示例#1
0
    def _combineTruthsets(self, samples):
        """ Combines the truthsets of several samples into a single file.
			Parameters
			----------
				samples: list<dict<>>
					A list of outputs from self._per_sample()
						* 'filename-indel'
						* 'filename-snv'
			Example Picard Command
			java -jar picard.jar SortVcf \
				I=vcf_1.vcf \
				I=vcf_2.vcf \
				O=sorted.vcf
		"""
        truthset_folder = getPipelineFolder('truthset', 'multiple')
        sampleIds = sorted(i['PatientID'] for i in samples)
        # Extracts patient id from the barcode. ex. TCGA-2H-A9GF -> A9GF
        sampleIds = [i.split('-')[-1] for i in sampleIds]
        sampleIds = ",".join(sampleIds)

        indel_truthset_filename = os.path.join(
            truthset_folder,
            "{}.{}.merged.indel.truthset.vcf".format(sampleIds, training_type))
        snp_truthset_filename = os.path.join(
            truthset_folder,
            "{}.{}.merged.snp.truthset.vcf".format(sampleIds, training_type))

        # Generate a command-line to parse the files.
        # snp_cmd_string   = "--variant " + " --variant ".join([i['filename-snv']   for i in samples]) + '\\'
        # indel_cmd_string = "--variant " + " --variant ".join([i['filename-indel'] for i in samples]) + '\\'
        snp_cmd_string = " ".join(
            ["I={}".format(i['filename-snp']) for i in samples])
        indel_cmd_string = " ".join(
            ["I={}".format(i['filename-indel']) for i in samples])

        picard_base_command = """java -jar {program} SortVcf {variants} O={output}"""
        snv_command = picard_base_command.format(
            program=self.picard_program,
            reference=self.reference,
            variants=indel_cmd_string,
            output=indel_truthset_filename)

        indel_command = picard_base_command.format(
            program=self.picard_program,
            reference=self.reference,
            variants=snp_cmd_string,
            output=snp_truthset_filename)

        systemtools.Terminal(snv_command, show_output=True)
        systemtools.Terminal(indel_command, show_output=True)

        result = {
            'PatientID': sampleIds,
            'filename-indel': indel_truthset_filename,
            'filename-snv': snp_truthset_filename
        }

        return result
示例#2
0
    def _mergeVariantFiles(self, callset, output_folder, patientId):
        """
			Note: The only records that are merged are those that are
			unfiltered in at least one caller.
		"""
        print("Merging files...")
        output_filename = os.path.join(
            output_folder,
            "{}.{}.modified.merged.vcf".format(patientId, self.kind))

        command = """java -jar {gatk} \
			--analysis_type CombineVariants \
			--reference_sequence {reference} \
			--genotypemergeoption UNSORTED \
			--filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \
			--variant {muse} \
			--variant {mutect2} \
			--variant {somaticsniper} \
			--variant {strelka} \
			--variant {varscan} \
			--out {output}""".format(gatk=self.gatk_program,
                            reference=self.reference,
                            muse=callset['muse-snp'],
                            mutect2=callset['mutect2-snp'],
                            somaticsniper=callset['somaticsniper-snp'],
                            strelka=callset['strelka-snp'],
                            varscan=callset['varscan-snp'],
                            output=output_filename)

        if not os.path.exists(output_filename):
            systemtools.Terminal(command, use_system=True)
        return output_filename
示例#3
0
    def gatkCombineVariants(self, variants, output_file):
        """ Uses GATK CombineVariants to merge the calls from each caller into a single file.
			Parameters
			----------
				variants: dict<caller, path>
					A dictionary linkng each caller to its harmonized output.
					Format: {NormalID}_vs_{TumorID}.{CallerName}.{TAG}.harmonized.vcf
			Returns
			-------
				Output_file: string
					Format: {NormalID}_vs_{TumorID}.{CallerName}.{TAG}.merged.vcf
		"""

        order = "mutect2,varscan,strelka,muse,somaticsniper"  # ordered by VAF confidence
        variant_command = [
            '--variant:{} "{}"'.format(k, v) for k, v in variants.items()
        ]
        variant_command = ' \\\n'.join(variant_command)
        command = """java -jar "{gatk}" \
			-T CombineVariants \
			-R "{reference}" \
			{variants} \
			-o "{output}" \
			-genotypeMergeOptions PRIORITIZE \
			-priority {rod}"""
        command = command.format(gatk=self.gatk_program,
                                 reference=self.reference,
                                 variants=variant_command,
                                 rod=order,
                                 output=output_file)

        systemtools.Terminal(command)
        return output_file
示例#4
0
 def buildTrainer(self, input_filename):
     print("Building model...")
     command = "{script} {infile}".format(script=self.ada_trainer_script,
                                          infile=input_filename)
     expected_output = input_filename + '.Classifier.RData'
     print("expected_output: ", expected_output)
     print(command)
     if not os.path.exists(expected_output):
         systemtools.Terminal(command, use_system=True)
     else:
         print("The Somaticseq classifier already exists.")
     return expected_output
示例#5
0
    def _convertToTable(self, sample, callset, merged_callset, output_folder):
        print("Converting to a TSV file...")
        start_time = time.time()
        output_filename = os.path.join(
            output_folder, "{}.{}.modified.merged.excluded.snp.tsv".format(
                sample['PatientID'], self.kind))
        print("Merged_callset: {}\t{}".format(os.path.exists(merged_callset),
                                              merged_callset))
        command = """python3 {script} \
			--p-scale phred \
			--genome-reference {reference} \
			--normal-bam-file {normal} \
			--tumor-bam-file {tumor} \
			--dbsnp-vcf {dbSNP} \
			--cosmic-vcf {cosmic} \
			--vcf-format {merged} \
			--muse-vcf {muse} \
			--mutect-vcf {mutect2} \
			--somaticsniper-vcf {somaticsniper} \
			--strelka-strelka-vcf {strelka} \
			--varscan-vcf {varscan} \
			--output-tsv-file {output}""".format(
            script=self.merged_vcf2tsv_script,
            reference=self.reference,
            normal=sample['NormalBAM'],
            tumor=sample['TumorBAM'],
            dbSNP=self.dbSNP,
            cosmic=self.cosmic,
            truth=self.truthset,
            merged=merged_callset,
            muse=callset['muse-snp'],
            mutect2=callset['mutect2-snp'],
            somaticsniper=callset['somaticsniper-snp'],
            strelka=callset['strelka-snp'],
            varscan=callset['varscan-snp'],
            output=output_filename)

        if self.kind == 'trainer':
            command += " --ground-truth-vcf " + self.truthset

        #if not os.path.exists(output_filename):
        if not os.path.exists(output_filename):
            systemtools.Terminal(command, use_system=True)
        else:
            print("The Somaticseq table already exists.")
        print("\tResult: {}\t{}".format(os.path.exists(output_filename),
                                        output_filename))
        stop_time = time.time()

        duration = timetools.Duration(seconds=stop_time - start_time)
        print("Converted the table in ", duration.isoformat())
        return output_filename
示例#6
0
def fixCallerOutputs(callset, somaticseq_folder, **kwargs):
    """
		Required Parameters
		-------------------
			variants: dict<str:str> [dict<caller_name: caller_output>]
				The callset to fix
			somaticseq_folder: str [path]
				Th folder containing the somaticseq program.
		Optional Parameters
		-------------------
			sample: dict<>
			patientId: str

	"""

    modify_vjsd_script = os.path.join(somaticseq_folder, "modify_VJSD.py")

    fixed_callset = dict()
    for caller, source in callset.items():
        if 'output_folder' in kwargs:
            output_folder = kwargs['output_folder']
        else:
            output_folder = os.path.dirname(source)

        if 'patientId' in kwargs:
            basename = "{}.{}.corrected.vcf".format(kwargs['patientId'],
                                                    caller)
        else:
            basename = os.path.basename(source)
            basename, ext = os.path.splitext(basename)
            basename = "{}.corrected.vcf".format(basename)

        destination = os.path.join(output_folder, basename)
        if 'varscan' in caller:
            command = """python3 {program} -method VarScan2 -infile {infile} -outfile {outfile}"""
        elif 'somaticsniper' in caller:
            command = """python3 {program} -method SomaticSniper -infile {infile} -outfile {outfile}"""
        elif 'muse' in caller:
            command = """python3 {program} -method MuSE -infile {infile} -outfile {outfile}"""
        else:
            command = None

        if command:
            command = command.format(program=modify_vjsd_script,
                                     infile=source,
                                     outfile=destination)
            systemtools.Terminal(command, use_system=True)
        else:
            shutil.copy2(source, destination)
        fixed_callset[caller] = destination

    return fixed_callset
示例#7
0
    def _reduceVariantTargets(self, input_filename, output_folder, patientId):
        print("Excluding non-exome targets...")
        output_filename = os.path.join(
            output_folder,
            "{}.{}.modified.merged.excluded.vcf".format(patientId, self.kind))

        command = """intersectBed -header -a {infile} -b {targets} > {output}""".format(
            infile=input_filename,
            targets=self.targets,
            output=output_filename)

        systemtools.Terminal(command, use_system=True)
        #print("\tResult: {}\t{}".format(os.path.exists(output_filename), output_filename))
        return output_filename
示例#8
0
    def runPredictor(self, input_filename):
        """
		"""
        basename = getBasename(input_filename)
        output_folder = os.path.dirname(input_filename)
        output_filename = "{}.predicted_scores.tsv".format(basename)
        output_filename = os.path.join(output_folder, output_filename)

        print("Predicting Scores...")
        command = "{script} {classifier} {infile} {outfile}".format(
            script=self.ada_prediction_script,
            classifier=self.snp_classifier,
            infile=input_filename,
            outfile=output_filename)
        print(command)
        systemtools.Terminal(command, use_system=True)

        return output_filename
示例#9
0
    def catVariants(self, left, right):
        """ Combines the SNV and Indel files. Assumes both are saved in the same folder. """
        l = os.path.splitext(os.path.splitext(left)[0])[0]
        output_file = l + '.cat.vcf'

        command = """java -cp {GATK} org.broadinstitute.gatk.tools.CatVariants \
			-R {reference}\
			-V {left} \
			-V {right} \
			-out {output}
		""".format(GATK=self.gatk_program,
             reference=self.reference,
             left=left,
             right=right,
             output=output_file)
        systemtools.Terminal(command)

        return output_file
示例#10
0
    def _convertToVcf(self, sample, input_filename):
        print("Converting to a VCF file...")

        output_filename = os.path.splitext(input_filename)[0] + ".vcf"

        command = """python3 {script} \
			--tsv-in {infile} \
			--vcf-out {outfile} \
			--normal-sample-name {normalid} \
			--tumor-sample-name {tumorid} \
			--emit-all \
			--individual-mutation-tools {tools} \
			--phred-scale""".format(script=self.tsv_to_vcf_script,
                           infile=input_filename,
                           outfile=output_filename,
                           normalid=sample['NormalID'],
                           tumorid=sample['SampleID'],
                           tools="MuSE CGA SomaticSniper Strelka VarScan2")
        print(command)
        if not os.path.exists(output_filename):
            systemtools.Terminal(command, use_system=True)
        return output_filename
示例#11
0
    def _processVJSDFiles(self, callset, output_folder, patientId):
        """
			Parameters
			----------
				input_file:
				output_folder:
				caller: {'varscan', 'somaticsniper', 'muse'}
		"""
        processed_callset = dict()
        for caller, input_file in callset.items():
            print("\tProcessing {}...".format(caller))
            if caller == 'muse':
                method = 'MuSE'
            elif caller == 'somaticsniper':
                method = 'SomaticSniper'
            elif 'varscan' in caller:
                method = 'VarScan2'
            else:
                method = None

            output_filename = os.path.join(
                output_folder, "{}.training.modified.vcf".format(patientId))

            if method:
                command = """python3 {program} \
					--call-method {method} \
					--input-vcf {infile} \
					--output-vcf {outfile}""".format(program=self.modify_vjsd_script,
                                      method=method,
                                      infile=input_file,
                                      outfile=output_filename)
                systemtools.Terminal(command, use_system=True)
            else:
                shutil.copy2(input_file, output_filename)

            processed_callset[caller] = output_filename

        return processed_callset