def _combineTruthsets(self, samples): """ Combines the truthsets of several samples into a single file. Parameters ---------- samples: list<dict<>> A list of outputs from self._per_sample() * 'filename-indel' * 'filename-snv' Example Picard Command java -jar picard.jar SortVcf \ I=vcf_1.vcf \ I=vcf_2.vcf \ O=sorted.vcf """ truthset_folder = getPipelineFolder('truthset', 'multiple') sampleIds = sorted(i['PatientID'] for i in samples) # Extracts patient id from the barcode. ex. TCGA-2H-A9GF -> A9GF sampleIds = [i.split('-')[-1] for i in sampleIds] sampleIds = ",".join(sampleIds) indel_truthset_filename = os.path.join( truthset_folder, "{}.{}.merged.indel.truthset.vcf".format(sampleIds, training_type)) snp_truthset_filename = os.path.join( truthset_folder, "{}.{}.merged.snp.truthset.vcf".format(sampleIds, training_type)) # Generate a command-line to parse the files. # snp_cmd_string = "--variant " + " --variant ".join([i['filename-snv'] for i in samples]) + '\\' # indel_cmd_string = "--variant " + " --variant ".join([i['filename-indel'] for i in samples]) + '\\' snp_cmd_string = " ".join( ["I={}".format(i['filename-snp']) for i in samples]) indel_cmd_string = " ".join( ["I={}".format(i['filename-indel']) for i in samples]) picard_base_command = """java -jar {program} SortVcf {variants} O={output}""" snv_command = picard_base_command.format( program=self.picard_program, reference=self.reference, variants=indel_cmd_string, output=indel_truthset_filename) indel_command = picard_base_command.format( program=self.picard_program, reference=self.reference, variants=snp_cmd_string, output=snp_truthset_filename) systemtools.Terminal(snv_command, show_output=True) systemtools.Terminal(indel_command, show_output=True) result = { 'PatientID': sampleIds, 'filename-indel': indel_truthset_filename, 'filename-snv': snp_truthset_filename } return result
def _mergeVariantFiles(self, callset, output_folder, patientId): """ Note: The only records that are merged are those that are unfiltered in at least one caller. """ print("Merging files...") output_filename = os.path.join( output_folder, "{}.{}.modified.merged.vcf".format(patientId, self.kind)) command = """java -jar {gatk} \ --analysis_type CombineVariants \ --reference_sequence {reference} \ --genotypemergeoption UNSORTED \ --filteredrecordsmergetype KEEP_IF_ANY_UNFILTERED \ --variant {muse} \ --variant {mutect2} \ --variant {somaticsniper} \ --variant {strelka} \ --variant {varscan} \ --out {output}""".format(gatk=self.gatk_program, reference=self.reference, muse=callset['muse-snp'], mutect2=callset['mutect2-snp'], somaticsniper=callset['somaticsniper-snp'], strelka=callset['strelka-snp'], varscan=callset['varscan-snp'], output=output_filename) if not os.path.exists(output_filename): systemtools.Terminal(command, use_system=True) return output_filename
def gatkCombineVariants(self, variants, output_file): """ Uses GATK CombineVariants to merge the calls from each caller into a single file. Parameters ---------- variants: dict<caller, path> A dictionary linkng each caller to its harmonized output. Format: {NormalID}_vs_{TumorID}.{CallerName}.{TAG}.harmonized.vcf Returns ------- Output_file: string Format: {NormalID}_vs_{TumorID}.{CallerName}.{TAG}.merged.vcf """ order = "mutect2,varscan,strelka,muse,somaticsniper" # ordered by VAF confidence variant_command = [ '--variant:{} "{}"'.format(k, v) for k, v in variants.items() ] variant_command = ' \\\n'.join(variant_command) command = """java -jar "{gatk}" \ -T CombineVariants \ -R "{reference}" \ {variants} \ -o "{output}" \ -genotypeMergeOptions PRIORITIZE \ -priority {rod}""" command = command.format(gatk=self.gatk_program, reference=self.reference, variants=variant_command, rod=order, output=output_file) systemtools.Terminal(command) return output_file
def buildTrainer(self, input_filename): print("Building model...") command = "{script} {infile}".format(script=self.ada_trainer_script, infile=input_filename) expected_output = input_filename + '.Classifier.RData' print("expected_output: ", expected_output) print(command) if not os.path.exists(expected_output): systemtools.Terminal(command, use_system=True) else: print("The Somaticseq classifier already exists.") return expected_output
def _convertToTable(self, sample, callset, merged_callset, output_folder): print("Converting to a TSV file...") start_time = time.time() output_filename = os.path.join( output_folder, "{}.{}.modified.merged.excluded.snp.tsv".format( sample['PatientID'], self.kind)) print("Merged_callset: {}\t{}".format(os.path.exists(merged_callset), merged_callset)) command = """python3 {script} \ --p-scale phred \ --genome-reference {reference} \ --normal-bam-file {normal} \ --tumor-bam-file {tumor} \ --dbsnp-vcf {dbSNP} \ --cosmic-vcf {cosmic} \ --vcf-format {merged} \ --muse-vcf {muse} \ --mutect-vcf {mutect2} \ --somaticsniper-vcf {somaticsniper} \ --strelka-strelka-vcf {strelka} \ --varscan-vcf {varscan} \ --output-tsv-file {output}""".format( script=self.merged_vcf2tsv_script, reference=self.reference, normal=sample['NormalBAM'], tumor=sample['TumorBAM'], dbSNP=self.dbSNP, cosmic=self.cosmic, truth=self.truthset, merged=merged_callset, muse=callset['muse-snp'], mutect2=callset['mutect2-snp'], somaticsniper=callset['somaticsniper-snp'], strelka=callset['strelka-snp'], varscan=callset['varscan-snp'], output=output_filename) if self.kind == 'trainer': command += " --ground-truth-vcf " + self.truthset #if not os.path.exists(output_filename): if not os.path.exists(output_filename): systemtools.Terminal(command, use_system=True) else: print("The Somaticseq table already exists.") print("\tResult: {}\t{}".format(os.path.exists(output_filename), output_filename)) stop_time = time.time() duration = timetools.Duration(seconds=stop_time - start_time) print("Converted the table in ", duration.isoformat()) return output_filename
def fixCallerOutputs(callset, somaticseq_folder, **kwargs): """ Required Parameters ------------------- variants: dict<str:str> [dict<caller_name: caller_output>] The callset to fix somaticseq_folder: str [path] Th folder containing the somaticseq program. Optional Parameters ------------------- sample: dict<> patientId: str """ modify_vjsd_script = os.path.join(somaticseq_folder, "modify_VJSD.py") fixed_callset = dict() for caller, source in callset.items(): if 'output_folder' in kwargs: output_folder = kwargs['output_folder'] else: output_folder = os.path.dirname(source) if 'patientId' in kwargs: basename = "{}.{}.corrected.vcf".format(kwargs['patientId'], caller) else: basename = os.path.basename(source) basename, ext = os.path.splitext(basename) basename = "{}.corrected.vcf".format(basename) destination = os.path.join(output_folder, basename) if 'varscan' in caller: command = """python3 {program} -method VarScan2 -infile {infile} -outfile {outfile}""" elif 'somaticsniper' in caller: command = """python3 {program} -method SomaticSniper -infile {infile} -outfile {outfile}""" elif 'muse' in caller: command = """python3 {program} -method MuSE -infile {infile} -outfile {outfile}""" else: command = None if command: command = command.format(program=modify_vjsd_script, infile=source, outfile=destination) systemtools.Terminal(command, use_system=True) else: shutil.copy2(source, destination) fixed_callset[caller] = destination return fixed_callset
def _reduceVariantTargets(self, input_filename, output_folder, patientId): print("Excluding non-exome targets...") output_filename = os.path.join( output_folder, "{}.{}.modified.merged.excluded.vcf".format(patientId, self.kind)) command = """intersectBed -header -a {infile} -b {targets} > {output}""".format( infile=input_filename, targets=self.targets, output=output_filename) systemtools.Terminal(command, use_system=True) #print("\tResult: {}\t{}".format(os.path.exists(output_filename), output_filename)) return output_filename
def runPredictor(self, input_filename): """ """ basename = getBasename(input_filename) output_folder = os.path.dirname(input_filename) output_filename = "{}.predicted_scores.tsv".format(basename) output_filename = os.path.join(output_folder, output_filename) print("Predicting Scores...") command = "{script} {classifier} {infile} {outfile}".format( script=self.ada_prediction_script, classifier=self.snp_classifier, infile=input_filename, outfile=output_filename) print(command) systemtools.Terminal(command, use_system=True) return output_filename
def catVariants(self, left, right): """ Combines the SNV and Indel files. Assumes both are saved in the same folder. """ l = os.path.splitext(os.path.splitext(left)[0])[0] output_file = l + '.cat.vcf' command = """java -cp {GATK} org.broadinstitute.gatk.tools.CatVariants \ -R {reference}\ -V {left} \ -V {right} \ -out {output} """.format(GATK=self.gatk_program, reference=self.reference, left=left, right=right, output=output_file) systemtools.Terminal(command) return output_file
def _convertToVcf(self, sample, input_filename): print("Converting to a VCF file...") output_filename = os.path.splitext(input_filename)[0] + ".vcf" command = """python3 {script} \ --tsv-in {infile} \ --vcf-out {outfile} \ --normal-sample-name {normalid} \ --tumor-sample-name {tumorid} \ --emit-all \ --individual-mutation-tools {tools} \ --phred-scale""".format(script=self.tsv_to_vcf_script, infile=input_filename, outfile=output_filename, normalid=sample['NormalID'], tumorid=sample['SampleID'], tools="MuSE CGA SomaticSniper Strelka VarScan2") print(command) if not os.path.exists(output_filename): systemtools.Terminal(command, use_system=True) return output_filename
def _processVJSDFiles(self, callset, output_folder, patientId): """ Parameters ---------- input_file: output_folder: caller: {'varscan', 'somaticsniper', 'muse'} """ processed_callset = dict() for caller, input_file in callset.items(): print("\tProcessing {}...".format(caller)) if caller == 'muse': method = 'MuSE' elif caller == 'somaticsniper': method = 'SomaticSniper' elif 'varscan' in caller: method = 'VarScan2' else: method = None output_filename = os.path.join( output_folder, "{}.training.modified.vcf".format(patientId)) if method: command = """python3 {program} \ --call-method {method} \ --input-vcf {infile} \ --output-vcf {outfile}""".format(program=self.modify_vjsd_script, method=method, infile=input_file, outfile=output_filename) systemtools.Terminal(command, use_system=True) else: shutil.copy2(input_file, output_filename) processed_callset[caller] = output_filename return processed_callset