def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source): if "generate_gatk_intervals" in self.step_nodes: generated_intervals = self.generate_gatk_intervals.out_regions else: generated_intervals = self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions intervals = FirstOperator([self.gatk_intervals, generated_intervals]) recal_ins = { "reference": self.reference, "intervals": intervals, "snps_dbsnp": self.snps_dbsnp, "snps_1000gp": self.snps_1000gp, "known_indels": self.known_indels, "mills_indels": self.mills_indels, } self.step( "bqsr_normal", GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source, **recal_ins), scatter="intervals", ) self.step( "bqsr_tumor", GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins), scatter="intervals", ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.bqsr_normal.out, tumor_bam=self.bqsr_tumor.out, normal_name=self.normal_name, intervals=intervals, reference=self.reference, gnomad=self.gnomad, panel_of_normals=self.panel_of_normals, ), scatter=["intervals", "normal_bam", "tumor_bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compressvcf", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9( vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompressvcf", UncompressArchive(file=self.vc_gatk_sort_combined.out), )
def add_vardict_variantcaller(self, normal_bam_source, tumor_bam_source): self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictSomaticVariantCaller( normal_bam=normal_bam_source, tumor_bam=tumor_bam_source, normal_name=self.normal_name, tumor_name=self.tumor_name, header_lines=self.generate_vardict_headerlines.out, intervals=self.vardict_intervals, reference=self.reference, allele_freq_threshold=self.allele_freq_threshold, minMappingQual=self.minMappingQual, filter=self.filter, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step("vc_vardict_compress_for_sort", BGZipLatest(file=self.vc_vardict_merge.out)) self.step( "vc_vardict_sort_combined", BcfToolsSort_1_9(vcf=self.vc_vardict_compress_for_sort.out.as_type( CompressedVcf)), ) self.step( "vc_vardict_uncompress_for_combine", UncompressArchive(file=self.vc_vardict_sort_combined.out), ) self.output( "out_variants_vardict", source=self.vc_vardict_sort_combined.out, output_folder=[ "vcf", ], output_name=StringFormatter( "{tumor_name}--{normal_name}_vardict", tumor_name=self.tumor_name, normal_name=self.normal_name, ), doc="Merged variants from the VarDict caller", ) self.output( "out_variants_vardict_split", source=self.vc_vardict.out, output_folder=[ "vcf", "VardictByInterval", ], doc="Unmerged variants from the GATK caller (by interval)", )
def add_vardict_variantcaller(self, bam_source): self.input( "allele_freq_threshold", Float, 0.05, ), self.input("minMappingQual", Int(optional=True)) self.input("filter", String(optional=True)) # Vardict self.step( "generate_vardict_headerlines", GenerateVardictHeaderLines(reference=self.reference), ) self.step( "vc_vardict", VardictGermlineVariantCaller( bam=bam_source, reference=self.reference, intervals=self.vardict_intervals, sample_name=self.sample_name, allele_freq_threshold=self.allele_freq_threshold, header_lines=self.generate_vardict_headerlines.out, minMappingQual=self.minMappingQual, filter=self.filter, ), scatter="intervals", ) self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out)) self.step( "vc_vardict_compress_for_sort", BGZipLatest(file=self.vc_vardict_merge.out.as_type(Vcf)), ) self.step( "vc_vardict_sort_combined", BcfToolsSort_1_9( vcf=self.vc_vardict_compress_for_sort.out.as_type(CompressedVcf) ), ) self.step( "vc_vardict_uncompress_for_combine", UncompressArchive(file=self.vc_vardict_sort_combined.out), ) self.output( "out_variants_vardict", source=self.vc_vardict_sort_combined.out, output_folder=["variants"], output_name="vardict", doc="Merged variants from the VarDict caller", ) self.output( "out_variants_vardict_split", source=self.vc_vardict.out, output_folder=["variants", "vardict"], doc="Unmerged variants from the VarDict caller (by interval)", )
def add_combine_variants(self, normal_bam_source, tumor_bam_source): self.step( "combine_variants", CombineVariants_0_0_8( normal=self.normal_name, tumor=self.tumor_name, vcfs=[ self.vc_gatk_uncompressvcf.out.as_type(Vcf), self.vc_strelka.out, self.vc_vardict_uncompress_for_combine.out.as_type(Vcf), ], type="somatic", columns=["AD", "DP", "GT"], ), ) self.step("combined_compress", BGZipLatest(file=self.combine_variants.out)) self.step( "combined_sort", BcfToolsSort_1_9( vcf=self.combined_compress.out.as_type(CompressedVcf)), ) self.step("combined_uncompress", UncompressArchive(file=self.combined_sort.out)) self.step( "combined_addbamstats", AddBamStatsSomatic_0_1_0( normal_id=self.normal_name, tumor_id=self.tumor_name, normal_bam=normal_bam_source, tumor_bam=tumor_bam_source, vcf=self.combined_uncompress.out.as_type(Vcf), reference=self.reference, ), ) self.output( "out_variants", source=self.combined_addbamstats.out, output_folder=[ "vcf", ], output_name=StringFormatter( "{tumor_name}--{normal_name}_combined", tumor_name=self.tumor_name, normal_name=self.normal_name, ), doc="Combined variants from GATK, VarDict and Strelka callers", )
def constructor(self): self.input("bam", BamBai) self.input("reference", FastaWithDict) self.input("intervals", BedTabix(optional=True)) self.input("is_exome", Boolean(optional=True)) self.step( "manta", Manta_1_5_0( bam=self.bam, reference=self.reference, callRegions=self.intervals, exome=self.is_exome, ), ) self.step( "strelka", StrelkaGermline_2_9_10( bam=self.bam, reference=self.reference, indelCandidates=self.manta.candidateSmallIndels, callRegions=self.intervals, exome=self.is_exome, ), ) # normalise and filter "PASS" variants self.step("uncompressvcf", UncompressArchive(file=self.strelka.variants)) self.step( "splitnormalisevcf", SplitMultiAllele(vcf=self.uncompressvcf.out, reference=self.reference), ) self.step( "filterpass", VcfToolsvcftoolsLatest( vcf=self.splitnormalisevcf.out, removeFileteredAll=True, recode=True, recodeINFOAll=True, ), ) self.output("sv", source=self.manta.diploidSV) self.output("variants", source=self.strelka.variants) self.output("out", source=self.filterpass.out)
def add_combine_variants(self, bam_source): # Note, this is reliant on the specific step names from previous steps # Combine self.step( "combine_variants", CombineVariants_0_0_8( vcfs=[ self.vc_gatk_uncompress.out.as_type(Vcf), self.vc_strelka.out, self.vc_vardict_uncompress_for_combine.out.as_type(Vcf), ], type="germline", columns=["AC", "AN", "AF", "AD", "DP", "GT"], ), ) self.step("combined_compress", BGZipLatest(file=self.combine_variants.out)) self.step( "combined_sort", BcfToolsSort_1_9( vcf=self.combined_compress.out.as_type(CompressedVcf)), ) self.step("combined_uncompress", UncompressArchive(file=self.combined_sort.out)) self.step( "combined_addbamstats", AddBamStatsGermline_0_1_0( bam=bam_source, vcf=self.combined_uncompress.out.as_type(Vcf), reference=self.reference, ), ) self.output( "out_variants", source=self.combined_addbamstats.out, output_folder="variants", output_name="combined", doc="Combined variants from all 3 callers", )
def constructor(self): self.input("bam", BamBai) self.input( "intervals", Bed(optional=True), doc= "This optional interval supports processing by regions. If this input resolves " "to null, then GATK will process the whole genome per each tool's spec", ) self.input("reference", FastaWithDict) self.input("snps_dbsnp", VcfTabix) self.step( "split_bam", gatk4.Gatk4SplitReads_4_1_3(bam=self.bam, intervals=self.intervals), ) self.step( "haplotype_caller", gatk4.Gatk4HaplotypeCaller_4_1_3( inputRead=self.split_bam.out, intervals=self.intervals, reference=self.reference, dbsnp=self.snps_dbsnp, pairHmmImplementation="LOGLESS_CACHING", ), ) self.step("uncompressvcf", UncompressArchive(file=self.haplotype_caller.out)) self.step( "splitnormalisevcf", SplitMultiAllele(vcf=self.uncompressvcf.out, reference=self.reference), ) self.output("variants", source=self.haplotype_caller.out) self.output("out_bam", source=self.haplotype_caller.bam) self.output("out", source=self.splitnormalisevcf.out)
def constructor(self): self.input("normal_bam", BamBai) self.input("tumor_bam", BamBai) self.input("normal_name", String(optional=True)) self.input( "intervals", Bed(optional=True), doc= "This optional intervals file supports processing by regions. If this file resolves " "to null, then GATK will process the whole genome per each tool's spec", ) self.input("reference", FastaWithDict) self.input("gnomad", VcfTabix) self.input("panel_of_normals", VcfTabix(optional=True)) # split normal and tumor bam self.step( "normal_split_bam", self.process_subpipeline(bam=self.normal_bam, intervals=self.intervals), ) self.step( "tumor_split_bam", self.process_subpipeline(bam=self.tumor_bam, intervals=self.intervals), ) # variant calling + learn read orientation model self.step( "mutect2", gatk4.GatkMutect2_4_1_3( normalBams=[self.normal_split_bam.out], tumorBams=[self.tumor_split_bam.out], normalSample=self.normal_name, intervals=self.intervals, reference=self.reference, germlineResource=self.gnomad, panelOfNormals=self.panel_of_normals, ), ) self.step( "learnorientationmodel", gatk4.Gatk4LearnReadOrientationModelLatest( f1r2CountsFiles=self.mutect2.f1f2r_out, ), ) # calculate contamination and segmentation self.step( "getpileupsummaries", gatk4.Gatk4GetPileUpSummariesLatest( bam=self.tumor_split_bam.out, sites=self.gnomad, intervals=self.intervals, ), ) self.step( "calculatecontamination", gatk4.Gatk4CalculateContaminationLatest( pileupTable=self.getpileupsummaries.out, ), ) self.step( "filtermutect2calls", gatk4.Gatk4FilterMutectCallsLatest( vcf=self.mutect2.out, reference=self.reference, segmentationFile=self.calculatecontamination.segOut, contaminationTable=self.calculatecontamination.contOut, readOrientationModel=self.learnorientationmodel.out, statsFile=self.mutect2.stats, ), ) # normalise and filter "PASS" variants self.step("uncompressvcf", UncompressArchive(file=self.filtermutect2calls.out)) self.step( "splitnormalisevcf", SplitMultiAllele(vcf=self.uncompressvcf.out, reference=self.reference), ) self.step( "filterpass", VcfToolsvcftoolsLatest( vcf=self.splitnormalisevcf.out, removeFileteredAll=True, recode=True, recodeINFOAll=True, ), ) self.output("variants", source=self.filtermutect2calls.out) self.output("out_bam", source=self.mutect2.bam) self.output("out", source=self.filterpass.out)
def add_gatk_variantcaller(self, bam_source): # VARIANT CALLERS intervals = FirstOperator( [ self.gatk_intervals, self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions, ] ) # GATK self.step( "bqsr", GATKBaseRecalBQSRWorkflow_4_1_3( bam=bam_source, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, intervals=intervals, ), scatter="intervals", doc="Perform base quality score recalibration", ) self.step( "vc_gatk", GatkGermlineVariantCaller_4_1_3( bam=self.bqsr.out, intervals=intervals, reference=self.reference, snps_dbsnp=self.snps_dbsnp, ), scatter=["intervals", "bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compressvcf", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9(vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompress", UncompressArchive(file=self.vc_gatk_sort_combined.out), ) self.output( "out_variants_gatk", source=self.vc_gatk_sort_combined.out, output_folder="variants", output_name="gatk", doc="Merged variants from the GATK caller", ) self.output( "out_variants_gatk_split", source=self.vc_gatk.out, output_folder=["variants", "gatk"], doc="Unmerged variants from the GATK caller (by interval)", )
def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source): """ Reimplemented because need steps for combine """ if "generate_gatk_intervals" in self.step_nodes: generated_intervals = self.generate_gatk_intervals.out_regions else: generated_intervals = self.step( "generate_gatk_intervals", GenerateIntervalsByChromosome(reference=self.reference), when=self.gatk_intervals.is_null(), ).out_regions intervals = FirstOperator([self.gatk_intervals, generated_intervals]) recal_ins = { "reference": self.reference, "intervals": intervals, "snps_dbsnp": self.snps_dbsnp, "snps_1000gp": self.snps_1000gp, "known_indels": self.known_indels, "mills_indels": self.mills_indels, } self.step( "bqsr_normal", GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source, **recal_ins), scatter="intervals", ) self.step( "bqsr_tumor", GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins), scatter="intervals", ) self.step( "vc_gatk", GatkSomaticVariantCaller_4_1_3( normal_bam=self.bqsr_normal.out, tumor_bam=self.bqsr_tumor.out, normal_name=self.normal_name, intervals=intervals, reference=self.reference, gnomad=self.gnomad, panel_of_normals=self.panel_of_normals, ), scatter=["intervals", "normal_bam", "tumor_bam"], ) self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out)) self.step("vc_gatk_compress_for_sort", BGZipLatest(file=self.vc_gatk_merge.out)) self.step( "vc_gatk_sort_combined", BcfToolsSort_1_9( vcf=self.vc_gatk_compress_for_sort.out.as_type(CompressedVcf)), ) self.step( "vc_gatk_uncompressvcf", UncompressArchive(file=self.vc_gatk_sort_combined.out), ) # VCF self.output( "out_variants_gatk", source=self.vc_gatk_sort_combined.out, output_folder=[ "vcf", ], output_name=StringFormatter( "{tumor_name}--{normal_name}_gatk", tumor_name=self.tumor_name, normal_name=self.normal_name, ), doc="Merged variants from the GATK caller", ) self.output( "out_variants_split", source=self.vc_gatk.out, output_folder=[ "vcf", "GATKByInterval", ], doc="Unmerged variants from the GATK caller (by interval)", )
def constructor(self): ##INPUTS self.input("bam", BamBai()) self.input("sample_name", String()) self.input("reference_folder", Directory()) self.input("intervals", Bed()) self.input("gemini_chromosomes", String(optional=True)) self.input("ploidy", String(optional=True), default="somatic") self.input("min_bq", Int(optional=True)) self.input("min_mq", Int(optional=True)) self.input("min_dp", Int(optional=True)) self.input("min_vaf", Float(optional=True)) self.input("vc_min_vq", Int(optional=True)) self.input("noise_level", Int(optional=True)) self.input("vqr_min_vq", Int(optional=True)) self.input("pisces_awk_script", File()) ## STEPS self.step( "primary_only", SamToolsView_1_9(sam=self.bam, doNotOutputAlignmentsWithBitsSet="0x100"), ) self.step( "index_primary_only_bam", SamToolsIndex_1_9(bam=self.primary_only.out), ) self.step( "gemini_read_preprocessing", PiscesGemini_5_3_0_0( inputBam=self.index_primary_only_bam, referenceFolder=self.reference_folder, samtoolsExecutable="samtools", chromosomeFilter=self.gemini_chromosomes, outputDir=".", piscesVersion="5.3.0.0", ), ) self.step( "pisces", PiscesVariantCaller_5_3_0_0( inputBam=self.gemini_read_preprocessing.bam, referenceFolder=self.reference_folder, outputDir=".", intervalBedFile=self.intervals, ploidy=self.ploidy, minimumBaseQuality=self.min_bq, minimumMappingQuality=self.min_mq, minimumVariantFrequency=self.min_vaf, noiseLevelForQModel=self.noise_level, minimumVariantFrequencyFilter=self.min_vaf, enableSingleStrandFilter="True", outputSBFiles="True", callMNVs="False", maxMNVLength=1, RMxNFilter="5,9,0.35", variantQualityFilter=self.vc_min_vq, crushVCF="False", gVCF="False", piscesVersion="5.3.0.0", ), ) self.step( "vqr", PiscesVariantQualityRecalibration_5_3_0_0( inputVcf=self.pisces.vcf, outputDir=".", baselineNoise=self.noise_level, minVariantQuality=self.vqr_min_vq, piscesVersion="5.3.0.0", ), ) piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf]) self.step( "fixSource", Awk(script=self.pisces_awk_script, input_files=piscesVcf), ) self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out)) self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out)) self.step("uncompress", UncompressArchive(file=self.normalise.out)) self.step( "filterpass", VcfToolsvcftools_0_1_16( vcf=self.uncompress.out.as_type(Vcf), removeFileteredAll=True, recode=True, recodeINFOAll=True, ), ) ## OUTPUTS self.output("variants", source=self.sort.out) self.output("out", source=self.filterpass.out) self.output("out_bam", source=self.gemini_read_preprocessing.bam)
def constructor(self): # Inputs self.input("sample_name", String) self.input("fastqs", Array(FastqGzPair)) self.input("seqrun", String, doc="SeqRun Name (for Vcf2Tsv)") self.input("reference", FastaWithDict) self.input("region_bed", Bed) self.input("region_bed_extended", Bed) self.input("region_bed_annotated", Bed) self.input("genecoverage_bed", Bed) self.input("genome_file", TextFile) self.input("panel_name", String) self.input("vcfcols", TextFile) self.input("black_list", Bed(optional=True)) self.input("snps_dbsnp", VcfTabix) self.input("snps_1000gp", VcfTabix) self.input("known_indels", VcfTabix) self.input("mills_indels", VcfTabix) self.input("mutalyzer_server", String) self.input("pathos_db", String) self.input("maxRecordsInRam", Int) # tumor only self.input("gnomad", VcfTabix) self.input("panel_of_normals", VcfTabix(optional=True)) # fastqc self.step( "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads" ) # get the overrepresentative sequence from fastqc self.step( "getfastqc_adapters", ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,), scatter="fastqc_datafiles", ) # align and generate sorted index bam self.step( "align_and_sort", BwaAligner( fastq=self.fastqs, reference=self.reference, sample_name=self.sample_name, sortsam_tmpDir=".", cutadapt_adapter=self.getfastqc_adapters, cutadapt_removeMiddle3Adapter=self.getfastqc_adapters, ), scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"], ) # merge into one bam and markdups self.step( "merge_and_mark", MergeAndMarkBams_4_1_3( bams=self.align_and_sort.out, sampleName=self.sample_name, maxRecordsInRam=self.maxRecordsInRam, ), ) # performance: doc self.step( "annotate_doc", AnnotateDepthOfCoverage_0_1_0( bam=self.merge_and_mark.out, bed=self.region_bed_annotated, reference=self.reference, sample_name=self.sample_name, ), ) # performance self.step( "performance_summary", PerformanceSummaryTargeted_0_1_0( bam=self.merge_and_mark.out, region_bed=self.region_bed, genecoverage_bed=self.genecoverage_bed, sample_name=self.sample_name, genome_file=self.genome_file, ), ) # gridss self.step( "gridss", Gridss_2_6_2( bams=self.merge_and_mark.out, reference=self.reference, blacklist=self.black_list, tmpdir=".", ), ) # post gridss r for tumor only + tumor only mode # self.step("gridss_post_r", GRIDSSProcessOutput(inp=self.gridss.out)) # gatk bqsr bam self.step( "bqsr", GATKBaseRecalBQSRWorkflow_4_1_3( bam=self.merge_and_mark.out, intervals=self.region_bed_extended, reference=self.reference, snps_dbsnp=self.snps_dbsnp, snps_1000gp=self.snps_1000gp, known_indels=self.known_indels, mills_indels=self.mills_indels, ), ) # mutect2 self.step( "mutect2", GatkSomaticVariantCallerTumorOnlyTargeted( bam=self.bqsr.out, intervals=self.region_bed_extended, reference=self.reference, gnomad=self.gnomad, panel_of_normals=self.panel_of_normals, ), ) # haplotypecaller to do: take base recal away from the self.step( "haplotype_caller", Gatk4HaplotypeCaller_4_1_3( inputRead=self.bqsr.out, intervals=self.region_bed_extended, reference=self.reference, dbsnp=self.snps_dbsnp, pairHmmImplementation="LOGLESS_CACHING", ), ) self.step( "splitnormalisevcf", SplitMultiAlleleNormaliseVcf( compressedVcf=self.haplotype_caller.out, reference=self.reference ), ) # combine variants self.step( "combinevariants", CombineVariants_0_0_8( vcfs=[self.splitnormalisevcf.out, self.mutect2.out], type="germline", columns=["AD", "DP", "AF", "GT"], ), ) self.step("compressvcf", BGZip_1_9(file=self.combinevariants.out)) self.step("sortvcf", BcfToolsSort_1_9(vcf=self.compressvcf.out)) self.step("uncompressvcf", UncompressArchive(file=self.sortvcf.out)) # addbamstats self.step( "addbamstats", AddBamStatsGermline_0_1_0( bam=self.merge_and_mark.out, vcf=self.uncompressvcf.out, reference=self.reference, ), ) # Molpath specific processes self.step("compressvcf2", BGZip_1_9(file=self.addbamstats.out)) self.step("tabixvcf", TabixLatest(inp=self.compressvcf2.out)) self.step( "calculate_variant_length", VcfLength_1_0_1(vcf=self.tabixvcf.out), doc="Add the length column for the output of AddBamStats", ) filter_for_variants = self.input("filter_for_vcfs", str, default="length > 150") self.step( "filter_variants_1_failed", VcfFilter_1_0_1( vcf=self.calculate_variant_length.out, info_filter=filter_for_variants ), ) self.step( "filter_variants_1", VcfFilter_1_0_1( vcf=self.calculate_variant_length.out, info_filter=filter_for_variants, invert=True, # -v param ), ) # Jiaan: copy over from the FRCP, can take the block comment out # # This one is the in-house molpath step # self.step( # "normalise_vcfs", # NormaliseVcf_1_5_4( # pathos_version=self.pathos_db, # mutalyzer=self.mutalyzer_server, # mutalyzer="https://vmpr-res-mutalyzer1.unix.petermac.org.au", # rdb=self.pathos_db, # rdb="pa_uat", # inp=self.filter_variants_1.out, # ), # ) # # repeat remove 150bp variants (workaround for normalise_vcf bug) # self.step( # "filter_variants_2_failed", # VcfFilter_1_0_1( # vcf=self.normalise_vcfs.out, info_filter=filter_for_variants # ), # ) # self.step( # "filter_variants_2", # VcfFilter_1_0_1( # vcf=self.normalise_vcfs.out, # info_filter=filter_for_variants, # invert=True, # -v param # ), # ) # self.step( # "convert_to_tsv", # Vcf2Tsv_1_5_4( # pathos_version=self.pathos_db, # inp=self.filter_variants_2.out, # sample=self.sample_name, # columns=self.vcfcols, # seqrun=self.seqrun, # ), # ) # self.step( # "index_with_igvtools", IgvIndexFeature_2_5_3(inp=self.filter_variants_2.out) # ) # output self.output("fastq_qc", source=self.fastqc.out, output_folder="QC") self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM") self.output( "doc_out", source=self.annotate_doc.out, output_folder="PERFORMANCE" ) self.output( "summary", source=self.performance_summary.out, output_folder="PERFORMANCE" ) self.output( "gene_summary", source=self.performance_summary.geneFileOut, output_folder="PERFORMANCE", ) self.output( "region_summary", source=self.performance_summary.regionFileOut, output_folder="PERFORMANCE", ) self.output("gridss_vcf", source=self.gridss.out, output_folder="SV") self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV") self.output( "haplotypecaller_vcf", source=self.haplotype_caller.out, output_folder="VCF", ) self.output( "haplotypecaller_bam", source=self.haplotype_caller.bam, output_folder="VCF", ) self.output( "haplotypecaller_norm", source=self.splitnormalisevcf.out, output_folder="VCF", ) self.output("mutect2_vcf", source=self.mutect2.variants, output_folder="VCF") self.output("mutect2_bam", source=self.mutect2.out_bam, output_folder="VCF") self.output("mutect2_norm", source=self.mutect2.out, output_folder="VCF") self.output("addbamstats_vcf", source=self.addbamstats.out)
def constructor(self): ## INPUTS self.input("bam", BamBai()) self.input("sample_name", String()) self.input("reference_folder", Directory()) self.input("intervals", Bed()) self.input("ploidy", String(optional=True), default="somatic") self.input("min_bq", Int(optional=True)) self.input("min_mq", Int(optional=True)) self.input("min_dp", Int(optional=True), default=100) self.input("min_vaf", Float(optional=True)) self.input("vc_min_vq", Int(optional=True)) self.input("noise_level", Int(optional=True)) self.input("vqr_min_vq", Int(optional=True)) self.input("pisces_awk_script", File()) ## STEPS self.step( "primary_only", SamToolsView_1_9(sam=self.bam, doNotOutputAlignmentsWithBitsSet="0x100"), ) self.step( "index_primary_only_bam", SamToolsIndex_1_9(bam=self.primary_only.out), ) self.step( "hygea_realignment", PiscesHygeaRealigner_5_2_10_49( inputBam=self.index_primary_only_bam, outputDir=".", referenceFolder=self.reference_folder, skipAndRemoveDuplicates="true", piscesVersion="5.2.10.49", ), ) self.step( "stitcher_read_joining", PiscesStitcher_5_2_10_49( inputBam=self.hygea_realignment.out, outputDir=".", sampleName=self.sample_name, piscesVersion="5.2.10.49", ), ) self.step( "stitcher_sort", SamToolsSort_1_9( bam=self.stitcher_read_joining.out, outputFilename=self.sample_name + ".bam", ), ) self.step("stitcher_index", SamToolsIndex_1_9(bam=self.stitcher_sort.out)) self.step( "pisces", PiscesVariantCaller_5_2_10_49( inputBam=self.stitcher_index.out, referenceFolder=self.reference_folder, outputDir=".", intervalBedFile=self.intervals, ploidy=self.ploidy, minimumBaseQuality=self.min_bq, minimumMappingQuality=self.min_mq, minimumVariantFrequency=self.min_vaf, minimumCoverage=self.min_dp, noiseLevelForQModel=self.noise_level, minimumVariantFrequencyFilter=self.min_vaf, enableSingleStrandFilter="true", callMNVs="false", maxMNVLength=1, RMxNFilter="5,9,0.35", variantQualityFilter=self.vc_min_vq, crushVCF="false", gVCF="false", piscesVersion="5.2.10.49", ), ) self.step( "vqr", PiscesVariantQualityRecalibration_5_2_10_49( inputVcf=self.pisces.vcf, outputDir=".", baselineNoise=self.noise_level, minVariantQuality=self.vqr_min_vq, piscesVersion="5.2.10.49", ), ) piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf]) self.step( "fixSource", Awk(script=self.pisces_awk_script, input_files=piscesVcf), ) self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out)) self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out)) self.step("uncompress", UncompressArchive(file=self.normalise.out)) self.step( "filterpass", VcfToolsvcftools_0_1_16( vcf=self.uncompress.out.as_type(Vcf), removeFileteredAll=True, recode=True, recodeINFOAll=True, ), ) ## OUTPUTs self.output("variants", source=self.sort.out) self.output("out", source=self.filterpass.out) self.output("out_bam", source=self.stitcher_index.out) ## OPTIONAL OUTPUTs self.output("hygea_options", source=self.hygea_realignment.used_options) self.output("stitcher_options", source=self.stitcher_read_joining.used_options) self.output("pisces_options", source=self.pisces.used_options) self.output("vqr_options", source=self.vqr.used_options)
def constructor(self): ## INPUTs self.input("sample_name", String()) self.input("reference", Fasta()) # For mpileup self.input("bam", BamBai()) self.input("pileup_max_depth", Int(optional=True)) self.input("min_bq", Int(optional=True), default=20) # For Varscan2 self.input("min_dp", Int(optional=True)) self.input("min_ad", Int(optional=True)) self.input("min_vaf", Float(optional=True)) self.input("pval", Float(optional=True), default=0.0001) # For correction of VCF header self.input("header_lines", File) ## STEPS self.step( "mpileup", SamToolsMpileup_1_9( bam=self.bam, outputFilename="./" + self.sample_name + ".mpileup", noBAQ=True, maxDepth=self.pileup_max_depth, reference=self.reference, ), ) self.step( "varscan2cns", VarscanMpileup2cns_2_4_2( mpileup=self.mpileup.out, minCoverage=self.min_dp, minVariantReads=self.min_ad, minBaseQuality=self.min_bq, minVariantFrequency=self.min_vaf, pValue=self.pval, outputVcfFormat=1, variantPositionsOnly=1, ), ) self.step( "VSheader", VarscanHeader( inputVcf=self.varscan2cns.out, outputVcf="./" + self.sample_name + "VS.reheader.vcf", ), ) self.step( "VSheaderContigs", BcfToolsAnnotate_1_5(vcf=self.VSheader.out, headerLines=self.header_lines), ) self.step("sortVcf", BcfToolsSort_1_9(vcf=self.VSheaderContigs.out)) self.step("normaliseVcf", BcfToolsNorm_1_9(vcf=self.sortVcf.out)) self.step("uncompress", UncompressArchive(file=self.normaliseVcf.out)) self.step( "filterpass", VcfToolsvcftools_0_1_16( vcf=self.uncompress.out.as_type(Vcf), removeFileteredAll=True, recode=True, recodeINFOAll=True, ), ) ## OUTPUTS self.output("variants", source=self.sortVcf.out) self.output("out", source=self.filterpass.out)