def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source):
        if "generate_gatk_intervals" in self.step_nodes:
            generated_intervals = self.generate_gatk_intervals.out_regions
        else:
            generated_intervals = self.step(
                "generate_gatk_intervals",
                GenerateIntervalsByChromosome(reference=self.reference),
                when=self.gatk_intervals.is_null(),
            ).out_regions

        intervals = FirstOperator([self.gatk_intervals, generated_intervals])

        recal_ins = {
            "reference": self.reference,
            "intervals": intervals,
            "snps_dbsnp": self.snps_dbsnp,
            "snps_1000gp": self.snps_1000gp,
            "known_indels": self.known_indels,
            "mills_indels": self.mills_indels,
        }
        self.step(
            "bqsr_normal",
            GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source,
                                            **recal_ins),
            scatter="intervals",
        )

        self.step(
            "bqsr_tumor",
            GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins),
            scatter="intervals",
        )

        self.step(
            "vc_gatk",
            GatkSomaticVariantCaller_4_1_3(
                normal_bam=self.bqsr_normal.out,
                tumor_bam=self.bqsr_tumor.out,
                normal_name=self.normal_name,
                intervals=intervals,
                reference=self.reference,
                gnomad=self.gnomad,
                panel_of_normals=self.panel_of_normals,
            ),
            scatter=["intervals", "normal_bam", "tumor_bam"],
        )

        self.step("vc_gatk_merge",
                  Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out))
        self.step("vc_gatk_compressvcf",
                  BGZipLatest(file=self.vc_gatk_merge.out))
        self.step(
            "vc_gatk_sort_combined",
            BcfToolsSort_1_9(
                vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)),
        )
        self.step(
            "vc_gatk_uncompressvcf",
            UncompressArchive(file=self.vc_gatk_sort_combined.out),
        )
    def add_vardict_variantcaller(self, normal_bam_source, tumor_bam_source):
        self.step(
            "generate_vardict_headerlines",
            GenerateVardictHeaderLines(reference=self.reference),
        )
        self.step(
            "vc_vardict",
            VardictSomaticVariantCaller(
                normal_bam=normal_bam_source,
                tumor_bam=tumor_bam_source,
                normal_name=self.normal_name,
                tumor_name=self.tumor_name,
                header_lines=self.generate_vardict_headerlines.out,
                intervals=self.vardict_intervals,
                reference=self.reference,
                allele_freq_threshold=self.allele_freq_threshold,
                minMappingQual=self.minMappingQual,
                filter=self.filter,
            ),
            scatter="intervals",
        )
        self.step("vc_vardict_merge",
                  Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out))
        self.step("vc_vardict_compress_for_sort",
                  BGZipLatest(file=self.vc_vardict_merge.out))
        self.step(
            "vc_vardict_sort_combined",
            BcfToolsSort_1_9(vcf=self.vc_vardict_compress_for_sort.out.as_type(
                CompressedVcf)),
        )
        self.step(
            "vc_vardict_uncompress_for_combine",
            UncompressArchive(file=self.vc_vardict_sort_combined.out),
        )

        self.output(
            "out_variants_vardict",
            source=self.vc_vardict_sort_combined.out,
            output_folder=[
                "vcf",
            ],
            output_name=StringFormatter(
                "{tumor_name}--{normal_name}_vardict",
                tumor_name=self.tumor_name,
                normal_name=self.normal_name,
            ),
            doc="Merged variants from the VarDict caller",
        )
        self.output(
            "out_variants_vardict_split",
            source=self.vc_vardict.out,
            output_folder=[
                "vcf",
                "VardictByInterval",
            ],
            doc="Unmerged variants from the GATK caller (by interval)",
        )
    def add_vardict_variantcaller(self, bam_source):
        self.input(
            "allele_freq_threshold",
            Float,
            0.05,
        ),
        self.input("minMappingQual", Int(optional=True))
        self.input("filter", String(optional=True))
        # Vardict
        self.step(
            "generate_vardict_headerlines",
            GenerateVardictHeaderLines(reference=self.reference),
        )
        self.step(
            "vc_vardict",
            VardictGermlineVariantCaller(
                bam=bam_source,
                reference=self.reference,
                intervals=self.vardict_intervals,
                sample_name=self.sample_name,
                allele_freq_threshold=self.allele_freq_threshold,
                header_lines=self.generate_vardict_headerlines.out,
                minMappingQual=self.minMappingQual,
                filter=self.filter,
            ),
            scatter="intervals",
        )
        self.step("vc_vardict_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_vardict.out))
        self.step(
            "vc_vardict_compress_for_sort",
            BGZipLatest(file=self.vc_vardict_merge.out.as_type(Vcf)),
        )
        self.step(
            "vc_vardict_sort_combined",
            BcfToolsSort_1_9(
                vcf=self.vc_vardict_compress_for_sort.out.as_type(CompressedVcf)
            ),
        )

        self.step(
            "vc_vardict_uncompress_for_combine",
            UncompressArchive(file=self.vc_vardict_sort_combined.out),
        )

        self.output(
            "out_variants_vardict",
            source=self.vc_vardict_sort_combined.out,
            output_folder=["variants"],
            output_name="vardict",
            doc="Merged variants from the VarDict caller",
        )
        self.output(
            "out_variants_vardict_split",
            source=self.vc_vardict.out,
            output_folder=["variants", "vardict"],
            doc="Unmerged variants from the VarDict caller (by interval)",
        )
    def add_combine_variants(self, normal_bam_source, tumor_bam_source):
        self.step(
            "combine_variants",
            CombineVariants_0_0_8(
                normal=self.normal_name,
                tumor=self.tumor_name,
                vcfs=[
                    self.vc_gatk_uncompressvcf.out.as_type(Vcf),
                    self.vc_strelka.out,
                    self.vc_vardict_uncompress_for_combine.out.as_type(Vcf),
                ],
                type="somatic",
                columns=["AD", "DP", "GT"],
            ),
        )

        self.step("combined_compress",
                  BGZipLatest(file=self.combine_variants.out))
        self.step(
            "combined_sort",
            BcfToolsSort_1_9(
                vcf=self.combined_compress.out.as_type(CompressedVcf)),
        )
        self.step("combined_uncompress",
                  UncompressArchive(file=self.combined_sort.out))

        self.step(
            "combined_addbamstats",
            AddBamStatsSomatic_0_1_0(
                normal_id=self.normal_name,
                tumor_id=self.tumor_name,
                normal_bam=normal_bam_source,
                tumor_bam=tumor_bam_source,
                vcf=self.combined_uncompress.out.as_type(Vcf),
                reference=self.reference,
            ),
        )

        self.output(
            "out_variants",
            source=self.combined_addbamstats.out,
            output_folder=[
                "vcf",
            ],
            output_name=StringFormatter(
                "{tumor_name}--{normal_name}_combined",
                tumor_name=self.tumor_name,
                normal_name=self.normal_name,
            ),
            doc="Combined variants from GATK, VarDict and Strelka callers",
        )
Пример #5
0
    def constructor(self):

        self.input("bam", BamBai)
        self.input("reference", FastaWithDict)
        self.input("intervals", BedTabix(optional=True))
        self.input("is_exome", Boolean(optional=True))

        self.step(
            "manta",
            Manta_1_5_0(
                bam=self.bam,
                reference=self.reference,
                callRegions=self.intervals,
                exome=self.is_exome,
            ),
        )

        self.step(
            "strelka",
            StrelkaGermline_2_9_10(
                bam=self.bam,
                reference=self.reference,
                indelCandidates=self.manta.candidateSmallIndels,
                callRegions=self.intervals,
                exome=self.is_exome,
            ),
        )

        # normalise and filter "PASS" variants
        self.step("uncompressvcf",
                  UncompressArchive(file=self.strelka.variants))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )

        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("sv", source=self.manta.diploidSV)
        self.output("variants", source=self.strelka.variants)
        self.output("out", source=self.filterpass.out)
    def add_combine_variants(self, bam_source):

        # Note, this is reliant on the specific step names from previous steps

        # Combine
        self.step(
            "combine_variants",
            CombineVariants_0_0_8(
                vcfs=[
                    self.vc_gatk_uncompress.out.as_type(Vcf),
                    self.vc_strelka.out,
                    self.vc_vardict_uncompress_for_combine.out.as_type(Vcf),
                ],
                type="germline",
                columns=["AC", "AN", "AF", "AD", "DP", "GT"],
            ),
        )
        self.step("combined_compress",
                  BGZipLatest(file=self.combine_variants.out))
        self.step(
            "combined_sort",
            BcfToolsSort_1_9(
                vcf=self.combined_compress.out.as_type(CompressedVcf)),
        )
        self.step("combined_uncompress",
                  UncompressArchive(file=self.combined_sort.out))

        self.step(
            "combined_addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=bam_source,
                vcf=self.combined_uncompress.out.as_type(Vcf),
                reference=self.reference,
            ),
        )

        self.output(
            "out_variants",
            source=self.combined_addbamstats.out,
            output_folder="variants",
            output_name="combined",
            doc="Combined variants from all 3 callers",
        )
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)

        self.step(
            "split_bam",
            gatk4.Gatk4SplitReads_4_1_3(bam=self.bam,
                                        intervals=self.intervals),
        )

        self.step(
            "haplotype_caller",
            gatk4.Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.split_bam.out,
                intervals=self.intervals,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step("uncompressvcf",
                  UncompressArchive(file=self.haplotype_caller.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )

        self.output("variants", source=self.haplotype_caller.out)
        self.output("out_bam", source=self.haplotype_caller.bam)
        self.output("out", source=self.splitnormalisevcf.out)
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String(optional=True))
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional intervals file supports processing by regions. If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # split normal and tumor bam
        self.step(
            "normal_split_bam",
            self.process_subpipeline(bam=self.normal_bam,
                                     intervals=self.intervals),
        )
        self.step(
            "tumor_split_bam",
            self.process_subpipeline(bam=self.tumor_bam,
                                     intervals=self.intervals),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=[self.normal_split_bam.out],
                tumorBams=[self.tumor_split_bam.out],
                normalSample=self.normal_name,
                intervals=self.intervals,
                reference=self.reference,
                germlineResource=self.gnomad,
                panelOfNormals=self.panel_of_normals,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out, ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(
                bam=self.tumor_split_bam.out,
                sites=self.gnomad,
                intervals=self.intervals,
            ),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out, ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step("uncompressvcf",
                  UncompressArchive(file=self.filtermutect2calls.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )
        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("variants", source=self.filtermutect2calls.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.filterpass.out)
    def add_gatk_variantcaller(self, bam_source):
        # VARIANT CALLERS

        intervals = FirstOperator(
            [
                self.gatk_intervals,
                self.step(
                    "generate_gatk_intervals",
                    GenerateIntervalsByChromosome(reference=self.reference),
                    when=self.gatk_intervals.is_null(),
                ).out_regions,
            ]
        )

        # GATK
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=bam_source,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
                intervals=intervals,
            ),
            scatter="intervals",
            doc="Perform base quality score recalibration",
        )
        self.step(
            "vc_gatk",
            GatkGermlineVariantCaller_4_1_3(
                bam=self.bqsr.out,
                intervals=intervals,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
            ),
            scatter=["intervals", "bam"],
        )
        self.step("vc_gatk_merge", Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out))
        self.step("vc_gatk_compressvcf", BGZipLatest(file=self.vc_gatk_merge.out))
        self.step(
            "vc_gatk_sort_combined",
            BcfToolsSort_1_9(vcf=self.vc_gatk_compressvcf.out.as_type(CompressedVcf)),
        )
        self.step(
            "vc_gatk_uncompress",
            UncompressArchive(file=self.vc_gatk_sort_combined.out),
        )

        self.output(
            "out_variants_gatk",
            source=self.vc_gatk_sort_combined.out,
            output_folder="variants",
            output_name="gatk",
            doc="Merged variants from the GATK caller",
        )
        self.output(
            "out_variants_gatk_split",
            source=self.vc_gatk.out,
            output_folder=["variants", "gatk"],
            doc="Unmerged variants from the GATK caller (by interval)",
        )
    def add_gatk_variantcaller(self, normal_bam_source, tumor_bam_source):
        """
        Reimplemented because need steps for combine
        """

        if "generate_gatk_intervals" in self.step_nodes:
            generated_intervals = self.generate_gatk_intervals.out_regions
        else:
            generated_intervals = self.step(
                "generate_gatk_intervals",
                GenerateIntervalsByChromosome(reference=self.reference),
                when=self.gatk_intervals.is_null(),
            ).out_regions

        intervals = FirstOperator([self.gatk_intervals, generated_intervals])

        recal_ins = {
            "reference": self.reference,
            "intervals": intervals,
            "snps_dbsnp": self.snps_dbsnp,
            "snps_1000gp": self.snps_1000gp,
            "known_indels": self.known_indels,
            "mills_indels": self.mills_indels,
        }

        self.step(
            "bqsr_normal",
            GATKBaseRecalBQSRWorkflow_4_1_3(bam=normal_bam_source,
                                            **recal_ins),
            scatter="intervals",
        )

        self.step(
            "bqsr_tumor",
            GATKBaseRecalBQSRWorkflow_4_1_3(bam=tumor_bam_source, **recal_ins),
            scatter="intervals",
        )

        self.step(
            "vc_gatk",
            GatkSomaticVariantCaller_4_1_3(
                normal_bam=self.bqsr_normal.out,
                tumor_bam=self.bqsr_tumor.out,
                normal_name=self.normal_name,
                intervals=intervals,
                reference=self.reference,
                gnomad=self.gnomad,
                panel_of_normals=self.panel_of_normals,
            ),
            scatter=["intervals", "normal_bam", "tumor_bam"],
        )

        self.step("vc_gatk_merge",
                  Gatk4GatherVcfs_4_1_3(vcfs=self.vc_gatk.out))
        self.step("vc_gatk_compress_for_sort",
                  BGZipLatest(file=self.vc_gatk_merge.out))
        self.step(
            "vc_gatk_sort_combined",
            BcfToolsSort_1_9(
                vcf=self.vc_gatk_compress_for_sort.out.as_type(CompressedVcf)),
        )
        self.step(
            "vc_gatk_uncompressvcf",
            UncompressArchive(file=self.vc_gatk_sort_combined.out),
        )

        # VCF
        self.output(
            "out_variants_gatk",
            source=self.vc_gatk_sort_combined.out,
            output_folder=[
                "vcf",
            ],
            output_name=StringFormatter(
                "{tumor_name}--{normal_name}_gatk",
                tumor_name=self.tumor_name,
                normal_name=self.normal_name,
            ),
            doc="Merged variants from the GATK caller",
        )
        self.output(
            "out_variants_split",
            source=self.vc_gatk.out,
            output_folder=[
                "vcf",
                "GATKByInterval",
            ],
            doc="Unmerged variants from the GATK caller (by interval)",
        )
    def constructor(self):
        ##INPUTS
        self.input("bam", BamBai())
        self.input("sample_name", String())

        self.input("reference_folder", Directory())
        self.input("intervals", Bed())

        self.input("gemini_chromosomes", String(optional=True))

        self.input("ploidy", String(optional=True), default="somatic")
        self.input("min_bq", Int(optional=True))
        self.input("min_mq", Int(optional=True))
        self.input("min_dp", Int(optional=True))
        self.input("min_vaf", Float(optional=True))
        self.input("vc_min_vq", Int(optional=True))
        self.input("noise_level", Int(optional=True))
        self.input("vqr_min_vq", Int(optional=True))
        self.input("pisces_awk_script", File())

        ## STEPS
        self.step(
            "primary_only",
            SamToolsView_1_9(sam=self.bam,
                             doNotOutputAlignmentsWithBitsSet="0x100"),
        )

        self.step(
            "index_primary_only_bam",
            SamToolsIndex_1_9(bam=self.primary_only.out),
        )

        self.step(
            "gemini_read_preprocessing",
            PiscesGemini_5_3_0_0(
                inputBam=self.index_primary_only_bam,
                referenceFolder=self.reference_folder,
                samtoolsExecutable="samtools",
                chromosomeFilter=self.gemini_chromosomes,
                outputDir=".",
                piscesVersion="5.3.0.0",
            ),
        )

        self.step(
            "pisces",
            PiscesVariantCaller_5_3_0_0(
                inputBam=self.gemini_read_preprocessing.bam,
                referenceFolder=self.reference_folder,
                outputDir=".",
                intervalBedFile=self.intervals,
                ploidy=self.ploidy,
                minimumBaseQuality=self.min_bq,
                minimumMappingQuality=self.min_mq,
                minimumVariantFrequency=self.min_vaf,
                noiseLevelForQModel=self.noise_level,
                minimumVariantFrequencyFilter=self.min_vaf,
                enableSingleStrandFilter="True",
                outputSBFiles="True",
                callMNVs="False",
                maxMNVLength=1,
                RMxNFilter="5,9,0.35",
                variantQualityFilter=self.vc_min_vq,
                crushVCF="False",
                gVCF="False",
                piscesVersion="5.3.0.0",
            ),
        )

        self.step(
            "vqr",
            PiscesVariantQualityRecalibration_5_3_0_0(
                inputVcf=self.pisces.vcf,
                outputDir=".",
                baselineNoise=self.noise_level,
                minVariantQuality=self.vqr_min_vq,
                piscesVersion="5.3.0.0",
            ),
        )

        piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf])

        self.step(
            "fixSource",
            Awk(script=self.pisces_awk_script, input_files=piscesVcf),
        )

        self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out))

        self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out))

        self.step("uncompress", UncompressArchive(file=self.normalise.out))

        self.step(
            "filterpass",
            VcfToolsvcftools_0_1_16(
                vcf=self.uncompress.out.as_type(Vcf),
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        ## OUTPUTS
        self.output("variants", source=self.sort.out)

        self.output("out", source=self.filterpass.out)

        self.output("out_bam", source=self.gemini_read_preprocessing.bam)
    def constructor(self):

        # Inputs
        self.input("sample_name", String)
        self.input("fastqs", Array(FastqGzPair))
        self.input("seqrun", String, doc="SeqRun Name (for Vcf2Tsv)")
        self.input("reference", FastaWithDict)
        self.input("region_bed", Bed)
        self.input("region_bed_extended", Bed)
        self.input("region_bed_annotated", Bed)
        self.input("genecoverage_bed", Bed)
        self.input("genome_file", TextFile)
        self.input("panel_name", String)
        self.input("vcfcols", TextFile)
        self.input("black_list", Bed(optional=True))
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)
        self.input("mutalyzer_server", String)
        self.input("pathos_db", String)
        self.input("maxRecordsInRam", Int)
        # tumor only
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # fastqc
        self.step(
            "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
        )
        # get the overrepresentative sequence from fastqc
        self.step(
            "getfastqc_adapters",
            ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
            scatter="fastqc_datafiles",
        )
        # align and generate sorted index bam
        self.step(
            "align_and_sort",
            BwaAligner(
                fastq=self.fastqs,
                reference=self.reference,
                sample_name=self.sample_name,
                sortsam_tmpDir=".",
                cutadapt_adapter=self.getfastqc_adapters,
                cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
            ),
            scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
        )
        # merge into one bam and markdups
        self.step(
            "merge_and_mark",
            MergeAndMarkBams_4_1_3(
                bams=self.align_and_sort.out,
                sampleName=self.sample_name,
                maxRecordsInRam=self.maxRecordsInRam,
            ),
        )
        # performance: doc
        self.step(
            "annotate_doc",
            AnnotateDepthOfCoverage_0_1_0(
                bam=self.merge_and_mark.out,
                bed=self.region_bed_annotated,
                reference=self.reference,
                sample_name=self.sample_name,
            ),
        )

        # performance
        self.step(
            "performance_summary",
            PerformanceSummaryTargeted_0_1_0(
                bam=self.merge_and_mark.out,
                region_bed=self.region_bed,
                genecoverage_bed=self.genecoverage_bed,
                sample_name=self.sample_name,
                genome_file=self.genome_file,
            ),
        )
        # gridss
        self.step(
            "gridss",
            Gridss_2_6_2(
                bams=self.merge_and_mark.out,
                reference=self.reference,
                blacklist=self.black_list,
                tmpdir=".",
            ),
        )
        # post gridss r for tumor only + tumor only mode
        # self.step("gridss_post_r", GRIDSSProcessOutput(inp=self.gridss.out))
        # gatk bqsr bam
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=self.merge_and_mark.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
        )
        # mutect2
        self.step(
            "mutect2",
            GatkSomaticVariantCallerTumorOnlyTargeted(
                bam=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                gnomad=self.gnomad,
                panel_of_normals=self.panel_of_normals,
            ),
        )
        # haplotypecaller to do: take base recal away from the
        self.step(
            "haplotype_caller",
            Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out, reference=self.reference
            ),
        )
        # combine variants
        self.step(
            "combinevariants",
            CombineVariants_0_0_8(
                vcfs=[self.splitnormalisevcf.out, self.mutect2.out],
                type="germline",
                columns=["AD", "DP", "AF", "GT"],
            ),
        )
        self.step("compressvcf", BGZip_1_9(file=self.combinevariants.out))
        self.step("sortvcf", BcfToolsSort_1_9(vcf=self.compressvcf.out))
        self.step("uncompressvcf", UncompressArchive(file=self.sortvcf.out))
        # addbamstats
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=self.merge_and_mark.out,
                vcf=self.uncompressvcf.out,
                reference=self.reference,
            ),
        )
        # Molpath specific processes
        self.step("compressvcf2", BGZip_1_9(file=self.addbamstats.out))
        self.step("tabixvcf", TabixLatest(inp=self.compressvcf2.out))
        self.step(
            "calculate_variant_length",
            VcfLength_1_0_1(vcf=self.tabixvcf.out),
            doc="Add the length column for the output of AddBamStats",
        )

        filter_for_variants = self.input("filter_for_vcfs", str, default="length > 150")
        self.step(
            "filter_variants_1_failed",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out, info_filter=filter_for_variants
            ),
        )
        self.step(
            "filter_variants_1",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out,
                info_filter=filter_for_variants,
                invert=True,  # -v param
            ),
        )

        # Jiaan: copy over from the FRCP, can take the block comment out
        # # This one is the in-house molpath step
        # self.step(
        #     "normalise_vcfs",
        #     NormaliseVcf_1_5_4(
        #         pathos_version=self.pathos_db,
        #         mutalyzer=self.mutalyzer_server,  # mutalyzer="https://vmpr-res-mutalyzer1.unix.petermac.org.au",
        #         rdb=self.pathos_db,  # rdb="pa_uat",
        #         inp=self.filter_variants_1.out,
        #     ),
        # )

        # # repeat remove 150bp variants (workaround for normalise_vcf bug)
        # self.step(
        #     "filter_variants_2_failed",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out, info_filter=filter_for_variants
        #     ),
        # )
        # self.step(
        #     "filter_variants_2",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out,
        #         info_filter=filter_for_variants,
        #         invert=True,  # -v param
        #     ),
        # )

        # self.step(
        #     "convert_to_tsv",
        #     Vcf2Tsv_1_5_4(
        #         pathos_version=self.pathos_db,
        #         inp=self.filter_variants_2.out,
        #         sample=self.sample_name,
        #         columns=self.vcfcols,
        #         seqrun=self.seqrun,
        #     ),
        # )

        # self.step(
        #     "index_with_igvtools", IgvIndexFeature_2_5_3(inp=self.filter_variants_2.out)
        # )

        # output
        self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")

        self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")

        self.output(
            "doc_out", source=self.annotate_doc.out, output_folder="PERFORMANCE"
        )
        self.output(
            "summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
        )
        self.output(
            "gene_summary",
            source=self.performance_summary.geneFileOut,
            output_folder="PERFORMANCE",
        )
        self.output(
            "region_summary",
            source=self.performance_summary.regionFileOut,
            output_folder="PERFORMANCE",
        )

        self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
        self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")

        self.output(
            "haplotypecaller_vcf",
            source=self.haplotype_caller.out,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_bam",
            source=self.haplotype_caller.bam,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_norm",
            source=self.splitnormalisevcf.out,
            output_folder="VCF",
        )
        self.output("mutect2_vcf", source=self.mutect2.variants, output_folder="VCF")
        self.output("mutect2_bam", source=self.mutect2.out_bam, output_folder="VCF")
        self.output("mutect2_norm", source=self.mutect2.out, output_folder="VCF")
        self.output("addbamstats_vcf", source=self.addbamstats.out)
Пример #13
0
    def constructor(self):
        ## INPUTS
        self.input("bam", BamBai())
        self.input("sample_name", String())
        self.input("reference_folder", Directory())
        self.input("intervals", Bed())
        self.input("ploidy", String(optional=True), default="somatic")
        self.input("min_bq", Int(optional=True))
        self.input("min_mq", Int(optional=True))
        self.input("min_dp", Int(optional=True), default=100)
        self.input("min_vaf", Float(optional=True))
        self.input("vc_min_vq", Int(optional=True))
        self.input("noise_level", Int(optional=True))
        self.input("vqr_min_vq", Int(optional=True))
        self.input("pisces_awk_script", File())

        ## STEPS
        self.step(
            "primary_only",
            SamToolsView_1_9(sam=self.bam,
                             doNotOutputAlignmentsWithBitsSet="0x100"),
        )

        self.step(
            "index_primary_only_bam",
            SamToolsIndex_1_9(bam=self.primary_only.out),
        )

        self.step(
            "hygea_realignment",
            PiscesHygeaRealigner_5_2_10_49(
                inputBam=self.index_primary_only_bam,
                outputDir=".",
                referenceFolder=self.reference_folder,
                skipAndRemoveDuplicates="true",
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "stitcher_read_joining",
            PiscesStitcher_5_2_10_49(
                inputBam=self.hygea_realignment.out,
                outputDir=".",
                sampleName=self.sample_name,
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "stitcher_sort",
            SamToolsSort_1_9(
                bam=self.stitcher_read_joining.out,
                outputFilename=self.sample_name + ".bam",
            ),
        )

        self.step("stitcher_index",
                  SamToolsIndex_1_9(bam=self.stitcher_sort.out))

        self.step(
            "pisces",
            PiscesVariantCaller_5_2_10_49(
                inputBam=self.stitcher_index.out,
                referenceFolder=self.reference_folder,
                outputDir=".",
                intervalBedFile=self.intervals,
                ploidy=self.ploidy,
                minimumBaseQuality=self.min_bq,
                minimumMappingQuality=self.min_mq,
                minimumVariantFrequency=self.min_vaf,
                minimumCoverage=self.min_dp,
                noiseLevelForQModel=self.noise_level,
                minimumVariantFrequencyFilter=self.min_vaf,
                enableSingleStrandFilter="true",
                callMNVs="false",
                maxMNVLength=1,
                RMxNFilter="5,9,0.35",
                variantQualityFilter=self.vc_min_vq,
                crushVCF="false",
                gVCF="false",
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "vqr",
            PiscesVariantQualityRecalibration_5_2_10_49(
                inputVcf=self.pisces.vcf,
                outputDir=".",
                baselineNoise=self.noise_level,
                minVariantQuality=self.vqr_min_vq,
                piscesVersion="5.2.10.49",
            ),
        )

        piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf])

        self.step(
            "fixSource",
            Awk(script=self.pisces_awk_script, input_files=piscesVcf),
        )

        self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out))

        self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out))

        self.step("uncompress", UncompressArchive(file=self.normalise.out))

        self.step(
            "filterpass",
            VcfToolsvcftools_0_1_16(
                vcf=self.uncompress.out.as_type(Vcf),
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        ## OUTPUTs
        self.output("variants", source=self.sort.out)

        self.output("out", source=self.filterpass.out)

        self.output("out_bam", source=self.stitcher_index.out)
        ## OPTIONAL OUTPUTs
        self.output("hygea_options",
                    source=self.hygea_realignment.used_options)
        self.output("stitcher_options",
                    source=self.stitcher_read_joining.used_options)
        self.output("pisces_options", source=self.pisces.used_options)
        self.output("vqr_options", source=self.vqr.used_options)
Пример #14
0
    def constructor(self):
        ## INPUTs
        self.input("sample_name", String())
        self.input("reference", Fasta())
        # For mpileup
        self.input("bam", BamBai())
        self.input("pileup_max_depth", Int(optional=True))
        self.input("min_bq", Int(optional=True), default=20)
        # For Varscan2
        self.input("min_dp", Int(optional=True))
        self.input("min_ad", Int(optional=True))
        self.input("min_vaf", Float(optional=True))
        self.input("pval", Float(optional=True), default=0.0001)
        # For correction of VCF header
        self.input("header_lines", File)

        ## STEPS
        self.step(
            "mpileup",
            SamToolsMpileup_1_9(
                bam=self.bam,
                outputFilename="./" + self.sample_name + ".mpileup",
                noBAQ=True,
                maxDepth=self.pileup_max_depth,
                reference=self.reference,
            ),
        )
        self.step(
            "varscan2cns",
            VarscanMpileup2cns_2_4_2(
                mpileup=self.mpileup.out,
                minCoverage=self.min_dp,
                minVariantReads=self.min_ad,
                minBaseQuality=self.min_bq,
                minVariantFrequency=self.min_vaf,
                pValue=self.pval,
                outputVcfFormat=1,
                variantPositionsOnly=1,
            ),
        )

        self.step(
            "VSheader",
            VarscanHeader(
                inputVcf=self.varscan2cns.out,
                outputVcf="./" + self.sample_name + "VS.reheader.vcf",
            ),
        )

        self.step(
            "VSheaderContigs",
            BcfToolsAnnotate_1_5(vcf=self.VSheader.out,
                                 headerLines=self.header_lines),
        )

        self.step("sortVcf", BcfToolsSort_1_9(vcf=self.VSheaderContigs.out))

        self.step("normaliseVcf", BcfToolsNorm_1_9(vcf=self.sortVcf.out))

        self.step("uncompress", UncompressArchive(file=self.normaliseVcf.out))

        self.step(
            "filterpass",
            VcfToolsvcftools_0_1_16(
                vcf=self.uncompress.out.as_type(Vcf),
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        ## OUTPUTS
        self.output("variants", source=self.sortVcf.out)
        self.output("out", source=self.filterpass.out)