def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)

        self.input("normal_name", str)
        self.input("tumor_name", str)

        self.input(
            "intervals",
            Bed(optional=True),
            doc="This optional intervals file supports processing by regions. If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)

        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        self.step(
            "base_recalibrator_normal",
            gatk4.Gatk4BaseRecalibrator_4_1_3(),
            ignore_missing=True,
        )
        self.step(
            "base_recalibrator_tumor",
            gatk4.Gatk4BaseRecalibrator_4_1_3(),
            ignore_missing=True,
        )

        self.step(
            "apply_bqsr_normal", gatk4.Gatk4ApplyBqsr_4_1_3(), ignore_missing=True
        )
        self.step("apply_bqsr_tumor", gatk4.Gatk4ApplyBqsr_4_1_3(), ignore_missing=True)

        # S1: BaseRecalibrator(s)

        for inp, base_recal, applyBQSR in [
            (self.normal_bam, self.base_recalibrator_normal, self.apply_bqsr_normal),
            (self.tumor_bam, self.base_recalibrator_tumor, self.apply_bqsr_tumor),
        ]:
            base_recal["bam"] = inp
            base_recal["intervals"] = self.intervals
            base_recal["reference"] = self.reference
            base_recal["knownSites"] = [
                self.snps_dbsnp,
                self.snps_1000gp,
                self.known_indels,
                self.mills_indels,
            ]

            applyBQSR["recalFile"] = base_recal.out
            applyBQSR["bam"] = inp
            applyBQSR["intervals"] = self.intervals
            applyBQSR["reference"] = self.reference

        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=self.apply_bqsr_normal.out,
                tumorBams=self.apply_bqsr_tumor.out,
                normalSample=self.normal_name,
                # tumorName=self.tumorName,
                intervals=self.intervals,
                reference=self.reference,
            ),
        )
        self.step(
            "split_multi_allele",
            SplitMultiAllele(reference=self.reference, vcf=self.mutect2.out),
        )

        self.output("out", source=self.split_multi_allele.out)
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String(optional=True))
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional intervals file supports processing by regions. If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # split normal and tumor bam
        self.step(
            "normal_split_bam",
            self.process_subpipeline(bam=self.normal_bam,
                                     intervals=self.intervals),
        )
        self.step(
            "tumor_split_bam",
            self.process_subpipeline(bam=self.tumor_bam,
                                     intervals=self.intervals),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=[self.normal_split_bam.out],
                tumorBams=[self.tumor_split_bam.out],
                normalSample=self.normal_name,
                intervals=self.intervals,
                reference=self.reference,
                germlineResource=self.gnomad,
                panelOfNormals=self.panel_of_normals,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out, ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(
                bam=self.tumor_split_bam.out,
                sites=self.gnomad,
                intervals=self.intervals,
            ),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out, ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step("uncompressvcf",
                  UncompressArchive(file=self.filtermutect2calls.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )
        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("variants", source=self.filtermutect2calls.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.filterpass.out)
Пример #3
0
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)

        self.input("normal_name", str)
        self.input("tumor_name", str)

        self.input(
            "intervals",
            Bed(),
            doc="If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )

        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)
        self.input("gnomad", VcfTabix)

        # # base calibration for normal and tumor bam
        self.step(
            "normal",
            self.process_subpipeline(
                bam=self.normal_bam,
                intervals=self.targeted_bed,
                reference=self.reference,
                known_sites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )
        self.step(
            "tumor",
            self.process_subpipeline(
                bam=self.tumor_bam,
                intervals=self.targeted_bed,
                reference=self.reference,
                known_sites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=self.normal.out,
                tumorBams=self.tumor.out,
                normalSample=self.normal_name,
                intervals=self.targeted_bed,
                reference=self.reference,
                germlineResource=self.gnomad,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out, ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(bam=self.tumor_bam,
                                                sites=self.gnomad,
                                                intervals=self.targeted_bed),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out,
                segmentationFileOut="tumor_segmentation.mutect2_segments",
            ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedTabixVcf=self.filtermutect2calls.out,
                reference=self.reference),
        )

        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                compressedVcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.step("tabixvcf", TabixLatest(inp=self.filterpass.out))

        self.output("variants", source=self.mutect2.out)
        self.output("out", source=self.tabixvcf.out)