def constructor(self):

        self.input("normalBam", CramCrai)
        self.input("tumorBam", CramCrai)

        self.input("reference", FastaFai)
        self.input("callRegions", BedTabix(optional=True))
        self.input("exome", Boolean(optional=True), default=False)
        self.input("configStrelka", File(optional=True))

        self.step(
            "manta",
            Manta(
                bam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
            ),
        )
        self.step(
            "strelka",
            Strelka(
                indelCandidates=self.manta.candidateSmallIndels,
                normalBam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
                config=self.configStrelka,
            ),
        )
        self.step(
            "normaliseSNVs",
            BcfToolsNorm(vcf=self.strelka.snvs, reference=self.reference),
        )
        self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out))

        self.step(
            "normaliseINDELs",
            BcfToolsNorm(vcf=self.strelka.indels, reference=self.reference),
        )
        self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out))

        self.output("diploid", source=self.manta.diploidSV)
        self.output("candIndels", source=self.manta.candidateSmallIndels)
        self.output("indels", source=self.indexINDELs.out)
        self.output("snvs", source=self.indexSNVs.out)
        self.output("somaticSVs", source=self.manta.somaticSVs)
    def constructor(self):

        self.input("normalBam", self.getStrelka2InputType())
        self.input("tumorBam", self.getStrelka2InputType())

        self.input("reference", FastaFai)
        self.input("callRegions", BedTabix(optional=True))
        self.input("exome", Boolean(optional=True), default=False)
        self.input("configStrelka", File(optional=True))

        self.input("indelCandidates", Array(VcfTabix))
        self.input("strelkaSNVs", Array(VcfTabix))
        # self.input("strelkaIndels", Array(VcfTabix))

        self.step(
            "strelka2pass",
            self.getStrelka2Tool()(
                indelCandidates=self.indelCandidates,
                # indelCandidates=self.strelkaIndels,
                forcedgt=self.strelkaSNVs,
                normalBam=self.normalBam,
                tumorBam=self.tumorBam,
                reference=self.reference,
                callRegions=self.callRegions,
                exome=self.exome,
                config=self.configStrelka,
            ),
        )
        self.step(
            "normaliseSNVs",
            BcfToolsNorm(vcf=self.strelka2pass.snvs, reference=self.reference),
        )
        self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out))

        self.step(
            "normaliseINDELs",
            BcfToolsNorm(vcf=self.strelka2pass.indels,
                         reference=self.reference),
        )
        self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out))

        self.output("indels", source=self.indexINDELs.out)
        self.output("snvs", source=self.indexSNVs.out)
예제 #3
0
    def constructor(self):

        self.input("bams", Array(CramCrai))

        self.input("reference", FastaFai)
        self.input("regionSize", int, default=10000000)

        self.input("normalSample", String)
        self.input("sampleNames", Array(String, optional=True))

        # for the moment this is a bit wonky, because you need to specify something which is
        # affected by the amount of bams that you specify (bam coverage just gets summed up at this
        # location)
        # so the formula at the moment would be nBams * coverage = skipCov
        # which means for 8 bams with an average coverage of 160 you would probably want
        # 8 * 400 = 1600 to be on the save side
        self.input("skipCov", Int(optional=True), default=500)

        # the same is true for min cov
        self.input("minCov", Int(optional=True), default=10)

        # this should be a conditional (if the callregions are supplied we use them, otherwise we
        # create them)
        self.step(
            "createCallRegions",
            CreateCallRegions(reference=self.reference,
                              regionSize=self.regionSize,
                              equalize=True),
        )

        self.step(
            "callVariants",
            FreeBayes(
                bams=self.bams,
                reference=self.reference,
                pooledDiscreteFlag=True,
                gtQuals=True,
                strictFlag=True,
                pooledContinousFlag=True,
                reportMaxGLFlag=True,
                noABPriorsFlag=True,
                maxNumOfAlleles=4,
                noPartObsFlag=True,
                region=self.createCallRegions.regions,
                skipCov=self.skipCov,
                # things that are actually default, but janis does not recognize yet
                useDupFlag=False,
                minBaseQual=1,
                minSupMQsum=0,
                minSupQsum=0,
                minCov=self.minCov,
                # now here we are trying to play with the detection limits
                # we set the fraction to be very low, to include ALL of the sites in a potential analysis
                minAltFrac=0.01,
                # and we want at least one sample that has two high quality variants OR multiple
                # lower quality ones
                minAltQSum=70,
                # but we also want to have at least two reads overall with that variants
                # we do not care if they are between samples or if they are in the same sample, but
                # 2 is better than one
                minAltTotal=2,
            ),
            scatter="region",
        )
        # might actually rewrite this once everything works, to not combine the files here, but do
        # all of it scattered and then only combine the final output
        # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out))

        #

        # self.step("compressAll", BGZip(file=self.sortAll.out))
        # self.step("indexAll", Tabix(file=self.compressAll.out))

        self.step(
            "callSomatic",
            CallSomaticFreeBayes(vcf=self.callVariants.out,
                                 normalSampleName=self.normalSample),
            # added for parallel
            scatter="vcf",
        )

        self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out))

        # should not be necessary here, but just to be save
        self.step(
            "sortSomatic1",
            VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True),
        )

        # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves
        self.step(
            "normalizeSomatic1",
            BcfToolsNorm(
                vcf=self.sortSomatic1.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step(
            "allelicPrimitves",
            VcfAllelicPrimitives(
                vcf=self.normalizeSomatic1.out,
                tagParsed="DECOMPOSED",
                keepGenoFlag=True,
            ),
        )

        self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out))

        self.step("sortSomatic2",
                  VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True))

        self.step(
            "normalizeSomatic2",
            BcfToolsNorm(
                vcf=self.sortSomatic2.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step("uniqueAlleles",
                  VcfUniqAlleles(vcf=self.normalizeSomatic2.out))

        self.step("sortFinal",
                  VcfStreamSort(vcf=self.uniqueAlleles.out, inMemoryFlag=True))

        self.step("uniqVcf", VcfUniq(vcf=self.sortFinal.out))

        self.step("compressFinal", BGZip(file=self.uniqVcf.out))

        self.step("indexFinal", Tabix(inp=self.compressFinal.out))

        self.output("somaticOutVcf", source=self.indexFinal)
예제 #4
0
    def constructor(self):

        self.input(
            "bams",
            Array(self.getFreebayesInputType()),
            doc="All bams to be analysed. Samples can be split over multiple bams as well as multiple samples can be contained in one bam as long as the sample ids are set properly.",
        )

        self.input(
            "reference",
            FastaFai,
            doc="The reference the bams were aligned to, with a fai index.",
        )
        self.input(
            "regionSize",
            int,
            default=10000000,
            doc="the size of the regions, to parallelise the analysis over. This needs to be adjusted if there are lots of samples or very high depth sequencing in the analysis.",
        )

        self.input(
            "normalSample",
            String,
            doc="The sample id of the normal sample, as it is specified in the bam header.",
        )

        # this is the coverage per sample that is the max we will analyse. It will automatically
        # multiplied by the amount of input bams we get
        self.input(
            "skipCov",
            Int(optional=True),
            default=500,
            doc="The depth per sample, at which the variant calling process will skip a region. This is used to ignore regions with mapping issues, like the centromeres as well as heterochromatin. A good value is 3 times the maximum expected coverage.",
        )

        # the same is true for min cov
        self.input(
            "minCov",
            Int(optional=True),
            default=10,
            doc="Minimum coverage over all samples, to still call variants.",
        )

        # this could be a conditional (if the callregions are supplied we use them, otherwise we
        # create them)
        self.step(
            "createCallRegions",
            CreateCallRegions(
                reference=self.reference, regionSize=self.regionSize, equalize=True
            ),
        )

        self.step(
            "callVariants",
            self.getFreebayesTool()(
                bams=self.bams,
                reference=self.reference,
                pooledDiscreteFlag=True,
                gtQuals=True,
                strictFlag=True,
                pooledContinousFlag=True,
                reportMaxGLFlag=True,
                noABPriorsFlag=True,
                maxNumOfAlleles=4,
                noPartObsFlag=True,
                region=self.createCallRegions.regions,
                # here we multiply the skipCov input by the amount of input that we have
                skipCov=(self.skipCov * self.bams.length()),
                # things that are actually default, but janis does not recognize yet
                useDupFlag=False,
                minBaseQual=1,
                minSupMQsum=0,
                minSupQsum=0,
                minCov=self.minCov,
                # now here we are trying to play with the detection limits
                # we set the fraction to be very low, to include ALL of the sites in a potential analysis
                minAltFrac=0.01,
                # and we want at least one sample that has two high quality variants OR multiple
                # lower quality ones
                minAltQSum=70,
                # but we also want to have at least two reads overall with that variants
                # we do not care if they are between samples or if they are in the same sample, but
                # 2 is better than one
                minAltTotal=2,
            ),
            scatter="region",
        )
        # might actually rewrite this once everything works, to not combine the files here, but do
        # all of it scattered and then only combine the final output
        # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out))

        #

        # self.step("compressAll", BGZip(file=self.sortAll.out))
        # self.step("indexAll", Tabix(file=self.compressAll.out))

        self.step(
            "callSomatic",
            CallSomaticFreeBayes(
                vcf=self.callVariants.out, normalSampleName=self.normalSample
            ),
            # added for parallel
            scatter="vcf",
        )

        self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out))

        # should not be necessary here, but just to be save
        self.step(
            "sortSomatic1",
            VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True),
        )

        # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves
        self.step(
            "normalizeSomatic1",
            BcfToolsNorm(
                vcf=self.sortSomatic1.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step(
            "allelicPrimitves",
            VcfAllelicPrimitives(
                vcf=self.normalizeSomatic1.out,
                tagParsed="DECOMPOSED",
                keepGenoFlag=True,
            ),
        )

        self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out))

        self.step(
            "sortSomatic2", VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True)
        )

        self.step(
            "normalizeSomatic2",
            BcfToolsNorm(
                vcf=self.sortSomatic2.out,
                reference=self.reference,
                outputType="v",
                outputFilename="normalised.vcf",
            ),
        )

        self.step("uniqueAlleles", VcfUniqAlleles(vcf=self.normalizeSomatic2.out))

        self.step(
            "fixUpFreeBayesMNPs",
            FixUpFreeBayesMNPs(
                vcf=self.uniqueAlleles.out,
            ),
        )

        self.output("somaticOutVcf", source=self.fixUpFreeBayesMNPs)
    def constructor(self):

        # we have to split the bam into the ones of the normal sample (can be multiple) and the
        # tumor, because some tools only work with the tumor bams
        self.input(
            "normalBams",
            Array(self.getMutect2InputType()),
            doc=
            "The bams that make up the normal sample. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work",
        )
        self.input(
            "tumorBams",
            Array(self.getMutect2InputType()),
            doc=
            "The bams that contain the tumour samples. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work",
        )

        # we also need the name of the normal sample (needs to be the name in the bams as well)
        self.input(
            "normalName",
            String,
            doc=
            "The sample id of the normal sample. This id will be used to distingiush reads from this sample from all other samples. This id needs to tbe the one set in the bam header",
        )

        self.input(
            "biallelicSites",
            VcfTabix,
            doc=
            "A vcf of common biallalic sites from a population. This will be used to estimate sample contamination.",
        )

        self.input(
            "reference",
            FastaWithDict,
            doc=
            "A fasta and dict indexed reference, which needs to be the reference, the bams were aligned to.",
        )

        self.input(
            "regionSize",
            int,
            default=10000000,
            doc=
            "The size of the regions over which to parallelise the analysis. This should be adjusted, if there are lots of samples or a very high sequencing depth. default: 10M bp",
        )

        self.input(
            "panelOfNormals",
            VcfTabix,
            doc=
            "The panel of normals, which summarises the technical and biological sites of errors. Its usually a good idea to generate this for your own cohort, but GATK suggests around 30 normals, so their panel is usually a good idea.",
        )

        self.input(
            "germlineResource",
            VcfTabix,
            doc=
            "Vcf of germline variants. GATK provides this as well, but it can easily substituted with the newst gnomad etc vcf.",
        )

        self.step(
            "createCallRegions",
            CreateCallRegions(
                reference=self.reference,
                regionSize=self.regionSize,
                equalize=True,
            ),
        )

        self.step(
            "mutect2",
            self.getMutect2Tool()(
                tumorBams=self.tumorBams,
                normalBams=self.normalBams,
                normalSample=self.normalName,
                intervals=self.createCallRegions.regions,
                reference=self.reference,
                panelOfNormals=self.panelOfNormals,
                germlineResource=self.germlineResource,
            ),
            scatter="intervals",
        )

        self.step("concat", BcfToolsConcat(vcf=self.mutect2.out))
        self.step("indexUnfiltered", BcfToolsIndex(vcf=self.concat.out))

        self.step(
            "learn",
            LearnReadOrientationModel(f1r2CountsFiles=self.mutect2.f1f2r_out))

        self.step("mergeMutect2",
                  MergeMutectStats(statsFiles=self.mutect2.stats))

        self.step(
            "pileup",
            self.getPileUpTool()(
                bam=self.tumorBams,
                sites=self.biallelicSites,
                intervals=self.biallelicSites,
                reference=self.reference,
            ),
        )

        self.step("contamination",
                  CalculateContamination(pileupTable=self.pileup.out))

        self.step(
            "filtering",
            FilterMutectCalls(
                vcf=self.indexUnfiltered.out,
                reference=self.reference,
                segmentationFile=self.contamination.segOut,
                contaminationTable=self.contamination.contOut,
                readOrientationModel=self.learn.out,
                statsFile=self.mergeMutect2.out,
            ),
        )

        self.step(
            "normalise",
            BcfToolsNorm(vcf=self.filtering.out, reference=self.reference))
        self.step("indexFiltered", BcfToolsIndex(vcf=self.normalise.out))
        self.output("out", source=self.indexFiltered.out)
    def constructor(self):

        # we have to split the bam into the ones of the normal sample (can be multiple) and the
        # tumor, because some tools only work with the tumor bams
        self.input("normalBams", Array(BamBai))
        self.input("tumorBams", Array(BamBai))

        # we also need the name of the normal sample (needs to be the name in the bams as well)
        self.input("normalName", String)

        self.input("biallelicSites", VcfTabix)

        self.input("reference", FastaWithDict)

        self.input("regionSize", int, default=10000000)

        self.input("panelOfNormals", VcfTabix)

        self.input("germlineResource", VcfTabix)

        self.step(
            "createCallRegions",
            CreateCallRegions(reference=self.reference,
                              regionSize=self.regionSize,
                              equalize=True),
        )

        self.step(
            "mutect2",
            Mutect2(
                tumorBams=self.tumorBams,
                normalBams=self.normalBams,
                normalSample=self.normalName,
                intervals=self.createCallRegions.regions,
                reference=self.reference,
                panelOfNormals=self.panelOfNormals,
                germlineResource=self.germlineResource,
            ),
            scatter="intervals",
        )

        self.step("concat", BcfToolsConcat(vcf=self.mutect2.out))
        self.step("indexUnfiltered", BcfToolsIndex(vcf=self.concat.out))

        self.step(
            "learn",
            LearnReadOrientationModel(f1r2CountsFiles=self.mutect2.f1f2r_out))

        self.step("mergeMutect2",
                  MergeMutectStats(statsFiles=self.mutect2.stats))

        self.step(
            "pileup",
            GetPileUpSummaries(
                bam=self.tumorBams,
                sites=self.biallelicSites,
                intervals=self.biallelicSites,
                reference=self.reference,
            ),
        )

        self.step("contamination",
                  CalculateContamination(pileupTable=self.pileup.out))

        self.step(
            "filtering",
            FilterMutectCalls(
                vcf=self.indexUnfiltered.out,
                reference=self.reference,
                segmentationFile=self.contamination.segOut,
                contaminationTable=self.contamination.contOut,
                readOrientationModel=self.learn.out,
                statsFile=self.mergeMutect2.out,
            ),
        )

        self.step(
            "normalise",
            BcfToolsNorm(vcf=self.filtering.out, reference=self.reference))
        self.step("indexFiltered", BcfToolsIndex(vcf=self.normalise.out))
        self.output("out", source=self.indexFiltered.out)