def constructor(self): self.input("normalBam", CramCrai) self.input("tumorBam", CramCrai) self.input("reference", FastaFai) self.input("callRegions", BedTabix(optional=True)) self.input("exome", Boolean(optional=True), default=False) self.input("configStrelka", File(optional=True)) self.step( "manta", Manta( bam=self.normalBam, tumorBam=self.tumorBam, reference=self.reference, callRegions=self.callRegions, exome=self.exome, ), ) self.step( "strelka", Strelka( indelCandidates=self.manta.candidateSmallIndels, normalBam=self.normalBam, tumorBam=self.tumorBam, reference=self.reference, callRegions=self.callRegions, exome=self.exome, config=self.configStrelka, ), ) self.step( "normaliseSNVs", BcfToolsNorm(vcf=self.strelka.snvs, reference=self.reference), ) self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out)) self.step( "normaliseINDELs", BcfToolsNorm(vcf=self.strelka.indels, reference=self.reference), ) self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out)) self.output("diploid", source=self.manta.diploidSV) self.output("candIndels", source=self.manta.candidateSmallIndels) self.output("indels", source=self.indexINDELs.out) self.output("snvs", source=self.indexSNVs.out) self.output("somaticSVs", source=self.manta.somaticSVs)
def constructor(self): self.input("normalBam", self.getStrelka2InputType()) self.input("tumorBam", self.getStrelka2InputType()) self.input("reference", FastaFai) self.input("callRegions", BedTabix(optional=True)) self.input("exome", Boolean(optional=True), default=False) self.input("configStrelka", File(optional=True)) self.input("indelCandidates", Array(VcfTabix)) self.input("strelkaSNVs", Array(VcfTabix)) # self.input("strelkaIndels", Array(VcfTabix)) self.step( "strelka2pass", self.getStrelka2Tool()( indelCandidates=self.indelCandidates, # indelCandidates=self.strelkaIndels, forcedgt=self.strelkaSNVs, normalBam=self.normalBam, tumorBam=self.tumorBam, reference=self.reference, callRegions=self.callRegions, exome=self.exome, config=self.configStrelka, ), ) self.step( "normaliseSNVs", BcfToolsNorm(vcf=self.strelka2pass.snvs, reference=self.reference), ) self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out)) self.step( "normaliseINDELs", BcfToolsNorm(vcf=self.strelka2pass.indels, reference=self.reference), ) self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out)) self.output("indels", source=self.indexINDELs.out) self.output("snvs", source=self.indexSNVs.out)
def constructor(self): self.input("bams", Array(CramCrai)) self.input("reference", FastaFai) self.input("regionSize", int, default=10000000) self.input("normalSample", String) self.input("sampleNames", Array(String, optional=True)) # for the moment this is a bit wonky, because you need to specify something which is # affected by the amount of bams that you specify (bam coverage just gets summed up at this # location) # so the formula at the moment would be nBams * coverage = skipCov # which means for 8 bams with an average coverage of 160 you would probably want # 8 * 400 = 1600 to be on the save side self.input("skipCov", Int(optional=True), default=500) # the same is true for min cov self.input("minCov", Int(optional=True), default=10) # this should be a conditional (if the callregions are supplied we use them, otherwise we # create them) self.step( "createCallRegions", CreateCallRegions(reference=self.reference, regionSize=self.regionSize, equalize=True), ) self.step( "callVariants", FreeBayes( bams=self.bams, reference=self.reference, pooledDiscreteFlag=True, gtQuals=True, strictFlag=True, pooledContinousFlag=True, reportMaxGLFlag=True, noABPriorsFlag=True, maxNumOfAlleles=4, noPartObsFlag=True, region=self.createCallRegions.regions, skipCov=self.skipCov, # things that are actually default, but janis does not recognize yet useDupFlag=False, minBaseQual=1, minSupMQsum=0, minSupQsum=0, minCov=self.minCov, # now here we are trying to play with the detection limits # we set the fraction to be very low, to include ALL of the sites in a potential analysis minAltFrac=0.01, # and we want at least one sample that has two high quality variants OR multiple # lower quality ones minAltQSum=70, # but we also want to have at least two reads overall with that variants # we do not care if they are between samples or if they are in the same sample, but # 2 is better than one minAltTotal=2, ), scatter="region", ) # might actually rewrite this once everything works, to not combine the files here, but do # all of it scattered and then only combine the final output # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out)) # # self.step("compressAll", BGZip(file=self.sortAll.out)) # self.step("indexAll", Tabix(file=self.compressAll.out)) self.step( "callSomatic", CallSomaticFreeBayes(vcf=self.callVariants.out, normalSampleName=self.normalSample), # added for parallel scatter="vcf", ) self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out)) # should not be necessary here, but just to be save self.step( "sortSomatic1", VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True), ) # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves self.step( "normalizeSomatic1", BcfToolsNorm( vcf=self.sortSomatic1.out, reference=self.reference, outputType="v", outputFilename="normalised.vcf", ), ) self.step( "allelicPrimitves", VcfAllelicPrimitives( vcf=self.normalizeSomatic1.out, tagParsed="DECOMPOSED", keepGenoFlag=True, ), ) self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out)) self.step("sortSomatic2", VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True)) self.step( "normalizeSomatic2", BcfToolsNorm( vcf=self.sortSomatic2.out, reference=self.reference, outputType="v", outputFilename="normalised.vcf", ), ) self.step("uniqueAlleles", VcfUniqAlleles(vcf=self.normalizeSomatic2.out)) self.step("sortFinal", VcfStreamSort(vcf=self.uniqueAlleles.out, inMemoryFlag=True)) self.step("uniqVcf", VcfUniq(vcf=self.sortFinal.out)) self.step("compressFinal", BGZip(file=self.uniqVcf.out)) self.step("indexFinal", Tabix(inp=self.compressFinal.out)) self.output("somaticOutVcf", source=self.indexFinal)
def constructor(self): self.input( "bams", Array(self.getFreebayesInputType()), doc="All bams to be analysed. Samples can be split over multiple bams as well as multiple samples can be contained in one bam as long as the sample ids are set properly.", ) self.input( "reference", FastaFai, doc="The reference the bams were aligned to, with a fai index.", ) self.input( "regionSize", int, default=10000000, doc="the size of the regions, to parallelise the analysis over. This needs to be adjusted if there are lots of samples or very high depth sequencing in the analysis.", ) self.input( "normalSample", String, doc="The sample id of the normal sample, as it is specified in the bam header.", ) # this is the coverage per sample that is the max we will analyse. It will automatically # multiplied by the amount of input bams we get self.input( "skipCov", Int(optional=True), default=500, doc="The depth per sample, at which the variant calling process will skip a region. This is used to ignore regions with mapping issues, like the centromeres as well as heterochromatin. A good value is 3 times the maximum expected coverage.", ) # the same is true for min cov self.input( "minCov", Int(optional=True), default=10, doc="Minimum coverage over all samples, to still call variants.", ) # this could be a conditional (if the callregions are supplied we use them, otherwise we # create them) self.step( "createCallRegions", CreateCallRegions( reference=self.reference, regionSize=self.regionSize, equalize=True ), ) self.step( "callVariants", self.getFreebayesTool()( bams=self.bams, reference=self.reference, pooledDiscreteFlag=True, gtQuals=True, strictFlag=True, pooledContinousFlag=True, reportMaxGLFlag=True, noABPriorsFlag=True, maxNumOfAlleles=4, noPartObsFlag=True, region=self.createCallRegions.regions, # here we multiply the skipCov input by the amount of input that we have skipCov=(self.skipCov * self.bams.length()), # things that are actually default, but janis does not recognize yet useDupFlag=False, minBaseQual=1, minSupMQsum=0, minSupQsum=0, minCov=self.minCov, # now here we are trying to play with the detection limits # we set the fraction to be very low, to include ALL of the sites in a potential analysis minAltFrac=0.01, # and we want at least one sample that has two high quality variants OR multiple # lower quality ones minAltQSum=70, # but we also want to have at least two reads overall with that variants # we do not care if they are between samples or if they are in the same sample, but # 2 is better than one minAltTotal=2, ), scatter="region", ) # might actually rewrite this once everything works, to not combine the files here, but do # all of it scattered and then only combine the final output # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out)) # # self.step("compressAll", BGZip(file=self.sortAll.out)) # self.step("indexAll", Tabix(file=self.compressAll.out)) self.step( "callSomatic", CallSomaticFreeBayes( vcf=self.callVariants.out, normalSampleName=self.normalSample ), # added for parallel scatter="vcf", ) self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out)) # should not be necessary here, but just to be save self.step( "sortSomatic1", VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True), ) # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves self.step( "normalizeSomatic1", BcfToolsNorm( vcf=self.sortSomatic1.out, reference=self.reference, outputType="v", outputFilename="normalised.vcf", ), ) self.step( "allelicPrimitves", VcfAllelicPrimitives( vcf=self.normalizeSomatic1.out, tagParsed="DECOMPOSED", keepGenoFlag=True, ), ) self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out)) self.step( "sortSomatic2", VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True) ) self.step( "normalizeSomatic2", BcfToolsNorm( vcf=self.sortSomatic2.out, reference=self.reference, outputType="v", outputFilename="normalised.vcf", ), ) self.step("uniqueAlleles", VcfUniqAlleles(vcf=self.normalizeSomatic2.out)) self.step( "fixUpFreeBayesMNPs", FixUpFreeBayesMNPs( vcf=self.uniqueAlleles.out, ), ) self.output("somaticOutVcf", source=self.fixUpFreeBayesMNPs)
def constructor(self): # we have to split the bam into the ones of the normal sample (can be multiple) and the # tumor, because some tools only work with the tumor bams self.input( "normalBams", Array(self.getMutect2InputType()), doc= "The bams that make up the normal sample. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work", ) self.input( "tumorBams", Array(self.getMutect2InputType()), doc= "The bams that contain the tumour samples. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work", ) # we also need the name of the normal sample (needs to be the name in the bams as well) self.input( "normalName", String, doc= "The sample id of the normal sample. This id will be used to distingiush reads from this sample from all other samples. This id needs to tbe the one set in the bam header", ) self.input( "biallelicSites", VcfTabix, doc= "A vcf of common biallalic sites from a population. This will be used to estimate sample contamination.", ) self.input( "reference", FastaWithDict, doc= "A fasta and dict indexed reference, which needs to be the reference, the bams were aligned to.", ) self.input( "regionSize", int, default=10000000, doc= "The size of the regions over which to parallelise the analysis. This should be adjusted, if there are lots of samples or a very high sequencing depth. default: 10M bp", ) self.input( "panelOfNormals", VcfTabix, doc= "The panel of normals, which summarises the technical and biological sites of errors. Its usually a good idea to generate this for your own cohort, but GATK suggests around 30 normals, so their panel is usually a good idea.", ) self.input( "germlineResource", VcfTabix, doc= "Vcf of germline variants. GATK provides this as well, but it can easily substituted with the newst gnomad etc vcf.", ) self.step( "createCallRegions", CreateCallRegions( reference=self.reference, regionSize=self.regionSize, equalize=True, ), ) self.step( "mutect2", self.getMutect2Tool()( tumorBams=self.tumorBams, normalBams=self.normalBams, normalSample=self.normalName, intervals=self.createCallRegions.regions, reference=self.reference, panelOfNormals=self.panelOfNormals, germlineResource=self.germlineResource, ), scatter="intervals", ) self.step("concat", BcfToolsConcat(vcf=self.mutect2.out)) self.step("indexUnfiltered", BcfToolsIndex(vcf=self.concat.out)) self.step( "learn", LearnReadOrientationModel(f1r2CountsFiles=self.mutect2.f1f2r_out)) self.step("mergeMutect2", MergeMutectStats(statsFiles=self.mutect2.stats)) self.step( "pileup", self.getPileUpTool()( bam=self.tumorBams, sites=self.biallelicSites, intervals=self.biallelicSites, reference=self.reference, ), ) self.step("contamination", CalculateContamination(pileupTable=self.pileup.out)) self.step( "filtering", FilterMutectCalls( vcf=self.indexUnfiltered.out, reference=self.reference, segmentationFile=self.contamination.segOut, contaminationTable=self.contamination.contOut, readOrientationModel=self.learn.out, statsFile=self.mergeMutect2.out, ), ) self.step( "normalise", BcfToolsNorm(vcf=self.filtering.out, reference=self.reference)) self.step("indexFiltered", BcfToolsIndex(vcf=self.normalise.out)) self.output("out", source=self.indexFiltered.out)
def constructor(self): # we have to split the bam into the ones of the normal sample (can be multiple) and the # tumor, because some tools only work with the tumor bams self.input("normalBams", Array(BamBai)) self.input("tumorBams", Array(BamBai)) # we also need the name of the normal sample (needs to be the name in the bams as well) self.input("normalName", String) self.input("biallelicSites", VcfTabix) self.input("reference", FastaWithDict) self.input("regionSize", int, default=10000000) self.input("panelOfNormals", VcfTabix) self.input("germlineResource", VcfTabix) self.step( "createCallRegions", CreateCallRegions(reference=self.reference, regionSize=self.regionSize, equalize=True), ) self.step( "mutect2", Mutect2( tumorBams=self.tumorBams, normalBams=self.normalBams, normalSample=self.normalName, intervals=self.createCallRegions.regions, reference=self.reference, panelOfNormals=self.panelOfNormals, germlineResource=self.germlineResource, ), scatter="intervals", ) self.step("concat", BcfToolsConcat(vcf=self.mutect2.out)) self.step("indexUnfiltered", BcfToolsIndex(vcf=self.concat.out)) self.step( "learn", LearnReadOrientationModel(f1r2CountsFiles=self.mutect2.f1f2r_out)) self.step("mergeMutect2", MergeMutectStats(statsFiles=self.mutect2.stats)) self.step( "pileup", GetPileUpSummaries( bam=self.tumorBams, sites=self.biallelicSites, intervals=self.biallelicSites, reference=self.reference, ), ) self.step("contamination", CalculateContamination(pileupTable=self.pileup.out)) self.step( "filtering", FilterMutectCalls( vcf=self.indexUnfiltered.out, reference=self.reference, segmentationFile=self.contamination.segOut, contaminationTable=self.contamination.contOut, readOrientationModel=self.learn.out, statsFile=self.mergeMutect2.out, ), ) self.step( "normalise", BcfToolsNorm(vcf=self.filtering.out, reference=self.reference)) self.step("indexFiltered", BcfToolsIndex(vcf=self.normalise.out)) self.output("out", source=self.indexFiltered.out)