def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)

        self.step(
            "split_bam",
            gatk4.Gatk4SplitReads_4_1_3(bam=self.bam,
                                        intervals=self.intervals),
        )

        self.step(
            "haplotype_caller",
            gatk4.Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.split_bam.out,
                intervals=self.intervals,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step("uncompressvcf",
                  UncompressArchive(file=self.haplotype_caller.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )

        self.output("variants", source=self.haplotype_caller.out)
        self.output("out_bam", source=self.haplotype_caller.bam)
        self.output("out", source=self.splitnormalisevcf.out)
示例#2
0
 def inputs(self):
     return [
         *self.additional_inputs,
         ToolInput(
             "inputABed",
             Bed(),
             prefix="-a",
             doc="input file a: only bed is supported. May be followed with multiple databases and/or  wildcard (*) character(s). ",
         ),
         ToolInput(
             "inputBBam",
             Bam(),
             prefix="-b",
             doc="input file b: only bam is supported.",
         ),
         ToolInput(
             "histogram",
             Boolean(optional=True),
             prefix="-hist",
             doc="Report a histogram of coverage for each feature in A as well as a summary histogram for _all_ features in A. Output (tab delimited) after each feature in A: 1) depth 2) # bases at depth 3) size of A 4) % of A at depth.",
         ),
         ToolInput(
             "depth",
             Boolean(optional=True),
             prefix="-d",
             doc="Report the depth at each position in each A feature. Positions reported are one based.  Each position and depth follow the complete A feature.",
         ),
         ToolInput(
             "counts",
             Boolean(optional=True),
             prefix="-counts",
             doc="Only report the count of overlaps, don't compute fraction, etc.",
         ),
         ToolInput(
             "mean",
             Boolean(optional=True),
             prefix="-mean",
             doc="Report the mean depth of all positions in each A feature.",
         ),
     ]
示例#3
0
 def inputs(self):
     return [
         *super(Gatk4HaplotypeCallerBase, self).inputs(),
         *Gatk4HaplotypeCallerBase.optional_args,
         ToolInput(
             "inputRead",
             BamBai(),
             doc="BAM/SAM/CRAM file containing reads",
             prefix="--input",
             secondaries_present_as={".bai": "^.bai"},
         ),
         ToolInput(
             "reference",
             FastaWithDict(),
             position=5,
             prefix="--reference",
             doc="Reference sequence file",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf.gz"),
             position=8,
             prefix="--output",
             doc="File to which variants should be written",
         ),
         ToolInput(
             "dbsnp",
             VcfTabix(),
             position=7,
             prefix="--dbsnp",
             doc="(Also: -D) A dbSNP VCF file.",
         ),
         ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--intervals",
             doc=
             "-L (BASE) One or more genomic intervals over which to operate",
         ),
     ]
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        self.step(
            "base_recalibrator",
            Gatk4BaseRecalibrator_4_1_2(
                bam=self.bam,
                intervals=self.intervals,
                reference=self.reference,
                knownSites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )
        self.step(
            "apply_bqsr",
            Gatk4ApplyBqsr_4_1_2(
                bam=self.bam,
                intervals=self.intervals,
                recalFile=self.base_recalibrator.out,
                reference=self.reference,
            ),
        )
        self.output("out", source=self.apply_bqsr.out)
示例#5
0
 def inputs(self):
     return [
         *super(Gatk4ApplyBqsrBase, self).inputs(),
         ToolInput(
             "bam",
             BamBai(),
             prefix="-I",
             doc="The SAM/BAM/CRAM file containing reads.",
             secondaries_present_as={".bai": "^.bai"},
             position=10,
         ),
         ToolInput("reference",
                   FastaWithDict(),
                   prefix="-R",
                   doc="Reference sequence"),
         ToolInput(
             "outputFilename",
             Filename(
                 prefix=InputSelector("bam"),
                 suffix=".recalibrated",
                 extension=".bam",
             ),
             prefix="-O",
             doc="Write output to this file",
         ),
         ToolInput(
             "recalFile",
             Tsv(optional=True),
             prefix="--bqsr-recal-file",
             doc="Input recalibration table for BQSR",
         ),
         ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--intervals",
             doc=
             "-L (BASE) One or more genomic intervals over which to operate",
         ),
         *self.additional_args,
     ]
示例#6
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("tumorBam", BamBai(), doc="The indexed BAM file"),
         ToolInput("normalBam", BamBai(), doc="The indexed BAM file"),
         ToolInput("intervals", Bed(), position=2, shell_quote=False),
         ToolInput(
             "reference",
             FastaFai(),
             prefix="-G",
             position=1,
             shell_quote=False,
             doc="The reference fasta. Should be indexed (.fai). "
             "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa",
         ),
         ToolInput(
             "tumorName",
             String(),
             doc="The sample name to be used directly.  Will overwrite -n option",
         ),
         ToolInput(
             "normalName",
             String(),
             doc="The normal sample name to use with the -b option",
         ),
         ToolInput(
             "alleleFreqThreshold",
             Float(optional=True),
             doc="The threshold for allele frequency, default: 0.05 or 5%",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf", suffix=".vardict"),
             prefix=">",
             position=6,
             shell_quote=False,
         ),
         *VarDictSomaticBase.vardict_inputs,
         *VarDictSomaticBase.var2vcf_inputs,
     ]
示例#7
0
 def inputs(self):
     return [
         *super().inputs(),
         *Gatk4GetPileUpSummariesBase.additional_args,
         ToolInput(
             "bam",
             Array(BamBai()),
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="The SAM/BAM/CRAM file containing reads.",
             position=0,
         ),
         ToolInput(
             "sites",
             VcfTabix(),
             prefix="-V",
             doc="sites of common biallelic variants",
         ),
         ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--intervals",
             doc=
             "-L (BASE) One or more genomic intervals over which to operate",
         ),
         ToolInput("pileupTableOut",
                   Filename(extension=".txt"),
                   position=1,
                   prefix="-O"),
         ToolInput(
             "reference",
             FastaWithDict(optional=True),
             prefix="-R",
             doc="reference to use when decoding CRAMS",
         ),
     ]
    def constructor(self):
        ##INPUTS
        self.input("bam", BamBai())
        self.input("sample_name", String())

        self.input("reference_folder", Directory())
        self.input("intervals", Bed())

        self.input("gemini_chromosomes", String(optional=True))

        self.input("ploidy", String(optional=True), default="somatic")
        self.input("min_bq", Int(optional=True))
        self.input("min_mq", Int(optional=True))
        self.input("min_dp", Int(optional=True))
        self.input("min_vaf", Float(optional=True))
        self.input("vc_min_vq", Int(optional=True))
        self.input("noise_level", Int(optional=True))
        self.input("vqr_min_vq", Int(optional=True))
        self.input("pisces_awk_script", File())

        ## STEPS
        self.step(
            "primary_only",
            SamToolsView_1_9(sam=self.bam,
                             doNotOutputAlignmentsWithBitsSet="0x100"),
        )

        self.step(
            "index_primary_only_bam",
            SamToolsIndex_1_9(bam=self.primary_only.out),
        )

        self.step(
            "gemini_read_preprocessing",
            PiscesGemini_5_3_0_0(
                inputBam=self.index_primary_only_bam,
                referenceFolder=self.reference_folder,
                samtoolsExecutable="samtools",
                chromosomeFilter=self.gemini_chromosomes,
                outputDir=".",
                piscesVersion="5.3.0.0",
            ),
        )

        self.step(
            "pisces",
            PiscesVariantCaller_5_3_0_0(
                inputBam=self.gemini_read_preprocessing.bam,
                referenceFolder=self.reference_folder,
                outputDir=".",
                intervalBedFile=self.intervals,
                ploidy=self.ploidy,
                minimumBaseQuality=self.min_bq,
                minimumMappingQuality=self.min_mq,
                minimumVariantFrequency=self.min_vaf,
                noiseLevelForQModel=self.noise_level,
                minimumVariantFrequencyFilter=self.min_vaf,
                enableSingleStrandFilter="True",
                outputSBFiles="True",
                callMNVs="False",
                maxMNVLength=1,
                RMxNFilter="5,9,0.35",
                variantQualityFilter=self.vc_min_vq,
                crushVCF="False",
                gVCF="False",
                piscesVersion="5.3.0.0",
            ),
        )

        self.step(
            "vqr",
            PiscesVariantQualityRecalibration_5_3_0_0(
                inputVcf=self.pisces.vcf,
                outputDir=".",
                baselineNoise=self.noise_level,
                minVariantQuality=self.vqr_min_vq,
                piscesVersion="5.3.0.0",
            ),
        )

        piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf])

        self.step(
            "fixSource",
            Awk(script=self.pisces_awk_script, input_files=piscesVcf),
        )

        self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out))

        self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out))

        self.step("uncompress", UncompressArchive(file=self.normalise.out))

        self.step(
            "filterpass",
            VcfToolsvcftools_0_1_16(
                vcf=self.uncompress.out.as_type(Vcf),
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        ## OUTPUTS
        self.output("variants", source=self.sort.out)

        self.output("out", source=self.filterpass.out)

        self.output("out_bam", source=self.gemini_read_preprocessing.bam)
示例#9
0
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        # self.step(
        #     "split_bam",
        #     gatk4.Gatk4SplitReads_4_0(bam=self.bam, intervals=self.intervals),
        # )
        self.step(
            "base_recalibrator",
            gatk4.Gatk4BaseRecalibrator_4_0(
                bam=self.bam,
                intervals=self.intervals,
                reference=self.reference,
                knownSites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )
        self.step(
            "apply_bqsr",
            gatk4.Gatk4ApplyBqsr_4_0(
                bam=self.bam,
                intervals=self.intervals,
                recalFile=self.base_recalibrator.out,
                reference=self.reference,
            ),
        )
        self.step(
            "haplotype_caller",
            gatk4.Gatk4HaplotypeCaller_4_0(
                inputRead=self.apply_bqsr,
                intervals=self.intervals,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out,
                reference=self.reference),
        )
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(bam=self.bam,
                                      vcf=self.splitnormalisevcf.out,
                                      reference=self.reference),
        )

        self.output("variants", source=self.haplotype_caller.out)
        self.output("out_bam", source=self.haplotype_caller.bam)
        self.output("out", source=self.addbamstats.out)
示例#10
0
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)

        self.input("normal_name", str)
        self.input("tumor_name", str)

        self.input(
            "intervals",
            Bed(),
            doc="If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )

        self.input("reference", FastaWithDict)
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)
        self.input("gnomad", VcfTabix)

        # # base calibration for normal and tumor bam
        self.step(
            "normal",
            self.process_subpipeline(
                bam=self.normal_bam,
                intervals=self.targeted_bed,
                reference=self.reference,
                known_sites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )
        self.step(
            "tumor",
            self.process_subpipeline(
                bam=self.tumor_bam,
                intervals=self.targeted_bed,
                reference=self.reference,
                known_sites=[
                    self.snps_dbsnp,
                    self.snps_1000gp,
                    self.known_indels,
                    self.mills_indels,
                ],
            ),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=self.normal.out,
                tumorBams=self.tumor.out,
                normalSample=self.normal_name,
                intervals=self.targeted_bed,
                reference=self.reference,
                germlineResource=self.gnomad,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out, ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(bam=self.tumor_bam,
                                                sites=self.gnomad,
                                                intervals=self.targeted_bed),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out,
                segmentationFileOut="tumor_segmentation.mutect2_segments",
            ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedTabixVcf=self.filtermutect2calls.out,
                reference=self.reference),
        )

        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                compressedVcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.step("tabixvcf", TabixLatest(inp=self.filterpass.out))

        self.output("variants", source=self.mutect2.out)
        self.output("out", source=self.tabixvcf.out)
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)

        self.input("normal_name", str)
        self.input("tumor_name", str)

        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)

        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        self.step(
            "base_recalibrator_normal",
            gatk4.Gatk4BaseRecalibrator_4_0(),
            ignore_missing=True,
        )
        self.step(
            "base_recalibrator_tumor",
            gatk4.Gatk4BaseRecalibrator_4_0(),
            ignore_missing=True,
        )

        self.step("apply_bqsr_normal",
                  gatk4.Gatk4ApplyBqsr_4_0(),
                  ignore_missing=True)
        self.step("apply_bqsr_tumor",
                  gatk4.Gatk4ApplyBqsr_4_0(),
                  ignore_missing=True)

        # S1: BaseRecalibrator(s)

        for inp, baseRecal, applyBQSR in [
            (self.normal_bam, self.base_recalibrator_normal,
             self.apply_bqsr_normal),
            (self.tumor_bam, self.base_recalibrator_tumor,
             self.apply_bqsr_tumor),
        ]:
            baseRecal["bam"] = inp
            baseRecal["intervals"] = self.intervals
            baseRecal["reference"] = self.reference
            baseRecal["knownSites"] = [
                self.snps_dbsnp,
                self.snps_1000gp,
                self.known_indels,
                self.mills_indels,
            ]

            applyBQSR["recalFile"] = baseRecal.out
            applyBQSR["bam"] = inp
            applyBQSR["intervals"] = self.intervals
            applyBQSR["reference"] = self.reference

        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_0(
                normal=self.apply_bqsr_normal.out,
                tumor=self.apply_bqsr_tumor.out,
                normalName=self.normal_name,
                tumorName=self.tumor_name,
                intervals=self.intervals,
                reference=self.reference,
            ),
        )
        self.step(
            "split_multi_allele",
            SplitMultiAllele(reference=self.reference, vcf=self.mutect2.out),
        )

        self.output("out", source=self.split_multi_allele.out)
示例#12
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf"),
             prefix="OUTPUT=",
             separate_value_from_prefix=False,
             doc="(O=) VCF structural variation calls. Required.",
         ),
         ToolInput(
             "reference",
             FastaWithDict(),
             prefix="REFERENCE_SEQUENCE=",
             separate_value_from_prefix=False,
         ),
         ToolInput(
             "bams",
             Array(BamBai()),
             prefix="INPUT=",
             separate_value_from_prefix=False,
             prefix_applies_to_all_elements=True,
             doc=
             "(I=File Coordinate-sorted input BAM file. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembled", extension=".bam"),
             prefix="ASSEMBLY=",
             separate_value_from_prefix=False,
             doc=
             "Breakend assemblies which have undergone split read identification Required.",
         ),
         ToolInput(
             "inputLabel",
             String(optional=True),
             prefix="INPUT_LABEL=",
             separate_value_from_prefix=False,
             doc=
             "Input label. Variant calling evidence breakdowns are reported for each label. Default "
             "labels correspond to INPUT filenames. When specifying labels, labels must be provided for "
             "all input files. Default value: null. This option may be specified 0 or more times.",
         ),
         ToolInput(
             "inputMaxFragmentSize",
             Int(optional=True),
             prefix="INPUT_MAX_FRAGMENT_SIZE=",
             separate_value_from_prefix=False,
             doc=
             "Per input maximum concordant fragment size. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "inputMinFragmentSize",
             Int(optional=True),
             prefix="INPUT_MIN_FRAGMENT_SIZE=",
             separate_value_from_prefix=False,
             doc=
             "Per input minimum concordant fragment size. Default value: null. "
             "This option may be specified 0 or more times.",
         ),
         ToolInput(
             "readPairConcordantPercent",
             Float(optional=True),
             prefix="READ_PAIR_CONCORDANT_PERCENT=",
             separate_value_from_prefix=False,
             doc=
             "Percent of read pairs considered concorant (0.0-1.0). If this is unset, the SAM proper "
             "pair flag is used to determine whether a read is discordantly aligned. Explicit fragment "
             "size specification overrides this setting. Default value: 0.995. "
             "This option can be set to 'null' to clear the default value.",
         ),
         ToolInput(
             "blacklist",
             Bed(optional=True),
             prefix="BLACKLIST=",
             separate_value_from_prefix=False,
             doc=
             "(BL=File) BED blacklist of regions to ignore. Assembly of regions such as high-coverage "
             "centromeric repeats is slow, and if such regions are to be filtered in downstream "
             "analysis anyway, blacklisting those region will improve runtime performance. "
             "For human WGS, the ENCODE DAC blacklist is recommended. Default value: null.",
         ),
         ToolInput(
             "configurationFile",
             File(optional=True),
             prefix="CONFIGURATION_FILE=",
             separate_value_from_prefix=False,
             doc=
             "(C=File) gridss configuration file containing overrides Default value: null.",
         ),
         ToolInput(
             "workerThreads",
             Int(optional=True),
             prefix="WORKER_THREADS=",
             separate_value_from_prefix=False,
             doc=
             "(THREADS=Integer  Number of worker threads to spawn. Defaults to number of cores available. "
             "Note that I/O threads are not included in this worker thread count so CPU usage can be "
             "higher than the number of worker thread. Default value: 6. "
             "This option can be set to 'null' to clear the default value.",
         ),
         ToolInput(
             "workingDir",
             String(optional=True),
             prefix="WORKING_DIR=",
             default=".",
             separate_value_from_prefix=False,
             doc=
             "Directory to place intermediate results directories. Default location is the same "
             "directory as the associated input or output file. Default value: null.",
         ),
         ToolInput(
             "ignoreDuplicates",
             Boolean(optional=True),
             prefix="IGNORE_DUPLICATES=",
             separate_value_from_prefix=False,
             doc=
             "Ignore reads marked as duplicates. Default value: true. This option can be set to 'null' "
             "to clear the default value. Possible values: {true, false}",
         ),
     ]
示例#13
0
    def tool_modifier(self, tool: Tool, inputs: Dict,
                      hints: Dict[str, str]) -> Tool:
        from janis_bioinformatics.data_types import FastaWithDict, Vcf, Bed
        from janis_bioinformatics.tools.illumina import HapPyValidator_0_3_9

        failed_outputs, untyped_outputs = ensure_outputs_are_in_workflow_and_are_compatible(
            tool, self.validation.fields, Vcf())

        if len(failed_outputs) > 0:
            raise Exception(
                f"Some outputs for validation were not found in the tool '{tool.id()}': "
                f"{', '.join(failed_outputs)}")

        if len(untyped_outputs) > 0:
            Logger.critical(
                f"Some outputs for validation from the tool '{tool.id()}' were not "
                f"compatible with VCF: {', '.join(untyped_outputs)}")

        w = WorkflowBuilder(tool.id() + "_validated")

        w.input("validatorReference",
                FastaWithDict,
                value=self.validation.reference)
        w.input("validatorTruthVCF", Vcf, value=self.validation.truthVCF)
        w.input("validatorIntervals",
                Bed(optional=True),
                value=self.validation.intervals)

        inpdict = {
            i.id(): w.input(i.id(), i.intype)
            for i in tool.tool_inputs()
        }
        toolstp = w.step(tool.id(), tool(**inpdict))

        if isinstance(tool, Workflow):
            wf: Workflow = tool
            for o in wf.output_nodes.values():
                w.output(
                    identifier=o.id(),
                    source=toolstp[o.id()],
                    output_folder=o.output_folder,
                    output_name=o.output_name,
                )
        else:
            for o in tool.tool_outputs():
                w.output(identifier=o.id(), source=toolstp[o.id()])

        for o in self.validation.fields:

            sid = "validator_" + o
            valstp = w.step(
                sid,
                HapPyValidator_0_3_9(
                    compareVCF=toolstp[o],
                    reportPrefix=
                    o,  # this will generate an input node with format validator_{o}_reportPrefix
                    reference=w.validatorReference,
                    truthVCF=w.validatorTruthVCF,
                    intervals=w.validatorIntervals,
                ),
            )

            # Connect all the outputs of the validator to an output
            for vo in valstp.tool.outputs():
                w.output(
                    f"validated_{o}_{vo.id()}",
                    source=valstp[vo.id()],
                    output_folder="validated",
                )

        return w
示例#14
0
 def inputs(self):
     return [
         ToolInput("bams", Array(BamBai()), position=10),
         ToolInput(
             "reference",
             FastaWithDict(),
             prefix="--reference",
             doc="reference genome to use.",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf"),
             prefix="--output",
             doc="output VCF.",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembly", extension=".bam"),
             prefix="--assembly",
             doc=
             "location of the GRIDSS assembly BAM. This file will be created by GRIDSS.",
         ),
         ToolInput(
             "threads",
             Int(optional=True),
             default=CpuSelector(),
             prefix="--threads",
             doc="number of threads to use. (Default: 8)",
         ),
         ToolInput(
             "jarPath",
             String(optional=True),
             prefix="--jar",
             doc="location of GRIDSS jar",
         ),
         ToolInput(
             "workingDir",
             String(optional=True),
             default="./TMP",
             prefix="--workingdir",
             doc=
             "directory to place GRIDSS intermediate and temporary files. .gridss.working subdirectories will be created. (Default: .)",
         ),
         ToolInput(
             "blacklist",
             Bed(optional=True),
             prefix="--blacklist",
             doc="BED file containing regions to ignore",
         ),
         ToolInput(
             "steps",
             Array(String, optional=True),
             prefix="--steps",
             separator=",",
             prefix_applies_to_all_elements=False,
             doc=
             "processing steps to run. Defaults to all steps. Multiple steps are specified using comma separators. Possible steps are: setupreference, preprocess, assemble, call, all. WARNING: multiple instances of GRIDSS generating reference files at the same time will result in file corruption. Make sure these files are generated before runninng parallel GRIDSS jobs.",
         ),
         ToolInput(
             "configuration",
             File(optional=True),
             prefix="--configuration",
             doc=
             "configuration file use to override default GRIDSS settings.",
         ),
         ToolInput(
             "labels",
             Array(String, optional=True),
             prefix="--labels",
             separator=",",
             prefix_applies_to_all_elements=False,
             doc=
             'comma separated labels to use in the output VCF for the input files. Supporting read counts for input files with the same label are aggregated (useful for multiple sequencing runs of the same sample). Labels default to input filenames, unless a single read group with a non-empty sample name exists in which case the read group sample name is used (which can be disabled by "useReadGroupSampleNameCategoryLabel=false" in the configuration file). If labels are specified, they must be specified for all input files.',
         ),
         ToolInput(
             "externalaligner",
             String(optional=True),
             prefix="--externalaligner",
             doc=
             "use the system version of bwa instead of the in-process version packaged with GRIDSS",
         ),
         ToolInput(
             "jvmheap",
             String(optional=True),
             prefix="--jvmheap",
             doc=
             "size of JVM heap for assembly and variant calling. (Default: 30g)",
         ),
         ToolInput(
             "maxcoverage",
             Int(optional=True),
             prefix="--maxcoverage",
             doc=
             "maximum coverage. Regions with coverage in excess of this are ignored. (Default: 50000)",
         ),
         ToolInput(
             "picardoptions",
             String(optional=True),
             prefix="--picardoptions",
             doc=
             "additional standard Picard command line options. Useful options include VALIDATION_STRINGENCY=LENIENT and COMPRESSION_LEVEL=0. See https://broadinstitute.github.io/picard/command-line-overview.html",
         ),
         ToolInput(
             "useproperpair",
             String(optional=True),
             prefix="--useproperpair",
             doc=
             "use SAM 'proper pair' flag to determine whether a read pair is discordant. Default: use library fragment size distribution to determine read pair concordance",
         ),
         ToolInput(
             "concordantreadpairdistribution",
             Float(optional=True),
             prefix="--concordantreadpairdistribution",
             doc=
             "portion of 6 sigma read pairs distribution considered concordantly mapped. (Default: 0.995)",
         ),
         ToolInput(
             "keepTempFiles",
             Boolean(optional=True),
             prefix="--keepTempFiles",
             doc=
             "keep intermediate files. Not recommended except for debugging due to the high disk usage.",
         ),
         ToolInput(
             "nojni",
             Boolean(optional=True),
             prefix="--nojni",
             doc=
             "do not use JNI native code acceleration libraries (snappy, GKL, ssw, bwa).",
         ),
         ToolInput(
             "jobindex",
             Int(optional=True),
             prefix="--jobindex",
             doc=
             "zero-based assembly job index (only required when performing parallel assembly across multiple computers)",
         ),
         ToolInput(
             "jobnodes",
             Int(optional=True),
             prefix="--jobnodes",
             doc=
             "total number of assembly jobs (only required when performing parallel assembly across multiple computers). Note than an assembly job with any --job argument is required to be run after all indexed jobs have been completed to gather the output files together.",
         ),
     ]
示例#15
0
class RNASeqQCBase(BioinformaticsTool):
    def friendly_name(self) -> str:
        return "RNASeqQC"

    def tool_provider(self):
        return "RNASeqQC"

    def tool(self) -> str:
        return "RNASeqQC"

    def base_command(self):
        return ["rnaseqc"]

    def inputs(self):
        return [
            ToolInput(
                "gtf",
                File,
                position=1,
                doc=
                "The input GTF file containing features to check the bam against",
            ),
            ToolInput(
                "bam",
                BamBai,
                position=2,
                doc="The input SAM/BAM file containing reads to process",
            ),
            ToolInput(
                "output_dir",
                String(optional=True),
                position=3,
                default=".",
                doc="Output directory",
            ),
            ToolInput(
                "sample",
                String(optional=True),
                position=4,
                prefix="--sample",
                doc=
                "The name of the current sample. Default: The bam's filename",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out_gene_fragments",
                File,
                selector=StringFormatter(
                    "{output_dir}/{sample}.gene_fragments.gct",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
            ToolOutput(
                "out_gene_reads",
                File,
                selector=StringFormatter(
                    "{output_dir}/{sample}.gene_reads.gct",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
            ToolOutput(
                "out_gene_tpm",
                File,
                selector=StringFormatter(
                    "{output_dir}/{sample}.gene_tpm.gct",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
            ToolOutput(
                "out_metrics_tsv",
                Tsv,
                selector=StringFormatter(
                    "{output_dir}/{sample}.metrics.tsv",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
            ToolOutput(
                "out_coverage_tsv",
                Tsv(optional=True),
                selector=StringFormatter(
                    "{output_dir}/{sample}.coverage.tsv",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
            ToolOutput(
                "out_exon_reads",
                File,
                selector=StringFormatter(
                    "{output_dir}/{sample}.exon_reads.gct",
                    output_dir=InputSelector("output_dir"),
                    sample=InputSelector("sample"),
                ),
            ),
        ]

    def bind_metadata(self):
        return ToolMetadata(
            contributors=["Jiaan Yu"],
            dateCreated=datetime(2021, 9, 10),
            dateUpdated=datetime(2021, 10, 19),
            documentationUrl="https://github.com/getzlab/rnaseqc",
            documentation="""Usage: rnaseqc [gtf] [bam] [output] \{OPTIONS\}
""",
        )

    def memory(self, hints):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def cpus(self, hints):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 1

    additional_args = [
        ToolInput(
            "bed",
            Bed(optional=True),
            position=4,
            prefix="--bed",
            doc=
            "Optional input BED file containing non-overlapping exons used for fragment size calculations",
        ),
        ToolInput(
            "fasta",
            Fasta(optional=True),
            position=4,
            prefix="--fasta",
            doc=
            "Optional input FASTA/FASTQ file containing the reference sequence used for parsing CRAM files",
        ),
        ToolInput(
            "chimeric_distance",
            Int(optional=True),
            position=4,
            prefix="--chimeric-distance",
            doc=
            "Set the maximum accepted distance between read mates. Mates beyond this distance will be counted as chimeric pairs. Default: 2000000 [bp]",
        ),
        ToolInput(
            "fragment_samples",
            Int(optional=True),
            position=4,
            prefix="--fragment-samples",
            doc=
            "Set the number of samples to take when computing fragment sizes. Requires the --bed argument. Default: 1000000",
        ),
        ToolInput(
            "mapping_quality",
            Int(optional=True),
            position=4,
            prefix="--mapping-quality",
            doc=
            "Set the lower bound on read quality for exon coverage counting. Reads below this number are excluded from coverage metrics. Default: 255",
        ),
        ToolInput(
            "base_mismatch",
            Int(optional=True),
            position=4,
            prefix="--base-mismatch",
            doc=
            "Set the maximum number of allowed mismatches between a read and the reference sequence. Reads with more than this number of mismatches are excluded from coverage metrics. Default: 6",
        ),
        ToolInput(
            "offset",
            Int(optional=True),
            position=4,
            prefix="--offset",
            doc=
            " Set the offset into the gene for the 3' and 5' windows in bias calculation. A positive value shifts the 3' and 5' windows towards eachother, while a negative value shifts them apart. Default: 150 [bp]",
        ),
        ToolInput(
            "window_size",
            Int(optional=True),
            position=4,
            prefix="--window-size",
            doc=
            "Set the size of the 3' and 5' windows in bias calculation. Default: 100 [bp]",
        ),
        ToolInput(
            "gene_length",
            Int(optional=True),
            position=4,
            prefix="--gene-length",
            doc=
            "Set the minimum size of a gene for bias calculation. Genes below this size are ignored in the calculation. Default: 600 [bp]",
        ),
        ToolInput(
            "legacy",
            Boolean(optional=True),
            position=4,
            prefix="--legacy",
            doc=
            "Use legacy counting rules. Gene and exon counts match output of RNA-SeQC 1.1.9",
        ),
        ToolInput(
            "stranded",
            String(optional=True),
            position=4,
            prefix="--stranded",
            doc=
            "Use strand-specific metrics. Only features on the same strand of a read will be considered. Allowed values are 'RF', 'rf', 'FR', and 'fr'",
        ),
        ToolInput(
            "verbose",
            Boolean(optional=True),
            position=4,
            prefix="--verbose",
            doc=
            "Give some feedback about what's going on. Supply this argument twice for progress updates while parsing the bam",
        ),
        ToolInput(
            "tag",
            String(optional=True),
            position=4,
            prefix="--tag",
            doc="Filter out reads with the specified tag.",
        ),
        ToolInput(
            "chimeric_tag",
            String(optional=True),
            position=4,
            prefix="--chimeric-tag",
            doc=
            "Reads maked with the specified tag will be labeled as Chimeric. Defaults to 'mC' for STAR",
        ),
        ToolInput(
            "exclude_chimeric",
            Boolean(optional=True),
            position=4,
            prefix="--exclude-chimeric",
            doc="Exclude chimeric reads from the read counts",
        ),
        ToolInput(
            "unpaired",
            Boolean(optional=True),
            position=4,
            prefix="--unpaired",
            doc=
            "Allow unpaired reads to be quantified. Required for single-end libraries",
        ),
        ToolInput(
            "rpkm",
            Boolean(optional=True),
            position=4,
            prefix="--rpkm",
            doc="Output gene RPKM values instead of TPMs",
        ),
        ToolInput(
            "coverage",
            Boolean(optional=True),
            position=4,
            prefix="--coverage",
            doc=
            "If this flag is provided, coverage statistics for each transcript will be written to a table. Otherwise, only summary coverage statistics are generated and added to the metrics table",
        ),
        ToolInput(
            "coverage_mask",
            Int(optional=True),
            position=4,
            prefix="--coverage-mask",
            doc=
            "Sets how many bases at both ends of a transcript are masked out when computing per-base exon coverage. Default: 500bp",
        ),
        ToolInput(
            "detection_threshold",
            Int(optional=True),
            position=4,
            prefix="--detection-threshold",
            doc=
            "Number of counts on a gene to consider the gene 'detected'. Additionally, genes below this limit are excluded from 3' bias computation. Default: 5 reads",
        ),
    ]
示例#16
0
 def inputs(self):
     return [
         *super(Gatk4BaseRecalibratorBase, self).inputs(),
         *Gatk4BaseRecalibratorBase.additional_args,
         ToolInput(
             "bam",
             BamBai(),
             position=6,
             prefix="-I",
             doc="BAM/SAM/CRAM file containing reads",
             secondaries_present_as={".bai": "^.bai"},
         ),
         ToolInput(
             "knownSites",
             Array(VcfTabix()),
             prefix="--known-sites",
             position=28,
             prefix_applies_to_all_elements=True,
             doc=
             "**One or more databases of known polymorphic sites used to exclude "
             "regions around known polymorphisms from analysis.** "
             "This algorithm treats every reference mismatch as an indication of error. However, real "
             "genetic variation is expected to mismatch the reference, so it is critical that a "
             "database of known polymorphic sites is given to the tool in order to skip over those sites. "
             "This tool accepts any number of Feature-containing files (VCF, BCF, BED, etc.) for use as "
             "this database. For users wishing to exclude an interval list of known variation simply "
             "use -XL my.interval.list to skip over processing those sites. Please note however "
             "that the statistics reported by the tool will not accurately reflected those sites "
             "skipped by the -XL argument.",
         ),
         ToolInput(
             "reference",
             FastaWithDict(),
             position=5,
             prefix="-R",
             doc="Reference sequence file",
         ),
         ToolInput(
             "outputFilename",
             Filename(
                 prefix=InputSelector("bam", remove_file_extension=True),
                 extension=".table",
             ),
             position=8,
             prefix="-O",
             doc="**The output recalibration table filename to create.** "
             "After the header, data records occur one per line until the end of the file. The first "
             "several items on a line are the values of the individual covariates and will change "
             "depending on which covariates were specified at runtime. The last three items are the "
             "data- that is, number of observations for this combination of covariates, number of "
             "reference mismatches, and the raw empirical quality score calculated by phred-scaling "
             "the mismatch rate. Use '/dev/stdout' to print to standard out.",
         ),
         ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--intervals",
             doc=
             "-L (BASE) One or more genomic intervals over which to operate",
         ),
         ToolInput(
             "intervalStrings",
             Array(String, optional=True),
             prefix="--intervals",
             prefix_applies_to_all_elements=True,
             doc=
             "-L (BASE) One or more genomic intervals over which to operate",
         ),
     ]
    def constructor(self):

        # Inputs
        self.input("sample_name", String)
        self.input("fastqs", Array(FastqGzPair))
        self.input("reference", FastaWithDict)
        self.input("region_bed", Bed)
        self.input("region_bed_extended", Bed)
        self.input("region_bed_annotated", Bed)
        self.input("genecoverage_bed", Bed)
        self.input("genome_file", TextFile)
        self.input("black_list", Bed(optional=True))
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)

        # fastqc
        self.step(
            "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
        )
        # get the overrepresentative sequence from fastqc
        self.step(
            "getfastqc_adapters",
            ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
            scatter="fastqc_datafiles",
        )
        # align and generate sorted index bam
        self.step(
            "align_and_sort",
            BwaAligner(
                fastq=self.fastqs,
                reference=self.reference,
                sample_name=self.sample_name,
                sortsam_tmpDir=".",
                cutadapt_adapter=self.getfastqc_adapters,
                cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
            ),
            scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
        )
        # merge into one bam and markdups
        self.step(
            "merge_and_mark",
            MergeAndMarkBams_4_1_3(
                bams=self.align_and_sort.out, sampleName=self.sample_name
            ),
        )
        # performance: doc
        self.step(
            "annotate_doc",
            AnnotateDepthOfCoverage_0_1_0(
                bam=self.merge_and_mark.out,
                bed=self.region_bed_annotated,
                reference=self.reference,
                sample_name=self.sample_name,
            ),
        )
        # performance
        self.step(
            "performance_summary",
            PerformanceSummaryTargeted_0_1_0(
                bam=self.merge_and_mark.out,
                region_bed=self.region_bed,
                genecoverage_bed=self.genecoverage_bed,
                sample_name=self.sample_name,
                genome_file=self.genome_file,
            ),
        )
        # gridss
        self.step(
            "gridss",
            Gridss_2_6_2(
                bams=self.merge_and_mark.out,
                reference=self.reference,
                blacklist=self.black_list,
                tmpdir=".",
            ),
        )
        # post gridss r script here
        # self.step("gridss_post_r", )
        # gatk bqsr bam
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=self.merge_and_mark.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
        )
        # haploytype caller
        self.step(
            "haplotype_caller",
            Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out, reference=self.reference
            ),
        )
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=self.merge_and_mark.out,
                vcf=self.splitnormalisevcf.out,
                reference=self.reference,
            ),
        )
        # output
        self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")

        self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")

        self.output("doc", source=self.annotate_doc.out, output_folder="PERFORMANCE")
        self.output(
            "summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
        )
        self.output(
            "gene_summary",
            source=self.performance_summary.geneFileOut,
            output_folder="PERFORMANCE",
        )
        self.output(
            "region_summary",
            source=self.performance_summary.regionFileOut,
            output_folder="PERFORMANCE",
        )

        self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
        self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")

        self.output("hap_vcf", source=self.haplotype_caller.out, output_folder="VCF")
        self.output("hap_bam", source=self.haplotype_caller.bam, output_folder="VCF")
        self.output("normalise_vcf", source=self.addbamstats.out, output_folder="VCF")
示例#18
0
 def inputs(self):
     return [
         ToolInput(
             tag="bams",
             input_type=Array(BamBai),
             prefix="-b",
             prefix_applies_to_all_elements=True,
             doc="Add FILE to the set of BAM files to be analyzed.",
         ),
         ToolInput(
             tag="bamList",
             input_type=TextFile(optional=True),
             prefix="-L",
             doc="A file containing a list of BAM files to be analyzed.",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaFai(),
             prefix="-f",
             doc=
             " Use FILE as the reference sequence for analysis. An index file (FILE.fai) will be created if none exists. If neither --targets nor --region are specified, FreeBayes will analyze every position in this reference.",
         ),
         ToolInput(
             tag="targetsFile",
             prefix="-t",
             input_type=Bed(optional=True),
             doc=" Limit analysis to targets listed in the BED-format FILE.",
         ),
         ToolInput(
             tag="region",
             prefix="-r",
             input_type=String(optional=True),
             doc=
             "<chrom>:<start_position>-<end_position> Limit analysis to the specified region, 0-base coordinates, end_position not included (same as BED format). Either '-' or '..' maybe used as a separator.",
         ),
         ToolInput(
             tag="samplesFile",
             prefix="-s",
             input_type=TextFile(optional=True),
             doc=
             "FILE  Limit analysis to samples listed (one per line) in the FILE. By default FreeBayes will analyze all samples in its input BAM files.",
         ),
         ToolInput(
             tag="popFile",
             prefix="--populations",
             input_type=TextFile(optional=True),
             doc=
             "FILE Each line of FILE should list a sample and a population which it is part of. The population-based bayesian inference model will then be partitioned on the basis of the populations.",
         ),
         ToolInput(
             tag="cnvFile",
             prefix="-A",
             input_type=TextFile(optional=True),
             doc=
             "FILE Read a copy number map from the BED file FILE, which has either a sample-level ploidy: sample name, copy number or a region-specific format: reference sequence, start, end, sample name, copy number ... for each region in each sample which does not have the default copy number as set by --ploidy.",
         ),
         ToolInput(
             tag="outputFilename",
             prefix="-v",
             input_type=Filename(extension=".vcf"),
             doc="FILE Output VCF-format results to FILE. (default: stdout)",
         ),
         ToolInput(
             tag="gvcfFlag",
             prefix="--gvcf",
             input_type=Boolean(optional=True),
             default=False,
             doc=
             "Write gVCF output, which indicates coverage in uncalled regions.",
         ),
         ToolInput(
             tag="gvcfChunkSize",
             prefix="--gvcf-chunk",
             input_type=Int(optional=True),
             doc=
             " When writing gVCF output emit a record for every NUM bases.",
         ),
         ToolInput(
             tag="candidateVcf",
             prefix="-@",
             input_type=File(optional=True),
             doc=
             " Use variants reported in VCF file as input to the algorithm. Variants in this file will included in the output even if there is not enough support in the data to pass input filters.",
         ),
         ToolInput(
             tag="restrictSitesFlag",
             prefix="-l",
             input_type=Boolean(optional=True),
             doc=
             "Only provide variant calls and genotype likelihoods for sites and alleles which are provided in the VCF input, and provide output in the VCF for all input alleles, not just those which have support in the data.",
         ),
         ToolInput(
             tag="candidateHaploVcf",
             prefix="--haplotype-basis-alleles",
             input_type=File(optional=True),
             doc=
             "When specified, only variant alleles provided in this input VCF will be used for the construction of complex or haplotype alleles.",
         ),
         ToolInput(
             tag="reportHapAllelesFlag",
             prefix="--report-all-haplotype-alleles",
             input_type=Boolean(optional=True),
             doc=
             "At sites where genotypes are made over haplotype alleles, provide information about all alleles in output, not only those which are called.",
         ),
         ToolInput(
             tag="monomorphicFlag",
             prefix="--report-monomorphic",
             input_type=Boolean(optional=True),
             doc=
             " Report even loci which appear to be monomorphic, and report all considered alleles, even those which are not in called genotypes. Loci which do not have any potential alternates have '.' for ALT.",
         ),
         ToolInput(
             tag="polyMoprhProbFlag",
             prefix="-P",
             input_type=Float(optional=True),
             default=0.0,
             doc=
             "Report sites if the probability that there is a polymorphism at the site is greater than N. default: 0.0. Note that post-filtering is generally recommended over the use of this parameter.",
         ),
         ToolInput(
             tag="strictFlag",
             prefix="--strict-vcf",
             input_type=Boolean(optional=True),
             doc="Generate strict VCF format (FORMAT/GQ will be an int)",
         ),
         ToolInput(
             tag="theta",
             prefix="-T",
             input_type=Float(),
             default=0.001,
             doc=
             "The expected mutation rate or pairwise nucleotide diversity among the population under analysis. This serves as the single parameter to the Ewens Sampling Formula prior model default: 0.001",
         ),
         ToolInput(
             tag="ploidy",
             prefix="-p",
             input_type=Int(),
             default=2,
             doc="Sets the default ploidy for the analysis to N. default: 2",
         ),
         ToolInput(
             tag="pooledDiscreteFlag",
             prefix="-J",
             input_type=Boolean(optional=True),
             doc=
             "Assume that samples result from pooled sequencing. Model pooled samples using discrete genotypes across pools. When using this flag, set --ploidy to the number of alleles in each sample or use the --cnv-map to define per-sample ploidy.",
         ),
         ToolInput(
             tag="pooledContinousFlag",
             prefix="-K",
             input_type=Boolean(optional=True),
             doc=
             "Output all alleles which pass input filters, regardles of genotyping outcome or model.",
         ),
         ToolInput(
             tag="addRefFlag",
             prefix="-Z",
             input_type=Boolean(optional=True),
             doc=
             "This flag includes the reference allele in the analysis as if it is another sample from the same population.",
         ),
         ToolInput(
             tag="refQual",
             prefix="--reference-quality",
             input_type=String(),
             default="100,60",
             doc=
             "--reference-quality MQ,BQ  Assign mapping quality of MQ to the reference allele at each site and base quality of BQ. default: 100,60",
         ),
         ToolInput(
             tag="ignoreSNPsFlag",
             prefix="-I",
             input_type=Boolean(optional=True),
             doc="Ignore SNP alleles.",
         ),
         ToolInput(
             tag="ignoreINDELsFlag",
             prefix="-i",
             input_type=Boolean(optional=True),
             doc="Ignore insertion and deletion alleles.",
         ),
         ToolInput(
             tag="ignoreMNPsFlag",
             prefix="-X",
             input_type=Boolean(optional=True),
             doc="Ignore multi-nuceotide polymorphisms, MNPs.",
         ),
         ToolInput(
             tag="ignoreComplexVarsFlag",
             prefix="-u",
             input_type=Boolean(optional=True),
             doc="Ignore complex events (composites of other classes).",
         ),
         ToolInput(
             tag="maxNumOfAlleles",
             prefix="-n",
             input_type=Int(),
             default=0,
             doc=
             "Evaluate only the best N SNP alleles, ranked by sum of supporting quality scores. (Set to 0 to use all; default: all)",
         ),
         ToolInput(
             tag="maxNumOfComplexVars",
             prefix="-E",
             input_type=Int(optional=True),
             doc="",
         ),
         ToolInput(
             tag="haplotypeLength",
             prefix="--haplotype-length",
             input_type=Int(),
             default=3,
             doc=
             "Allow haplotype calls with contiguous embedded matches of up to this length. Set N=-1 to disable clumping. (default: 3)",
         ),
         ToolInput(
             tag="minRepSize",
             prefix="--min-repeat-size",
             input_type=Int(),
             default=5,
             doc=
             "When assembling observations across repeats, require the total repeat length at least this many bp. (default: 5)",
         ),
         ToolInput(
             tag="minRepEntropy",
             prefix="--min-repeat-entropy",
             input_type=Int(),
             default=1,
             doc=
             "To detect interrupted repeats, build across sequence until it has  entropy > N bits per bp. Set to 0 to turn off. (default: 1)",
         ),
         ToolInput(
             tag="noPartObsFlag",
             prefix="--no-partial-observations",
             input_type=Boolean(optional=True),
             doc=
             "Exclude observations which do not fully span the dynamically-determined detection window. (default, use all observations, dividing partial support across matching haplotypes when generating haplotypes.)",
         ),
         ToolInput(
             tag="noNormaliseFlag",
             prefix="-O",
             input_type=Boolean(optional=True),
             doc=
             "Turn off left-alignment of indels, which is enabled by default.",
         ),
         ToolInput(
             tag="useDupFlag",
             prefix="-4",
             input_type=Boolean(),
             default=False,
             doc=
             "Include duplicate-marked alignments in the analysis. default: exclude duplicates marked as such in alignments",
         ),
         ToolInput(
             tag="minMappingQual",
             prefix="-m",
             input_type=Int(),
             default=1,
             doc=
             " Exclude alignments from analysis if they have a mapping quality less than Q. default: 1",
         ),
         ToolInput(
             tag="minBaseQual",
             prefix="-q",
             input_type=Int(),
             default=0,
             doc=
             " -q --min-base-quality Q Exclude alleles from analysis if their supporting base quality is less than Q. default: 0",
         ),
         ToolInput(
             tag="minSupQsum",
             prefix="-R",
             input_type=Int(),
             default=0,
             doc=
             " -R --min-supporting-allele-qsum Q Consider any allele in which the sum of qualities of supporting observations is at least Q. default: 0",
         ),
         ToolInput(
             tag="minSupMQsum",
             prefix="-Y",
             input_type=Int(),
             default=0,
             doc=
             " -Y --min-supporting-mapping-qsum Q Consider any allele in which and the sum of mapping qualities of supporting reads is at least Q. default: 0",
         ),
         ToolInput(
             tag="minSupBQthres",
             prefix="-Q",
             input_type=Int(),
             default=10,
             doc=
             " -Q --mismatch-base-quality-threshold Q Count mismatches toward --read-mismatch-limit if the base quality of the mismatch is >= Q. default: 10",
         ),
         ToolInput(
             tag="readMisMatchLim",
             prefix="-U",
             input_type=Int(optional=True),
             doc=
             " -U --read-mismatch-limit N Exclude reads with more than N mismatches where each mismatch has base quality >= mismatch-base-quality-threshold. default: ~unbounded",
         ),
         ToolInput(
             tag="maxMisMatchFrac",
             prefix="-z",
             input_type=Float(),
             default=1.0,
             doc=
             " -z --read-max-mismatch-fraction N Exclude reads with more than N [0,1] fraction of mismatches where each mismatch has base quality >= mismatch-base-quality-threshold default: 1.0",
         ),
         ToolInput(
             tag="readSNPLim",
             prefix="-$",
             input_type=Int(optional=True),
             doc=
             " -$ --read-snp-limit N Exclude reads with more than N base mismatches, ignoring gaps with quality >= mismatch-base-quality-threshold. default: ~unbounded",
         ),
         ToolInput(
             tag="readINDELLim",
             prefix="-e",
             input_type=Int(optional=True),
             doc=
             " -e --read-indel-limit N Exclude reads with more than N separate gaps. default: ~unbounded",
         ),
         ToolInput(
             tag="standardFilterFlag",
             prefix="-0",
             input_type=Boolean(optional=True),
             doc=
             " -0 --standard-filters Use stringent input base and mapping quality filters Equivalent to -m 30 -q 20 -R 0 -S 0",
         ),
         ToolInput(
             tag="minAltFrac",
             prefix="-F",
             input_type=Float(),
             default=0.05,
             doc=
             " -F --min-alternate-fraction N Require at least this fraction of observations supporting an alternate allele within a single individual in the in order to evaluate the position. default: 0.05",
         ),
         ToolInput(
             tag="minAltCount",
             prefix="-C",
             input_type=Int(),
             default=2,
             doc=
             " -C --min-alternate-count N Require at least this count of observations supporting an alternate allele within a single individual in order to evaluate the position. default: 2",
         ),
         ToolInput(
             tag="minAltQSum",
             prefix="-3",
             input_type=Int(),
             default=0,
             doc=
             " -3 --min-alternate-qsum N Require at least this sum of quality of observations supporting an alternate allele within a single individual in order to evaluate the position. default: 0",
         ),
         ToolInput(
             tag="minAltTotal",
             prefix="-G",
             input_type=Int(),
             default=1,
             doc=
             " -G --min-alternate-total N Require at least this count of observations supporting an alternate allele within the total population in order to use the allele in analysis. default: 1",
         ),
         ToolInput(
             tag="minCov",
             prefix="--min-coverage",
             input_type=Int(),
             default=0,
             doc=
             " --min-coverage N Require at least this coverage to process a site. default: 0",
         ),
         ToolInput(
             tag="maxCov",
             prefix="--max-coverage",
             input_type=Int(optional=True),
             doc=
             " --max-coverage N Do not process sites with greater than this coverage. default: no limit",
         ),
         ToolInput(
             tag="noPopPriorsFlag",
             prefix="-k",
             input_type=Boolean(optional=True),
             doc=
             " -k --no-population-priors Equivalent to --pooled-discrete --hwe-priors-off and removal of Ewens Sampling Formula component of priors.",
         ),
         ToolInput(
             tag="noHWEPriorsFlag",
             prefix="-w",
             input_type=Boolean(optional=True),
             doc=
             " -w --hwe-priors-off Disable estimation of the probability of the combination arising under HWE given the allele frequency as estimated by observation frequency.",
         ),
         ToolInput(
             tag="noBinOBSPriorsFlag",
             prefix="-V",
             input_type=Boolean(optional=True),
             doc=
             " -V --binomial-obs-priors-off Disable incorporation of prior expectations about observations. Uses read placement probability, strand balance probability, and read position (5'-3') probability.",
         ),
         ToolInput(
             tag="noABPriorsFlag",
             prefix="-a",
             input_type=Boolean(optional=True),
             doc=
             " -a --allele-balance-priors-off Disable use of aggregate probability of observation balance between alleles as a component of the priors.",
         ),
         ToolInput(
             tag="obsBiasFile",
             prefix="--observation-bias",
             input_type=TextFile(optional=True),
             doc=
             " --observation-bias FILE Read length-dependent allele observation biases from FILE. The format is [length] [alignment efficiency relative to reference] where the efficiency is 1 if there is no relative observation bias.",
         ),
         ToolInput(
             tag="baseQualCap",
             prefix="--base-quality-cap",
             input_type=Int(optional=True),
             doc=
             " --base-quality-cap Q Limit estimated observation quality by capping base quality at Q.",
         ),
         ToolInput(
             tag="probContamin",
             prefix="--prob-contamination",
             input_type=Float(),
             default=0.000000001,
             doc=
             " --prob-contamination F An estimate of contamination to use for all samples. default: 10e-9",
         ),
         ToolInput(
             tag="legGLScalc",
             prefix="--legacy-gls",
             input_type=Boolean(optional=True),
             doc=
             " --legacy-gls Use legacy (polybayes equivalent) genotype likelihood calculations",
         ),
         ToolInput(
             tag="contaminEst",
             prefix="--contamination-estimates",
             input_type=TextFile(optional=True),
             doc=
             " --contamination-estimates FILE A file containing per-sample estimates of contamination, such as those generated by VerifyBamID. The format should be: sample p(read=R|genotype=AR) p(read=A|genotype=AA) Sample '*' can be used to set default contamination estimates.",
         ),
         ToolInput(
             tag="repoprtMaxGLFlag",
             prefix="--report-genotype-likelihood-max",
             input_type=Boolean(optional=True),
             doc=
             " --report-genotype-likelihood-max Report genotypes using the maximum-likelihood estimate provided from genotype likelihoods.",
         ),
         ToolInput(
             tag="genotypingMaxIter",
             prefix="-B",
             input_type=Int(),
             default=1000,
             doc=
             " -B --genotyping-max-iterations N Iterate no more than N times during genotyping step. default: 1000.",
         ),
         ToolInput(
             tag="genotypingMaxBDepth",
             prefix="--genotyping-max-banddepth",
             input_type=Int(),
             default=6,
             doc=
             " --genotyping-max-banddepth N Integrate no deeper than the Nth best genotype by likelihood when genotyping. default: 6.",
         ),
         ToolInput(
             tag="postIntegrationLim",
             prefix="-W",
             input_type=String(),
             default="1,3",
             doc=
             " -W --posterior-integration-limits N,M Integrate all genotype combinations in our posterior space which include no more than N samples with their Mth best data likelihood. default: 1,3.",
         ),
         ToolInput(
             tag="excludeUnObsGT",
             prefix="-N",
             input_type=Boolean(optional=True),
             doc=
             " -N --exclude-unobserved-genotypes Skip sample genotypings for which the sample has no supporting reads.",
         ),
         ToolInput(
             tag="gtVarThres",
             prefix="-S",
             input_type=Int(optional=True),
             doc=
             " -S --genotype-variant-threshold N Limit posterior integration to samples where the second-best genotype likelihood is no more than log(N) from the highest genotype likelihood for the sample. default: ~unbounded",
         ),
         ToolInput(
             tag="useMQFlag",
             prefix="-j",
             input_type=Boolean(optional=True),
             doc=
             " -j --use-mapping-quality Use mapping quality of alleles when calculating data likelihoods.",
         ),
         ToolInput(
             tag="harmIndelQualFlag",
             prefix="-H",
             input_type=Boolean(optional=True),
             doc=
             " -H --harmonic-indel-quality Use a weighted sum of base qualities around an indel, scaled by the distance from the indel. By default use a minimum BQ in flanking sequence.",
         ),
         ToolInput(
             tag="readDepFact",
             prefix="-D",
             input_type=Float(),
             default=0.9,
             doc=
             " -D --read-dependence-factor N Incorporate non-independence of reads by scaling successive observations by this factor during data likelihood calculations. default: 0.9",
         ),
         ToolInput(
             tag="gtQuals",
             prefix="-=",
             input_type=Boolean(optional=True),
             doc=
             " -= --genotype-qualities Calculate the marginal probability of genotypes and report as GQ in each sample field in the VCF output.",
         ),
     ]
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String(optional=True))
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional intervals file supports processing by regions. If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # split normal and tumor bam
        self.step(
            "normal_split_bam",
            self.process_subpipeline(bam=self.normal_bam,
                                     intervals=self.intervals),
        )
        self.step(
            "tumor_split_bam",
            self.process_subpipeline(bam=self.tumor_bam,
                                     intervals=self.intervals),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=[self.normal_split_bam.out],
                tumorBams=[self.tumor_split_bam.out],
                normalSample=self.normal_name,
                intervals=self.intervals,
                reference=self.reference,
                germlineResource=self.gnomad,
                panelOfNormals=self.panel_of_normals,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out, ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(
                bam=self.tumor_split_bam.out,
                sites=self.gnomad,
                intervals=self.intervals,
            ),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out, ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step("uncompressvcf",
                  UncompressArchive(file=self.filtermutect2calls.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(vcf=self.uncompressvcf.out,
                             reference=self.reference),
        )
        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("variants", source=self.filtermutect2calls.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.filterpass.out)
示例#20
0
    def constructor(self):
        ## INPUTS
        self.input("bam", BamBai())
        self.input("sample_name", String())
        self.input("reference_folder", Directory())
        self.input("intervals", Bed())
        self.input("ploidy", String(optional=True), default="somatic")
        self.input("min_bq", Int(optional=True))
        self.input("min_mq", Int(optional=True))
        self.input("min_dp", Int(optional=True), default=100)
        self.input("min_vaf", Float(optional=True))
        self.input("vc_min_vq", Int(optional=True))
        self.input("noise_level", Int(optional=True))
        self.input("vqr_min_vq", Int(optional=True))
        self.input("pisces_awk_script", File())

        ## STEPS
        self.step(
            "primary_only",
            SamToolsView_1_9(sam=self.bam,
                             doNotOutputAlignmentsWithBitsSet="0x100"),
        )

        self.step(
            "index_primary_only_bam",
            SamToolsIndex_1_9(bam=self.primary_only.out),
        )

        self.step(
            "hygea_realignment",
            PiscesHygeaRealigner_5_2_10_49(
                inputBam=self.index_primary_only_bam,
                outputDir=".",
                referenceFolder=self.reference_folder,
                skipAndRemoveDuplicates="true",
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "stitcher_read_joining",
            PiscesStitcher_5_2_10_49(
                inputBam=self.hygea_realignment.out,
                outputDir=".",
                sampleName=self.sample_name,
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "stitcher_sort",
            SamToolsSort_1_9(
                bam=self.stitcher_read_joining.out,
                outputFilename=self.sample_name + ".bam",
            ),
        )

        self.step("stitcher_index",
                  SamToolsIndex_1_9(bam=self.stitcher_sort.out))

        self.step(
            "pisces",
            PiscesVariantCaller_5_2_10_49(
                inputBam=self.stitcher_index.out,
                referenceFolder=self.reference_folder,
                outputDir=".",
                intervalBedFile=self.intervals,
                ploidy=self.ploidy,
                minimumBaseQuality=self.min_bq,
                minimumMappingQuality=self.min_mq,
                minimumVariantFrequency=self.min_vaf,
                minimumCoverage=self.min_dp,
                noiseLevelForQModel=self.noise_level,
                minimumVariantFrequencyFilter=self.min_vaf,
                enableSingleStrandFilter="true",
                callMNVs="false",
                maxMNVLength=1,
                RMxNFilter="5,9,0.35",
                variantQualityFilter=self.vc_min_vq,
                crushVCF="false",
                gVCF="false",
                piscesVersion="5.2.10.49",
            ),
        )

        self.step(
            "vqr",
            PiscesVariantQualityRecalibration_5_2_10_49(
                inputVcf=self.pisces.vcf,
                outputDir=".",
                baselineNoise=self.noise_level,
                minVariantQuality=self.vqr_min_vq,
                piscesVersion="5.2.10.49",
            ),
        )

        piscesVcf = FirstOperator([self.vqr.vcf, self.pisces.vcf])

        self.step(
            "fixSource",
            Awk(script=self.pisces_awk_script, input_files=piscesVcf),
        )

        self.step("sort", BcfToolsSort_1_9(vcf=self.fixSource.out))

        self.step("normalise", BcfToolsNorm_1_9(vcf=self.sort.out))

        self.step("uncompress", UncompressArchive(file=self.normalise.out))

        self.step(
            "filterpass",
            VcfToolsvcftools_0_1_16(
                vcf=self.uncompress.out.as_type(Vcf),
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        ## OUTPUTs
        self.output("variants", source=self.sort.out)

        self.output("out", source=self.filterpass.out)

        self.output("out_bam", source=self.stitcher_index.out)
        ## OPTIONAL OUTPUTs
        self.output("hygea_options",
                    source=self.hygea_realignment.used_options)
        self.output("stitcher_options",
                    source=self.stitcher_read_joining.used_options)
        self.output("pisces_options", source=self.pisces.used_options)
        self.output("vqr_options", source=self.vqr.used_options)
示例#21
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="tumorBams",
             input_type=Array(BamBai),
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="(--input) BAM/SAM/CRAM file containing reads This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="normalBams",
             input_type=Array(BamBai, optional=True),
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="(--input) Extra BAM/SAM/CRAM file containing reads This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="normalSample",
             input_type=String(optional=True),
             prefix="--normal-sample",
             doc="(--normal-sample, if) May be URL-encoded as output by GetSampleName with",
         ),
         ToolInput(
             "outputPrefix",
             String(optional=True),
             doc="Used as a prefix for the outputFilename if not specified, with format: {outputPrefix}.vcf.gz",
             default="generated",
         ),
         ToolInput(
             "outputFilename",
             Filename(prefix=InputSelector("outputPrefix"), extension=".vcf.gz"),
             position=20,
             prefix="-O",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaWithDict(),
             prefix="--reference",
             doc="(-R) Reference sequence file Required.",
         ),
         ToolInput(
             tag="outputBamName",
             # This is not a FileName because otherwise we cant make this optional
             input_type=String(optional=True),
             prefix="-bamout",
             doc="File to which assembled haplotypes should be written",
         ),
         ToolInput(
             tag="activityProfileOut",
             input_type=String(optional=True),
             prefix="--activity-profile-out",
             doc="Default value: null.",
         ),
         ToolInput(
             tag="addOutputSamProgramRecord",
             input_type=Boolean(optional=True),
             prefix="-add-output-sam-program-record",
             doc="(--add-output-sam-program-record)  If true, adds a PG tag to created SAM/BAM/CRAM files.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="addOutputVcfCommandLine",
             input_type=Boolean(optional=True),
             prefix="-add-output-vcf-command-line",
             doc="(--add-output-vcf-command-line)  If true, adds a command line header line to created VCF files.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="afOfAllelesNotInResource",
             input_type=String(optional=True),
             prefix="--af-of-alleles-not-in-resource",
             doc="(-default-af)  Population allele fraction assigned to alleles not found in germline resource.  Please see docs/mutect/mutect2.pdf fora derivation of the default value.  Default value: -1.0. ",
         ),
         ToolInput(
             tag="alleles",
             input_type=String(optional=True),
             prefix="--alleles",
             doc="The set of alleles for which to force genotyping regardless of evidence Default value: null. ",
         ),
         ToolInput(
             tag="annotation",
             input_type=String(optional=True),
             prefix="--annotation",
             doc="(-A) One or more specific annotations to add to variant calls This argument may be specified 0 or more times. Default value: null. Possible Values: {AlleleFraction, AS_BaseQualityRankSumTest, AS_FisherStrand, AS_InbreedingCoeff, AS_MappingQualityRankSumTest, AS_QualByDepth, AS_ReadPosRankSumTest, AS_RMSMappingQuality, AS_StrandOddsRatio, BaseQuality, BaseQualityRankSumTest, ChromosomeCounts, ClippingRankSumTest, CountNs, Coverage, DepthPerAlleleBySample, DepthPerSampleHC, ExcessHet, FisherStrand, FragmentLength, GenotypeSummaries, InbreedingCoeff, LikelihoodRankSumTest, MappingQuality, MappingQualityRankSumTest, MappingQualityZero, OrientationBiasReadCounts, OriginalAlignment, PossibleDeNovo, QualByDepth, ReadPosition, ReadPosRankSumTest, ReferenceBases, RMSMappingQuality, SampleList, StrandBiasBySample, StrandOddsRatio, TandemRepeat, UniqueAltReadCount}",
         ),
         ToolInput(
             tag="annotationGroup",
             input_type=String(optional=True),
             prefix="--annotation-group",
             doc="(-G) One or more groups of annotations to apply to variant calls This argument may be specified 0 or more times. Default value: null. Possible Values: {AS_StandardAnnotation, ReducibleAnnotation, StandardAnnotation, StandardHCAnnotation, StandardMutectAnnotation}",
         ),
         ToolInput(
             tag="annotationsToExclude",
             input_type=String(optional=True),
             prefix="--annotations-to-exclude",
             doc="(-AX)  One or more specific annotations to exclude from variant calls  This argument may be specified 0 or more times. Default value: null. Possible Values: {BaseQuality, Coverage, DepthPerAlleleBySample, DepthPerSampleHC, FragmentLength, MappingQuality, OrientationBiasReadCounts, ReadPosition, StrandBiasBySample, TandemRepeat}",
         ),
         ToolInput(
             tag="arguments_file",
             input_type=File(optional=True),
             prefix="--arguments_file",
             doc="read one or more arguments files and add them to the command line This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="assemblyRegionOut",
             input_type=String(optional=True),
             prefix="--assembly-region-out",
             doc="Output the assembly region to this IGV formatted file Default value: null.",
         ),
         ToolInput(
             tag="baseQualityScoreThreshold",
             input_type=Int(optional=True),
             prefix="--base-quality-score-threshold",
             doc=" Base qualities below this threshold will be reduced to the minimum (6)  Default value: 18.",
         ),
         ToolInput(
             tag="callableDepth",
             input_type=Int(optional=True),
             prefix="--callable-depth",
             doc="Minimum depth to be considered callable for Mutect stats. Does not affect genotyping. Default value: 10. ",
         ),
         ToolInput(
             tag="cloudIndexPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-index-prefetch-buffer",
             doc="(-CIPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset.  Default value: -1. ",
         ),
         ToolInput(
             tag="cloudPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-prefetch-buffer",
             doc="(-CPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable).  Default value: 40. ",
         ),
         ToolInput(
             tag="createOutputBamIndex",
             input_type=Boolean(optional=True),
             prefix="--create-output-bam-index",
             doc="(-OBI)  If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputBamMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-bam-md5",
             doc="(-OBM)  If true, create a MD5 digest for any BAM/SAM/CRAM file created  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputVariantIndex",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-index",
             doc="(-OVI)  If true, create a VCF index when writing a coordinate-sorted VCF file.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputVariantMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-md5",
             doc="(-OVM)  If true, create a a MD5 digest any VCF file created.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableBamIndexCaching",
             input_type=Boolean(optional=True),
             prefix="--disable-bam-index-caching",
             doc="(-DBIC)  If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified.  Caching is automatically disabled if there are no intervals specified.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableReadFilter",
             input_type=Boolean(optional=True),
             prefix="--disable-read-filter",
             doc="(-DF)  Read filters to be disabled before analysis  This argument may be specified 0 or more times. Default value: null. Possible Values: {GoodCigarReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotSecondaryAlignmentReadFilter, PassesVendorQualityCheckReadFilter, ReadLengthReadFilter, WellformedReadFilter}",
         ),
         ToolInput(
             tag="disableSequenceDictionaryValidation",
             input_type=Boolean(optional=True),
             prefix="-disable-sequence-dictionary-validation",
             doc="(--disable-sequence-dictionary-validation)  If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="downsamplingStride",
             input_type=Int(optional=True),
             prefix="--downsampling-stride",
             doc="(-stride)  Downsample a pool of reads starting within a range of one or more bases.  Default value: 1. ",
         ),
         ToolInput(
             tag="excludeIntervals",
             input_type=Boolean(optional=True),
             prefix="--exclude-intervals",
             doc="(-XLOne) This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="f1r2MaxDepth",
             input_type=Int(optional=True),
             prefix="--f1r2-max-depth",
             doc="sites with depth higher than this value will be grouped Default value: 200.",
         ),
         ToolInput(
             tag="f1r2MedianMq",
             input_type=Int(optional=True),
             prefix="--f1r2-median-mq",
             doc="skip sites with median mapping quality below this value Default value: 50.",
         ),
         ToolInput(
             tag="f1r2MinBq",
             input_type=Int(optional=True),
             prefix="--f1r2-min-bq",
             doc="exclude bases below this quality from pileup Default value: 20.",
         ),
         ToolInput(
             tag="f1r2TarGz_outputFilename",
             input_type=Filename(extension=".tar.gz"),
             prefix="--f1r2-tar-gz",
             doc="If specified, collect F1R2 counts and output files into this tar.gz file Default value: null. ",
         ),
         ToolInput(
             tag="founderId",
             input_type=String(optional=True),
             prefix="-founder-id",
             doc="(--founder-id)  Samples representing the population founders This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="gatkConfigFile",
             input_type=String(optional=True),
             prefix="--gatk-config-file",
             doc="A configuration file to use with the GATK. Default value: null.",
         ),
         ToolInput(
             tag="gcsRetries",
             input_type=Int(optional=True),
             prefix="-gcs-retries",
             doc="(--gcs-max-retries)  If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection  Default value: 20. ",
         ),
         ToolInput(
             tag="gcsProjectForRequesterPays",
             input_type=String(optional=True),
             prefix="--gcs-project-for-requester-pays",
             doc=" Project to bill when accessing requester pays buckets. If unset, these buckets cannot be accessed.  Default value: . ",
         ),
         ToolInput(
             tag="genotypeGermlineSites",
             input_type=Boolean(optional=True),
             prefix="--genotype-germline-sites",
             doc=" (EXPERIMENTAL) Call all apparent germline site even though they will ultimately be filtered.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="genotypePonSites",
             input_type=Boolean(optional=True),
             prefix="--genotype-pon-sites",
             doc="Call sites in the PoN even though they will ultimately be filtered. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="germlineResource",
             input_type=VcfTabix(optional=True),
             prefix="--germline-resource",
             doc=" Population vcf of germline sequencing containing allele fractions.  Default value: null. ",
         ),
         ToolInput(
             tag="graph",
             input_type=String(optional=True),
             prefix="-graph",
             doc="(--graph-output) Write debug assembly graph information to this file Default value: null.",
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="-h",
             doc="(--help) display the help message Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="ignoreItrArtifacts",
             input_type=String(optional=True),
             prefix="--ignore-itr-artifactsTurn",
             doc=" inverted tandem repeats.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="initialTumorLod",
             input_type=String(optional=True),
             prefix="--initial-tumor-lod",
             doc="(-init-lod)  Log 10 odds threshold to consider pileup active.  Default value: 2.0. ",
         ),
         ToolInput(
             tag="intervalExclusionPadding",
             input_type=String(optional=True),
             prefix="--interval-exclusion-padding",
             doc="(-ixp)  Amount of padding (in bp) to add to each interval you are excluding.  Default value: 0. ",
         ),
         ToolInput(
             tag="imr",
             input_type=String(optional=True),
             prefix="--interval-merging-rule",
             doc="(--interval-merging-rule)  Interval merging rule for abutting intervals  Default value: ALL. Possible values: {ALL, OVERLAPPING_ONLY} ",
         ),
         ToolInput(
             tag="ip",
             input_type=String(optional=True),
             prefix="-ipAmount",
             doc="(--interval-padding) Default value: 0.",
         ),
         ToolInput(
             tag="isr",
             input_type=String(optional=True),
             prefix="--interval-set-rule",
             doc="(--interval-set-rule)  Set merging approach to use for combining interval inputs  Default value: UNION. Possible values: {UNION, INTERSECTION} ",
         ),
         ToolInput(
             tag="intervals",
             input_type=Bed(optional=True),
             prefix="--intervals",
             doc="(-L) One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="le",
             input_type=Boolean(optional=True),
             prefix="-LE",
             doc="(--lenient) Lenient processing of VCF files Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="maxPopulationAf",
             input_type=String(optional=True),
             prefix="--max-population-af",
             doc="(-max-af)  Maximum population allele frequency in tumor-only mode.  Default value: 0.01. ",
         ),
         ToolInput(
             tag="maxReadsPerAlignmentStart",
             input_type=Int(optional=True),
             prefix="--max-reads-per-alignment-start",
             doc=" Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable.  Default value: 50. ",
         ),
         ToolInput(
             tag="minBaseQualityScore",
             input_type=String(optional=True),
             prefix="--min-base-quality-score",
             doc="(-mbq:Byte)  Minimum base quality required to consider a base for calling  Default value: 10. ",
         ),
         ToolInput(
             tag="mitochondriaMode",
             input_type=Boolean(optional=True),
             prefix="--mitochondria-mode",
             doc="Mitochondria mode sets emission and initial LODs to 0. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="nativePairHmmThreads",
             input_type=Int(optional=True),
             prefix="--native-pair-hmm-threads",
             default=CpuSelector(),
             doc=" How many threads should a native pairHMM implementation use  Default value: 4. ",
         ),
         ToolInput(
             tag="nativePairHmmUseDoublePrecision",
             input_type=Boolean(optional=True),
             prefix="--native-pair-hmm-use-double-precision",
             doc=" use double precision in the native pairHmm. This is slower but matches the java implementation better  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="normalLod",
             input_type=Double(optional=True),
             prefix="--normal-lod",
             doc="Log 10 odds threshold for calling normal variant non-germline. Default value: 2.2.",
         ),
         ToolInput(
             tag="encode",
             input_type=String(optional=True),
             prefix="-encode",
             doc="This argument may be specified 0 or more times. Default value: null.",
         ),
         ToolInput(
             tag="panelOfNormals",
             input_type=VcfTabix(optional=True),
             prefix="--panel-of-normals",
             doc="(--panel-of-normals)  VCF file of sites observed in normal.  Default value: null. ",
         ),
         ToolInput(
             tag="pcrIndelQual",
             input_type=Int(optional=True),
             prefix="--pcr-indel-qual",
             doc="Phred-scaled PCR SNV qual for overlapping fragments Default value: 40.",
         ),
         ToolInput(
             tag="pcrSnvQual",
             input_type=Int(optional=True),
             prefix="--pcr-snv-qual",
             doc="Phred-scaled PCR SNV qual for overlapping fragments Default value: 40.",
         ),
         ToolInput(
             tag="pedigree",
             input_type=String(optional=True),
             prefix="--pedigree",
             doc="(-ped) Pedigree file for determining the population founders. Default value: null.",
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             doc="Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="readFilter",
             input_type=String(optional=True),
             prefix="--read-filter",
             doc="(-RF) Read filters to be applied before analysis This argument may be specified 0 or more times. Default value: null. Possible Values: {AlignmentAgreesWithHeaderReadFilter, AllowAllReadsReadFilter, AmbiguousBaseReadFilter, CigarContainsNoNOperator, FirstOfPairReadFilter, FragmentLengthReadFilter, GoodCigarReadFilter, HasReadGroupReadFilter, IntervalOverlapReadFilter, LibraryReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, MatchingBasesAndQualsReadFilter, MateDifferentStrandReadFilter, MateOnSameContigOrNoMappedMateReadFilter, MateUnmappedAndUnmappedReadFilter, MetricsReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroFragmentLengthReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotOpticalDuplicateReadFilter, NotSecondaryAlignmentReadFilter, NotSupplementaryAlignmentReadFilter, OverclippedReadFilter, PairedReadFilter, PassesVendorQualityCheckReadFilter, PlatformReadFilter, PlatformUnitReadFilter, PrimaryLineReadFilter, ProperlyPairedReadFilter, ReadGroupBlackListReadFilter, ReadGroupReadFilter, ReadLengthEqualsCigarLengthReadFilter, ReadLengthReadFilter, ReadNameReadFilter, ReadStrandFilter, SampleReadFilter, SecondOfPairReadFilter, SeqIsStoredReadFilter, ValidAlignmentEndReadFilter, ValidAlignmentStartReadFilter, WellformedReadFilter}",
         ),
         ToolInput(
             tag="readIndex",
             input_type=String(optional=True),
             prefix="-read-index",
             doc="(--read-index)  Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically.  This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="readValidationStringency",
             input_type=String(optional=True),
             prefix="--read-validation-stringency",
             doc="(-VS:ValidationStringency)  Validation stringency for all SAM/BAM/CRAM/SRA files read by this program.  The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: SILENT. Possible values: {STRICT, LENIENT, SILENT} ",
         ),
         ToolInput(
             tag="secondsBetweenProgressUpdates",
             input_type=Double(optional=True),
             prefix="-seconds-between-progress-updates",
             doc="(--seconds-between-progress-updates)  Output traversal statistics every time this many seconds elapse  Default value: 10.0. ",
         ),
         ToolInput(
             tag="sequenceDictionary",
             input_type=String(optional=True),
             prefix="-sequence-dictionary",
             doc="(--sequence-dictionary)  Use the given sequence dictionary as the master/canonical sequence dictionary.  Must be a .dict file.  Default value: null. ",
         ),
         ToolInput(
             tag="sitesOnlyVcfOutput",
             input_type=Boolean(optional=True),
             prefix="--sites-only-vcf-output",
             doc=" If true, don't emit genotype fields when writing vcf file output.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="tmpDir",
             input_type=String(optional=True),
             prefix="--tmp-dir",
             doc="Temp directory to use. Default value: null.",
         ),
         ToolInput(
             tag="tumorLodToEmit",
             input_type=String(optional=True),
             prefix="--tumor-lod-to-emit",
             doc="(-emit-lod)  Log 10 odds threshold to emit variant to VCF.  Default value: 3.0. ",
         ),
         ToolInput(
             tag="tumor",
             input_type=String(optional=True),
             prefix="-tumor",
             doc="(--tumor-sample) BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument.  Default value: null. ",
         ),
         ToolInput(
             tag="jdkDeflater",
             input_type=Boolean(optional=True),
             prefix="-jdk-deflater",
             doc="(--use-jdk-deflater)  Whether to use the JdkDeflater (as opposed to IntelDeflater)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="jdkInflater",
             input_type=Boolean(optional=True),
             prefix="-jdk-inflater",
             doc="(--use-jdk-inflater)  Whether to use the JdkInflater (as opposed to IntelInflater)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="verbosity",
             input_type=String(optional=True),
             prefix="-verbosity",
             doc="(--verbosity)  Control verbosity of logging.  Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} ",
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             doc="display the version number for this tool Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="activeProbabilityThreshold",
             input_type=Double(optional=True),
             prefix="--active-probability-threshold",
             doc=" Minimum probability for a locus to be considered active.  Default value: 0.002. ",
         ),
         ToolInput(
             tag="adaptivePruningInitialErrorRate",
             input_type=Double(optional=True),
             prefix="--adaptive-pruning-initial-error-rate",
             doc=" Initial base error rate estimate for adaptive pruning  Default value: 0.001. ",
         ),
         ToolInput(
             tag="allowNonUniqueKmersInRef",
             input_type=Boolean(optional=True),
             prefix="--allow-non-unique-kmers-in-ref",
             doc=" Allow graphs that have non-unique kmers in the reference  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="assemblyRegionPadding",
             input_type=Int(optional=True),
             prefix="--assembly-region-padding",
             doc=" Number of additional bases of context to include around each assembly region  Default value: 100. ",
         ),
         ToolInput(
             tag="bamWriterType",
             input_type=String(optional=True),
             prefix="--bam-writer-type",
             doc="Which haplotypes should be written to the BAM Default value: CALLED_HAPLOTYPES. Possible values: {ALL_POSSIBLE_HAPLOTYPES, CALLED_HAPLOTYPES} ",
         ),
         ToolInput(
             tag="debugAssembly",
             input_type=String(optional=True),
             prefix="--debug-assembly",
             doc="(-debug)  Print out verbose debug information about each assembly region  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableAdaptivePruning",
             input_type=Boolean(optional=True),
             prefix="--disable-adaptive-pruning",
             doc=" Disable the adaptive algorithm for pruning paths in the graph  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableToolDefaultAnnotations",
             input_type=Boolean(optional=True),
             prefix="-disable-tool-default-annotations",
             doc="(--disable-tool-default-annotations)  Disable all tool default annotations  Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="disableToolDefaultReadFilters",
             input_type=Boolean(optional=True),
             prefix="-disable-tool-default-read-filters",
             doc="(--disable-tool-default-read-filters)  Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontIncreaseKmerSizesForCycles",
             input_type=Boolean(optional=True),
             prefix="--dont-increase-kmer-sizes-for-cycles",
             doc=" Disable iterating over kmer sizes when graph cycles are detected  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontTrimActiveRegions",
             input_type=Boolean(optional=True),
             prefix="--dont-trim-active-regions",
             doc=" If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontUseSoftClippedBases",
             input_type=Boolean(optional=True),
             prefix="--dont-use-soft-clipped-bases",
             doc=" Do not analyze soft clipped bases in the reads  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="erc",
             input_type=String(optional=True),
             prefix="-ERC",
             doc="(--emit-ref-confidence)  (BETA feature) Mode for emitting reference confidence scores  Default value: NONE. Possible values: {NONE, BP_RESOLUTION, GVCF} ",
         ),
         ToolInput(
             tag="enableAllAnnotations",
             input_type=Boolean(optional=True),
             prefix="--enable-all-annotations",
             doc=" Use all possible annotations (not for the faint of heart)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="forceActive",
             input_type=Boolean(optional=True),
             prefix="--force-active",
             doc="If provided, all regions will be marked as active Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="genotypeFilteredAlleles",
             input_type=Boolean(optional=True),
             prefix="--genotype-filtered-alleles",
             doc=" Whether to force genotype even filtered alleles  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="gvcfLodBand",
             input_type=String(optional=True),
             prefix="--gvcf-lod-band",
             doc="(-LODB) Exclusive upper bounds for reference confidence LOD bands (must be specified in increasing order)  This argument may be specified 0 or more times. Default value: [-2.5, -2.0, -1.5,",
         ),
         ToolInput(
             tag="kmerSize",
             input_type=Int(optional=True),
             prefix="--kmer-size",
             doc="Kmer size to use in the read threading assembler This argument may be specified 0 or more times. Default value: [10, 25]. ",
         ),
         ToolInput(
             tag="maxAssemblyRegionSize",
             input_type=Int(optional=True),
             prefix="--max-assembly-region-size",
             doc=" Maximum size of an assembly region  Default value: 300. ",
         ),
         ToolInput(
             tag="mnpDist",
             input_type=Int(optional=True),
             prefix="-mnp-dist",
             doc="(--max-mnp-distance)  Two or more phased substitutions separated by this distance or less are merged into MNPs.  Default value: 1. ",
         ),
         ToolInput(
             tag="maxNumHaplotypesInPopulation",
             input_type=Int(optional=True),
             prefix="--max-num-haplotypes-in-population",
             doc=" Maximum number of haplotypes to consider for your population  Default value: 128. ",
         ),
         ToolInput(
             tag="maxProbPropagationDistance",
             input_type=Int(optional=True),
             prefix="--max-prob-propagation-distance",
             doc=" Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions  Default value: 50. ",
         ),
         ToolInput(
             tag="maxSuspiciousReadsPerAlignmentStart",
             input_type=Int(optional=True),
             prefix="--max-suspicious-reads-per-alignment-start",
             doc=" Maximum number of suspicious reads (mediocre mapping quality or too many substitutions) allowed in a downsampling stride.  Set to 0 to disable.  Default value: 0. ",
         ),
         ToolInput(
             tag="maxUnprunedVariants",
             input_type=Int(optional=True),
             prefix="--max-unpruned-variants",
             doc=" Maximum number of variants in graph the adaptive pruner will allow  Default value: 100. ",
         ),
         ToolInput(
             tag="minAssemblyRegionSize",
             input_type=Int(optional=True),
             prefix="--min-assembly-region-size",
             doc=" Minimum size of an assembly region  Default value: 50. ",
         ),
         ToolInput(
             tag="minDanglingBranchLength",
             input_type=Int(optional=True),
             prefix="--min-dangling-branch-length",
             doc=" Minimum length of a dangling branch to attempt recovery  Default value: 4. ",
         ),
         ToolInput(
             tag="minPruning",
             input_type=Int(optional=True),
             prefix="--min-pruning",
             doc="Minimum support to not prune paths in the graph Default value: 2.",
         ),
         ToolInput(
             tag="minimumAlleleFraction",
             input_type=Float(optional=True),
             prefix="--minimum-allele-fraction",
             doc="(-min-AF)  Lower bound of variant allele fractions to consider when calculating variant LOD  Default value: 0.0. ",
         ),
         ToolInput(
             tag="numPruningSamples",
             input_type=Int(optional=True),
             prefix="--num-pruning-samples",
             doc="Default value: 1.",
         ),
         ToolInput(
             tag="pairHmmGapContinuationPenalty",
             input_type=Int(optional=True),
             prefix="--pair-hmm-gap-continuation-penalty",
             doc=" Flat gap continuation penalty for use in the Pair HMM  Default value: 10. ",
         ),
         ToolInput(
             tag="pairhmm",
             input_type=String(optional=True),
             prefix="-pairHMM",
             doc="(--pair-hmm-implementation)  The PairHMM implementation to use for genotype likelihood calculations  Default value: FASTEST_AVAILABLE. Possible values: {EXACT, ORIGINAL, LOGLESS_CACHING, AVX_LOGLESS_CACHING, AVX_LOGLESS_CACHING_OMP, EXPERIMENTAL_FPGA_LOGLESS_CACHING, FASTEST_AVAILABLE} ",
         ),
         ToolInput(
             tag="pcrIndelModel",
             input_type=String(optional=True),
             prefix="--pcr-indel-model",
             doc=" The PCR indel model to use  Default value: CONSERVATIVE. Possible values: {NONE, HOSTILE, AGGRESSIVE, CONSERVATIVE} ",
         ),
         ToolInput(
             tag="phredScaledGlobalReadMismappingRate",
             input_type=Int(optional=True),
             prefix="--phred-scaled-global-read-mismapping-rate",
             doc=" The global assumed mismapping rate for reads  Default value: 45. ",
         ),
         ToolInput(
             tag="pruningLodThreshold",
             input_type=Float(optional=True),
             prefix="--pruning-lod-thresholdLn",
             doc="Default value: 2.302585092994046. ",
         ),
         ToolInput(
             tag="recoverAllDanglingBranches",
             input_type=Boolean(optional=True),
             prefix="--recover-all-dangling-branches",
             doc=" Recover all dangling branches  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="-showHidden",
             doc="(--showHidden)  display hidden arguments  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="smithWaterman",
             input_type=String(optional=True),
             prefix="--smith-waterman",
             doc=" Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice  Default value: JAVA. Possible values: {FASTEST_AVAILABLE, AVX_ENABLED, JAVA} ",
         ),
         ToolInput(
             tag="ambigFilterBases",
             input_type=Int(optional=True),
             prefix="--ambig-filter-bases",
             doc="Threshold number of ambiguous bases. If null, uses threshold fraction; otherwise, overrides threshold fraction.  Default value: null.  Cannot be used in conjuction with argument(s) maxAmbiguousBaseFraction",
         ),
         ToolInput(
             tag="ambigFilterFrac",
             input_type=Double(optional=True),
             prefix="--ambig-filter-frac",
             doc="Threshold fraction of ambiguous bases Default value: 0.05. Cannot be used in conjuction with argument(s) maxAmbiguousBases",
         ),
         ToolInput(
             tag="maxFragmentLength",
             input_type=Int(optional=True),
             prefix="--max-fragment-length",
             doc="Default value: 1000000.",
         ),
         ToolInput(
             tag="minFragmentLength",
             input_type=Int(optional=True),
             prefix="--min-fragment-length",
             doc="Default value: 0.",
         ),
         ToolInput(
             tag="keepIntervals",
             input_type=String(optional=True),
             prefix="--keep-intervals",
             doc="One or more genomic intervals to keep This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="library",
             input_type=String(optional=True),
             prefix="-library",
             doc="(--library) Name of the library to keep This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="maximumMappingQuality",
             input_type=Int(optional=True),
             prefix="--maximum-mapping-quality",
             doc=" Maximum mapping quality to keep (inclusive)  Default value: null. ",
         ),
         ToolInput(
             tag="minimumMappingQuality",
             input_type=Int(optional=True),
             prefix="--minimum-mapping-quality",
             doc=" Minimum mapping quality to keep (inclusive)  Default value: 20. ",
         ),
         ToolInput(
             tag="dontRequireSoftClipsBothEnds",
             input_type=Boolean(optional=True),
             prefix="--dont-require-soft-clips-both-ends",
             doc=" Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="filterTooShort",
             input_type=Int(optional=True),
             prefix="--filter-too-short",
             doc="Minimum number of aligned bases Default value: 30.",
         ),
         ToolInput(
             tag="platformFilterName",
             input_type=String(optional=True),
             prefix="--platform-filter-name",
             doc="This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="blackListedLanes",
             input_type=String(optional=True),
             prefix="--black-listed-lanes",
             doc="Platform unit (PU) to filter out This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="readGroupBlackList",
             input_type=String(optional=True),
             prefix="--read-group-black-listThe",
             doc="This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="keepReadGroup",
             input_type=String(optional=True),
             prefix="--keep-read-group",
             doc="The name of the read group to keep Required.",
         ),
         ToolInput(
             tag="maxReadLength",
             input_type=Int(optional=True),
             prefix="--max-read-length",
             doc="Keep only reads with length at most equal to the specified value Default value: 2147483647. ",
         ),
         ToolInput(
             tag="minReadLength",
             input_type=Int(optional=True),
             prefix="--min-read-length",
             doc="Keep only reads with length at least equal to the specified value Default value: 30.",
         ),
         ToolInput(
             tag="readName",
             input_type=String(optional=True),
             prefix="--read-name",
             doc="Keep only reads with this read name Required.",
         ),
         ToolInput(
             tag="keepReverseStrandOnly",
             input_type=Boolean(optional=True),
             prefix="--keep-reverse-strand-only",
             doc=" Keep only reads on the reverse strand  Required. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="sample",
             input_type=String(optional=True),
             prefix="-sample",
             doc="(--sample) The name of the sample(s) to keep, filtering out all others This argument must be specified at least once. Required. ",
         ),
     ]
示例#22
0
 def inputs(self) -> List[j.ToolInput]:
     return [
         j.ToolInput("truthVCF", Vcf(), position=1),
         j.ToolInput("compareVCF", Vcf(), position=2),
         j.ToolInput(
             "reportPrefix",
             j.Filename(),
             prefix="--report-prefix",
             doc="(-o)  Filename prefix for report output.",
         ),
         j.ToolInput(
             "reference",
             FastaWithDict(),
             prefix="--reference",
             doc="(-r)  Specify a reference file.",
         ),
         j.ToolInput(
             "intervals",
             Bed(optional=True),
             prefix="--target-regions",
             doc=
             "(-T)  Restrict analysis to given (dense) regions (using -T in bcftools).",
         ),
         j.ToolInput(
             "version",
             j.Boolean(optional=True),
             prefix="--version",
             doc="(-v) Show version number and exit.",
         ),
         j.ToolInput(
             "scratchPrefix",
             j.String(optional=True),
             prefix="--scratch-prefix",
             doc="Directory for scratch files.",
         ),
         j.ToolInput(
             "keepScratch",
             j.String(optional=True),
             prefix="--keep-scratch",
             doc=
             "Filename prefix for scratch report output. Annotation format in input VCF file.",
         ),
         j.ToolInput(
             "falsePositives",
             Bed(optional=True),
             prefix="--false-positives",
             doc=
             "(-f)  False positive / confident call regions (.bed or .bed.gz). "
             "Calls outside these regions will be labelled as UNK.",
         ),
         j.ToolInput(
             "stratification",
             Tsv(optional=True),
             prefix="--stratification",
             doc=
             " Stratification file list (TSV format -- first column is region name, "
             "second column is file name).",
         ),
         j.ToolInput(
             "stratificationRegion",
             j.String(optional=True),
             prefix="--stratification-region",
             doc=
             "Add single stratification region, e.g. --stratification-region TEST:test.bed",
         ),
         j.ToolInput(
             "stratificationFixchr",
             j.String(optional=True),
             prefix="--stratification-fixchr",
             doc=" Add chr prefix to stratification files if necessary",
         ),
         j.ToolInput(
             "writeVcf",
             j.Boolean(optional=True),
             prefix="--write-vcf",
             doc="(-V) Write an annotated VCF.",
         ),
         j.ToolInput(
             "writeCounts",
             j.Boolean(optional=True),
             prefix="--write-counts",
             doc="(-X) Write advanced counts and metrics.",
         ),
         j.ToolInput(
             "noWriteCounts",
             j.Boolean(optional=True),
             prefix="--no-write-counts",
             doc="Do not write advanced counts and metrics.",
         ),
         j.ToolInput(
             "outputVtc",
             j.Boolean(optional=True),
             prefix="--output-vtc",
             doc=
             "Write VTC field in the final VCF which gives the counts each position has contributed to.",
         ),
         j.ToolInput(
             "preserveInfo",
             j.Boolean(optional=True),
             prefix="--preserve-info",
             doc=
             "When using XCMP, preserve and merge the INFO fields in truth and query. "
             "Useful for ROC computation.",
         ),
         j.ToolInput(
             "roc",
             j.String(optional=True),
             prefix="--roc",
             doc=
             "Select a feature to produce a ROC on (INFO feature, QUAL, GQX, ...).",
         ),
         j.ToolInput(
             "noRoc",
             j.Boolean(optional=True),
             prefix="--no-roc",
             doc=
             "Disable ROC computation and only output summary statistics for more concise output.",
         ),
         j.ToolInput(
             "rocRegions",
             j.String(optional=True),
             prefix="--roc-regions",
             doc=" Select a list of regions to compute ROCs in. By default, "
             "only the '*' region will produce ROC output (aggregate variant counts).",
         ),
         j.ToolInput(
             "rocFilter",
             j.String(optional=True),
             prefix="--roc-filter",
             doc=" Select a filter to ignore when making ROCs.",
         ),
         j.ToolInput(
             "rocDelta",
             j.Int(optional=True),
             prefix="--roc-delta",
             doc=" Minimum spacing between ROC QQ levels.",
         ),
         j.ToolInput(
             "ciAlpha",
             j.Int(optional=True),
             prefix="--ci-alpha",
             doc=
             "Confidence level for Jeffrey's CI for recall, precision and fraction of non-assessed calls.",
         ),
         j.ToolInput(
             "noJson",
             j.Boolean(optional=True),
             prefix="--no-json",
             doc="Disable JSON file output.",
         ),
         # j.ToolInput("location", Array(j.String(), optional=True), prefix="--location", separator=",",
         #           doc="(-l)  Comma-separated list of locations [use naming after preprocessing], "
         #               "when not specified will use whole VCF."),
         j.ToolInput(
             "passOnly",
             j.Boolean(optional=True),
             prefix="--pass-only",
             doc="Keep only PASS variants.",
         ),
         # j.ToolInput("filtersOnly", Array(j.String(), optional=True), prefix="--filters-only", separator=",",
         #           doc=" Specify a comma-separated list of filters to apply "
         #               "(by default all filters are ignored / passed on."),
         j.ToolInput(
             "restrictRegions",
             j.Boolean(optional=True),
             prefix="--restrict-regions",
             doc=
             "(-R)  Restrict analysis to given (sparse) regions (using -R in bcftools).",
         ),
         j.ToolInput(
             "leftshift",
             j.Boolean(optional=True),
             prefix="--leftshift",
             doc="(-L) Left-shift variants safely.",
         ),
         j.ToolInput(
             "noLeftshift",
             j.Boolean(optional=True),
             prefix="--no-leftshift",
             doc="Do not left-shift variants safely.",
         ),
         j.ToolInput(
             "decompose",
             j.Boolean(optional=True),
             prefix="--decompose",
             doc=
             "Decompose variants into primitives. This results in more granular counts.",
         ),
         j.ToolInput(
             "noDecompose",
             j.Boolean(optional=True),
             prefix="--no-decompose",
             doc="(-D) Do not decompose variants into primitives.",
         ),
         j.ToolInput(
             "bcftoolsNorm",
             j.Boolean(optional=True),
             prefix="--bcftools-norm",
             doc="Enable preprocessing through bcftools norm -c x -D "
             "(requires external preprocessing to be switched on).",
         ),
         j.ToolInput(
             "fixchr",
             j.Boolean(optional=True),
             prefix="--fixchr",
             doc=
             "Add chr prefix to VCF records where necessary (default: auto, attempt to match reference).",
         ),
         j.ToolInput(
             "noFixchr",
             j.Boolean(optional=True),
             prefix="--no-fixchr",
             doc=
             "Do not add chr prefix to VCF records (default: auto, attempt to match reference).",
         ),
         j.ToolInput(
             "bcf",
             j.Boolean(optional=True),
             prefix="--bcf",
             doc=
             "Use BCF internally. This is the default when the input file is in BCF format already. "
             "Using BCF can speed up temp file access, but may fail for VCF files that have broken "
             "headers or records that don't comply with the header.",
         ),
         j.ToolInput(
             "somatic",
             j.Boolean(optional=True),
             prefix="--somatic",
             doc=
             "Assume the input file is a somatic call file and squash all columns into one, "
             "putting all FORMATs into INFO + use half genotypes (see also --set-gt). "
             "This will replace all sample columns and replace them with a single one. "
             "This is used to treat Strelka somatic files Possible values for this parameter: "
             "half / hemi / het / hom / half to assign one of the following genotypes to the "
             "resulting sample: 1 | 0/1 | 1/1 | ./1. This will replace all sample columns and "
             "replace them with a single one.",
         ),
         j.ToolInput(
             "setGT",
             j.Boolean(optional=True),
             prefix="--set-gt",
             doc=
             "This is used to treat Strelka somatic files Possible values for this parameter: "
             "half / hemi / het / hom / half to assign one of the following genotypes to the resulting "
             "sample: 1 | 0/1 | 1/1 | ./1. "
             "This will replace all sample columns and replace them with a single one.",
         ),
         j.ToolInput(
             "gender",
             j.String(optional=True),
             prefix="--gender",
             doc=
             "({male,female,auto,none})  Specify gender. This determines how haploid calls on chrX "
             "get treated: for male samples, all non-ref calls (in the truthset only when "
             "running through hap.py) are given a 1/1 genotype.",
         ),
         j.ToolInput(
             "preprocessTruth",
             j.Boolean(optional=True),
             prefix="--preprocess-truth",
             doc="Preprocess truth file with same settings as query "
             "(default is to accept truth in original format).",
         ),
         j.ToolInput(
             "usefilteredTruth",
             j.Boolean(optional=True),
             prefix="--usefiltered-truth",
             doc="Use filtered variant calls in truth file "
             "(by default, only PASS calls in the truth file are used)",
         ),
         j.ToolInput(
             "preprocessingWindowSize",
             j.Boolean(optional=True),
             prefix="--preprocessing-window-size",
             doc=" Preprocessing window size (variants further apart than "
             "that size are not expected to interfere).",
         ),
         j.ToolInput(
             "adjustConfRegions",
             j.Boolean(optional=True),
             prefix="--adjust-conf-regions",
             doc=
             " Adjust confident regions to include variant locations. Note this will only include "
             "variants that are included in the CONF regions already when viewing with bcftools; "
             "this option only makes sure insertions are padded correctly in the CONF regions (to "
             "capture these, both the base before and after must be contained in the bed file).",
         ),
         j.ToolInput(
             "noAdjustConfRegions",
             j.Boolean(optional=True),
             prefix="--no-adjust-conf-regions",
             doc=" Do not adjust confident regions for insertions.",
         ),
         j.ToolInput(
             "noHaplotypeComparison",
             j.Boolean(optional=True),
             prefix="--no-haplotype-comparison",
             doc=
             "(--unhappy)  Disable haplotype comparison (only count direct GT matches as TP).",
         ),
         j.ToolInput(
             "windowSize",
             j.Int(optional=True),
             prefix="--window-size",
             doc=
             "(-w)  Minimum distance between variants such that they fall into the same superlocus.",
         ),
         j.ToolInput(
             "xcmpEnumerationThreshold",
             j.Int(optional=True),
             prefix="--xcmp-enumeration-threshold",
             doc=
             " Enumeration threshold / maximum number of sequences to enumerate per block.",
         ),
         j.ToolInput(
             "xcmpExpandHapblocks",
             j.String(optional=True),
             prefix="--xcmp-expand-hapblocks",
             doc=
             " Expand haplotype blocks by this many basepairs left and right.",
         ),
         j.ToolInput(
             "threads",
             j.Int(optional=True),
             prefix="--threads",
             default=j.CpuSelector(),
             doc="Number of threads to use. Comparison engine to use.",
         ),
         # j.ToolInput("engineVcfevalPath", j.String(optional=True), prefix="--engine-vcfeval-path",
         #           doc=" This parameter should give the path to the \"rtg\" executable. "
         #               "The default is /opt/hap.py/lib/python27/Haplo/../../../libexec/rtg- tools-install/rtg"),
         j.ToolInput(
             "engine",
             j.String(optional=True),
             prefix="--engine",
             doc=
             " {xcmp,vcfeval,scmp-somatic,scmp-distance} Comparison engine to use.",
         ),
         j.ToolInput(
             "engineVcfevalTemplate",
             j.String(optional=True),
             prefix="--engine-vcfeval-template",
             doc=
             " Vcfeval needs the reference sequence formatted in its own file format (SDF -- run rtg "
             "format -o ref.SDF ref.fa). You can specify this here to save time when running hap.py "
             "with vcfeval. If no SDF folder is specified, hap.py will create a temporary one.",
         ),
         j.ToolInput(
             "scmpDistance",
             j.Int(optional=True),
             prefix="--scmp-distance",
             doc=
             " For distance-based matching, this is the distance between variants to use.",
         ),
         j.ToolInput(
             "logfile",
             j.Filename(suffix="-log", extension=".txt"),
             prefix="--logfile",
             doc="Write logging information into file rather than to stderr",
         ),
         j.ToolInput(
             "verbose",
             j.Boolean(optional=True),
             prefix="--verbose",
             doc="Raise logging level from warning to info.",
         ),
         j.ToolInput(
             "quiet",
             j.Boolean(optional=True),
             prefix="--quiet",
             doc="Set logging level to output errors only.",
         ),
     ]
示例#23
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput("piscesVersion", String()),
         ToolInput(
             "inputBam",
             BamBai(),
             prefix="-b",
             position=4,
             shell_quote=False,
             doc="Input BAM file",
         ),
         ToolInput(
             "referenceFolder",
             Directory(),
             prefix="--genomefolders",
             position=5,
             shell_quote=False,
             doc="Folder containing reference genome files",
         ),
         ToolInput(
             "outputDir",
             String(),
             prefix="--outfolder",
             position=4,
             shell_quote=False,
             doc="Output directory",
         ),
         ToolInput(
             "intervalBedFile",
             Bed(optional=True),
             prefix="-i",
             position=5,
             shell_quote=False,
             doc="Bed File denoting regions to call variants.",
         ),
         ToolInput(
             "minimumBaseQuality",
             Int(optional=True),
             prefix="--minbq",
             position=5,
             shell_quote=False,
             default=20,
             doc="Minimum base call quality to use base in read. (Default: 20)",
         ),
         ToolInput(
             "callMNVs",
             String(optional=True),
             prefix="--callmnvs",
             position=5,
             shell_quote=False,
             doc="Call Multi Nucleotide Variants (aka Phased SNPs). (Default: false)",
         ),
         ToolInput(
             "outputSBFiles",
             String(optional=True),
             prefix="--outputsbfiles",
             position=5,
             shell_quote=False,
             doc="Boolean Flag to output strand bias files. (Default: false)",
         ),
         *self.pisces_additional_args,
     ]
    def constructor(self):

        # Inputs
        self.input("sample_name", String)
        self.input("fastqs", Array(FastqGzPair))
        self.input("seqrun", String, doc="SeqRun Name (for Vcf2Tsv)")
        self.input("reference", FastaWithDict)
        self.input("region_bed", Bed)
        self.input("region_bed_extended", Bed)
        self.input("region_bed_annotated", Bed)
        self.input("genecoverage_bed", Bed)
        self.input("genome_file", TextFile)
        self.input("panel_name", String)
        self.input("vcfcols", TextFile)
        self.input("black_list", Bed(optional=True))
        self.input("snps_dbsnp", VcfTabix)
        self.input("snps_1000gp", VcfTabix)
        self.input("known_indels", VcfTabix)
        self.input("mills_indels", VcfTabix)
        self.input("mutalyzer_server", String)
        self.input("pathos_db", String)
        self.input("maxRecordsInRam", Int)
        # tumor only
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))

        # fastqc
        self.step(
            "fastqc", FastQC_0_11_5(reads=self.fastqs, threads=4), scatter="reads"
        )
        # get the overrepresentative sequence from fastqc
        self.step(
            "getfastqc_adapters",
            ParseFastqcAdaptors(fastqc_datafiles=self.fastqc.datafile,),
            scatter="fastqc_datafiles",
        )
        # align and generate sorted index bam
        self.step(
            "align_and_sort",
            BwaAligner(
                fastq=self.fastqs,
                reference=self.reference,
                sample_name=self.sample_name,
                sortsam_tmpDir=".",
                cutadapt_adapter=self.getfastqc_adapters,
                cutadapt_removeMiddle3Adapter=self.getfastqc_adapters,
            ),
            scatter=["fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter"],
        )
        # merge into one bam and markdups
        self.step(
            "merge_and_mark",
            MergeAndMarkBams_4_1_3(
                bams=self.align_and_sort.out,
                sampleName=self.sample_name,
                maxRecordsInRam=self.maxRecordsInRam,
            ),
        )
        # performance: doc
        self.step(
            "annotate_doc",
            AnnotateDepthOfCoverage_0_1_0(
                bam=self.merge_and_mark.out,
                bed=self.region_bed_annotated,
                reference=self.reference,
                sample_name=self.sample_name,
            ),
        )

        # performance
        self.step(
            "performance_summary",
            PerformanceSummaryTargeted_0_1_0(
                bam=self.merge_and_mark.out,
                region_bed=self.region_bed,
                genecoverage_bed=self.genecoverage_bed,
                sample_name=self.sample_name,
                genome_file=self.genome_file,
            ),
        )
        # gridss
        self.step(
            "gridss",
            Gridss_2_6_2(
                bams=self.merge_and_mark.out,
                reference=self.reference,
                blacklist=self.black_list,
                tmpdir=".",
            ),
        )
        # post gridss r for tumor only + tumor only mode
        # self.step("gridss_post_r", GRIDSSProcessOutput(inp=self.gridss.out))
        # gatk bqsr bam
        self.step(
            "bqsr",
            GATKBaseRecalBQSRWorkflow_4_1_3(
                bam=self.merge_and_mark.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                snps_dbsnp=self.snps_dbsnp,
                snps_1000gp=self.snps_1000gp,
                known_indels=self.known_indels,
                mills_indels=self.mills_indels,
            ),
        )
        # mutect2
        self.step(
            "mutect2",
            GatkSomaticVariantCallerTumorOnlyTargeted(
                bam=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                gnomad=self.gnomad,
                panel_of_normals=self.panel_of_normals,
            ),
        )
        # haplotypecaller to do: take base recal away from the
        self.step(
            "haplotype_caller",
            Gatk4HaplotypeCaller_4_1_3(
                inputRead=self.bqsr.out,
                intervals=self.region_bed_extended,
                reference=self.reference,
                dbsnp=self.snps_dbsnp,
                pairHmmImplementation="LOGLESS_CACHING",
            ),
        )
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedVcf=self.haplotype_caller.out, reference=self.reference
            ),
        )
        # combine variants
        self.step(
            "combinevariants",
            CombineVariants_0_0_8(
                vcfs=[self.splitnormalisevcf.out, self.mutect2.out],
                type="germline",
                columns=["AD", "DP", "AF", "GT"],
            ),
        )
        self.step("compressvcf", BGZip_1_9(file=self.combinevariants.out))
        self.step("sortvcf", BcfToolsSort_1_9(vcf=self.compressvcf.out))
        self.step("uncompressvcf", UncompressArchive(file=self.sortvcf.out))
        # addbamstats
        self.step(
            "addbamstats",
            AddBamStatsGermline_0_1_0(
                bam=self.merge_and_mark.out,
                vcf=self.uncompressvcf.out,
                reference=self.reference,
            ),
        )
        # Molpath specific processes
        self.step("compressvcf2", BGZip_1_9(file=self.addbamstats.out))
        self.step("tabixvcf", TabixLatest(inp=self.compressvcf2.out))
        self.step(
            "calculate_variant_length",
            VcfLength_1_0_1(vcf=self.tabixvcf.out),
            doc="Add the length column for the output of AddBamStats",
        )

        filter_for_variants = self.input("filter_for_vcfs", str, default="length > 150")
        self.step(
            "filter_variants_1_failed",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out, info_filter=filter_for_variants
            ),
        )
        self.step(
            "filter_variants_1",
            VcfFilter_1_0_1(
                vcf=self.calculate_variant_length.out,
                info_filter=filter_for_variants,
                invert=True,  # -v param
            ),
        )

        # Jiaan: copy over from the FRCP, can take the block comment out
        # # This one is the in-house molpath step
        # self.step(
        #     "normalise_vcfs",
        #     NormaliseVcf_1_5_4(
        #         pathos_version=self.pathos_db,
        #         mutalyzer=self.mutalyzer_server,  # mutalyzer="https://vmpr-res-mutalyzer1.unix.petermac.org.au",
        #         rdb=self.pathos_db,  # rdb="pa_uat",
        #         inp=self.filter_variants_1.out,
        #     ),
        # )

        # # repeat remove 150bp variants (workaround for normalise_vcf bug)
        # self.step(
        #     "filter_variants_2_failed",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out, info_filter=filter_for_variants
        #     ),
        # )
        # self.step(
        #     "filter_variants_2",
        #     VcfFilter_1_0_1(
        #         vcf=self.normalise_vcfs.out,
        #         info_filter=filter_for_variants,
        #         invert=True,  # -v param
        #     ),
        # )

        # self.step(
        #     "convert_to_tsv",
        #     Vcf2Tsv_1_5_4(
        #         pathos_version=self.pathos_db,
        #         inp=self.filter_variants_2.out,
        #         sample=self.sample_name,
        #         columns=self.vcfcols,
        #         seqrun=self.seqrun,
        #     ),
        # )

        # self.step(
        #     "index_with_igvtools", IgvIndexFeature_2_5_3(inp=self.filter_variants_2.out)
        # )

        # output
        self.output("fastq_qc", source=self.fastqc.out, output_folder="QC")

        self.output("markdups_bam", source=self.merge_and_mark.out, output_folder="BAM")

        self.output(
            "doc_out", source=self.annotate_doc.out, output_folder="PERFORMANCE"
        )
        self.output(
            "summary", source=self.performance_summary.out, output_folder="PERFORMANCE"
        )
        self.output(
            "gene_summary",
            source=self.performance_summary.geneFileOut,
            output_folder="PERFORMANCE",
        )
        self.output(
            "region_summary",
            source=self.performance_summary.regionFileOut,
            output_folder="PERFORMANCE",
        )

        self.output("gridss_vcf", source=self.gridss.out, output_folder="SV")
        self.output("gridss_bam", source=self.gridss.assembly, output_folder="SV")

        self.output(
            "haplotypecaller_vcf",
            source=self.haplotype_caller.out,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_bam",
            source=self.haplotype_caller.bam,
            output_folder="VCF",
        )
        self.output(
            "haplotypecaller_norm",
            source=self.splitnormalisevcf.out,
            output_folder="VCF",
        )
        self.output("mutect2_vcf", source=self.mutect2.variants, output_folder="VCF")
        self.output("mutect2_bam", source=self.mutect2.out_bam, output_folder="VCF")
        self.output("mutect2_norm", source=self.mutect2.out, output_folder="VCF")
        self.output("addbamstats_vcf", source=self.addbamstats.out)
示例#25
0
class BwaMem_SamToolsView(BioinformaticsTool):
    def tool(self) -> str:
        return "BwaMemSamtoolsView"

    def tool_provider(self):
        return "common"

    def version(self):
        return "0.7.17|1.9"

    def container(self):
        return "michaelfranklin/bwasamtools:0.7.17-1.9"

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("bwa", position=0, shell_quote=False),
            ToolArgument("mem", position=1, shell_quote=False),
            ToolArgument("|", position=5, shell_quote=False),
            ToolArgument("samtools", position=6, shell_quote=False),
            ToolArgument("view", position=7, shell_quote=False),
            ToolArgument(InputSelector("reference"),
                         prefix="-T",
                         position=8,
                         shell_quote=False),
            ToolArgument(
                CpuSelector(),
                position=8,
                shell_quote=False,
                prefix="--threads",
                doc="(-@)  Number of additional threads to use [0]",
            ),
            ToolArgument(
                "-h",
                position=8,
                shell_quote=False,
                doc="Include the header in the output.",
            ),
            ToolArgument("-b",
                         position=8,
                         shell_quote=False,
                         doc="Output in the BAM format."),
            ToolArgument(
                StringFormatter(
                    "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}",
                    name=InputSelector("sampleName"),
                    pl=InputSelector("platformTechnology"),
                ),
                prefix="-R",
                position=2,
                doc=
                "Complete read group header line. ’\\t’ can be used in STR and will be converted to a TAB"
                "in the output SAM. The read group ID will be attached to every read in the output. "
                "An example is ’@RG\\tID:foo\\tSM:bar’. (Default=null) "
                "https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups",
            ),
            ToolArgument(
                CpuSelector(),
                prefix="-t",
                position=2,
                shell_quote=False,
                doc="Number of threads. (default = 1)",
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("reference",
                      FastaWithDict(),
                      position=2,
                      shell_quote=False),
            ToolInput("reads",
                      FastqGzPair,
                      position=3,
                      shell_quote=False,
                      doc=None),
            ToolInput(
                "mates",
                FastqGzPair(optional=True),
                separator=" ",
                position=4,
                shell_quote=False,
                doc=None,
            ),
            ToolInput(
                "outputFilename",
                Filename(prefix=InputSelector("sampleName"), extension=".bam"),
                position=8,
                shell_quote=False,
                prefix="-o",
                doc="output file name [stdout]",
            ),
            # Eventually it would be cool to have like a cascading:
            #   - If readGroupHeaderLine provided, use that,
            #   - If sampleName provided, construct based on that
            #   - Else don't include
            # but this is probbaly a bit hard to do, and for all our purposes we require a readGroupHeaderLine,
            # so we're always going to construct it:
            ToolInput(
                "sampleName",
                String(),
                doc="Used to construct the readGroupHeaderLine with format: "
                "'@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:ILLUMINA'",
            ),
            ToolInput(
                "platformTechnology",
                String(optional=True),
                doc=
                "(ReadGroup: PL) Used to construct the readGroupHeaderLine, defaults: ILLUMINA",
                default="ILLUMINA",
            ),
            *self.bwa_additional_inputs,
            *self.samtools_additional_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("out", Bam(), glob=InputSelector("outputFilename")),
            # ToolOutput("skippedReads", File(optional=True), glob=InputSelector("skippedReadsOutputFilename"))
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_MEM_TUPLE)
        if val:
            return val
        return 16

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_CORES_TUPLE)
        if val:
            return val
        return 16

    def friendly_name(self) -> str:
        return "Bwa mem + Samtools View"

    bwa_additional_inputs = [
        ToolInput(
            "minimumSeedLength",
            Int(optional=True),
            prefix="-k",
            position=2,
            shell_quote=False,
            doc=
            "Matches shorter than INT will be missed. The alignment speed is usually "
            "insensitive to this value unless it significantly deviates 20. (Default: 19)",
        ),
        ToolInput(
            "bandwidth",
            Int(optional=True),
            prefix="-w",
            position=2,
            shell_quote=False,
            doc=
            "Essentially, gaps longer than ${bandWidth} will not be found. Note that the maximum gap length "
            "is also affected by the scoring matrix and the hit length, not solely determined by this option."
            " (Default: 100)",
        ),
        ToolInput(
            "offDiagonalXDropoff",
            Int(optional=True),
            prefix="-d",
            position=2,
            shell_quote=False,
            doc=
            "(Z-dropoff): Stop extension when the difference between the best and the current extension "
            "score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, "
            "respectively, and A is the matching score. Z-dropoff is similar to BLAST’s X-dropoff except "
            "that it doesn’t penalize gaps in one of the sequences in the alignment. Z-dropoff not only "
            "avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. "
            "(Default: 100)",
        ),
        ToolInput(
            "reseedTrigger",
            Float(optional=True),
            prefix="-r",
            position=2,
            shell_quote=False,
            doc=
            "Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter "
            "for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment "
            "speed but lower accuracy. (Default: 1.5)",
        ),
        ToolInput(
            "occurenceDiscard",
            Int(optional=True),
            prefix="-c",
            position=2,
            shell_quote=False,
            doc="Discard a MEM if it has more than INT occurence in the genome. "
            "This is an insensitive parameter. (Default: 10000)",
        ),
        ToolInput(
            "performSW",
            Boolean(optional=True),
            prefix="-P",
            position=2,
            shell_quote=False,
            doc=
            "In the paired-end mode, perform SW to rescue missing hits only but "
            "do not try to find hits that fit a proper pair.",
        ),
        ToolInput(
            "matchingScore",
            Int(optional=True),
            prefix="-A",
            position=2,
            shell_quote=False,
            doc="Matching score. (Default: 1)",
        ),
        ToolInput(
            "mismatchPenalty",
            Int(optional=True),
            prefix="-B",
            position=2,
            shell_quote=False,
            doc=
            "Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. "
            "(Default: 4)",
        ),
        ToolInput(
            "openGapPenalty",
            Int(optional=True),
            prefix="-O",
            position=2,
            shell_quote=False,
            doc="Gap open penalty. (Default: 6)",
        ),
        ToolInput(
            "gapExtensionPenalty",
            Int(optional=True),
            prefix="-E",
            position=2,
            shell_quote=False,
            doc="Gap extension penalty. A gap of length k costs O + k*E "
            "(i.e. -O is for opening a zero-length gap). (Default: 1)",
        ),
        ToolInput(
            "clippingPenalty",
            Int(optional=True),
            prefix="-L",
            position=2,
            shell_quote=False,
            doc=
            "Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score "
            "reaching the end of query. If this score is larger than the best SW score minus the "
            "clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag "
            "reports the best SW score; clipping penalty is not deducted. (Default: 5)",
        ),
        ToolInput(
            "unpairedReadPenalty",
            Int(optional=True),
            prefix="-U",
            position=2,
            shell_quote=False,
            doc=
            "Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as "
            "scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. "
            "It compares these two scores to determine whether we should force pairing. (Default: 9)",
        ),
        ToolInput(
            "assumeInterleavedFirstInput",
            Boolean(optional=True),
            prefix="-p",
            position=2,
            shell_quote=False,
            doc=
            "Assume the first input query file is interleaved paired-end FASTA/Q. ",
        ),
        ToolInput(
            "outputAlignmentThreshold",
            Int(optional=True),
            prefix="-T",
            position=2,
            shell_quote=False,
            doc=
            "Don’t output alignment with score lower than INT. Only affects output. (Default: 30)",
        ),
        ToolInput(
            "outputAllElements",
            Boolean(optional=True),
            prefix="-a",
            position=2,
            shell_quote=False,
            doc=
            "Output all found alignments for single-end or unpaired paired-end reads. "
            "These alignments will be flagged as secondary alignments.",
        ),
        ToolInput(
            "appendComments",
            Boolean(optional=True),
            prefix="-C",
            position=2,
            shell_quote=False,
            doc=
            "Append append FASTA/Q comment to SAM output. This option can be used to transfer "
            "read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment "
            "(the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). "
            "Malformated comments lead to incorrect SAM output.",
        ),
        ToolInput(
            "hardClipping",
            Boolean(optional=True),
            prefix="-H",
            position=2,
            shell_quote=False,
            doc=
            "Use hard clipping ’H’ in the SAM output. This option may dramatically reduce "
            "the redundancy of output when mapping long contig or BAC sequences.",
        ),
        ToolInput(
            "markShorterSplits",
            Boolean(optional=True),
            prefix="-M",
            position=2,
            shell_quote=False,
            doc=
            "Mark shorter split hits as secondary (for Picard compatibility).",
        ),
        ToolInput(
            "verboseLevel",
            Int(optional=True),
            prefix="-v",
            position=2,
            shell_quote=False,
            doc="Control the verbose level of the output. "
            "This option has not been fully supported throughout BWA. Ideally, a value: "
            "0 for disabling all the output to stderr; "
            "1 for outputting errors only; "
            "2 for warnings and errors; "
            "3 for all normal messages; "
            "4 or higher for debugging. When this option takes value 4, the output is not SAM. (Default: 3)",
        ),
    ]

    samtools_additional_args = [
        ToolInput(
            "skippedReadsOutputFilename",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-U",
            doc="output reads not selected by filters to FILE [null]",
        ),
        ToolInput(
            "referenceIndex",
            File(optional=True),
            position=8,
            shell_quote=False,
            prefix="-t",
            doc=
            "FILE listing reference names and lengths (see long help) [null]",
        ),
        ToolInput(
            "intervals",
            Bed(optional=True),
            position=8,
            shell_quote=False,
            prefix="-L",
            doc="only include reads overlapping this BED FILE [null]",
        ),
        ToolInput(
            "includeReadsInReadGroup",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-r",
            doc="only include reads in read group STR [null]",
        ),
        ToolInput(
            "includeReadsInFile",
            File(optional=True),
            position=8,
            shell_quote=False,
            prefix="-R",
            doc="only include reads with read group listed in FILE [null]",
        ),
        ToolInput(
            "includeReadsWithQuality",
            Int(optional=True),
            position=8,
            shell_quote=False,
            prefix="-q",
            doc="only include reads with mapping quality >= INT [0]",
        ),
        ToolInput(
            "includeReadsInLibrary",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-l",
            doc="only include reads in library STR [null]",
        ),
        ToolInput(
            "includeReadsWithCIGAROps",
            Int(optional=True),
            position=8,
            shell_quote=False,
            prefix="-m",
            doc=
            "only include reads with number of CIGAR operations consuming query sequence >= INT [0]",
        ),
        ToolInput(
            "includeReadsWithAllFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-f",
            separator=" ",
            doc="only include reads with all of the FLAGs in INT present [0]",
        ),
        ToolInput(
            "includeReadsWithoutFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-F",
            separator=" ",
            doc="only include reads with none of the FLAGS in INT present [0]",
        ),
        ToolInput(
            "excludeReadsWithAllFLAGs",
            Array(Int(), optional=True),
            position=8,
            shell_quote=False,
            prefix="-G",
            separator=" ",
            doc="only EXCLUDE reads with all of the FLAGs in INT present [0] "
            "fraction of templates/read pairs to keep; INT part sets seed)",
        ),
        ToolInput(
            "useMultiRegionIterator",
            Boolean(optional=True),
            position=8,
            shell_quote=False,
            prefix="-M",
            doc="use the multi-region iterator (increases the speed, removes "
            "duplicates and outputs the reads as they are ordered in the file)",
        ),
        ToolInput(
            "readTagToStrip",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="-x",
            doc="read tag to strip (repeatable) [null]",
        ),
        ToolInput(
            "collapseBackwardCIGAROps",
            Boolean(optional=True),
            position=8,
            shell_quote=False,
            prefix="-B",
            doc=
            "collapse the backward CIGAR operation Specify a single input file format "
            "option in the form of OPTION or OPTION=VALUE",
        ),
        ToolInput(
            "outputFmt",
            String(optional=True),
            position=8,
            shell_quote=False,
            prefix="--output-fmt",
            doc=
            "(OPT[, -O)  Specify output format (SAM, BAM, CRAM) Specify a single "
            "output file format option in the form of OPTION or OPTION=VALUE",
        ),
    ]
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix())
        self.input("panel_of_normals", VcfTabix())
        self.input("gatk_bam_str", String(optional=True))

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_2(
                tumorBams=self.bam,
                intervals=self.intervals,
                reference=self.reference,
                panelOfNormals=self.panel_of_normals,
                germlineResource=self.gnomad,
                outputBamName=self.gatk_bam_str,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModel_4_1_2(
                f1r2CountsFiles=self.mutect2.f1f2r_out),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummaries_4_1_2(bam=self.bam,
                                                sites=self.gnomad,
                                                intervals=self.intervals),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContamination_4_1_2(
                pileupTable=self.getpileupsummaries.out),
        )

        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCalls_4_1_2(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise vcf
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedTabixVcf=self.filtermutect2calls.out,
                reference=self.reference,
            ),
        )

        self.output("variants", source=self.mutect2.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.splitnormalisevcf.out)