예제 #1
0
 def inputs(self):
     return [
         *super().inputs(),  # cache options
         ToolInput(
             "database",
             Boolean(optional=True),
             prefix="--database",
             doc="Enable VEP to use local or remote databases.",
         ),
         ToolInput(
             "host",
             String(optional=True),
             prefix="--host",
             doc=
             "Manually define the database host to connect to. Users in the US may find connection and transfer "
             'speeds quicker using our East coast mirror, useastdb.ensembl.org. Default = "ensembldb.ensembl.org"',
         ),
         ToolInput(
             "user",
             String(optional=True),
             prefix="--user",
             doc=
             '(-u) Manually define the database username. Default = "anonymous"',
         ),
         ToolInput(
             "password",
             String(optional=True),
             prefix="--password",
             doc=
             "(--pass) Manually define the database password. Not used by default",
         ),
         ToolInput(
             "port",
             Int(optional=True),
             prefix="--port",
             doc="Manually define the database port. Default = 5306",
         ),
         ToolInput(
             "genomes",
             Boolean(optional=True),
             prefix="--genomes",
             doc=
             "Override the default connection settings with those for the Ensembl Genomes public MySQL server. "
             "Required when using any of the Ensembl Genomes species. Not used by default",
         ),
         ToolInput(
             "isMultispecies",
             Boolean(optional=True),
             prefix="--is_multispecies",
             doc=
             "Some of the Ensembl Genomes databases (mainly bacteria and protists) are composed of a collection "
             "of close species. It updates the database connection settings (i.e. the database name) "
             "if the value is set to 1. Default: 0",
         ),
         ToolInput(
             "lrg",
             Boolean(optional=True),
             prefix="--lrg",
             doc=
             "Map input variants to LRG coordinates (or to chromosome coordinates if given in LRG coordinates), "
             "and provide consequences on both LRG and chromosomal transcripts. Not used by default",
         ),
         ToolInput(
             "dbVersion",
             String(optional=True),
             prefix="--db_version",
             doc=
             "Force VEP to connect to a specific version of the Ensembl databases. Not recommended as there "
             "may be conflicts between software and database versions. Not used by default",
         ),
         ToolInput(
             "registry",
             Filename(),
             prefix="--registry",
             doc=
             "Defining a registry file overwrites other connection settings and uses those found in the "
             "specified registry file to connect. Not used by default",
         ),
     ]
예제 #2
0
class Gatk4HaplotypeCallerBase(Gatk4ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "HaplotypeCaller"

    def tool(self):
        return "Gatk4HaplotypeCaller"

    def friendly_name(self):
        return "GATK4: Haplotype Caller"

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 1

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self):
        return [
            *super(Gatk4HaplotypeCallerBase, self).inputs(),
            *Gatk4HaplotypeCallerBase.optional_args,
            ToolInput(
                "inputRead",
                BamBai(),
                doc="BAM/SAM/CRAM file containing reads",
                prefix="--input",
                secondaries_present_as={".bai": "^.bai"},
            ),
            ToolInput(
                "reference",
                FastaWithDict(),
                position=5,
                prefix="--reference",
                doc="Reference sequence file",
            ),
            ToolInput(
                "outputFilename",
                Filename(
                    prefix=InputSelector("inputRead",
                                         remove_file_extension=True),
                    extension=".vcf.gz",
                ),
                position=8,
                prefix="--output",
                doc="File to which variants should be written",
            ),
            ToolInput(
                "dbsnp",
                VcfTabix(optional=True),
                position=7,
                prefix="--dbsnp",
                doc="(Also: -D) A dbSNP VCF file.",
            ),
            ToolInput(
                "intervals",
                Bed(optional=True),
                prefix="--intervals",
                doc=
                "-L (BASE) One or more genomic intervals over which to operate",
            ),
            ToolInput(
                "outputBamName",
                Filename(
                    prefix=InputSelector("inputRead",
                                         remove_file_extension=True),
                    extension=".bam",
                ),
                position=8,
                prefix="-bamout",
                doc="File to which assembled haplotypes should be written",
            ),
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out",
                VcfTabix,
                glob=InputSelector("outputFilename"),
                doc="A raw, unfiltered, highly sensitive callset in VCF format. "
                "File to which variants should be written",
            ),
            ToolOutput(
                "bam",
                BamBai,
                glob=InputSelector("outputBamName"),
                doc="File to which assembled haplotypes should be written",
                secondaries_present_as={".bai": "^.bai"},
            ),
        ]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2018, 12, 24),
            dateUpdated=date(2019, 1, 24),
            institution="Broad Institute",
            doi=None,
            citation=
            "See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information",
            keywords=["gatk", "gatk4", "broad", "haplotype"],
            documentationUrl=
            "https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_haplotypecaller_HaplotypeCaller.php#",
            documentation=
            """Call germline SNPs and indels via local re-assembly of haplotypes
    
The HaplotypeCaller is capable of calling SNPs and indels simultaneously via local de-novo assembly of haplotypes 
in an active region. In other words, whenever the program encounters a region showing signs of variation, it 
discards the existing mapping information and completely reassembles the reads in that region. This allows the 
HaplotypeCaller to be more accurate when calling regions that are traditionally difficult to call, for example when 
they contain different types of variants close to each other. It also makes the HaplotypeCaller much better at 
calling indels than position-based callers like UnifiedGenotyper.

In the GVCF workflow used for scalable variant calling in DNA sequence data, HaplotypeCaller runs per-sample to 
generate an intermediate GVCF (not to be used in final analysis), which can then be used in GenotypeGVCFs for joint 
genotyping of multiple samples in a very efficient way. The GVCF workflow enables rapid incremental processing of 
samples as they roll off the sequencer, as well as scaling to very large cohort sizes (e.g. the 92K exomes of ExAC).

In addition, HaplotypeCaller is able to handle non-diploid organisms as well as pooled experiment data. 
Note however that the algorithms used to calculate variant likelihoods is not well suited to extreme allele 
frequencies (relative to ploidy) so its use is not recommended for somatic (cancer) variant discovery. 
For that purpose, use Mutect2 instead.

Finally, HaplotypeCaller is also able to correctly handle the splice junctions that make RNAseq a challenge 
for most variant callers, on the condition that the input read data has previously been processed according 
to our recommendations as documented (https://software.broadinstitute.org/gatk/documentation/article?id=4067).
""".strip(),
        )

    optional_args = [
        ToolInput(
            "pairHmmImplementation",
            String(optional=True),
            prefix="--pair-hmm-implementation",
            doc=
            "The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. The --pair-hmm-implementation argument is an enumerated type (Implementation), which can have one of the following values: EXACT;ORIGINAL;LOGLESS_CACHING;AVX_LOGLESS_CACHING;AVX_LOGLESS_CACHING_OMP;EXPERIMENTAL_FPGA_LOGLESS_CACHING;FASTEST_AVAILABLE. Implementation:  FASTEST_AVAILABLE",
        ),
        ToolInput(
            "activityProfileOut",
            String(optional=True),
            prefix="--activity-profile-out",
            doc=
            "Output the raw activity profile results in IGV format (default: null)",
        ),
        ToolInput(
            "alleles",
            File(optional=True),
            prefix="--alleles",
            doc=
            "(default: null) The set of alleles at which to genotype when --genotyping_mode "
            "is GENOTYPE_GIVEN_ALLELES",
        ),
        ToolInput(
            "annotateWithNumDiscoveredAlleles",
            Boolean(optional=True),
            prefix="--annotate-with-num-discovered-alleles",
            doc=
            "If provided, we will annotate records with the number of alternate alleles that were "
            "discovered (but not necessarily genotyped) at a given site",
        ),
        ToolInput(
            "annotation",
            Array(String(), optional=True),
            prefix="--annotation",
            doc="-A: One or more specific annotations to add to variant calls",
        ),
        ToolInput(
            "annotationGroup",
            Array(String(), optional=True),
            prefix="--annotation-group",
            doc=
            "-G	One or more groups of annotations to apply to variant calls",
        ),
        ToolInput(
            "annotationsToExclude",
            Array(String(), optional=True),
            prefix="--annotations-to-exclude",
            doc=
            "-AX	One or more specific annotations to exclude from variant calls",
        ),
        ToolInput(
            "arguments_file",
            Array(File(), optional=True),
            prefix="--arguments_file",
            doc=
            "read one or more arguments files and add them to the command line",
        ),
        ToolInput(
            "assemblyRegionOut",
            String(optional=True),
            prefix="--assembly-region-out",
            doc=
            "(default: null) Output the assembly region to this IGV formatted file. Which annotations to "
            "exclude from output in the variant calls. Note that this argument has higher priority than "
            "the -A or -G arguments, so these annotations will be excluded even if they are explicitly "
            "included with the other options.",
        ),
        ToolInput(
            "baseQualityScoreThreshold",
            Int(optional=True),
            prefix="--base-quality-score-threshold",
            doc=
            "(default: 18) Base qualities below this threshold will be reduced to the minimum (6)",
        ),
        ToolInput(
            "cloudIndexPrefetchBuffer",
            Int(optional=True),
            prefix="--cloud-index-prefetch-buffer",
            doc=
            "-CIPB (default: -1) Size of the cloud-only prefetch buffer (in MB; 0 to disable). "
            "Defaults to cloudPrefetchBuffer if unset.",
        ),
        ToolInput(
            "cloudPrefetchBuffer",
            Int(optional=True),
            prefix="--cloud-prefetch-buffer",
            doc=
            "-CPB (default: 40) Size of the cloud-only prefetch buffer (in MB; 0 to disable).",
        ),
        ToolInput(
            "contaminationFractionToFilter",
            Double(optional=True),
            prefix="--contamination-fraction-to-filter",
            doc=
            "-contamination (default: 0.0) Fraction of contamination in sequencing data "
            "(for all samples) to aggressively remove",
        ),
        ToolInput(
            "correctOverlappingQuality",
            Boolean(optional=True),
            prefix="--correct-overlapping-quality",
            doc="Undocumented option",
        ),
        # ToolInput("dbsnp", VcfIdx(optional=True), prefix="--dbsnp", doc="-D (default: null) dbSNP file"),
        ToolInput(
            "disableBamIndexCaching",
            Boolean(optional=True),
            prefix="--disable-bam-index-caching",
            doc=
            "-DBIC. If true, don't cache bam indexes, this will reduce memory requirements but may harm "
            "performance if many intervals are specified. Caching is automatically disabled if "
            "there are no intervals specified.",
        ),
        # ToolInput("disableSequenceDictionaryValidation", Boolean(optional=True), prefix="--disable-sequence-dictionary-validation",
        #           doc="If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!"),
        ToolInput(
            "founderId",
            Array(String(), optional=True),
            prefix="--founder-id",
            doc='Samples representing the population "founders"',
        ),
        # ToolInput("gcsMaxRetries", Int(optional=True), prefix="--gcs-max-retries",
        #           doc="-gcs-retries (default: 20) If the GCS bucket channel errors out, "
        #               "how many times it will attempt to re-initiate the connection"),
        # ToolInput("gcsProjectForRequesterPays", String(), prefix="--gcs-project-for-requester-pays",
        #           doc="Project to bill when accessing \"requester pays\" buckets. If unset, these buckets cannot be accessed."),
        ToolInput(
            "genotypingMode",
            String(optional=True),
            prefix="--genotyping-mode",
            doc=
            "(default: DISCOVERY) Specifies how to determine the alternate alleles to use for genotyping. "
            "The --genotyping-mode argument is an enumerated type (GenotypingOutputMode), which can have one "
            "of the following values: DISCOVERY (The genotyper will choose the most likely alternate allele) "
            "or GENOTYPE_GIVEN_ALLELES (Only the alleles passed by the user should be considered).",
        ),
        # ToolInput("graphOutput", DataType(optional=True), prefix="--graph-output", doc="-graph	null	Write debug assembly graph information to this file"),
        ToolInput(
            "heterozygosity",
            Double(optional=True),
            prefix="--heterozygosity",
            doc=
            "(default: 0.001) Heterozygosity value used to compute prior likelihoods for any locus. The "
            "expected heterozygosity value used to compute prior probability that a locus is non-reference. "
            "The default priors are for provided for humans: het = 1e-3 which means that the probability "
            "of N samples being hom-ref at a site is: 1 - sum_i_2N (het / i) Note that heterozygosity as "
            "used here is the population genetics concept: "
            "http://en.wikipedia.org/wiki/Zygosity#Heterozygosity_in_population_genetics . "
            "That is, a hets value of 0.01 implies that two randomly chosen chromosomes from the population "
            "of organisms would differ from each other (one being A and the other B) at a rate of 1 in 100 bp. "
            "Note that this quantity has nothing to do with the likelihood of any given sample having a "
            "heterozygous genotype, which in the GATK is purely determined by the probability of the observed "
            "data P(D | AB) under the model that there may be a AB het genotype. The posterior probability "
            "of this AB genotype would use the het prior, but the GATK only uses this posterior probability "
            "in determining the prob. that a site is polymorphic. So changing the het parameters only "
            "increases the chance that a site will be called non-reference across all samples, but doesn't "
            "actually change the output genotype likelihoods at all, as these aren't posterior probabilities "
            "at all. The quantity that changes whether the GATK considers the possibility of a het genotype "
            "at all is the ploidy, which determines how many chromosomes each individual in the species carries.",
        ),
        ToolInput(
            "heterozygosityStdev",
            Double(optional=True),
            prefix="--heterozygosity-stdev",
            doc=
            "(default 0.01) Standard deviation of heterozygosity for SNP and indel calling.",
        ),
        ToolInput(
            "indelHeterozygosity",
            Double(optional=True),
            prefix="--indel-heterozygosity",
            doc=
            "(default: 1.25E-4) Heterozygosity for indel calling. This argument informs the prior "
            "probability of having an indel at a site. (See heterozygosity)",
        ),
        ToolInput(
            "intervalMergingRule",
            String(optional=True),
            prefix="--interval-merging-rule",
            doc=
            "-imr (default: ALL) Interval merging rule for abutting intervals. By default, the program "
            "merges abutting intervals (i.e. intervals that are directly side-by-side but do not actually "
            "overlap) into a single continuous interval. However you can change this behavior if you want "
            "them to be treated as separate intervals instead. The --interval-merging-rule argument is an "
            "enumerated type (IntervalMergingRule), which can have one of the following values:"
            "[ALL, OVERLAPPING]",
        ),
        ToolInput(
            "maxReadsPerAlignmentStart",
            Int(optional=True),
            prefix="--max-reads-per-alignment-start",
            doc=
            "(default: 50) Maximum number of reads to retain per alignment start position. "
            "Reads above this threshold will be downsampled. Set to 0 to disable.",
        ),
        ToolInput(
            "minBaseQualityScore",
            Int(optional=True),
            prefix="--min-base-quality-score",
            doc=
            "-mbq (default: 10) Minimum base quality required to consider a base for calling",
        ),
        ToolInput(
            "nativePairHmmThreads",
            Int(optional=True),
            prefix="--native-pair-hmm-threads",
            doc=
            "(default: 4) How many threads should a native pairHMM implementation use",
        ),
        ToolInput(
            "nativePairHmmUseDoublePrecision",
            Boolean(optional=True),
            prefix="--native-pair-hmm-use-double-precision",
            doc="use double precision in the native pairHmm. "
            "This is slower but matches the java implementation better",
        ),
        ToolInput(
            "numReferenceSamplesIfNoCall",
            Int(optional=True),
            prefix="--num-reference-samples-if-no-call",
            doc=
            "(default: 0) Number of hom-ref genotypes to infer at sites not present in a panel. When a "
            "variant is not seen in any panel, this argument controls whether to infer (and with what "
            "effective strength) that only reference alleles were observed at that site. "
            'E.g. "If not seen in 1000Genomes, treat it as AC=0, AN=2000".',
        ),
        ToolInput(
            "outputMode",
            String(optional=True),
            prefix="--output-mode",
            doc=
            "(default: EMIT_VARIANTS_ONLY) Specifies which type of calls we should output. The --output-mode "
            "argument is an enumerated type (OutputMode), which can have one of the following values: "
            "[EMIT_VARIANTS_ONLY (produces calls only at variant sites), "
            "EMIT_ALL_CONFIDENT_SITES (produces calls at variant sites and confident reference sites), "
            "EMIT_ALL_SITES (produces calls at any callable site regardless of confidence; "
            "this argument is intended only for point mutations (SNPs) in DISCOVERY mode or "
            "generally when running in GENOTYPE_GIVEN_ALLELES mode; it will by no means produce "
            "a comprehensive set of indels in DISCOVERY mode)]",
        ),
        ToolInput(
            "pedigree",
            File(optional=True),
            prefix="--pedigree",
            doc=
            '-ped (default: null) Pedigree file for determining the population "founders"',
        ),
        ToolInput(
            "populationCallset",
            File(optional=True),
            prefix="--population-callset",
            doc=
            "-population (default: null) Callset to use in calculating genotype priors",
        ),
        ToolInput(
            "sampleName",
            String(optional=True),
            prefix="--sample-name",
            doc=
            "-ALIAS (default: null) Name of single sample to use from a multi-sample bam. You can use this "
            "argument to specify that HC should process a single sample out of a multisample BAM file. "
            "This is especially useful if your samples are all in the same file but you need to run them "
            "individually through HC in -ERC GVC mode (which is the recommended usage). "
            "Note that the name is case-sensitive.",
        ),
        ToolInput(
            "samplePloidy",
            Int(optional=True),
            prefix="--sample-ploidy",
            doc=
            "-ploidy (default: 2) Ploidy (number of chromosomes) per sample. "
            "For pooled data, set to (Number of samples in each pool * Sample Ploidy). "
            "Sample ploidy - equivalent to number of chromosomes per pool. In pooled "
            "experiments this should be = # of samples in pool * individual sample ploidy",
        ),
        ToolInput(
            "sitesOnlyVcfOutput",
            Boolean(optional=True),
            prefix="--sites-only-vcf-output",
            doc=
            "(default: false) If true, don't emit genotype fields when writing vcf file output.",
        ),
        ToolInput(
            "standardMinConfidenceThresholdForCalling",
            Double(optional=True),
            prefix="--standard-min-confidence-threshold-for-calling",
            doc=
            "-stand-call-conf (default: 10.0) The minimum phred-scaled confidence "
            "threshold at which variants should be called",
        ),
        ToolInput(
            "useNewQualCalculator",
            Boolean(optional=True),
            prefix="--use-new-qual-calculator",
            doc=
            "-new-qual If provided, we will use the new AF model instead of the so-called exact model",
        ),
        ToolInput(
            "gvcfGqBands",
            Array(Int, optional=True),
            prefix="-GQB",
            prefix_applies_to_all_elements=True,
            doc=
            "(--gvcf-gq-bands) Exclusive upper bounds for reference confidence GQ"
            " bands (must be in [1, 100] and specified in increasing order)",
        ),
        ToolInput(
            "emitRefConfidence",
            String(optional=True),
            prefix="--emit-ref-confidence",
            doc=
            "(-ERC) Mode for emitting reference confidence scores (For Mutect2, this is a BETA feature)",
        ),
        ToolInput(
            "dontUseSoftClippedBases",
            Boolean(optional=True),
            prefix="--dont-use-soft-clipped-bases",
            doc="Do not analyze soft clipped bases in the reads",
        ),
    ]

    def tests(self):
        return [
            TTestCase(
                name="basic",
                input={
                    "inputRead":
                    os.path.join(
                        BioinformaticsTool.test_data_path(),
                        "wgsgermline_data",
                        "NA12878-BRCA1.split.bam",
                    ),
                    "reference":
                    os.path.join(
                        BioinformaticsTool.test_data_path(),
                        "wgsgermline_data",
                        "Homo_sapiens_assembly38.chr17.fasta",
                    ),
                    "intervals":
                    os.path.join(
                        BioinformaticsTool.test_data_path(),
                        "wgsgermline_data",
                        "BRCA1.hg38.bed",
                    ),
                    "dbsnp":
                    os.path.join(
                        BioinformaticsTool.test_data_path(),
                        "wgsgermline_data",
                        "Homo_sapiens_assembly38.dbsnp138.BRCA1.vcf.gz",
                    ),
                    "javaOptions": ["-Xmx6G"],
                    "pairHmmImplementation":
                    "LOGLESS_CACHING",
                },
                output=VcfTabix.basic_test(
                    "out",
                    12800,
                    270,
                    214,
                    ["GATKCommandLine"],
                    "0224e24e5fc27286ee90c8d3c63373a7",
                ) + BamBai.basic_test(
                    "bam",
                    596698,
                    21272,
                    os.path.join(
                        BioinformaticsTool.test_data_path(),
                        "wgsgermline_data",
                        "NA12878-BRCA1.haplotyped.flagstat",
                    ),
                    "d83b4c0d8eab24a3be1cc6af4f827753",
                    "b4bb4028b8679a3a635e3ad87126a097",
                ),
            )
        ]
예제 #3
0
 def test_str_str(self):
     s1 = String()
     s2 = String()
     self.assertTrue(s2.can_receive_from(s1))
예제 #4
0
class Gatk4SplitReadsBase(Gatk4ToolBase):
    def friendly_name(self) -> str:
        return "GATK4: SplitReads"

    def tool(self) -> str:
        return "Gatk4SplitReads"

    @classmethod
    def gatk_command(cls):
        return "SplitReads"

    def inputs(self):
        return [
            ToolInput(
                "outputFilename",
                String,
                prefix="--output",
                default=".",
                doc=
                "The directory to output SAM/BAM/CRAM files. Default value: '.' ",
            ),
            ToolInput(
                "bam",
                BamBai,
                prefix="--input",
                position=1,
                secondaries_present_as={".bai": "^.bai"},
                doc=
                "(-I:String) BAM/SAM/CRAM file containing reads  This argument must be specified at least once.",
            ),
            ToolInput(
                tag="intervals",
                input_type=Bed(optional=True),
                prefix="--intervals",
                doc=
                "(-L:String) One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null. ",
            ),
            *Gatk4SplitReadsBase.additional_args,
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def outputs(self):
        return [
            ToolOutput(
                "out",
                BamBai,
                glob=InputSelector("bam", use_basename=True),
                doc="Bam",
                secondaries_present_as={".bai": "^.bai"},
            )
        ]

    def metadata(self):
        return ToolMetadata(
            dateCreated=datetime.fromisoformat("2019-09-16T15:53:15.813130"),
            dateUpdated=datetime.fromisoformat("2019-09-16T15:53:15.813131"),
            documentation=
            "USAGE: SplitReads [arguments]\nOutputs reads from a SAM/BAM/CRAM by read group, sample and library name\nVersion:4.1.3.0",
        )

    additional_args = [
        ToolInput(
            tag="addOutputSamProgramRecord",
            input_type=Boolean(optional=True),
            prefix="-add-output-sam-program-record",
            doc=
            "(--add-output-sam-program-record)  If true, adds a PG tag to created SAM/BAM/CRAM files.  Default value: true. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="addOutputVcfCommandLine",
            input_type=Boolean(optional=True),
            prefix="-add-output-vcf-command-line",
            doc=
            "(--add-output-vcf-command-line)  If true, adds a command line header line to created VCF files.  Default value: true. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="arguments_file",
            input_type=File(optional=True),
            prefix="--arguments_file:File",
            doc=
            "read one or more arguments files and add them to the command line This argument may be specified 0 or more times. Default value: null. ",
        ),
        ToolInput(
            tag="cloudIndexPrefetchBuffer",
            input_type=String(optional=True),
            prefix="--cloud-index-prefetch-buffer",
            doc=
            "(-CIPB:Integer)  Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset.  Default value: -1. ",
        ),
        ToolInput(
            tag="cloudPrefetchBuffer",
            input_type=String(optional=True),
            prefix="--cloud-prefetch-buffer",
            doc=
            "(-CPB:Integer)  Size of the cloud-only prefetch buffer (in MB; 0 to disable).  Default value: 40. ",
        ),
        ToolInput(
            tag="createOutputBamIndex",
            input_type=String(optional=True),
            prefix="--create-output-bam-index",
            doc=
            "(-OBI:Boolean)  If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file.  Default value: true. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="createOutputBamMd5",
            input_type=String(optional=True),
            prefix="--create-output-bam-md5",
            doc=
            "(-OBM:Boolean)  If true, create a MD5 digest for any BAM/SAM/CRAM file created  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="createOutputVariantIndex",
            input_type=String(optional=True),
            prefix="--create-output-variant-index",
            doc=
            "(-OVI:Boolean)  If true, create a VCF index when writing a coordinate-sorted VCF file.  Default value: true. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="createOutputVariantMd5",
            input_type=String(optional=True),
            prefix="--create-output-variant-md5",
            doc=
            "(-OVM:Boolean)  If true, create a a MD5 digest any VCF file created.  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="disableBamIndexCaching",
            input_type=String(optional=True),
            prefix="--disable-bam-index-caching",
            doc=
            "(-DBIC:Boolean)  If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified.  Caching is automatically disabled if there are no intervals specified.  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="disableReadFilter",
            input_type=String(optional=True),
            prefix="--disable-read-filter",
            doc=
            "(-DF:String)  Read filters to be disabled before analysis  This argument may be specified 0 or more times. Default value: null. Possible Values: {WellformedReadFilter}",
        ),
        ToolInput(
            tag="disableSequenceDictionaryValidation",
            input_type=Boolean(optional=True),
            prefix="-disable-sequence-dictionary-validation",
            doc=
            "(--disable-sequence-dictionary-validation)  If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="excludeIntervals",
            input_type=String(optional=True),
            prefix="--exclude-intervals",
            doc=
            "(-XL:StringOne) This argument may be specified 0 or more times. Default value: null. ",
        ),
        ToolInput(
            tag="gatkConfigFile",
            input_type=File(optional=True),
            prefix="--gatk-config-file",
            doc=
            "A configuration file to use with the GATK. Default value: null.",
        ),
        ToolInput(
            tag="gcsRetries",
            input_type=Int(optional=True),
            prefix="-gcs-retries",
            doc=
            "(--gcs-max-retries)  If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection  Default value: 20. ",
        ),
        ToolInput(
            tag="gcsProjectForRequesterPays",
            input_type=String(optional=True),
            prefix="--gcs-project-for-requester-pays",
            doc=
            " Project to bill when accessing requester pays  buckets. If unset, these buckets cannot be accessed.  Default value: . ",
        ),
        ToolInput(
            tag="intervalExclusionPadding",
            input_type=Int(optional=True),
            prefix="--interval-exclusion-padding",
            doc=
            "(-ixp:Integer)  Amount of padding (in bp) to add to each interval you are excluding.  Default value: 0. ",
        ),
        ToolInput(
            tag="imr",
            input_type=String(optional=True),
            prefix="-imr:IntervalMergingRule",
            doc=
            "(--interval-merging-rule)  Interval merging rule for abutting intervals  Default value: ALL. Possible values: {ALL, OVERLAPPING_ONLY} ",
        ),
        ToolInput(
            tag="ip",
            input_type=Int(optional=True),
            prefix="-ip",
            doc="(--interval-padding) Default value: 0.",
        ),
        ToolInput(
            tag="isr",
            input_type=String(optional=True),
            prefix="-isr:IntervalSetRule",
            doc=
            "(--interval-set-rule)  Set merging approach to use for combining interval inputs  Default value: UNION. Possible values: {UNION, INTERSECTION} ",
        ),
        ToolInput(
            tag="le",
            input_type=Boolean(optional=True),
            prefix="--lenient",
            doc=
            "(-LE) Lenient processing of VCF files Default value: false. Possible values: {true, false}",
        ),
        ToolInput(
            tag="quiet",
            input_type=Boolean(optional=True),
            prefix="--QUIET",
            doc=
            "Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="readFilter",
            input_type=String(optional=True),
            prefix="--read-filter",
            doc=
            "(-RF:String) Read filters to be applied before analysis This argument may be specified 0 or more times. Default value: null. Possible Values: {AlignmentAgreesWithHeaderReadFilter, AllowAllReadsReadFilter, AmbiguousBaseReadFilter, CigarContainsNoNOperator, FirstOfPairReadFilter, FragmentLengthReadFilter, GoodCigarReadFilter, HasReadGroupReadFilter, IntervalOverlapReadFilter, LibraryReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, MatchingBasesAndQualsReadFilter, MateDifferentStrandReadFilter, MateOnSameContigOrNoMappedMateReadFilter, MateUnmappedAndUnmappedReadFilter, MetricsReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroFragmentLengthReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotOpticalDuplicateReadFilter, NotSecondaryAlignmentReadFilter, NotSupplementaryAlignmentReadFilter, OverclippedReadFilter, PairedReadFilter, PassesVendorQualityCheckReadFilter, PlatformReadFilter, PlatformUnitReadFilter, PrimaryLineReadFilter, ProperlyPairedReadFilter, ReadGroupBlackListReadFilter, ReadGroupReadFilter, ReadLengthEqualsCigarLengthReadFilter, ReadLengthReadFilter, ReadNameReadFilter, ReadStrandFilter, SampleReadFilter, SecondOfPairReadFilter, SeqIsStoredReadFilter, SoftClippedReadFilter, ValidAlignmentEndReadFilter, ValidAlignmentStartReadFilter, WellformedReadFilter}",
        ),
        ToolInput(
            tag="readIndex",
            input_type=String(optional=True),
            prefix="-read-index",
            doc=
            "(--read-index)  Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically.  This argument may be specified 0 or more times. Default value: null. ",
        ),
        ToolInput(
            tag="readValidationStringency",
            input_type=String(optional=True),
            prefix="--read-validation-stringency",
            doc=
            "(-VS:ValidationStringency)  Validation stringency for all SAM/BAM/CRAM/SRA files read by this program.  The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: SITool returned: 0 LENT. Possible values: {STRICT, LENIENT, SILENT} ",
        ),
        ToolInput(
            tag="reference",
            input_type=FastaWithDict(optional=True),
            prefix="--reference",
            doc="(-R:String) Reference sequence Default value: null.",
        ),
        ToolInput(
            tag="secondsBetweenProgressUpdates",
            input_type=Double(optional=True),
            prefix="-seconds-between-progress-updates",
            doc=
            "(--seconds-between-progress-updates)  Output traversal statistics every time this many seconds elapse  Default value: 10.0. ",
        ),
        ToolInput(
            tag="sequenceDictionary",
            input_type=String(optional=True),
            prefix="-sequence-dictionary",
            doc=
            "(--sequence-dictionary)  Use the given sequence dictionary as the master/canonical sequence dictionary.  Must be a .dict file.  Default value: null. ",
        ),
        ToolInput(
            tag="sitesOnlyVcfOutput",
            input_type=Boolean(optional=True),
            prefix="--sites-only-vcf-output:Boolean",
            doc=
            " If true, don't emit genotype fields when writing vcf file output.  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="splitLibraryName",
            input_type=String(optional=True),
            prefix="--split-library-name",
            doc=
            "(-LB)  Split file by library.  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="rg",
            input_type=String(optional=True),
            prefix="--split-read-group",
            doc=
            "(-RG:BooleanSplit) Default value: false. Possible values: {true, false}",
        ),
        ToolInput(
            tag="splitSample",
            input_type=String(optional=True),
            prefix="--split-sample",
            doc=
            "(-SM:Boolean) Split file by sample. Default value: false. Possible values: {true, false}",
        ),
        ToolInput(
            tag="tmpDir",
            input_type=String(optional=True),
            prefix="--tmp-dir:GATKPathSpecifier",
            doc="Temp directory to use. Default value: null.",
        ),
        ToolInput(
            tag="jdkDeflater",
            input_type=Boolean(optional=True),
            prefix="-jdk-deflater",
            doc=
            "(--use-jdk-deflater)  Whether to use the JdkDeflater (as opposed to IntelDeflater)  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="jdkInflater",
            input_type=Boolean(optional=True),
            prefix="-jdk-inflater",
            doc=
            "(--use-jdk-inflater)  Whether to use the JdkInflater (as opposed to IntelInflater)  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="verbosity",
            input_type=String(optional=True),
            prefix="-verbosity:LogLevel",
            doc=
            "(--verbosity)  Control verbosity of logging.  Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} ",
        ),
        ToolInput(
            tag="disableToolDefaultReadFilters",
            input_type=Boolean(optional=True),
            prefix="-disable-tool-default-read-filters",
            doc=
            "(--disable-tool-default-read-filters)  Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="ambigFilterBases",
            input_type=Int(optional=True),
            prefix="--ambig-filter-bases",
            doc=
            "Threshold number of ambiguous bases. If null, uses threshold fraction; otherwise, overrides threshold fraction.  Default value: null.  Cannot be used in conjuction with argument(s) maxAmbiguousBaseFraction",
        ),
        ToolInput(
            tag="ambigFilterFrac",
            input_type=Double(optional=True),
            prefix="--ambig-filter-frac",
            doc=
            "Threshold fraction of ambiguous bases Default value: 0.05. Cannot be used in conjuction with argument(s) maxAmbiguousBases",
        ),
        ToolInput(
            tag="maxFragmentLength",
            input_type=Int(optional=True),
            prefix="--max-fragment-length",
            doc="Default value: 1000000.",
        ),
        ToolInput(
            tag="minFragmentLength",
            input_type=Int(optional=True),
            prefix="--min-fragment-length",
            doc="Default value: 0.",
        ),
        ToolInput(
            tag="keepIntervals",
            input_type=String(optional=True),
            prefix="--keep-intervals",
            doc=
            'Valid only if "IntervalOverlapReadFilter" is specified: One or more genomic intervals to keep This argument must be specified at least once. Required. ',
        ),
        ToolInput(
            tag="library",
            input_type=String(optional=True),
            prefix="-library",
            doc=
            '(--library) Valid only if "LibraryReadFilter" is specified: Name of the library to keep This argument must be specified at least once. Required.',
        ),
        ToolInput(
            tag="maximumMappingQuality",
            input_type=Int(optional=True),
            prefix="--maximum-mapping-quality",
            doc=
            " Maximum mapping quality to keep (inclusive)  Default value: null. ",
        ),
        ToolInput(
            tag="minimumMappingQuality",
            input_type=Int(optional=True),
            prefix="--minimum-mapping-quality",
            doc=
            " Minimum mapping quality to keep (inclusive)  Default value: 10. ",
        ),
        ToolInput(
            tag="dontRequireSoftClipsBothEnds",
            input_type=Boolean(optional=True),
            prefix="--dont-require-soft-clips-both-ends",
            doc=
            " Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="filterTooShort",
            input_type=Int(optional=True),
            prefix="--filter-too-short",
            doc="Minimum number of aligned bases Default value: 30.",
        ),
        ToolInput(
            tag="platformFilterName",
            input_type=String(optional=True),
            prefix="--platform-filter-name:String",
            doc="This argument must be specified at least once. Required.",
        ),
        ToolInput(
            tag="blackListedLanes",
            input_type=String(optional=True),
            prefix="--black-listed-lanes:String",
            doc=
            "Platform unit (PU) to filter out This argument must be specified at least once. Required.",
        ),
        ToolInput(
            tag="readGroupBlackList",
            input_type=String(optional=True),
            prefix="--read-group-black-list:StringThe",
            doc="This argument must be specified at least once. Required. ",
        ),
        ToolInput(
            tag="keepReadGroup",
            input_type=String(optional=True),
            prefix="--keep-read-group:String",
            doc="The name of the read group to keep Required.",
        ),
        ToolInput(
            tag="maxReadLength",
            input_type=Int(optional=True),
            prefix="--max-read-length",
            doc=
            "Keep only reads with length at most equal to the specified value Required.",
        ),
        ToolInput(
            tag="minReadLength",
            input_type=Int(optional=True),
            prefix="--min-read-length",
            doc=
            "Keep only reads with length at least equal to the specified value Default value: 1.",
        ),
        ToolInput(
            tag="readName",
            input_type=String(optional=True),
            prefix="--read-name:String",
            doc="Keep only reads with this read name Required.",
        ),
        ToolInput(
            tag="keepReverseStrandOnly",
            input_type=Boolean(optional=True),
            prefix="--keep-reverse-strand-only",
            doc=
            " Keep only reads on the reverse strand  Required. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="sample",
            input_type=String(optional=True),
            prefix="-sample:String",
            doc=
            "(--sample) The name of the sample(s) to keep, filtering out all others This argument must be specified at least once. Required. ",
        ),
        ToolInput(
            tag="invertSoftClipRatioFilter",
            input_type=Boolean(optional=True),
            prefix="--invert-soft-clip-ratio-filter",
            doc=
            " Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.  Default value: false. Possible values: {true, false} ",
        ),
        ToolInput(
            tag="softClippedLeadingTrailingRatio",
            input_type=Double(optional=True),
            prefix="--soft-clipped-leading-trailing-ratio",
            doc=
            " Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.  Default value: null.  Cannot be used in conjuction with argument(s) minimumSoftClippedRatio",
        ),
        ToolInput(
            tag="softClippedRatioThreshold",
            input_type=Double(optional=True),
            prefix="--soft-clipped-ratio-threshold",
            doc=
            " Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.  Default value: null.  Cannot be used in conjuction with argument(s) minimumLeadingTrailingSoftClippedRatio",
        ),
    ]
 def inputs(self):
     return [
         # ToolInput(tag="version", input_type=Boolean(), prefix="--version", separate_value_from_prefix=True,
         #           doc="show program's version number and exit"),
         # ToolInput(tag="help", input_type=Boolean(), prefix="--help", separate_value_from_prefix=True,
         #           doc="(-h) show this help message and exit"),
         # ToolInput(tag="allhelp", input_type=Boolean(), prefix="--allHelp", separate_value_from_prefix=True,
         #           doc="show all extended/hidden options"),
         ToolInput(
             tag="normalBam",
             input_type=BamBai(),
             prefix="--normalBam=",
             separate_value_from_prefix=False,
             position=1,
             doc="Normal sample BAM or CRAM file. (no default)",
         ),
         ToolInput(
             tag="tumorBam",
             input_type=BamBai(),
             prefix="--tumourBam=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "(--tumorBam)  Tumor sample BAM or CRAM file. [required] (no default)",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaFai(),
             prefix="--referenceFasta=",
             position=1,
             separate_value_from_prefix=False,
             doc=" samtools-indexed reference fasta file [required]",
         ),
         ToolInput(
             tag="rundir",
             input_type=Filename(),
             prefix="--runDir=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Name of directory to be created where all workflow scripts and output will be written. "
             "Each analysis requires a separate directory. (default: StrelkaSomaticWorkflow)",
         ),
         ToolInput(
             tag="region",
             input_type=Array(String, optional=True),
             prefix="--region",
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Limit the analysis to one or more genome region(s) for debugging purposes. If this argument "
             "is provided multiple times the union of all specified regions will be analyzed. All regions "
             "must be non-overlapping to get a meaningful result. Examples: '--region chr20' "
             "(whole chromosome), '--region chr2:100-2000 --region chr3:2500-3000' (two regions)'. "
             "If this option is specified (one or more times) together with the 'callRegions' BED file,"
             "then all region arguments will be intersected with the callRegions BED track.",
         ),
         ToolInput(
             tag="config",
             input_type=File(optional=True),
             prefix="--config=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "provide a configuration file to override defaults in global config file "
             "(/opt/strelka/bin/configureStrelkaSomaticWorkflow.py.ini)",
         ),
         ToolInput(
             tag="outputcallableregions",
             input_type=Boolean(optional=True),
             prefix="--outputCallableRegions",
             position=1,
             separate_value_from_prefix=True,
             doc=
             "Output a bed file describing somatic callable regions of the genome",
         ),
         ToolInput(
             tag="indelCandidates",
             input_type=Array(VcfTabix, optional=True),
             prefix="--indelCandidates=",
             prefix_applies_to_all_elements=True,
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Specify a VCF of candidate indel alleles. These alleles are always evaluated "
             "but only reported in the output when they are inferred to exist in the sample. "
             "The VCF must be tabix indexed. All indel alleles must be left-shifted/normalized, "
             "any unnormalized alleles will be ignored. This option may be specified more than once, "
             "multiple input VCFs will be merged. (default: None)",
         ),
         ToolInput(
             tag="forcedgt",
             input_type=Array(VcfTabix, optional=True),
             prefix="--forcedGT=",
             separate_value_from_prefix=False,
             prefix_applies_to_all_elements=True,
             position=1,
             doc=
             "Specify a VCF of candidate alleles. These alleles are always evaluated and reported even "
             "if they are unlikely to exist in the sample. The VCF must be tabix indexed. All indel "
             "alleles must be left- shifted/normalized, any unnormalized allele will trigger a runtime "
             "error. This option may be specified more than once, multiple input VCFs will be merged. "
             "Note that for any SNVs provided in the VCF, the SNV site will be reported (and for gVCF, "
             "excluded from block compression), but the specific SNV alleles are ignored. (default: None)",
         ),
         ToolInput(
             tag="targeted",
             input_type=Boolean(optional=True),
             prefix="--targeted",
             separate_value_from_prefix=True,
             position=1,
             doc="Set options for other targeted input: "
             "note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="exome",
             input_type=Boolean(optional=True),
             prefix="--exome",
             separate_value_from_prefix=True,
             position=1,
             doc=
             "Set options for exome: note in particular that this flag turns off high-depth filters",
         ),
         ToolInput(
             tag="callRegions",
             input_type=BedTabix(optional=True),
             prefix="--callRegions=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Optionally provide a bgzip-compressed/tabix-indexed BED file containing the set of "
             "regions to call. No VCF output will be provided outside of these regions. "
             "The full genome will still be used to estimate statistics from the input "
             "(such as expected depth per chromosome). Only one BED file may be specified. "
             "(default: call the entire genome)",
         ),
         ToolInput(
             tag="noisevcf",
             input_type=VcfTabix(optional=True),
             prefix="--noiseVcf=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Noise vcf file (submit argument multiple times for more than one file)",
         ),
         ToolInput(
             tag="scansizemb",
             input_type=Int(optional=True),
             prefix="--scanSizeMb=",
             separate_value_from_prefix=False,
             position=1,
             doc=
             "Maximum sequence region size (in megabases) scanned by each "
             "task during genome variant calling. (default: 12)",
         ),
         ToolInput(
             tag="callmemmb",
             input_type=Int(optional=True),
             prefix="--callMemMb=",
             position=1,
             separate_value_from_prefix=False,
             doc=
             "Set variant calling task memory limit (in megabytes). It is not recommended to change the "
             "default in most cases, but this might be required for a sample of unusual depth.",
         ),
         ToolInput(
             tag="retaintempfiles",
             input_type=Boolean(optional=True),
             default=False,
             position=1,
             prefix="--retainTempFiles",
             separate_value_from_prefix=True,
             doc="Keep all temporary files (for workflow debugging)",
         ),
         ToolInput(
             tag="disableevs",
             input_type=Boolean(optional=True),
             prefix="--disableEVS",
             position=1,
             separate_value_from_prefix=True,
             doc="Disable empirical variant scoring (EVS).",
         ),
         ToolInput(
             tag="reportevsfeatures",
             input_type=Boolean(optional=True),
             prefix="--reportEVSFeatures",
             position=1,
             separate_value_from_prefix=True,
             doc=
             " Report all empirical variant scoring features in VCF output.",
         ),
         ToolInput(
             tag="snvscoringmodelfile",
             input_type=File(optional=True),
             prefix="--snvScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for SNVs "
             "(default: /opt/strelka/share/config/somaticSNVScoringM odels.json)",
         ),
         ToolInput(
             tag="indelscoringmodelfile",
             input_type=File(optional=True),
             prefix="--indelScoringModelFile=",
             position=1,
             separate_value_from_prefix=False,
             doc=" Provide a custom empirical scoring model file for indels "
             "(default: /opt/strelka/share/config/somaticInde lScoringModels.json)",
         ),
         ToolInput(
             "mode",
             String(optional=True),
             default="local",
             prefix="--mode",
             position=3,
             shell_quote=False,
             doc="(-m MODE)  select run mode (local|sge)",
         ),
         ToolInput(
             "queue",
             String(optional=True),
             prefix="--queue",
             position=3,
             shell_quote=False,
             doc="(-q QUEUE) specify scheduler queue name",
         ),
         ToolInput(
             "memGb",
             String(optional=True),
             prefix="--memGb",
             position=3,
             shell_quote=False,
             doc=" (-g MEMGB) gigabytes of memory available to run workflow "
             "-- only meaningful in local mode, must be an integer (default: Estimate the total "
             "memory for this node for local mode, 'unlimited' for sge mode)",
         ),
         ToolInput(
             "quiet",
             Boolean(optional=True),
             prefix="--quiet",
             position=3,
             shell_quote=False,
             doc="Don't write any log output to stderr "
             "(but still write to workspace/pyflow.data/logs/pyflow_log.txt)",
         ),
         # ToolInput("mailTo", String(optional=True), prefix="--mailTo", position=3, shell_quote=False,
         #           doc="(-e) send email notification of job completion status to this address "
         #               "(may be provided multiple times for more than one email address)"),
     ]
예제 #6
0
class FeatureCountsBase(SubreadToolBase, ABC):
    def tool(self):
        return "featureCounts"

    @classmethod
    def subread_command(cls):
        return "featureCounts"

    def inputs(self):
        return [
            *self.additional_inputs,
            ToolInput(
                "bam",
                Array(Bam),
                position=10,
                doc=
                "A list of SAM or BAM format files. They can be either name or location sorted. If no files provided, <stdin> input is expected. Location-sorted paired-end reads are automatically sorted by read names.",
            ),
            ToolInput(
                "outputFilename",
                Filename(extension=".txt"),
                prefix="-o",
                doc=
                "Name of output file including read counts. A separate file including summary statistics of counting results is also included in the output ('<string>.summary'). Both files are in tab delimited format.",
            ),
            ToolInput(
                "annotationFile",
                File,
                prefix="-a",
                doc=
                "Name of an annotation file. GTF/GFF format by default. See -F option for more format information. Inbuilt annotations (SAF format) is available in 'annotation' directory of the package. Gzipped file is also accepted.",
            ),
        ]

    def outputs(self):
        return [
            ToolOutput("out", TextFile, glob=InputSelector("outputFilename"))
        ]

    def friendly_name(self):
        return "featureCounts"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Jiaan Yu"],
            dateCreated=date(2020, 7, 16),
            dateUpdated=date(2020, 7, 16),
            institution="Walter and Eliza Hall Institute of Medical Research",
            doi=None,
            citation=None,
            keywords=["subread", "featureCounts"],
            documentationUrl=
            "https://www.rdocumentation.org/packages/Rsubread/versions/1.22.2/topics/featureCounts",
            documentation=
            """FeatureCounts: A General-Purpose Read Summarization Function
This function assigns mapped sequencing reads to genomic features""".strip(),
        )

    additional_inputs = [
        ToolInput(
            "format",
            String(optional=True),
            prefix="-F",
            doc=
            "Specify format of the provided annotation file. Acceptable formats include 'GTF' (or compatible GFF format) and 'SAF'. 'GTF' by default.  For SAF format, please refer to Users Guide.",
        ),
        ToolInput(
            "featureType",
            Array(String(), optional=True),
            prefix="-t",
            separator=",",
            doc=
            "Specify feature type(s) in a GTF annotation. If multiple types are provided, they should be separated by ',' with no space in between. 'exon' by default. Rows in the annotation with a matched feature will be extracted and used for read mapping.",
        ),
        ToolInput(
            "attributeType",
            String(optional=True),
            prefix="-g",
            doc=
            "Specify attribute type in GTF annotation. 'gene_id' by default. Meta-features used for read counting will be extracted from annotation using the provided value.",
        ),
        ToolInput(
            "extraAttributes",
            Array(String(), optional=True),
            separator=",",
            prefix="--extraAttributes",
            doc=
            "Extract extra attribute types from the provided GTF annotation and include them in the counting output. These attribute types will not be used to group features. If more than one attribute type is provided they should be separated by comma.",
        ),
        ToolInput(
            "chromsomeAlias",
            String(optional=True),
            prefix="-A",
            doc=
            "Provide a chromosome name alias file to match chr names inannotation with those in the reads. This should be a two-column comma-delimited text file. Its first column should include chr names in the annotation and its second column should include chr names in the reads. Chr names are case sensitive. No column header should be included in the file.",
        ),
        ToolInput(
            "featureLevel",
            Boolean(optional=True),
            prefix="-f",
            doc=
            "Perform read counting at feature level (eg. counting reads for exons rather than genes).",
        ),
        ToolInput(
            "overlap",
            Boolean(optional=True),
            prefix="-O",
            doc=
            "Assign reads to all their overlapping meta-features (or features if -f is specified).",
        ),
        ToolInput(
            "minOverlap",
            Int(optional=True),
            prefix="--minOverlap",
            doc=
            "Minimum number of overlapping bases in a read that isrequired for read assignment. 1 by default. Number ofoverlapping bases is counted from both reads if pairedend. If a negative value is provided, then a gap of upto specified size will be allowed between read and the feature that the read is assigned to.",
        ),
        ToolInput(
            "fracOverlap",
            Float(optional=True),
            prefix="--fracOverlap",
            doc=
            "Minimum fraction of overlapping bases in a read that isrequired for read assignment. Value should be within range [0,1]. 0 by default. Number of overlapping bases is counted from both reads if paired end. Both this option and '--minOverlap' option need to be satisfied for read assignment.",
        ),
        ToolInput(
            "fracOverlapFeature",
            Float(optional=True),
            prefix="--fracOverlapFeature",
            doc=
            "Minimum fraction of overlapping bases in a feature that is required for read assignment. Value should be within range [0,1]. 0 by default.",
        ),
        ToolInput(
            "largestOverlap",
            Boolean(optional=True),
            prefix="--largestOverlap",
            doc=
            "Assign reads to a meta-feature/feature that has the  largest number of overlapping bases.",
        ),
        ToolInput(
            "nonOverlap",
            Int(optional=True),
            prefix="--nonOverlap",
            doc=
            "Maximum number of non-overlapping bases in a read (or a read pair) that is allowed when being assigned to a feature. No limit is set by default.",
        ),
        ToolInput(
            "nonOverlapFeature",
            Int(optional=True),
            prefix="--nonOverlapFeature",
            doc=
            "Maximum number of non-overlapping bases in a feature that is allowed in read assignment. No limit is set by default.",
        ),
        ToolInput(
            "readExtensionFive",
            Int(optional=True),
            prefix="--readExtension5",
            doc="Reads are extended upstream by <int> bases from their 5' end.",
        ),
        ToolInput(
            "readExtensionThree",
            String(optional=True),
            prefix="--readExtension3",
            doc="Reads are extended upstream by <int> bases from their 3' end.",
        ),
        ToolInput(
            "readToPos",
            String(optional=True),
            prefix="--read2pos",
            doc=
            "Reduce reads to their 5' most base or 3' most base. Read counting is then performed based on the single base the read is reduced to.",
        ),
        ToolInput(
            "multiMapping",
            Boolean(optional=True),
            prefix="-M",
            doc=
            "Multi-mapping reads will also be counted. For a multi-mapping read, all its reported alignments will be counted. The 'NH' tag in BAM/SAM input is used to detect multi-mapping reads.",
        ),
        ToolInput(
            "fration",
            Boolean(optional=True),
            prefix="--fraction",
            doc=
            "Assign fractional counts to features. This option must be used together with '-M' or '-O' or both. When '-M' is specified, each reported alignment from a multi-mapping read (identified via 'NH' tag) will carry a fractional count of 1/x, instead of 1 (one), where x is the total number of alignments reported for the same read. When '-O' is specified, each overlapping feature will receive a fractional count of 1/y, where y is the total number of features overlapping with the read. When both '-M' and '-O' are specified, each alignment will carry a fractional count of 1/(x*y).",
        ),
        ToolInput(
            "quality",
            String(optional=True),
            prefix="-Q",
            doc=
            "The minimum mapping quality score a read must satisfy in order to be counted. For paired-end reads, at least one end should satisfy this criteria. 0 by default.",
        ),
        ToolInput(
            "splitOnly",
            Boolean(optional=True),
            prefix="--splitOnly",
            doc=
            "Count split alignments only (ie. alignments with CIGAR string containing 'N'). An example of split alignments is exon-spanning reads in RNA-seq data.",
        ),
        ToolInput(
            "nonSplitOnly",
            Boolean(optional=True),
            prefix="--nonSplitOnly",
            doc=
            "If specified, only non-split alignments (CIGAR strings do not contain letter 'N') will be counted. All the other alignments will be ignored.",
        ),
        ToolInput(
            "primary",
            Boolean(optional=True),
            prefix="--primary",
            doc=
            "Count primary alignments only. Primary alignments are identified using bit 0x100 in SAM/BAM FLAG field.",
        ),
        ToolInput(
            "ignoreDup",
            Boolean(optional=True),
            prefix="--ignoreDup",
            doc=
            "Ignore duplicate reads in read counting. Duplicate reads are identified using bit Ox400 in BAM/SAM FLAG field. The whole read pair is ignored if one of the reads is a duplicate read for paired end data.",
        ),
        ToolInput(
            "strandness",
            String(optional=True),
            prefix="-",
            doc=
            "Perform strand-specific read counting. A single integer value (applied to all input files) or a string of comma-separated values (applied to each corresponding input file) should be provided. Possible values include: 0 (unstranded), 1 (stranded) and 2 (reversely stranded). Default value is 0 (ie. unstranded read counting carried out for all input files).",
        ),
        ToolInput(
            "junction",
            String(optional=True),
            prefix="-J",
            doc=
            "Count number of reads supporting each exon-exon junction. Junctions were identified from those exon-spanning reads in the input (containing 'N' in CIGAR string). Counting results are saved to a file named '<output_file>.jcounts'",
        ),
        ToolInput(
            "genome",
            File(optional=True),
            prefix="-G",
            doc=
            "Provide the name of a FASTA-format file that contains thereference sequences used in read mapping that produced the provided SAM/BAM files. This optional argument can be used with '-J' option to improve read counting for junctions.",
        ),
        ToolInput(
            "pairEnd",
            Boolean(optional=True),
            prefix="-p",
            doc=
            "If specified, fragments (or templates) will be counted instead of reads. This option is only applicable for paired-end reads; single-end reads are always counted as reads.",
        ),
        ToolInput(
            "both",
            Boolean(optional=True),
            prefix="-B",
            doc="Only count read pairs that have both ends aligned.",
        ),
        ToolInput(
            "pairEndDistance",
            Boolean(optional=True),
            prefix="-P",
            doc=
            "Check validity of paired-end distance when counting read  pairs. Use -d and -D to set thresholds.",
        ),
        ToolInput(
            "minDistance",
            Int(optional=True),
            prefix="-d",
            doc="Minimum fragment/template length, 50 by default.",
        ),
        ToolInput(
            "maxDistance",
            Int(optional=True),
            prefix="-D",
            doc="Maximum fragment/template length, 600 by default.",
        ),
        ToolInput(
            "countRead",
            Boolean(optional=True),
            prefix="-C",
            doc=
            "Do not count read pairs that have their two ends mapping to different chromosomes or mapping to same chromosome but on different strands.",
        ),
        ToolInput(
            "doNotSort",
            Boolean(optional=True),
            prefix="--donotsort",
            doc=
            "Do not sort reads in BAM/SAM input. Note that reads from the same pair are required to be located next to each other in the input.",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            prefix="-T",
            doc="Number of the threads. 1 by default.",
        ),
        ToolInput(
            "byReadGroup",
            Boolean(optional=True),
            prefix="--byReadGroup",
            doc=
            "Assign reads by read group. 'RG' tag is required to be present in the input BAM/SAM files.",
        ),
        ToolInput(
            "longRead",
            Boolean(optional=True),
            prefix="-L",
            doc=
            "Count long reads such as Nanopore and PacBio reads. Long read counting can only run in one thread and only reads (not read-pairs) can be counted. There is no limitation on the number of 'M' operations allowed in a CIGAR string in long read counting.",
        ),
        ToolInput(
            "outputFormat",
            String(optional=True),
            prefix="-R",
            doc=
            "Output detailed assignment results for each read or read-pair. Results are saved to a file that is in one of the following formats: CORE, SAM and BAM. See Users Guide for more info about these formats.",
        ),
        ToolInput(
            "outputDirectory",
            String(optional=True),
            prefix="--Rpath",
            doc=
            "Specify a directory to save the detailed assignment results. If unspecified, the directory where counting results are saved is used.",
        ),
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--tmpDir",
            doc=
            "Directory under which intermediate files are saved (later removed). By default, intermediate files will be saved to the directory specified in '-o' argument.",
        ),
        ToolInput(
            "maxMOp",
            Int(optional=True),
            prefix="--maxMOp",
            doc=
            "Maximum number of 'M' operations allowed in a CIGAR string. 10 by default. Both 'X' and '=' are treated as 'M' and adjacent 'M' operations are merged in the CIGAR string.",
        ),
    ]
예제 #7
0
    def test_array_of_strings(self):

        ar = Array(String())
        d = ar.cwl_type()
        self.assertEqual({"type": "array", "items": "string"}, d.save())
 def test_validate_string_nooptional_disallowoptional_novalue(self):
     self.assertFalse(String().validate_value(None, False))
 def test_array_valid(self):
     self.assertTrue(Array(String()).validate_value(["aa", "bb"], True))
예제 #10
0
    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput(
                "inputFile",
                CompressedVcf(),
                prefix="--input_file",
                doc="Input file name. Can use compressed file (gzipped).",
            ),
            ToolInput(
                "outputFilename",
                Filename(prefix=InputSelector("inputFile"), extension=".vcf"),
                prefix="--output_file",
                doc=
                "(-o) Output file name. Results can write to STDOUT by specifying "
                ' as the output file name - this will force quiet mode. Default = "variant_effect_output.txt"',
            ),
            ToolInput(
                "vcf",
                Boolean(),
                default=True,
                prefix="--vcf",
                doc=
                "Writes output in VCF format. Consequences are added in the INFO field of the VCF file, using the "
                'key "CSQ". Data fields are encoded separated by "|"; the order of fields is written in the VCF header.'
                ' Output fields in the "CSQ" INFO field can be selected by using --fields. If the input format was VCF,'
                " the file will remain unchanged save for the addition of the CSQ field (unless using any filtering). "
                "Custom data added with --custom are added as separate fields, using the key specified for each data "
                "file. Commas in fields are replaced with ampersands (&) to preserve VCF format.",
            ),
            # ToolInput('plugin', [PLUGINS](optional=True), prefix='--plugin',
            #           doc='Use named plugin. Plugin modules should be installed in the Plugins subdirectory of the VEP cache directory (defaults to $HOME/.vep/). Multiple plugins can be used by supplying the --plugin flag multiple times. See plugin documentation. Not used by default'),
            ToolInput(
                "help",
                Boolean(optional=True),
                prefix="--help",
                doc="Display help message and quit",
            ),
            ToolInput(
                "quiet",
                Boolean(optional=True),
                prefix="--quiet",
                doc="(-q) Suppress warning messages.Not used by default",
            ),
            ToolInput(
                "verbose",
                Boolean(optional=True),
                prefix="--verbose",
                doc=
                "(-v) Print out a bit more information while running. Not used by default",
            ),
            ToolInput(
                "config",
                File(optional=True),
                prefix="--config",
                doc=
                """Load configuration options from a config file. The config file should consist of whitespace-separated pairs of option names and settings e.g.:

            output_file   my_output.txt
            species       mus_musculus
            format        vcf
            host          useastdb.ensembl.org

            A config file can also be implicitly read; save the file as $HOME/.vep/vep.ini (or equivalent directory if 
            using --dir). Any options in this file will be overridden by those specified in a config file using --config, 
            and in turn by any options specified on the command line. You can create a quick version file of this by 
            setting the flags as normal and running VEP in verbose (-v) mode. This will output lines that can be copied 
            to a config file that can be loaded in on the next run using --config. Not used by default""",
            ),
            ToolInput(
                "everything",
                Boolean(optional=True),
                prefix="--everything",
                doc=
                "(-e) Shortcut flag to switch on all of the following: --sift b, --polyphen b, --ccds, "
                "--uniprot, --hgvs, --symbol, --numbers, --domains, --regulatory, --canonical, --protein, "
                "--biotype, --uniprot, --tsl, --appris, --gene_phenotype --af, --af_1kg, --af_esp, "
                "--af_gnomad, --max_af, --pubmed, --variant_class, --mane",
            ),
            ToolInput(
                "species",
                String(optional=True),
                prefix="--species",
                doc=
                'Species for your data. This can be the latin name e.g. "homo_sapiens" or any Ensembl alias e.g. '
                '"mouse". Specifying the latin name can speed up initial database connection as the registry does '
                'not have to load all available database aliases on the server. Default = "homo_sapiens"',
            ),
            ToolInput(
                "assembly",
                String(optional=True),
                prefix="--assembly",
                doc=
                """(-a) Select the assembly version to use if more than one available. If using the cache, you must 
                have the appropriate assembly's cache file installed. If not specified and you have only 1 assembly 
                version installed, this will be chosen by default. Default = use found assembly version""",
            ),
            ToolInput(
                "inputData",
                String(optional=True),
                prefix="--input_data",
                doc=
                "(--id) Raw input data as a string. May be used, for example, to input a single rsID or HGVS "
                "notation quickly to vep: --input_data rs699",
            ),
            ToolInput(
                "format",
                String(optional=True),
                prefix="--format",
                doc=
                'Input file format - one of "ensembl", "vcf", "hgvs", "id", "region", "spdi". By default, '
                "VEP auto-detects the input file format. Using this option you can specify the input file is "
                "Ensembl, VCF, IDs, HGVS, SPDI or region format. Can use compressed version (gzipped) of any "
                "file format listed above. Auto-detects format by default",
            ),
            ToolInput(
                "forceOverwrite",
                Boolean(optional=True),
                prefix="--force_overwrite",
                doc=
                "(--force) By default, VEP will fail with an error if the output file already exists. You can "
                "force the overwrite of the existing file by using this flag. Not used by default",
            ),
            ToolInput(
                "statsFile",
                String(optional=True),
                default="variant_effect_output.txt_summary.html",
                prefix="--stats_file",
                doc=
                "(--sf) Summary stats file name. This is an HTML file containing a summary of the VEP run - the "
                'file name must end ".htm" or ".html". Default = "variant_effect_output.txt_summary.html"',
            ),
            ToolInput(
                "noStats",
                Boolean(optional=True),
                prefix="--no_stats",
                doc=
                """Don\'t generate a stats file. Provides marginal gains in run time.""",
            ),
            ToolInput(
                "statsText",
                Boolean(optional=True),
                prefix="--stats_text",
                doc="Generate a plain text stats file in place of the HTML.",
            ),
            ToolInput(
                "warningFile",
                Filename(suffix="warning", extension=".txt"),
                prefix="--warning_file",
                doc=
                "File name to write warnings and errors to. Default = STDERR (standard error)",
            ),
            ToolInput(
                "maxSvSize",
                Boolean(optional=True),
                prefix="--max_sv_size",
                doc=
                "Extend the maximum Structural Variant size VEP can process.",
            ),
            ToolInput(
                "noCheckVariantsOrder",
                Boolean(optional=True),
                prefix="--no_check_variants_order",
                doc=
                "Permit the use of unsorted input files. However running VEP on unsorted input files slows down "
                "the tool and requires more memory.",
            ),
            ToolInput(
                "fork",
                Int(optional=True),
                default=CpuSelector(),
                prefix="--fork",
                doc=
                "Enable forking, using the specified number of forks. Forking can dramatically improve runtime. "
                "Not used by default",
            ),
            ToolInput(
                "custom",
                Array(BedTabix, optional=True),
                prefix="--custom",
                prefix_applies_to_all_elements=True,
                doc=
                "Add custom annotation to the output. Files must be tabix indexed or in the bigWig format. "
                "Multiple files can be specified by supplying the --custom flag multiple times. "
                "See https://asia.ensembl.org/info/docs/tools/vep/script/vep_custom.html for full details. "
                "Not used by default",
            ),
            ToolInput(
                "gff",
                File(optional=True),
                prefix="--gff",
                doc=
                "Use GFF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "gtf",
                File(optional=True),
                prefix="--gtf",
                doc=
                "Use GTF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "bam",
                Bam(optional=True),
                prefix="--bam",
                doc=
                "ADVANCED Use BAM file of sequence alignments to correct transcript models not derived from "
                "reference genome sequence. Used to correct RefSeq transcript models. "
                "Enables --use_transcript_ref; add --use_given_ref to override this behaviour. Not used by default",
            ),
            ToolInput(
                "useTranscriptRef",
                Boolean(optional=True),
                prefix="--use_transcript_ref",
                doc=
                "By default VEP uses the reference allele provided in the input file to calculate consequences "
                "for the provided alternate allele(s). Use this flag to force VEP to replace the provided "
                "reference allele with sequence derived from the overlapped transcript. This is especially "
                "relevant when using the RefSeq cache, see documentation for more details. The GIVEN_REF and "
                "USED_REF fields are set in the output to indicate any change. Not used by default",
            ),
            ToolInput(
                "useGivenRef",
                Boolean(optional=True),
                prefix="--use_given_ref",
                doc=
                "Using --bam or a BAM-edited RefSeq cache by default enables --use_transcript_ref; add this flag "
                "to override this behaviour and use the provided reference allele from the input. Not used by default",
            ),
            ToolInput(
                "customMultiAllelic",
                Boolean(optional=True),
                prefix="--custom_multi_allelic",
                doc=
                "By default, comma separated lists found within the INFO field of custom annotation VCFs are "
                "assumed to be allele specific. For example, a variant with allele_string A/G/C with associated "
                'custom annotation "single,double,triple" will associate triple with C, double with G and single '
                "with A. This flag instructs VEP to return all annotations for all alleles. Not used by default",
            ),
            ToolInput(
                "tab",
                Boolean(optional=True),
                prefix="--tab",
                doc=
                "Writes output in tab-delimited format. Not used by default",
            ),
            ToolInput(
                "json",
                Boolean(optional=True),
                prefix="--json",
                doc="Writes output in JSON format. Not used by default",
            ),
            ToolInput(
                "compressOutput",
                String(optional=True),
                default="bgzip",
                prefix="--compress_output",
                doc=
                "Writes output compressed using either gzip or bgzip. Not used by default",
            ),
            ToolInput(
                "fields",
                Array(String, optional=True),
                prefix="--fields",
                doc=
                """Configure the output format using a comma separated list of fields.
Can only be used with tab (--tab) or VCF format (--vcf) output.
For the tab format output, the selected fields may be those present in the default output columns, or 
any of those that appear in the Extra column (including those added by plugins or custom annotations). 
Output remains tab-delimited. For the VCF format output, the selected fields are those present within the ""CSQ"" INFO field.

Example of command for the tab output:

--tab --fields ""Uploaded_variation,Location,Allele,Gene""
Example of command for the VCF format output:

--vcf --fields ""Allele,Consequence,Feature_type,Feature""
Not used by default""",
            ),
            ToolInput(
                "minimal",
                Boolean(optional=True),
                prefix="--minimal",
                doc=
                "Convert alleles to their most minimal representation before consequence calculation i.e. "
                "sequence that is identical between each pair of reference and alternate alleles is trimmed "
                "off from both ends, with coordinates adjusted accordingly. Note this may lead to discrepancies "
                "between input coordinates and coordinates reported by VEP relative to transcript sequences; "
                "to avoid issues, use --allele_number and/or ensure that your input variants have unique "
                "identifiers. The MINIMISED flag is set in the VEP output where relevant. Not used by default",
            ),
            ToolInput(
                "variantClass",
                Boolean(optional=True),
                prefix="--variant_class",
                doc=
                "Output the Sequence Ontology variant class. Not used by default",
            ),
            ToolInput(
                "sift",
                String(optional=True),
                prefix="--sift",
                doc=
                "Species limited SIFT predicts whether an amino acid substitution affects protein function based "
                "on sequence homology and the physical properties of amino acids. VEP can output the prediction "
                "term, score or both. Not used by default",
            ),
            ToolInput(
                "polyphen",
                String(optional=True),
                prefix="--polyphen",
                doc=
                "Human only PolyPhen is a tool which predicts possible impact of an amino acid substitution on "
                "the structure and function of a human protein using straightforward physical and comparative "
                "considerations. VEP can output the prediction term, score or both. VEP uses the humVar score "
                "by default - use --humdiv to retrieve the humDiv score. Not used by default",
            ),
            ToolInput(
                "humdiv",
                Boolean(optional=True),
                prefix="--humdiv",
                doc=
                "Human only Retrieve the humDiv PolyPhen prediction instead of the default humVar. "
                "Not used by default",
            ),
            ToolInput(
                "nearest",
                String(optional=True),
                prefix="--nearest",
                doc=
                """Retrieve the transcript or gene with the nearest protein-coding transcription start site 
                (TSS) to each input variant. Use ""transcript"" to retrieve the transcript stable ID, ""gene"" to 
                retrieve the gene stable ID, or ""symbol"" to retrieve the gene symbol. Note that the nearest 
                TSS may not belong to a transcript that overlaps the input variant, and more than one may be 
                reported in the case where two are equidistant from the input coordinates.

            Currently only available when using a cache annotation source, and requires the Set::IntervalTree perl module.
            Not used by default""",
            ),
            ToolInput(
                "distance",
                Array(Int, optional=True),
                separator=",",
                prefix="--distance",
                doc=
                "Modify the distance up and/or downstream between a variant and a transcript for which VEP will assign the upstream_gene_variant or downstream_gene_variant consequences. Giving one distance will modify both up- and downstream distances; prodiving two separated by commas will set the up- (5') and down - (3') stream distances respectively. Default: 5000",
            ),
            ToolInput(
                "overlaps",
                Boolean(optional=True),
                prefix="--overlaps",
                doc=
                "Report the proportion and length of a transcript overlapped by a structural variant in VCF format.",
            ),
            ToolInput(
                "genePhenotype",
                Boolean(optional=True),
                prefix="--gene_phenotype",
                doc=
                "Indicates if the overlapped gene is associated with a phenotype, disease or trait. See list of phenotype sources. Not used by default",
            ),
            ToolInput(
                "regulatory",
                Boolean(optional=True),
                prefix="--regulatory",
                doc=
                "Look for overlaps with regulatory regions. VEP can also report if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature. Not used by default",
            ),
            ToolInput(
                "cellType",
                Boolean(optional=True),
                prefix="--cell_type",
                doc=
                "Report only regulatory regions that are found in the given cell type(s). Can be a single cell type or a comma-separated list. The functional type in each cell type is reported under CELL_TYPE in the output. To retrieve a list of cell types, use --cell_type list. Not used by default",
            ),
            ToolInput(
                "individual",
                Array(String, optional=True),
                prefix="--individual",
                separator=",",
                doc=
                'Consider only alternate alleles present in the genotypes of the specified individual(s). May be a single individual, a comma-separated list or "all" to assess all individuals separately. Individual variant combinations homozygous for the given reference allele will not be reported. Each individual and variant combination is given on a separate line of output. Only works with VCF files containing individual genotype data; individual IDs are taken from column headers. Not used by default',
            ),
            ToolInput(
                "phased",
                Boolean(optional=True),
                prefix="--phased",
                doc=
                "Force VCF genotypes to be interpreted as phased. For use with plugins that depend on phased data. Not used by default",
            ),
            ToolInput(
                "alleleNumber",
                Boolean(optional=True),
                prefix="--allele_number",
                doc=
                "Identify allele number from VCF input, where 1 = first ALT allele, 2 = second ALT allele etc. Useful when using --minimal Not used by default",
            ),
            ToolInput(
                "showRefAllele",
                Boolean(optional=True),
                prefix="--show_ref_allele",
                doc=
                'Adds the reference allele in the output. Mainly useful for the VEP "default" and tab-delimited output formats. Not used by default',
            ),
            ToolInput(
                "totalLength",
                Boolean(optional=True),
                prefix="--total_length",
                doc=
                "Give cDNA, CDS and protein positions as Position/Length. Not used by default",
            ),
            ToolInput(
                "numbers",
                Boolean(optional=True),
                prefix="--numbers",
                doc=
                "Adds affected exon and intron numbering to to output. Format is Number/Total. Not used by default",
            ),
            ToolInput(
                "noEscape",
                Boolean(optional=True),
                prefix="--no_escape",
                doc="Don't URI escape HGVS strings. Default = escape",
            ),
            ToolInput(
                "keepCsq",
                Boolean(optional=True),
                prefix="--keep_csq",
                doc=
                "Don't overwrite existing CSQ entry in VCF INFO field. Overwrites by default",
            ),
            ToolInput(
                "vcfInfoField",
                String(optional=True),
                prefix="--vcf_info_field",
                doc=
                'Change the name of the INFO key that VEP write the consequences to in its VCF output. Use "ANN" for compatibility with other tools such as snpEff. Default: CSQ',
            ),
            ToolInput(
                "terms",
                String(optional=True),
                prefix="--terms",
                doc=
                '(-t) The type of consequence terms to output. The Ensembl terms are described here. The Sequence Ontology is a joint effort by genome annotation centres to standardise descriptions of biological sequences. Default = "SO"',
            ),
            ToolInput(
                "noHeaders",
                Boolean(optional=True),
                prefix="--no_headers",
                doc=
                "Don't write header lines in output files. Default = add headers",
            ),
            ToolInput(
                "hgvs",
                Boolean(optional=True),
                prefix="--hgvs",
                doc=
                "Add HGVS nomenclature based on Ensembl stable identifiers to the output. Both coding and protein sequence names are added where appropriate. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. HGVS notations given on Ensembl identifiers are versioned. Not used by default",
            ),
            ToolInput(
                "hgvsg",
                Boolean(optional=True),
                prefix="--hgvsg",
                doc=
                "Add genomic HGVS nomenclature based on the input chromosome name. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. Not used by default",
            ),
            ToolInput(
                "shiftHgvs",
                Boolean(optional=True),
                prefix="--shift_hgvs",
                doc=
                """Enable or disable 3\' shifting of HGVS notations. When enabled, this causes ambiguous insertions or deletions (typically in repetetive sequence tracts) to be "shifted" to their most 3' possible coordinates (relative to the transcript sequence and strand) before the HGVS notations are calculated; the flag HGVS_OFFSET is set to the number of bases by which the variant has shifted, relative to the input genomic coordinates. Disabling retains the original input coordinates of the variant. Default: 1 (shift)""",
            ),
            ToolInput(
                "transcriptVersion",
                Boolean(optional=True),
                prefix="--transcript_version",
                doc="Add version numbers to Ensembl transcript identifiers",
            ),
            ToolInput(
                "protein",
                Boolean(optional=True),
                prefix="--protein",
                doc=
                "Add the Ensembl protein identifier to the output where appropriate. Not used by default",
            ),
            ToolInput(
                "symbol",
                Boolean(optional=True),
                prefix="--symbol",
                doc=
                "Adds the gene symbol (e.g. HGNC) (where available) to the output. Not used by default",
            ),
            ToolInput(
                "ccds",
                Boolean(optional=True),
                prefix="--ccds",
                doc=
                "Adds the CCDS transcript identifer (where available) to the output. Not used by default",
            ),
            ToolInput(
                "uniprot",
                Boolean(optional=True),
                prefix="--uniprot",
                doc=
                "Adds best match accessions for translated protein products from three UniProt-related databases (SWISSPROT, TREMBL and UniParc) to the output. Not used by default",
            ),
            ToolInput(
                "tsl",
                Boolean(optional=True),
                prefix="--tsl",
                doc=
                "Adds the transcript support level for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "appris",
                Boolean(optional=True),
                prefix="--appris",
                doc=
                "Adds the APPRIS isoform annotation for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "canonical",
                Boolean(optional=True),
                prefix="--canonical",
                doc=
                "Adds a flag indicating if the transcript is the canonical transcript for the gene. Not used by default",
            ),
            ToolInput(
                "mane",
                Boolean(optional=True),
                prefix="--mane",
                doc=
                "Adds a flag indicating if the transcript is the MANE Select transcript for the gene. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "biotype",
                Boolean(optional=True),
                prefix="--biotype",
                doc=
                "Adds the biotype of the transcript or regulatory feature. Not used by default",
            ),
            ToolInput(
                "domains",
                Boolean(optional=True),
                prefix="--domains",
                doc=
                "Adds names of overlapping protein domains to output. Not used by default",
            ),
            ToolInput(
                "xrefRefseq",
                Boolean(optional=True),
                prefix="--xref_refseq",
                doc=
                "Output aligned RefSeq mRNA identifier for transcript. Not used by default. Note: The RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND FREQUENTLY WILL NOT, match exactly in sequence, exon structure and protein product",
            ),
            ToolInput(
                "synonyms",
                Tsv(optional=True),
                prefix="--synonyms",
                doc=
                "Load a file of chromosome synonyms. File should be tab-delimited with the primary identifier in column 1 and the synonym in column 2. Synonyms allow different chromosome identifiers to be used in the input file and any annotation source (cache, database, GFF, custom file, FASTA file). Not used by default",
            ),
            ToolInput(
                "checkExisting",
                Boolean(optional=True),
                prefix="--check_existing",
                doc=
                """Checks for the existence of known variants that are co-located with your input. By default the alleles are compared and variants on an allele-specific basis - to compare only coordinates, use --no_check_alleles.

            Some databases may contain variants with unknown (null) alleles and these are included by default; to exclude them use --exclude_null_alleles.

            See this page for more details.

            Not used by default""",
            ),
            ToolInput(
                "checkSvs",
                Boolean(optional=True),
                prefix="--check_svs",
                doc=
                "Checks for the existence of structural variants that overlap your input. Currently requires database access. Not used by default",
            ),
            ToolInput(
                "clinSigAllele",
                Boolean(optional=True),
                prefix="--clin_sig_allele",
                doc=
                "Return allele specific clinical significance. Setting this option to 0 will provide all known clinical significance values at the given locus. Default: 1 (Provide allele-specific annotations)",
            ),
            ToolInput(
                "excludeNullAlleles",
                Boolean(optional=True),
                prefix="--exclude_null_alleles",
                doc=
                "Do not include variants with unknown alleles when checking for co-located variants. Our human database contains variants from HGMD and COSMIC for which the alleles are not publically available; by default these are included when using --check_existing, use this flag to exclude them. Not used by default",
            ),
            ToolInput(
                "noCheckAlleles",
                Boolean(optional=True),
                prefix="--no_check_alleles",
                doc=
                """When checking for existing variants, by default VEP only reports a co-located variant if none of the input alleles are novel. For example, if your input variant has alleles A/G, and an existing co-located variant has alleles A/C, the co-located variant will not be reported.

            Strand is also taken into account - in the same example, if the input variant has alleles T/G but on the negative strand, then the co-located variant will be reported since its alleles match the reverse complement of input variant.

            Use this flag to disable this behaviour and compare using coordinates alone. Not used by default""",
            ),
            ToolInput(
                "af",
                Boolean(optional=True),
                prefix="--af",
                doc=
                "Add the global allele frequency (AF) from 1000 Genomes Phase 3 data for any known co-located variant to the output. For this and all --af_* flags, the frequency reported is for the input allele only, not necessarily the non-reference or derived allele. Not used by default",
            ),
            ToolInput(
                "maxAf",
                Boolean(optional=True),
                prefix="--max_af",
                doc=
                "Report the highest allele frequency observed in any population from 1000 genomes, ESP or gnomAD. Not used by default",
            ),
            ToolInput(
                "af1kg",
                String(optional=True),
                prefix="--af_1kg",
                doc=
                "Add allele frequency from continental populations (AFR,AMR,EAS,EUR,SAS) of 1000 Genomes Phase 3 to the output. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afEsp",
                Boolean(optional=True),
                prefix="--af_esp",
                doc=
                "Include allele frequency from NHLBI-ESP populations. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afGnomad",
                Boolean(optional=True),
                prefix="--af_gnomad",
                doc=
                "Include allele frequency from Genome Aggregation Database (gnomAD) exome populations. Note only data from the gnomAD exomes are included; to retrieve data from the additional genomes data set, see this guide. Must be used with --cache Not used by default",
            ),
            ToolInput(
                "afExac",
                Boolean(optional=True),
                prefix="--af_exac",
                doc=
                "Include allele frequency from ExAC project populations. Must be used with --cache. Not used by default. Note: ExAC data has been superceded by gnomAD. This flag remains for those wishing to use older cache versions containing ExAC data.",
            ),
            ToolInput(
                "pubmed",
                Boolean(optional=True),
                prefix="--pubmed",
                doc=
                "Report Pubmed IDs for publications that cite existing variant. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "failed",
                Boolean(optional=True),
                prefix="--failed",
                doc=
                "When checking for co-located variants, by default VEP will exclude variants that have been flagged as failed. Set this flag to include such variants. Default: 0 (exclude)",
            ),
            ToolInput(
                "gencodeBasic",
                Boolean(optional=True),
                prefix="--gencode_basic",
                doc=
                "Limit your analysis to transcripts belonging to the GENCODE basic set. This set has fragmented or problematic transcripts removed. Not used by default",
            ),
            ToolInput(
                "excludePredicted",
                Boolean(optional=True),
                prefix="--exclude_predicted",
                doc=
                'When using the RefSeq or merged cache, exclude predicted transcripts (i.e. those with identifiers beginning with "XM_" or "XR_").',
            ),
            ToolInput(
                "transcriptFilter",
                Boolean(optional=True),
                prefix="--transcript_filter",
                doc=
                '''ADVANCED Filter transcripts according to any arbitrary set of rules. Uses similar notation to filter_vep.

            You may filter on any key defined in the root of the transcript object; most commonly this will be ""stable_id"":

            --transcript_filter ""stable_id match N[MR]_""''',
            ),
            ToolInput(
                "checkRef",
                Boolean(optional=True),
                prefix="--check_ref",
                doc=
                "Force VEP to check the supplied reference allele against the sequence stored in the Ensembl Core database or supplied FASTA file. Lines that do not match are skipped. Not used by default",
            ),
            ToolInput(
                "lookupRef",
                Boolean(optional=True),
                prefix="--lookup_ref",
                doc=
                "Force overwrite the supplied reference allele with the sequence stored in the Ensembl Core database or supplied FASTA file. Not used by default",
            ),
            ToolInput(
                "dontSkip",
                Boolean(optional=True),
                prefix="--dont_skip",
                doc=
                "Don't skip input variants that fail validation, e.g. those that fall on unrecognised sequences. Combining --check_ref with --dont_skip will add a CHECK_REF output field when the given reference does not match the underlying reference sequence.",
            ),
            ToolInput(
                "allowNonVariant",
                Boolean(optional=True),
                prefix="--allow_non_variant",
                doc=
                "When using VCF format as input and output, by default VEP will skip non-variant lines of input (where the ALT allele is null). Enabling this option the lines will be printed in the VCF output with no consequence data added.",
            ),
            ToolInput(
                "chr",
                Array(String, optional=True),
                prefix="--chr",
                separator=",",
                doc=
                'Select a subset of chromosomes to analyse from your file. Any data not on this chromosome in the input will be skipped. The list can be comma separated, with "-" characters representing an interval. For example, to include chromosomes 1, 2, 3, 10 and X you could use --chr 1-3,10,X Not used by default',
            ),
            ToolInput(
                "codingOnly",
                Boolean(optional=True),
                prefix="--coding_only",
                doc=
                "Only return consequences that fall in the coding regions of transcripts. Not used by default",
            ),
            ToolInput(
                "noIntergenic",
                Boolean(optional=True),
                prefix="--no_intergenic",
                doc=
                "Do not include intergenic consequences in the output. Not used by default",
            ),
            ToolInput(
                "pick",
                Boolean(optional=True),
                prefix="--pick",
                doc=
                "Pick once line or block of consequence data per variant, including transcript-specific columns. Consequences are chosen according to the criteria described here, and the order the criteria are applied may be customised with --pick_order. This is the best method to use if you are interested only in one consequence per variant. Not used by default",
            ),
            ToolInput(
                "pickAllele",
                Boolean(optional=True),
                prefix="--pick_allele",
                doc=
                "Like --pick, but chooses one line or block of consequence data per variant allele. Will only differ in behaviour from --pick when the input variant has multiple alternate alleles. Not used by default",
            ),
            ToolInput(
                "perGene",
                Boolean(optional=True),
                prefix="--per_gene",
                doc=
                "Output only the most severe consequence per gene. The transcript selected is arbitrary if more than one has the same predicted consequence. Uses the same ranking system as --pick. Not used by default",
            ),
            ToolInput(
                "pickAlleleGene",
                Boolean(optional=True),
                prefix="--pick_allele_gene",
                doc=
                "Like --pick_allele, but chooses one line or block of consequence data per variant allele and gene combination. Not used by default",
            ),
            ToolInput(
                "flagPick",
                Boolean(optional=True),
                prefix="--flag_pick",
                doc=
                "As per --pick, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAllele",
                Boolean(optional=True),
                prefix="--flag_pick_allele",
                doc=
                "As per --pick_allele, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAlleleGene",
                Boolean(optional=True),
                prefix="--flag_pick_allele_gene",
                doc=
                "As per --pick_allele_gene, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "pickOrder",
                Array(String, optional=True),
                prefix="--pick_order",
                separator=",",
                doc=
                """Customise the order of criteria (and the list of criteria) applied when choosing a block of annotation data with one of the following options: --pick, --pick_allele, --per_gene, --pick_allele_gene, --flag_pick, --flag_pick_allele, --flag_pick_allele_gene. See this page for the default order.
            Valid criteria are: [ canonical appris tsl biotype ccds rank length mane ]. e.g.:

            --pick --pick_order tsl,appris,rank""",
            ),
            ToolInput(
                "mostSevere",
                Boolean(optional=True),
                prefix="--most_severe",
                doc=
                "Output only the most severe consequence per variant. Transcript-specific columns will be left blank. Consequence ranks are given in this table. To include regulatory consequences, use the --regulatory option in combination with this flag. Not used by default",
            ),
            ToolInput(
                "summary",
                Boolean(optional=True),
                prefix="--summary",
                doc=
                "Output only a comma-separated list of all observed consequences per variant. Transcript-specific columns will be left blank. Not used by default",
            ),
            ToolInput(
                "filterCommon",
                Boolean(optional=True),
                prefix="--filter_common",
                doc=
                "Shortcut flag for the filters below - this will exclude variants that have a co-located existing variant with global AF > 0.01 (1%). May be modified using any of the following freq_* filters. Not used by default",
            ),
            ToolInput(
                "checkFrequency",
                Boolean(optional=True),
                prefix="--check_frequency",
                doc=
                "Turns on frequency filtering. Use this to include or exclude variants based on the frequency of co-located existing variants in the Ensembl Variation database. You must also specify all of the --freq_* flags below. Frequencies used in filtering are added to the output under the FREQS key in the Extra field. Not used by default",
            ),
            ToolInput(
                "freqPop",
                String(optional=True),
                prefix="--freq_pop",
                doc=
                "Name of the population to use in frequency filter. This must be one of the following: (1KG_ALL, 1KG_AFR, 1KG_AMR, 1KG_EAS, 1KG_EUR, 1KG_SAS, AA, EA, gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS)",
            ),
            ToolInput(
                "freqFreq",
                Float(optional=True),
                prefix="--freq_freq",
                doc=
                "Allele frequency to use for filtering. Must be a float value between 0 and 1",
            ),
            ToolInput(
                "freqGtLt",
                String(optional=True),
                prefix="--freq_gt_lt",
                doc=
                "Specify whether the frequency of the co-located variant must be greater than (gt) or less than (lt) the value specified with --freq_freq",
            ),
            ToolInput(
                "freqFilter",
                String(optional=True),
                prefix="--freq_filter",
                doc=
                "Specify whether to exclude or include only variants that pass the frequency filter",
            ),
            # CADD plugin
            ToolInput("caddReference", Array(VcfTabix, optional=True)),
            # Condel
            ToolInput(
                "condelConfig",
                Directory(optional=True),
                doc=
                "Directory containing CondelPlugin config, in format: '<dir>/condel_SP.conf'",
            ),
            # dbNSFP
            ToolInput("dbnspReference", VcfTabix(optional=True), doc=""),
            ToolInput("dbsnpColumns", Array(String, optional=True)),
            # REVEL
            ToolInput("revelReference", VcfTabix(optional=True)),
            # CUSTOM
            ToolInput("custom1Reference", VcfTabix(optional=True)),
            ToolInput("custom1Columns", Array(String, optional=True)),
            ToolInput("custom2Reference", VcfTabix(optional=True)),
            ToolInput("custom2Columns", Array(String, optional=True)),
        ]
 def test_validate_string_nooptional_disallowoptional_value(self):
     self.assertTrue(String().validate_value("aa", False))
예제 #12
0
class GATK3DepthOfCoverageBase(GATK3ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "DepthOfCoverage"

    def friendly_name(self):
        return "GATK3 DepthOfCoverage: Determine coverage at different levels of partitioning and aggregation."

    def tool(self):
        return "Gatk3DepthOfCoverage"

    def inputs(self):
        return [
            ToolInput(
                "bam",
                BamBai(),
                prefix="-I",
                doc="Input file containing sequence  data (BAM or CRAM)",
                secondaries_present_as={".bai": "^.bai"},
                position=10,
            ),
            ToolInput("reference",
                      FastaWithDict(),
                      prefix="-R",
                      doc="Reference sequence file"),
            ToolInput(
                "outputPrefix",
                String(),
                prefix="-o",
                doc=
                "An output file created by the walker. Will overwrite contents if file exists",
            ),
            ToolInput(
                "intervals",
                File(optional=True),
                prefix="-L",
                doc="One or more genomic intervals over which to operate",
            ),
            ToolInput(
                "excludeIntervals",
                File(optional=True),
                prefix="--excludeIntervals",
                doc="One or more genomic intervals to exclude from processing",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput("sample",
                       TextFile(),
                       glob=InputSelector("outputPrefix"),
                       doc=""),
            ToolOutput(
                "sampleCumulativeCoverageCounts",
                TextFile(),
                glob=InputSelector("outputPrefix") +
                ".sample_cumulative_coverage_counts",
                doc="",
            ),
            ToolOutput(
                "sampleCumulativeCoverageProportions",
                TextFile(),
                glob=InputSelector("outputPrefix") +
                ".sample_cumulative_coverage_proportions",
                doc="",
            ),
            ToolOutput(
                "sampleIntervalStatistics",
                TextFile(),
                glob=InputSelector("outputPrefix") +
                ".sample_interval_statistics",
                doc="",
            ),
            ToolOutput(
                "sampleIntervalSummary",
                TextFile(),
                glob=InputSelector("outputPrefix") +
                ".sample_interval_summary",
                doc="",
            ),
            ToolOutput(
                "sampleStatistics",
                TextFile(),
                glob=InputSelector("outputPrefix") + ".sample_statistics",
                doc="",
            ),
            ToolOutput(
                "sampleSummary",
                TextFile(),
                glob=InputSelector("outputPrefix") + ".sample_summary",
                doc="",
            ),
        ]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Jiaan Yu"],
            dateCreated=date(2020, 4, 9),
            dateUpdated=date(2020, 4, 9),
            institution="Broad Institute",
            doi=None,
            citation="",
            keywords=["gatk", "gatk3", "DepthOfCoverage"],
            documentationUrl=
            "https://github.com/broadinstitute/gatk-docs/blob/master/gatk3-tooldocs/3.8-0/org_broadinstitute_gatk_engine_CommandLineGATK.html",
            documentation="""Overview
This tool processes a set of bam files to determine coverage at different levels of partitioning and aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by mapping or base quality score.

Input
One or more bam files (with proper headers) to be analyzed for coverage statistics
(Optional) A REFSEQ file to aggregate coverage to the gene level (for information about creating the REFSEQ Rod, please consult the online documentation)
Output
Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:

no suffix: per locus coverage
_summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
_statistics: coverage histograms (# locus with X coverage), aggregated over all bases
_interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
_interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
_gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
_gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
_cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
_cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases""",
        )

    additional_args = [
        # Engine parameters
        ToolInput(
            "argFile",
            File(optional=True),
            prefix="--arg_file",
            doc="Reads arguments from the specified file",
        ),
        ToolInput(
            "showFullBamList",
            Boolean(optional=True),
            prefix="--showFullBamList",
            doc="Emit list of input BAM/CRAM files to log",
        ),
        ToolInput(
            "read_buffer_size",
            Int(optional=True),
            prefix="--read_buffer_size",
            doc="Number of reads per SAM file to buffer in memory",
        ),
        ToolInput(
            "read_filter",
            Boolean(optional=True),
            prefix="--read_filter",
            doc="Filters to apply to reads before analysis",
        ),
        ToolInput(
            "disable_read_filter",
            Boolean(optional=True),
            prefix="--disable_read_filter",
            doc="Read filters to disable",
        ),
        ToolInput(
            "interval_set_rule",
            String(optional=True),
            prefix="--interval_set_rule",
            doc=
            "Set merging approach to use for combining interval inputs (UNION|INTERSECTION)",
        ),
        ToolInput(
            "interval_merging",
            String(optional=True),
            prefix="--interval_merging",
            doc=
            "Set merging approach to use for combining interval inputs (UNION|INTERSECTION)",
        ),
        ToolInput(
            "interval_padding",
            Int(optional=True),
            prefix="--interval_padding",
            doc="Amount of padding (in bp) to add to each interval",
        ),
        ToolInput(
            "nonDeterministicRandomSeed",
            Boolean(optional=True),
            prefix="--nonDeterministicRandomSeed",
            doc="Use a non-deterministic random seed",
        ),
        ToolInput(
            "maxRuntime",
            String(optional=True),
            prefix="--maxRuntime",
            doc=
            "Unit of time used by maxRuntime (NANOSECONDS|MICROSECONDS|SECONDS|MINUTES|HOURS|DAYS)",
        ),
        ToolInput(
            "downsampling_type",
            String(optional=True),
            prefix="--downsampling_type",
            doc=
            "Type of read downsampling to employ at a given locus (NONE|ALL_READS|BY.sample)",
        ),
        ToolInput(
            "downsample_to_fraction",
            Float(optional=True),
            prefix="--downsample_to_fraction",
            doc=
            "Fraction of reads to downsample to Target coverage threshold for downsampling to coverage",
        ),
        ToolInput(
            "baq",
            String(optional=True),
            prefix="--baq",
            doc=
            "Type of BAQ calculation to apply in the engine (OFF|CALCULATE_AS_NECESSARY|RECALCULATE)",
        ),
        # ToolInput("baqGapOpenPenalty", Type(?), prefix="--baqGapOpenPenalty", doc="BAQ gap open penalty"),
        ToolInput(
            "refactor_NDN_cigar_string",
            Boolean(optional=True),
            prefix="--refactor_NDN_cigar_string",
            doc="Reduce NDN elements in CIGAR string",
        ),
        ToolInput(
            "fixMisencodedQuals",
            Boolean(optional=True),
            prefix="--fixMisencodedQuals",
            doc="Fix mis-encoded base quality scores",
        ),
        ToolInput(
            "allowPotentiallyMisencodedQuals",
            Boolean(optional=True),
            prefix="--allowPotentiallyMisencodedQuals",
            doc="Ignore warnings about base quality score encoding",
        ),
        ToolInput(
            "useOriginalQualities",
            Boolean(optional=True),
            prefix="--useOriginalQualities",
            doc="Use the base quality scores from the OQ tag",
        ),
        ToolInput(
            "defaultBaseQualities",
            Int(optional=True),
            prefix="--defaultBaseQualities",
            doc="Assign a default base quality",
        ),
        ToolInput(
            "performanceLog",
            Filename(),
            prefix="--performanceLog",
            doc="Write GATK runtime performance log to this file",
        ),
        ToolInput(
            "BQSR",
            File(optional=True),
            prefix="--BQSR",
            doc=
            "Input covariates table file for on-the-fly base quality score recalibration",
        ),
        # ToolInput("quantize_quals", Int(optional=True), prefix="--quantize_quals", doc="Quantize quality scores to a given number of levels (with -BQSR)"),
        # ToolInput("static_quantized_quals", Type(optional=True), prefix="--static_quantized_quals", doc="Use static quantized quality scores to a given number of levels (with -BQSR)"),
        ToolInput(
            "disable_indel_quals",
            Boolean(optional=True),
            prefix="--disable_indel_quals",
            doc=
            "Disable printing of base insertion and deletion tags (with -BQSR)",
        ),
        ToolInput(
            "emit_original_quals",
            Boolean(optional=True),
            prefix="--emit_original_quals",
            doc="Emit the OQ tag with the original base qualities (with -BQSR)",
        ),
        ToolInput(
            "preserve_qscores_less_than",
            Int(optional=True),
            prefix="--preserve_qscores_less_than",
            doc=
            "Don't recalibrate bases with quality scores less than this threshold (with -BQSR)",
        ),
        # ToolInput("globalQScorePrior", Type(optional=True), prefix="--globalQScorePrior", doc="globalQScorePrior")
        # Tool specific parameters
        ToolInput(
            "countType",
            String(optional=True),
            prefix="--countType",
            doc=
            "overlapping reads from the same  fragment be handled? (COUNT_READS|COUNT_FRAGMENTS|COUNT_FRAGMENTS_REQUIRE_SAME_BASE)",
        ),
        ToolInput(
            "summaryCoverageThreshold",
            Array(Int(), optional=True),
            prefix="-ct",
            doc="Coverage threshold (in percent) for summarizing statistics",
            prefix_applies_to_all_elements=True,
        ),
    ]
예제 #13
0
class UncompressArchive(UnixTool):
    def tool(self):
        return "UncompressArchive"

    def friendly_name(self):
        return "UncompressArchive"

    def tool_provider(self):
        return "GNU Project"

    def base_command(self):
        return "gunzip"

    def inputs(self):
        return [ToolInput("file", Gunzipped(), position=1), *self.additional_inputs]

    def outputs(self):
        return [ToolOutput("out", Stdout(File))]

    additional_inputs = [
        ToolInput(
            "stdout",
            Boolean(optional=True),
            prefix="-c",
            default=True,
            doc="write on standard output, keep original files unchanged",
        ),
        ToolInput("decompress", Boolean(optional=True), prefix="-d", doc="decompress"),
        ToolInput(
            "force",
            Boolean(optional=True),
            prefix="-f",
            doc="force overwrite of output file and compress links",
        ),
        ToolInput(
            "keep",
            Boolean(optional=True),
            prefix="-k",
            doc="keep (don't delete) input files",
        ),
        ToolInput(
            "list",
            Boolean(optional=True),
            prefix="-l",
            doc="list compressed file contents",
        ),
        ToolInput(
            "noName",
            Boolean(optional=True),
            prefix="-n",
            doc="do not save or restore the original name and time stamp",
        ),
        ToolInput(
            "name",
            Boolean(optional=True),
            prefix="-N",
            doc="save or restore the original name and time stamp",
        ),
        ToolInput(
            "quiet", Boolean(optional=True), prefix="-q", doc="suppress all warnings"
        ),
        ToolInput(
            "recursive",
            Boolean(optional=True),
            prefix="-r",
            doc="operate recursively on directories",
        ),
        ToolInput(
            "suffix",
            String(optional=True),
            prefix="-s",
            doc="use suffix SUF on compressed files",
        ),
        ToolInput(
            "test",
            Boolean(optional=True),
            prefix="-t",
            doc="test compressed file integrity",
        ),
        ToolInput("fast", Boolean(optional=True), prefix="-1", doc="compress faster"),
        ToolInput("best", Boolean(optional=True), prefix="-9", doc="compress better"),
        ToolInput(
            "rsyncable",
            Boolean(optional=True),
            prefix="--rsyncable",
            doc="Make rsync-friendly archive",
        ),
    ]

    def bind_metadata(self):
        return ToolMetadata(
            contributors=["Jiaan Yu"],
            dateCreated=datetime(2020, 6, 11),
            dateUpdated=datetime(2020, 6, 11),
            documentation="",
        )

    def tests(self):
        return [
            TTestCase(
                name="basic",
                input={
                    "file": "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/petermac_testdata/1000G_phase1.snps.high_confidence.hg38.BRCA1.vcf.gz",
                },
                output=[
                    TTestExpectedOutput(
                        tag="out",
                        preprocessor=TTestPreprocessor.FileSize,
                        operator=operator.eq,
                        expected_value=160525,
                    ),
                    TTestExpectedOutput(
                        tag="out",
                        preprocessor=TTestPreprocessor.LineCount,
                        operator=operator.eq,
                        expected_value=625,
                    ),
                ],
            )
        ]
예제 #14
0
    def constructor(self):

        self.input("normal_bam", BamBai)
        self.input("tumor_bam", BamBai)
        self.input("normal_name", String(optional=True))
        self.input(
            "intervals",
            Bed(optional=True),
            doc="This optional intervals file supports processing by regions. If this file resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix)
        self.input("panel_of_normals", VcfTabix(optional=True))
        self.input("output_bam_name", String(optional=True))

        # split normal and tumor bam
        self.step(
            "normal_split_bam",
            self.process_subpipeline(bam=self.normal_bam, intervals=self.intervals),
        )
        self.step(
            "tumor_split_bam",
            self.process_subpipeline(bam=self.tumor_bam, intervals=self.intervals),
        )

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_3(
                normalBams=[self.normal_split_bam.out],
                tumorBams=[self.tumor_split_bam.out],
                normalSample=self.normal_name,
                intervals=self.intervals,
                reference=self.reference,
                germlineResource=self.gnomad,
                panelOfNormals=self.panel_of_normals,
                outputPrefix=self.normal_name,
                outputBamName=self.output_bam_name,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModelLatest(
                f1r2CountsFiles=self.mutect2.f1f2r_out,
            ),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummariesLatest(
                bam=self.tumor_split_bam.out,
                sites=self.gnomad,
                intervals=self.intervals,
            ),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContaminationLatest(
                pileupTable=self.getpileupsummaries.out,
            ),
        )
        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCallsLatest(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise and filter "PASS" variants
        self.step("uncompressvcf", UncompressArchive(file=self.filtermutect2calls.out))
        self.step(
            "splitnormalisevcf",
            SplitMultiAllele(
                vcf=self.uncompressvcf.out.as_type(Vcf), reference=self.reference
            ),
        )
        self.step(
            "filterpass",
            VcfToolsvcftoolsLatest(
                vcf=self.splitnormalisevcf.out,
                removeFileteredAll=True,
                recode=True,
                recodeINFOAll=True,
            ),
        )

        self.output("variants", source=self.filtermutect2calls.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.filterpass.out)
    def constructor(self):

        self.input("bam", BamBai)
        self.input(
            "intervals",
            Bed(optional=True),
            doc=
            "This optional interval supports processing by regions. If this input resolves "
            "to null, then GATK will process the whole genome per each tool's spec",
        )
        self.input("reference", FastaWithDict)
        self.input("gnomad", VcfTabix())
        self.input("panel_of_normals", VcfTabix())
        self.input("gatk_bam_str", String(optional=True))

        # variant calling + learn read orientation model
        self.step(
            "mutect2",
            gatk4.GatkMutect2_4_1_2(
                tumorBams=self.bam,
                intervals=self.intervals,
                reference=self.reference,
                panelOfNormals=self.panel_of_normals,
                germlineResource=self.gnomad,
                outputBamName=self.gatk_bam_str,
            ),
        )
        self.step(
            "learnorientationmodel",
            gatk4.Gatk4LearnReadOrientationModel_4_1_2(
                f1r2CountsFiles=self.mutect2.f1f2r_out),
        )

        # calculate contamination and segmentation
        self.step(
            "getpileupsummaries",
            gatk4.Gatk4GetPileUpSummaries_4_1_2(bam=self.bam,
                                                sites=self.gnomad,
                                                intervals=self.intervals),
        )
        self.step(
            "calculatecontamination",
            gatk4.Gatk4CalculateContamination_4_1_2(
                pileupTable=self.getpileupsummaries.out),
        )

        self.step(
            "filtermutect2calls",
            gatk4.Gatk4FilterMutectCalls_4_1_2(
                vcf=self.mutect2.out,
                reference=self.reference,
                segmentationFile=self.calculatecontamination.segOut,
                contaminationTable=self.calculatecontamination.contOut,
                readOrientationModel=self.learnorientationmodel.out,
                statsFile=self.mutect2.stats,
            ),
        )

        # normalise vcf
        self.step(
            "splitnormalisevcf",
            SplitMultiAlleleNormaliseVcf(
                compressedTabixVcf=self.filtermutect2calls.out,
                reference=self.reference,
            ),
        )

        self.output("variants", source=self.mutect2.out)
        self.output("out_bam", source=self.mutect2.bam)
        self.output("out", source=self.splitnormalisevcf.out)
 def test_array_valid_optional_internal(self):
     self.assertTrue(
         Array(String(optional=True)).validate_value(["aa", None], False))
예제 #17
0
 def inputs(self):
     return [
         ToolInput(
             tag="aligned_inp",
             input_type=Bam(),
             prefix="-x",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File in SAM/BAM/CRAM format with main alignments as generated by STAR (Aligned.out.sam). "
                 "Arriba extracts candidate reads from this file. This is sometimes /dev/stdin"
             ),
         ),
         ToolInput(
             tag="inp_chimeric",
             input_type=Bam(optional=True),
             prefix="-c",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR (Chimeric.out.sam). "
                 "This parameter is only required, if STAR was run with the parameter "
                 "'--chimOutType SeparateSAMold'. When STAR was run with the parameter "
                 "'--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c can be omitted. "
             ),
         ),
         ToolInput(
             tag="gtf_file",
             input_type=File(optional=True),
             prefix="-g",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "GTF file with gene annotation. The file may be gzip-compressed."
             ),
         ),
         ToolInput(
             tag="gtf_features",
             input_type=Csv(optional=True),
             prefix="-G",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Comma-/space-separated list of names of GTF features. "
                 "Default: gene_name=gene_name|gene_id gene_id=gene_id transcript_id=transcript_id feature_exon=exon feature_CDS=CDS "
             ),
         ),
         ToolInput(
             tag="reference",
             input_type=Fasta(optional=True),
             prefix="-a",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "FastA file with genome sequence (assembly). The file may be gzip-compressed. An index with "
                 "the file extension .fai must exist only if CRAM files are processed. "
             ),
         ),
         ToolInput(
             tag="blacklist",
             input_type=File(optional=True),
             prefix="-b",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File containing blacklisted events (recurrent artifacts and transcripts observed in healthy tissue). "
             ),
         ),
         ToolInput(
             tag="known_fusions",
             input_type=Tsv(optional=True),
             prefix="-k",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "File containing known/recurrent fusions. Some cancer entities are often characterized by "
                 "fusions between the same pair of genes. In order to boost sensitivity, a list of known "
                 "fusions can be supplied using this parameter. The list must contain two columns with the "
                 "names of the fused genes, separated by tabs. "),
         ),
         ToolInput(
             tag="output_filename",
             input_type=Filename(extension=".tsv"),
             prefix="-o",
             default="fusions.tsv",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Output file with fusions that have passed all filters."
             ),
         ),
         ToolInput(
             tag="discarded_output_filename",
             input_type=Filename(suffix=".discarded", extension=".tsv"),
             prefix="-O",
             separate_value_from_prefix=True,
             default="fusions.discarded.tsv",
             doc=InputDocumentation(
                 doc=
                 "Output file with fusions that were discarded due to filtering."
             ),
         ),
         ToolInput(
             tag="structural_variants_coordinates",
             input_type=Tsv(optional=True),
             prefix="-d",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Tab-separated file with coordinates of structural variants found using whole-genome "
                 "sequencing data. These coordinates serve to increase sensitivity towards weakly expressed "
                 "fusions and to eliminate fusions with low evidence. "),
         ),
         ToolInput(
             tag="max_genomic_breakpoint_distance",
             input_type=Int(optional=True),
             prefix="-D",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When a file with genomic breakpoints obtained via whole-genome sequencing is supplied via "
                 "the -d parameter, this parameter determines how far a genomic breakpoint may be away from a "
                 "transcriptomic breakpoint to consider it as a related event. For events inside genes, the "
                 "distance is added to the end of the gene; for intergenic events, the distance threshold is "
                 "applied as is. Default: 100000 "),
         ),
         ToolInput(
             tag="strandedness",
             input_type=String(optional=True),
             prefix="-s",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether a strand-specific protocol was used for library preparation, and if so, the type of "
                 "strandedness (auto/yes/no/reverse). When unstranded data is processed, the strand can "
                 "sometimes be inferred from splice-patterns. But in unclear situations, stranded data helps"
                 " resolve ambiguities. Default: auto "),
         ),
         ToolInput(
             tag="contigs",
             input_type=Array(String(), optional=True),
             prefix="-i",
             doc=InputDocumentation(
                 doc=
                 "Comma-/space-separated list of interesting contigs. Fusions between genes on other contigs "
                 "are ignored. Contigs can be specified with or without the prefix 'chr'. "
                 "Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y "
             ),
         ),
         ToolInput(
             tag="filters",
             input_type=Array(String, optional=True),
             prefix="-f",
             separator=" ",
             doc=InputDocumentation(
                 doc=
                 "Comma-/space-separated list of filters to disable. By default all filters are enabled. "
                 "Valid values: homopolymer, same_gene, inconsistently_clipped, duplicates, low_entropy, "
                 "no_genomic_support, short_anchor, homologs, blacklist, pcr_fusions, isoforms, intronic, "
                 "uninteresting_contigs, read_through, genomic_support, mismatches, no_coverage, spliced, "
                 "mismappers, merge_adjacent, select_best, many_spliced, long_gap, min_support, "
                 "relative_support, end_to_end, known_fusions, non_coding_neighbors, intragenic_exonic, "
                 "hairpin, small_insert_size "),
         ),
         ToolInput(
             tag="max_e_value",
             input_type=Float(optional=True),
             prefix="-E",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Arriba estimates the number of fusions with a given number of supporting reads which one "
                 "would expect to see by random chance. If the expected number of fusions (e-value) is higher "
                 "than this threshold, the fusion is discarded by the 'relative_support' filter. Note: "
                 "Increasing this threshold can dramatically increase the number of false positives and may "
                 "increase the runtime of resource-intensive steps. Fractional values are possible. "
                 "Default: 0.300000 "),
         ),
         ToolInput(
             tag="min_supporting_reads",
             input_type=Int(optional=True),
             prefix="-S",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'min_support' filter discards all fusions with fewer than this many supporting reads "
                 "(split reads and discordant mates combined). Default: 2 "
             ),
         ),
         ToolInput(
             tag="max_mismappers",
             input_type=Float(optional=True),
             prefix="-m",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When more than this fraction of supporting reads turns out to be mismappers, the "
                 "'mismappers' filter discards the fusion. Default: 0.800000 "
             ),
         ),
         ToolInput(
             tag="max_homolog_identity",
             input_type=Float(optional=True),
             prefix="-L",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Genes with more than the given fraction of sequence identity are considered homologs and "
                 "removed by the 'homologs' filter. Default: 0.300000 "),
         ),
         ToolInput(
             tag="homopolymer_length",
             input_type=Int(optional=True),
             prefix="-H",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'homopolymer' filter removes breakpoints adjacent to homopolymers of the given length "
                 "or more. Default: 6 "),
         ),
         ToolInput(
             tag="read_through_distance",
             input_type=Int(optional=True),
             prefix="-R",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'read_through' filter removes read-through fusions where the breakpoints are "
                 "less than the given distance away from each other. Default: 10000 "
             ),
         ),
         ToolInput(
             tag="min_anchor_length",
             input_type=Int(optional=True),
             prefix="-A",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Alignment artifacts are often characterized by split reads coming from only one gene "
                 "and no discordant mates. Moreover, the split reads only align to a short stretch in one "
                 "of the genes. The 'short_anchor' filter removes these fusions. This parameter sets the "
                 "threshold in bp for what the filter considers short. Default: 23 "
             ),
         ),
         ToolInput(
             tag="many_spliced_events",
             input_type=Int(optional=True),
             prefix="-M",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'many_spliced' filter recovers fusions between genes that have at least this "
                 "many spliced breakpoints. Default: 4 "),
         ),
         ToolInput(
             tag="max_kmer_content",
             input_type=Float(optional=True),
             prefix="-K",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'low_entropy' filter removes reads with repetitive 3-mers. If the 3-mers make up more "
                 "than the given fraction of the sequence, then the read is discarded. Default: 0.600000 "
             ),
         ),
         ToolInput(
             tag="max_mismatch_pvalue",
             input_type=Float(optional=True),
             prefix="-V",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The 'mismatches' filter uses a binomial model to calculate a p-value for observing a given "
                 "number of mismatches in a read. If the number of mismatches is too high, the read is "
                 "discarded. Default: 0.010000 "),
         ),
         ToolInput(
             tag="fragment_length",
             input_type=Int(optional=True),
             prefix="-F",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When paired-end data is given, the fragment length is estimated automatically and this "
                 "parameter has no effect. But when single-end data is given, the mean fragment length "
                 "should be specified to effectively filter fusions that arise from hairpin structures. "
                 "Default: 200 "),
         ),
         ToolInput(
             tag="max_reads",
             input_type=Int(optional=True),
             prefix="-U",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Subsample fusions with more than the given number of supporting reads. This improves "
                 "performance without compromising sensitivity, as long as the threshold is high. Counting "
                 "of supporting reads beyond the threshold is inaccurate, obviously. Default: 300 "
             ),
         ),
         ToolInput(
             tag="quantile",
             input_type=Float(optional=True),
             prefix="-Q",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Highly expressed genes are prone to produce artifacts during library preparation. Genes "
                 "with an expression above the given quantile are eligible for filtering by the 'pcr_fusions' "
                 "filter. Default: 0.998000 "),
         ),
         ToolInput(
             tag="exonic_fraction",
             input_type=Float(optional=True),
             prefix="-e",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "The breakpoints of false-positive predictions of intragenic events are often both in exons. "
                 "True predictions are more likely to have at least one breakpoint in an intron, because "
                 "introns are larger. If the fraction of exonic sequence between two breakpoints is smaller "
                 "than the given fraction, the 'intragenic_exonic' filter discards the event. Default: 0.200000"
             ),
         ),
         ToolInput(
             tag="fusion_transcript",
             input_type=Boolean(optional=True),
             prefix="-T",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'fusion_transcript' is populated with the sequence of the fused genes "
                 "as assembled from the supporting reads. Specify the flag twice to also print the fusion "
                 "transcripts to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         ToolInput(
             tag="peptide_sequence",
             input_type=Boolean(optional=True),
             prefix="-P",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'peptide_sequence' is populated with the sequence of the fused proteins "
                 "as assembled from the supporting reads. Specify the flag twice to also print the peptide "
                 "sequence to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         ToolInput(
             tag="read_identifiers",
             input_type=Boolean(optional=True),
             prefix="-I",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "When set, the column 'read_identifiers' is populated with identifiers of the reads which "
                 "support the fusion. The identifiers are separated by commas. Specify the flag twice to "
                 "also print the read identifiers to the file containing discarded fusions (-O). Default: off "
             ),
         ),
         # ToolInput(
         #   tag="help",
         #   input_type=Boolean(optional=True),
         #   prefix="-h",
         #   separate_value_from_prefix=True,
         #   doc=InputDocumentation(doc="Print help and exit."),
         # ),
     ]
 def test_array_invalid_int_string(self):
     self.assertTrue(Array(String()).validate_value(["aa", 2], True))
예제 #19
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="inp",
             input_type=Array(Bam, optional=True),
             prefix="--input",
             separate_value_from_prefix=True,
             prefix_applies_to_all_elements=True,
             doc=InputDocumentation(
                 doc="(-I) BAM/SAM/CRAM file containing reads."
                 " This argument must be specified at least once. Required. "
             ),
         ),
         ToolInput(
             tag="outputFilename",
             input_type=Filename(extension=".bam"),
             prefix="--output",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-O) Write output to this BAM filename Required."),
         ),
         ToolInput(
             tag="reference",
             input_type=FastaWithIndexes(optional=True),
             prefix="--reference",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="(-R) Reference sequence file Required."),
         ),
         ToolInput(
             tag="addOutputSamProgramRecord",
             input_type=Boolean(optional=True),
             prefix="--add-output-sam-program-record",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-add-output-sam-program-record)  If true, adds a PG tag to created SAM/BAM/CRAM files.  "
                 "Default value: true. Possible values: {true, false} "),
         ),
         ToolInput(
             tag="addOutputVcfCommandLine",
             input_type=Boolean(optional=True),
             prefix="--add-output-vcf-command-line",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-add-output-vcf-command-line)  If true, adds a command line header line to created VCF files."
                 "Default value: true. Possible values: {true, false} "),
         ),
         ToolInput(
             tag="arguments_file",
             input_type=File(optional=True),
             prefix="--arguments_file",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "read one or more arguments files and add them to the command line This argument may be "
                 "specified 0 or more times. Default value: null. "),
         ),
         ToolInput(
             tag="cloudIndexPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-index-prefetch-buffer",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-CIPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset.  Default value: -1. "
             ),
         ),
         ToolInput(
             tag="cloudPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-prefetch-buffer",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-CPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable).  Default value: 40. "
             ),
         ),
         ToolInput(
             tag="createOutputBamIndex",
             input_type=Boolean(optional=True),
             default=True,
             prefix="--create-output-bam-index",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-OBI)  If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file.  Default value: true. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="createOutputBamMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-bam-md5",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-OBM)  If true, create a MD5 digest for any BAM/SAM/CRAM file created  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="createOutputVariantIndex",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-index",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-OVI)  If true, create a VCF index when writing a coordinate-sorted VCF file.  Default value: true. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="createOutputVariantMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-md5",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-OVM)  If true, create a a MD5 digest any VCF file created.  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="disableBamIndexCaching",
             input_type=Boolean(optional=True),
             prefix="--disable-bam-index-caching",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-DBIC)  If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified.  Caching is automatically disabled if there are no intervals specified.  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="disableReadFilter",
             input_type=String(optional=True),
             prefix="--disable-read-filter",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-DF)  Read filters to be disabled before analysis  This argument may be specified 0 or more times. Default value: null. Possible Values: {AllowAllReadsReadFilter}"
             ),
         ),
         ToolInput(
             tag="disableSequenceDictionaryValidation",
             input_type=Boolean(optional=True),
             prefix="--disable-sequence-dictionary-validation",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-disable-sequence-dictionary-validation)  If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="doNotFixOverhangs",
             input_type=Boolean(optional=True),
             prefix="--do-not-fix-overhangs",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="excludeIntervals",
             input_type=Boolean(optional=True),
             prefix="--exclude-intervals",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-XL) This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="gatkConfigFile",
             input_type=String(optional=True),
             prefix="--gatk-config-file",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "A configuration file to use with the GATK. Default value: null."
             ),
         ),
         ToolInput(
             tag="gcsMaxRetries",
             input_type=Int(optional=True),
             prefix="--gcs-max-retries",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-gcs-retries)  If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection  Default value: 20. "
             ),
         ),
         ToolInput(
             tag="gcsProjectForRequesterPays",
             input_type=String(optional=True),
             prefix="--gcs-project-for-requester-pays",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Project to bill when accessing 'requester pays' buckets. If unset, these buckets cannot be accessed.  Default value: . "
             ),
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="--help",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-h) display the help message Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="intervalExclusionPadding",
             input_type=Int(optional=True),
             prefix="--interval-exclusion-padding",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-ixp)  Amount of padding (in bp) to add to each interval you are excluding.  Default value: 0. "
             ),
         ),
         ToolInput(
             tag="intervalMergingRule",
             input_type=Boolean(optional=True),
             prefix="--interval-merging-rule",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-imr)  Interval merging rule for abutting intervals  Default value: ALL. Possible values: {ALL, OVERLAPPING_ONLY} "
             ),
         ),
         ToolInput(
             tag="intervalPadding",
             input_type=Boolean(optional=True),
             prefix="--interval-padding",
             separate_value_from_prefix=True,
             doc=InputDocumentation(doc="(-ip) Default value: 0."),
         ),
         ToolInput(
             tag="intervalSetRule",
             input_type=Boolean(optional=True),
             prefix="--interval-set-rule",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-isr)  Set merging approach to use for combining interval inputs  Default value: UNION. Possible values: {UNION, INTERSECTION} "
             ),
         ),
         ToolInput(
             tag="intervals",
             input_type=String(optional=True),
             prefix="--intervals",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-L) One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="lenient",
             input_type=Boolean(optional=True),
             prefix="--lenient",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-LE) Lenient processing of VCF files Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="maxBasesInOverhang",
             input_type=Int(optional=True),
             prefix="--max-bases-in-overhang",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " max number of bases allowed in the overhang  Default value: 40. "
             ),
         ),
         ToolInput(
             tag="maxMismatchesInOverhang",
             input_type=Int(optional=True),
             prefix="--max-mismatches-in-overhang",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " max number of mismatches allowed in the overhang  Default value: 1. "
             ),
         ),
         ToolInput(
             tag="processSecondaryAlignments",
             input_type=Boolean(optional=True),
             prefix="--process-secondary-alignments",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " have the walker split secondary alignments (will still repair MC tag without it)  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="readFilter",
             input_type=String(optional=True),
             prefix="--read-filter",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-RF) Read filters to be applied before analysis This argument may be specified 0 or more times. Default value: null. Possible Values: {AlignmentAgreesWithHeaderReadFilter, AllowAllReadsReadFilter, AmbiguousBaseReadFilter, CigarContainsNoNOperator, FirstOfPairReadFilter, FragmentLengthReadFilter, GoodCigarReadFilter, HasReadGroupReadFilter, IntervalOverlapReadFilter, LibraryReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, MatchingBasesAndQualsReadFilter, MateDifferentStrandReadFilter, MateOnSameContigOrNoMappedMateReadFilter, MateUnmappedAndUnmappedReadFilter, MetricsReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroFragmentLengthReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotOpticalDuplicateReadFilter, NotSecondaryAlignmentReadFilter, NotSupplementaryAlignmentReadFilter, OverclippedReadFilter, PairedReadFilter, PassesVendorQualityCheckReadFilter, PlatformReadFilter, PlatformUnitReadFilter, PrimaryLineReadFilter, ProperlyPairedReadFilter, ReadGroupBlackListReadFilter, ReadGroupReadFilter, ReadLengthEqualsCigarLengthReadFilter, ReadLengthReadFilter, ReadNameReadFilter, ReadStrandFilter, SampleReadFilter, SecondOfPairReadFilter, SeqIsStoredReadFilter, SoftClippedReadFilter, ValidAlignmentEndReadFilter, ValidAlignmentStartReadFilter, WellformedReadFilter}"
             ),
         ),
         ToolInput(
             tag="readIndex",
             input_type=String(optional=True),
             prefix="--read-index",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-read-index)  Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically.  This argument may be specified 0 or more times. Default value: null. "
             ),
         ),
         ToolInput(
             tag="readValidationStringency",
             input_type=Boolean(optional=True),
             prefix="--read-validation-stringency",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-VS)  Validation stringency for all SAM/BAM/CRAM/SRA files read by this program.  The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: SILENT. Possible values: {STRICT, LENIENT, SILENT} "
             ),
         ),
         ToolInput(
             tag="refactorCigarString",
             input_type=Boolean(optional=True),
             prefix="--refactor-cigar-string",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-fixNDN)  refactor cigar string with NDN elements to one element  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="secondsBetweenProgressUpdates",
             input_type=Double(optional=True),
             prefix="--seconds-between-progress-updates",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-seconds-between-progress-updates)  Output traversal statistics every time this many seconds elapse  Default value: 10.0. "
             ),
         ),
         ToolInput(
             tag="sequenceDictionary",
             input_type=String(optional=True),
             prefix="--sequence-dictionary",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-sequence-dictionary)  Use the given sequence dictionary as the master/canonical sequence dictionary.  Must be a .dict file.  Default value: null. "
             ),
         ),
         ToolInput(
             tag="sitesOnlyVcfOutput",
             input_type=Boolean(optional=True),
             prefix="--sites-only-vcf-output",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " If true, don't emit genotype fields when writing vcf file output.  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="skipMappingQualityTransform",
             input_type=Boolean(optional=True),
             prefix="--skip-mapping-quality-transform",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-skip-mq-transform)  skip the 255 -> 60 MQ read transform  Default value: false. Possible values: {true, false}"
             ),
         ),
         ToolInput(
             tag="tmpDir",
             input_type=String(optional=True),
             prefix="--tmp-dir",
             separate_value_from_prefix=True,
             default="tmp/",
             doc=InputDocumentation(
                 doc="Temp directory to use. Default value: null."),
         ),
         ToolInput(
             tag="useJdkDeflater",
             input_type=Boolean(optional=True),
             prefix="--use-jdk-deflater",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-jdk-deflater)  Whether to use the JdkDeflater (as opposed to IntelDeflater)  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="useJdkInflater",
             input_type=Boolean(optional=True),
             prefix="--use-jdk-inflater",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-jdk-inflater)  Whether to use the JdkInflater (as opposed to IntelInflater)  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="verbosity",
             input_type=Boolean(optional=True),
             prefix="--verbosity",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-verbosity)  Control verbosity of logging.  Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} "
             ),
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "display the version number for this tool Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="disableToolDefaultReadFilters",
             input_type=Boolean(optional=True),
             prefix="--disable-tool-default-read-filters",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-disable-tool-default-read-filters)  Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="maxReadsInMemory",
             input_type=Boolean(optional=True),
             prefix="--max-reads-in-memory",
             separate_value_from_prefix=True,
             doc=InputDocumentation(doc="Default value: 150000."),
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="--showHidden",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-showHidden)  display hidden arguments  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="ambigFilterBases",
             input_type=Int(optional=True),
             prefix="--ambig-filter-bases",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Threshold number of ambiguous bases. If null, uses threshold fraction; otherwise, overrides threshold fraction.  Default value: null.  Cannot be used in conjuction with argument(s) maxAmbiguousBaseFraction"
             ),
         ),
         ToolInput(
             tag="ambigFilterFrac",
             input_type=Double(optional=True),
             prefix="--ambig-filter-frac",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Threshold fraction of ambiguous bases Default value: 0.05. Cannot be used in conjuction with argument(s) maxAmbiguousBases"
             ),
         ),
         ToolInput(
             tag="maxFragmentLength",
             input_type=Boolean(optional=True),
             prefix="--max-fragment-length",
             separate_value_from_prefix=True,
             doc=InputDocumentation(doc="Default value: 1000000."),
         ),
         ToolInput(
             tag="minFragmentLength",
             input_type=Boolean(optional=True),
             prefix="--min-fragment-length",
             separate_value_from_prefix=True,
             doc=InputDocumentation(doc="Default value: 0."),
         ),
         ToolInput(
             tag="keepIntervals",
             input_type=String(optional=True),
             prefix="--keep-intervals",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "One or more genomic intervals to keep This argument must be specified at least once. Required. "
             ),
         ),
         ToolInput(
             tag="library",
             input_type=String(optional=True),
             prefix="--library",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-library) Name of the library to keep This argument must be specified at least once. Required."
             ),
         ),
         ToolInput(
             tag="maximumMappingQuality",
             input_type=Int(optional=True),
             prefix="--maximum-mapping-quality",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Maximum mapping quality to keep (inclusive)  Default value: null. "
             ),
         ),
         ToolInput(
             tag="minimumMappingQuality",
             input_type=Int(optional=True),
             prefix="--minimum-mapping-quality",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Minimum mapping quality to keep (inclusive)  Default value: 10. "
             ),
         ),
         ToolInput(
             tag="dontRequireSoftClipsBothEnds",
             input_type=Boolean(optional=True),
             prefix="--dont-require-soft-clips-both-ends",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="filterTooShort",
             input_type=Int(optional=True),
             prefix="--filter-too-short",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Minimum number of aligned bases Default value: 30."),
         ),
         ToolInput(
             tag="platformFilterName",
             input_type=Boolean(optional=True),
             prefix="--platform-filter-name",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "This argument must be specified at least once. Required."
             ),
         ),
         ToolInput(
             tag="blackListedLanes",
             input_type=String(optional=True),
             prefix="--black-listed-lanes",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Platform unit (PU) to filter out This argument must be specified at least once. Required."
             ),
         ),
         ToolInput(
             tag="readGroupBlackList",
             input_type=Boolean(optional=True),
             prefix="--read-group-black-list",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "This argument must be specified at least once. Required. "
             ),
         ),
         ToolInput(
             tag="keepReadGroup",
             input_type=String(optional=True),
             prefix="--keep-read-group",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="The name of the read group to keep Required."),
         ),
         ToolInput(
             tag="maxReadLength",
             input_type=Int(optional=True),
             prefix="--max-read-length",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Keep only reads with length at most equal to the specified value Required."
             ),
         ),
         ToolInput(
             tag="minReadLength",
             input_type=Int(optional=True),
             prefix="--min-read-length",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "Keep only reads with length at least equal to the specified value Default value: 1."
             ),
         ),
         ToolInput(
             tag="readName",
             input_type=String(optional=True),
             prefix="--read-name",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc="Keep only reads with this read name Required."),
         ),
         ToolInput(
             tag="keepReverseStrandOnly",
             input_type=Boolean(optional=True),
             prefix="--keep-reverse-strand-only",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Keep only reads on the reverse strand  Required. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="sample",
             input_type=String(optional=True),
             prefix="--sample",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 "(-sample) The name of the sample(s) to keep, filtering out all others This argument must be specified at least once. Required. "
             ),
         ),
         ToolInput(
             tag="invertSoftClipRatioFilter",
             input_type=Boolean(optional=True),
             prefix="--invert-soft-clip-ratio-filter",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Inverts the results from this filter, causing all variants that would pass to fail and visa-versa.  Default value: false. Possible values: {true, false} "
             ),
         ),
         ToolInput(
             tag="softClippedLeadingTrailingRatio",
             input_type=Double(optional=True),
             prefix="--soft-clipped-leading-trailing-ratio",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Threshold ratio of soft clipped bases (leading / trailing the cigar string) to total bases in read for read to be filtered.  Default value: null.  Cannot be used in conjuction with argument(s) minimumSoftClippedRatio"
             ),
         ),
         ToolInput(
             tag="softClippedRatioThreshold",
             input_type=Double(optional=True),
             prefix="--soft-clipped-ratio-threshold",
             separate_value_from_prefix=True,
             doc=InputDocumentation(
                 doc=
                 " Threshold ratio of soft clipped bases (anywhere in the cigar string) to total bases in read for read to be filtered.  Default value: null.  Cannot be used in conjuction with argument(s) minimumLeadingTrailingSoftClippedRatio"
             ),
         ),
     ]
 def test_validate_string_optional_allowoptional_novalue(self):
     self.assertTrue(String(optional=True).validate_value(None, True))
예제 #21
0
 def test_parse_instantiated_str(self):
     t = get_instantiated_type(String())
     self.assertIsInstance(t, String)
예제 #22
0
class Gatk4FastqToSamBase(Gatk4ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "FastqToSam"

    def tool(self):
        return "Gatk4FastqToSam"

    def friendly_name(self):
        return "GATK4: Convert a FASTQ file to an unaligned BAM or SAM file."

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 1

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def inputs(self):
        return [
            *super().inputs(),
            ToolInput(
                "fastqR1",
                FastqGz(),
                prefix="--FASTQ",
                prefix_applies_to_all_elements=True,
                doc=
                "Input fastq file (optionally gzipped) for single end data, or first read in paired end data.",
                position=10,
            ),
            ToolInput(
                "fastqR2",
                FastqGz(optional=True),
                prefix="--FASTQ2",
                prefix_applies_to_all_elements=True,
                doc=
                "Input fastq file (optionally gzipped) for single end data, or first read in paired end data.",
                position=10,
            ),
            ToolInput(
                "sampleName",
                String(optional=True),
                prefix="--SAMPLE_NAME",
                prefix_applies_to_all_elements=True,
                doc=
                "Input fastq file (optionally gzipped) for single end data, or first read in paired end data.",
                position=10,
            ),
            ToolInput(
                "reference",
                FastaWithDict(optional=True),
                prefix="--REFERENCE_SEQUENCE",
                position=10,
                doc="Reference sequence file.",
            ),
            ToolInput(
                "outputFilename",
                Filename(extension=".bam"),
                position=10,
                prefix="--OUTPUT",
                doc="Merged SAM or BAM file to write to.",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [ToolOutput("out", Bam(), glob=InputSelector("outputFilename"))]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=[
                "Michael Franklin (@illusional)",
                "Matthias De Smet(@matthdsm)",
            ],
            dateCreated=date(2020, 2, 26),
            dateUpdated=date(2020, 2, 26),
            institution="Broad Institute",
            doi=None,
            citation=
            "See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information",
            keywords=["gatk", "gatk4", "broad", "merge", "sam"],
            documentationUrl=
            "https://gatk.broadinstitute.org/hc/en-us/articles/360037226792-FastqToSam-Picard-",
            documentation=
            "Converts a FASTQ file to an unaligned BAM or SAM file.",
        )

    additional_args = [
        ToolInput(
            "allowAndIgnoreEmptyLines",
            Boolean(optional=True),
            prefix="--ALLOW_AND_IGNORE_EMPTY_LINES",
            position=11,
            doc="Allow (and ignore) empty lines",
        ),
        ToolInput(
            "argumentsFile",
            Array(File(), optional=True),
            prefix="--arguments_file",
            position=11,
            doc=
            "read one or more arguments files and add them to the command line",
        ),
        ToolInput(
            "comment",
            Array(String(), optional=True),
            prefix="--COMMENT",
            position=11,
            doc="Comment(s) to include in the merged output file's header.",
        ),
        ToolInput(
            "description",
            Array(String(), optional=True),
            prefix="--DESCRIPTION",
            position=11,
            doc="Inserted into the read group header",
        ),
        ToolInput(
            "libraryName",
            Array(String(), optional=True),
            prefix="--LIBRARY_NAME",
            position=11,
            doc=
            "The library name to place into the LB attribute in the read group header",
        ),
        ToolInput(
            "maxQ",
            Int(optional=True),
            prefix="--MAX_Q",
            position=11,
            doc=
            "Maximum quality allowed in the input fastq. An exception will be thrown if a quality is greater than this value.",
        ),
        ToolInput(
            "minQ",
            Int(optional=True),
            prefix="--MIN_Q",
            position=11,
            doc=
            "Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.",
        ),
        ToolInput(
            "platform",
            String(optional=True),
            prefix="--PLATFORM",
            position=11,
            doc=
            "The platform type (e.g. ILLUMINA, SOLID) to insert into the read group header.",
        ),
        ToolInput(
            "platformModel",
            String(optional=True),
            prefix="--PLATFORM_MODEL",
            position=11,
            doc=
            "Platform model to insert into the group header (free-form text providing further details of the platform/technology used).",
        ),
        ToolInput(
            "platformUnit",
            String(optional=True),
            prefix="--PLATFORM_UNIT",
            position=11,
            doc="The expected orientation of proper read pairs.",
        ),
        ToolInput(
            "predictedInsertSize",
            Int(optional=True),
            prefix="--PREDICTED_INSERT_SIZE",
            position=11,
            doc=
            "Predicted median insert size, to insert into the read group header.",
        ),
        ToolInput(
            "programGroup",
            String(optional=True),
            prefix="--PROGRAM_GROUP",
            position=11,
            doc="Program group to insert into the read group header.",
        ),
        ToolInput(
            "readGroupName",
            String(optional=True),
            prefix="--READ_GROUP_NAME",
            position=11,
            doc="Read group name.",
        ),
        ToolInput(
            "runDate",
            String(optional=True),
            prefix="--RUN_DATE",
            position=11,
            doc=
            "Date the run was produced, to insert into the read group header",
        ),
        ToolInput(
            "sequencingCenter",
            String(optional=True),
            prefix="--SEQUENCING_CENTER",
            position=11,
            doc="The sequencing center from which the data originated.",
        ),
        ToolInput(
            "sortOrder",
            String(optional=True),
            prefix="-SO",
            position=10,
            doc=
            "The --SORT_ORDER argument is an enumerated type (SortOrder), which can have one of "
            "the following values: [unsorted, queryname, coordinate, duplicate, unknown]",
        ),
        ToolInput(
            "useSequenctialFastqs",
            Boolean(optional=True),
            prefix="--USE_SEQUENTIAL_FASTQS",
            position=11,
            doc=
            "Use sequential fastq files with the suffix _###.fastq or _###.fastq.gz.",
        ),
        ToolInput(
            "compressionLevel",
            Int(optional=True),
            prefix="--COMPRESSION_LEVEL",
            position=11,
            doc=
            "Compression level for all compressed files created (e.g. BAM and GELI).",
        ),
        ToolInput(
            "createIndex",
            Boolean(optional=True),
            prefix="--CREATE_INDEX",
            position=11,
            doc=
            "Whether to create a BAM index when writing a coordinate-sorted BAM file.",
        ),
        ToolInput(
            "createMd5File",
            Boolean(optional=True),
            prefix="--CREATE_MD5_FILE",
            position=11,
            doc=
            "Whether to create an MD5 digest for any BAM or FASTQ files created.",
        ),
        ToolInput(
            "maxRecordsInRam",
            Int(optional=True),
            prefix="--MAX_RECORDS_IN_RAM",
            position=11,
            doc=
            "When writing SAM files that need to be sorted, this will specify the number of "
            "records stored in RAM before spilling to disk. Increasing this number reduces "
            "the number of file handles needed to sort a SAM file, and increases the amount of RAM needed.",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--QUIET",
            position=11,
            doc="Whether to suppress job-summary info on System.err.",
        ),
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--TMP_DIR",
            position=11,
            default="/tmp/",
            doc="Undocumented option",
        ),
        ToolInput(
            "useJdkDeflater",
            Boolean(optional=True),
            prefix="--use_jdk_deflater",
            position=11,
            doc="Whether to use the JdkDeflater (as opposed to IntelDeflater)",
        ),
        ToolInput(
            "useJdkInflater",
            Boolean(optional=True),
            prefix="--use_jdk_inflater",
            position=11,
            doc="Whether to use the JdkInflater (as opposed to IntelInflater)",
        ),
        ToolInput(
            "validationStringency",
            String(optional=True),
            prefix="--VALIDATION_STRINGENCY",
            position=11,
            doc=
            "Validation stringency for all SAM files read by this program. Setting stringency to SILENT "
            "can improve performance when processing a BAM file in which variable-length data "
            "(read, qualities, tags) do not otherwise need to be decoded."
            "The --VALIDATION_STRINGENCY argument is an enumerated type (ValidationStringency), "
            "which can have one of the following values: [STRICT, LENIENT, SILENT]",
        ),
        ToolInput(
            "verbosity",
            String(optional=True),
            prefix="--verbosity",
            position=11,
            doc=
            "The --verbosity argument is an enumerated type (LogLevel), which can have "
            "one of the following values: [ERROR, WARNING, INFO, DEBUG]",
        ),
    ]
예제 #23
0
class SamToolsMpileupBase(SamToolsToolBase, ABC):
    def tool(self):
        return "SamToolsMpileup"

    @classmethod
    def samtools_command(cls):
        return "mpileup"

    def inputs(self):
        return [
            *self.additional_inputs,
            ToolInput("bam", BamBai(), position=10),
        ]

    def outputs(self):
        return [ToolOutput("out", TextFile, glob=InputSelector("outputFilename"))]

    def friendly_name(self):
        return "SamTools: Mpileup"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Jiaan Yu"],
            dateCreated=date(2020, 5, 19),
            dateUpdated=date(2020, 5, 19),
            institution="Samtools",
            doi=None,
            citation=None,
            keywords=["samtools", "mpileup"],
            documentationUrl="http://www.htslib.org/doc/samtools-mpileup.html",
            documentation="""Generate text pileup output for one or multiple BAM files. Each input file produces a separate group of pileup columns in the output.

Samtools mpileup can still produce VCF and BCF output (with -g or -u), but this feature is deprecated and will be removed in a future release. Please use bcftools mpileup for this instead. (Documentation on the deprecated options has been removed from this manual page, but older versions are available online at <http://www.htslib.org/doc/>.)

Note that there are two orthogonal ways to specify locations in the input file; via -r region and -l file. The former uses (and requires) an index to do random access while the latter streams through the file contents filtering out the specified regions, requiring no index. The two may be used in conjunction. For example a BED file containing locations of genes in chromosome 20 could be specified using -r 20 -l chr20.bed, meaning that the index is used to find chromosome 20 and then it is filtered for the regions listed in the bed file.""".strip(),
        )

    additional_inputs = [
        ToolInput(
            "illuminaEncoding",
            Boolean(optional=True),
            prefix="--illumina1.3+",
            doc="Assume the quality is in the Illumina 1.3+ encoding.",
        ),
        ToolInput(
            "countOrphans",
            Boolean(optional=True),
            prefix="--count-orphans",
            doc="do not discard anomalous read pairs",
        ),
        # Not sure this would load the
        # ToolInput("bamList", File(optional=True), prefix="--bam-list", doc="list of input BAM filenames, one per line")
        ToolInput(
            "noBAQ",
            Boolean(optional=True),
            prefix="--no-BAQ",
            doc="disable BAQ (per-Base Alignment Quality)",
        ),
        ToolInput(
            "adjustMQ",
            Int(optional=True),
            prefix="--adjust-MQ",
            doc="adjust mapping quality; recommended:50, disable:0 [0]",
        ),
        ToolInput(
            "maxDepth",
            Int(optional=True),
            prefix="--max-depth",
            doc="max per-file depth; avoids excessive memory usage [8000]",
        ),
        ToolInput(
            "redoBAQ",
            Boolean(optional=True),
            prefix="--redo-BAQ",
            doc="recalculate BAQ on the fly, ignore existing BQs",
        ),
        ToolInput(
            "fastaRef",
            File(optional=True),
            prefix="--fasta-ref",
            doc=" skip unlisted positions (chr pos) or regions (BED)",
        ),
        ToolInput(
            "excludeRG",
            File(optional=True),
            prefix="--exclude-RG",
            doc="exclude read groups listed in FILE",
        ),
        ToolInput(
            "positions",
            File(optional=True),
            prefix="--positions",
            doc="skip unlisted positions (chr pos) or regions (BED)",
        ),
        ToolInput(
            "minBQ",
            Int(optional=True),
            prefix="--min-BQ",
            doc="Minimum base quality for a base to be considered [13]",
        ),
        ToolInput(
            "minMQ",
            Int(optional=True),
            prefix="--min-MQ",
            doc="skip alignments with mapQ smaller than INT [0]",
        ),
        ToolInput(
            "region",
            String(optional=True),
            prefix="--region",
            doc="region in which pileup is generated",
        ),
        ToolInput(
            "ignoreRG",
            Boolean(optional=True),
            prefix="--ignore-RG",
            doc="ignore RG tags (one BAM = one sample)",
        ),
        ToolInput(
            "inclFlags",
            String(optional=True),
            prefix="--incl-flags",
            doc="required flags: skip reads with mask bits unset []",
        ),
        ToolInput(
            "exclFlags",
            String(optional=True),
            prefix="--excl-flags",
            doc="filter flags: skip reads with mask bits set [UNMAP,SECONDARY,QCFAIL,DUP]",
        ),
        ToolInput(
            "outputFilename",
            Filename(extension=".txt"),
            prefix="--output",
            doc="write output to FILE [standard output]",
        ),
        ToolInput(
            "ignoreOverlaps",
            Boolean(optional=True),
            prefix="--ignore-overlaps",
            doc="disable read-pair overlap detection",
        ),
        ToolInput(
            "outputBP",
            Boolean(optional=True),
            prefix="--output-BP",
            doc="output base positions on reads",
        ),
        ToolInput(
            "outputMQ",
            Boolean(optional=True),
            prefix="--output-MQ",
            doc="output mapping quality",
        ),
        ToolInput(
            "outputQNAME",
            Boolean(optional=True),
            prefix="--output-QNAME",
            doc="output read names",
        ),
        ToolInput(
            "allPositions",
            Boolean(optional=True),
            prefix="-a",
            doc="output all positions (including zero depth)",
        ),
        ToolInput(
            "absolutelyAllPositions",
            Boolean(optional=True),
            doc="output absolutely all positions, including unused ref. sequences",
        ),
        ToolInput(
            "reference",
            File(optional=True),
            prefix="--reference",
            doc="Reference sequence FASTA FILE [null]",
        ),
    ]

    """
    def tests(self):
        return [
            TTestCase(
                name="basic",
                input={
                    "bam": os.path.join(
                        BioinformaticsTool.test_data_path(), "small.bam"
                    ),
                },
                output=[
                    TTestExpectedOutput(
                        tag="out",
                        preprocessor=TTestPreprocessor.FileMd5,
                        operator=operator.eq,
                        expected_value="6b6f2401df9965b5250f4752dde03f2a",
                    ),
                    TTestExpectedOutput(
                        tag="out",
                        preprocessor=TTestPreprocessor.FileContent,
                        operator=operator.contains,
                        expected_value="17:43044045-43125733\t5\tN\t15\tCCCCCCCCCCCCCCC\tJDDAJDEDCDJD>gB\n",
                    ),
                    TTestExpectedOutput(
                        tag="out",
                        preprocessor=TTestPreprocessor.LineCount,
                        operator=operator.eq,
                        expected_value=81689,
                    ),
                ],
            )
        ]
        """

    def tests(self):
        remote_dir = "https://swift.rc.nectar.org.au/v1/AUTH_4df6e734a509497692be237549bbe9af/janis-test-data/bioinformatics/wgsgermline_data"
        return [
            TTestCase(
                name="basic",
                input={
                    "positions": f"{remote_dir}/NA12878-BRCA1.sorted.uncompressed.stdout",
                    "reference": f"{remote_dir}/Homo_sapiens_assembly38.chr17.fasta",
                    "bam": f"{remote_dir}/NA12878-BRCA1.markduped.bam",
                    "countOrphans": True,
                    "noBAQ": True,
                    "maxDepth": 10000,
                    "minBQ": 0,
                },
                output=TextFile.basic_test(
                    "out",
                    19900,
                    "chr17\t43044391\tG\t19\tA,A,,A.a,,A,,A..,,a\tDJCJ:FHDDBJBBJJIDDB",
                    187,
                    "53c3e03c20730ff45411087444379b1b",
                ),
            )
        ]
예제 #24
0
class Bcl2FastqBase(IlluminaToolBase, ABC):
    def tool(self):
        return "bcl2fastq"

    def tool_provider(self):
        return "Illumina"

    def friendly_name(self):
        return "Bcl2Fastq"

    def base_command(self):
        return "bcl2fastq"

    def arguments(self):
        return [
            ToolArgument(".",
                         prefix="--output-dir",
                         doc="path to demultiplexed output")
        ]

    def skip_test(cls) -> bool:
        return True

    def inputs(self):
        return [
            ToolInput(
                "runFolderDir",
                input_type=Directory(),
                prefix="-R",
                doc="path to runfolder directory",
            ),
            ToolInput(
                "sampleSheet",
                input_type=Csv(),
                prefix="--sample-sheet",
                doc="path to the sample sheet",
            ),
            ToolInput(
                "loadingThreads",
                input_type=Int(),
                prefix="-r",
                default=4,
                doc="number of threads used for loading BCL data",
            ),
            ToolInput(
                "processingThreads",
                input_type=Int(),
                prefix="-p",
                default=4,
                doc="number of threads used for processing demultiplexed data",
            ),
            ToolInput(
                "writingThreads",
                input_type=Int(),
                prefix="-w",
                default=4,
                doc="number of threads used for writing FASTQ data",
            ),
            *Bcl2FastqBase.additional_inputs,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "unalignedReads",
                output_type=Array(FastqGz()),
                glob=WildcardSelector("*/*.fastq.gz"),
            ),
            ToolOutput("stats",
                       output_type=Array(File()),
                       glob=WildcardSelector("Stats/*")),
            ToolOutput("interop",
                       output_type=Array(File()),
                       glob=WildcardSelector("InterOp/*")),
        ]

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Matthias De Smet (@mattdsm)"],
            dateCreated=date(2020, 3, 5),
            dateUpdated=date(2020, 3, 5),
            institution=None,
            doi=None,
            keywords=["illumina", "demultiplex"],
            documentationUrl=
            "https://support.illumina.com/downloads/bcl2fastq-conversion-software-v2-20.html",
            documentation="BCL to FASTQ file converter",
        )

    additional_inputs = [
        ToolInput(
            "minimumTrimmedReadLength",
            input_type=Int(optional=True),
            prefix="--minimum-trimmed-read-length",
            doc="minimum read length after adapter trimming",
        ),
        ToolInput(
            "useBasesMask",
            input_type=String(optional=True),
            prefix="--use-bases-mask",
            doc="specifies how to use each cycle",
        ),
        ToolInput(
            "maskShortAdapterReads",
            input_type=Int(optional=True),
            prefix="--mask-short-adapter-reads",
            doc=
            "smallest number of remaining bases (after masking bases below the minimum trimmed read length) below which whole read is masked",
        ),
        ToolInput(
            "adapterStringency",
            input_type=Float(optional=True),
            prefix="--adapter-stringency",
            doc="adapter stringency",
        ),
        ToolInput(
            "ignoreMissingBcls",
            input_type=Boolean(optional=True),
            prefix="--ignore-missing-bcls",
            doc="assume 'N'/'#' for missing calls",
        ),
        ToolInput(
            "ignoreMissingFilter",
            input_type=Boolean(optional=True),
            prefix="--ignore-missing-filter",
            doc="assume 'true' for missing filters",
        ),
        ToolInput(
            "ignoreMissingPositions",
            input_type=Boolean(optional=True),
            prefix="--ignore-missing-positions",
            doc=
            "assume [0,i] for missing positions, where i is incremented starting from 0",
        ),
        ToolInput(
            "writeFastqReverseComplement",
            input_type=Boolean(optional=True),
            prefix="--write-fastq-reverse-complement",
            doc="generate FASTQs containing reverse complements of actual data",
        ),
        ToolInput(
            "withFailedReads",
            input_type=Boolean(optional=True),
            prefix="--with-failed-reads",
            doc="include non-PF clusters",
        ),
        ToolInput(
            "createFastqForIndexReads",
            input_type=Boolean(optional=True),
            prefix="--create-fastq-for-index-reads",
            doc="create FASTQ files also for index reads",
        ),
        ToolInput(
            "findAdaptersWithSlidingWindow",
            input_type=Boolean(optional=True),
            prefix="--find-adapters-with-sliding-window",
            doc="find adapters with simple sliding window algorithm",
        ),
        ToolInput(
            "noBgzfCompression",
            input_type=Boolean(optional=True),
            prefix="--no-bgzf-compression",
            doc="turn off BGZF compression for FASTQ files",
        ),
        ToolInput(
            "barcodeMismatches",
            input_type=Int(optional=True),
            prefix="--barcode-mismatches",
            doc="number of allowed mismatches per index",
        ),
        ToolInput(
            "noLaneSplitting",
            input_type=Boolean(optional=True),
            prefix=" --no-lane-splitting",
            doc="do not split fastq files by lane",
        ),
    ]
예제 #25
0
class BcfToolsConcatBase(BcfToolsToolBase, ABC):
    def tool(self):
        return "bcftoolsConcat"

    def friendly_name(self):
        return "BCFTools: Concat"

    def base_command(self):
        return ["bcftools", "concat"]

    def inputs(self):
        return [
            ToolInput("vcf", Array(CompressedVcf()), position=15),
            ToolInput(
                "outputFilename",
                Filename(extension=".vcf.gz"),
                prefix="-o",
                doc="--output: When output consists of a single stream, "
                "write it to FILE rather than to standard output, where it is written by default.",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput("out",
                       CompressedVcf(),
                       glob=InputSelector("outputFilename"))
        ]

    def bind_metadata(self):
        from datetime import date

        self.metadata.dateUpdated = date(2019, 9, 9)
        self.metadata.doi = "http://www.ncbi.nlm.nih.gov/pubmed/19505943"
        self.metadata.citation = (
            "Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R, "
            "and 1000 Genome Project Data Processing Subgroup, The Sequence alignment/map (SAM) "
            "format and SAMtools, Bioinformatics (2009) 25(16) 2078-9")
        self.metadata.documentationUrl = (
            "https://samtools.github.io/bcftools/bcftools.html#concat")
        self.metadata.documentation = """
Concatenate or combine VCF/BCF files. All source files must have the same sample
columns appearing in the same order. The program can be used, for example, to
concatenate chromosome VCFs into one VCF, or combine a SNP VCF and an indel
VCF into one. The input files must be sorted by chr and position. The files
must be given in the correct order to produce sorted VCF on output unless
the -a, --allow-overlaps option is specified. With the --naive option, the files
are concatenated without being recompressed, which is very fast but dangerous
if the BCF headers differ.
"""

    additional_args = [
        ToolInput(
            "allowOverLaps",
            Boolean(optional=True),
            prefix="-a",
            doc=
            "First coordinate of the next file can precede last record of the current file.",
        ),
        ToolInput(
            "compactPS",
            Boolean(optional=True),
            prefix="-c",
            doc=
            "Do not output PS tag at each site, only at the start of a new phase set block.",
        ),
        ToolInput(
            "rmDups",
            String(optional=True),
            prefix="-d",
            doc=
            "Output duplicate records present in multiple files only once: <snps|indels|both|all|none>",
        ),
        ToolInput("rmDupsNone",
                  Boolean(optional=True),
                  prefix="-d",
                  doc="Alias for -d none"),
        ToolInput(
            "fileList",
            File(optional=True),
            prefix="-f",
            doc="Read the list of files from a file.",
        ),
        ToolInput(
            "ligate",
            Boolean(optional=True),
            prefix="-l",
            doc=
            "Ligate phased VCFs by matching phase at overlapping haplotypes",
        ),
        ToolInput(
            "noVersion",
            Boolean(optional=True),
            prefix="--no-version",
            doc=
            "Do not append version and command line information to the output VCF header.",
        ),
        ToolInput(
            "naive",
            Boolean(optional=True),
            prefix="-n",
            doc=
            "Concatenate files without recompression (dangerous, use with caution)",
        ),
        ToolInput(
            "outputType",
            String(optional=True),
            prefix="-O",
            default="z",
            doc=
            "--output-type b|u|z|v: Output compressed BCF (b), uncompressed BCF (u), "
            "compressed VCF (z), uncompressed VCF (v). Use the -Ou option when piping "
            "between bcftools subcommands to speed up performance by removing "
            "unnecessary compression/decompression and VCF←→BCF conversion.",
        ),
        ToolInput(
            "minPG",
            Int(optional=True),
            prefix="-q",
            doc="Break phase set if phasing quality is lower than <int> [30]",
        ),
        ToolInput(
            "regions",
            String(optional=True),
            prefix="-r",
            doc=
            "--regions chr|chr:pos|chr:from-to|chr:from-[,…]: Comma-separated list of regions, "
            "see also -R, --regions-file. Note that -r cannot be used in combination with -R.",
        ),
        ToolInput(
            "regionsFile",
            File(optional=True),
            prefix="-R",
            doc=
            "--regions-file: Regions can be specified either on command line or in a VCF, BED, or "
            "tab-delimited file (the default). The columns of the tab-delimited file are: CHROM, POS, "
            "and, optionally, POS_TO, where positions are 1-based and inclusive. The columns of the "
            "tab-delimited BED file are also CHROM, POS and POS_TO (trailing columns are ignored), "
            "but coordinates are 0-based, half-open. To indicate that a file be treated as BED rather "
            "than the 1-based tab-delimited file, the file must have the '.bed' or '.bed.gz' suffix "
            "(case-insensitive). Uncompressed files are stored in memory, while bgzip-compressed and "
            "tabix-indexed region files are streamed. Note that sequence names must match exactly, 'chr20'"
            " is not the same as '20'. Also note that chromosome ordering in FILE will be respected, "
            "the VCF will be processed in the order in which chromosomes first appear in FILE. "
            "However, within chromosomes, the VCF will always be processed in ascending genomic coordinate "
            "order no matter what order they appear in FILE. Note that overlapping regions in FILE can "
            "result in duplicated out of order positions in the output. This option requires indexed "
            "VCF/BCF files. Note that -R cannot be used in combination with -r.",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            prefix="--threads",
            doc=
            "Number of output compression threads to use in addition to main thread. "
            "Only used when --output-type is b or z. Default: 0.",
        ),
    ]
예제 #26
0
class Gatk4ApplyBqsrBase(Gatk4ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "ApplyBQSR"

    def friendly_name(self):
        return "GATK4: Apply base quality score recalibration"

    def tool(self):
        return "Gatk4ApplyBQSR"

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, CORES_TUPLE)
        if val:
            return val
        return 1

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self):
        return [
            *super(Gatk4ApplyBqsrBase, self).inputs(),
            ToolInput(
                "bam",
                BamBai(),
                prefix="-I",
                doc="The SAM/BAM/CRAM file containing reads.",
                secondaries_present_as={".bai": "^.bai"},
                position=10,
            ),
            ToolInput(
                "reference", FastaWithDict(), prefix="-R", doc="Reference sequence"
            ),
            ToolInput(
                "outputFilename",
                Filename(extension=".bam"),
                prefix="-O",
                doc="Write output to this file",
            ),
            ToolInput(
                "recalFile",
                Tsv(optional=True),
                prefix="--bqsr-recal-file",
                doc="Input recalibration table for BQSR",
            ),
            ToolInput(
                "intervals",
                Bed(optional=True),
                prefix="--intervals",
                doc="-L (BASE) One or more genomic intervals over which to operate",
            ),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out",
                BamBai(),
                glob=InputSelector("outputFilename"),
                secondaries_present_as={".bai": "^.bai"},
            )
        ]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2018, 12, 24),
            dateUpdated=date(2019, 1, 24),
            institution="Broad Institute",
            doi=None,
            citation="See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information",
            keywords=["gatk", "gatk4", "broad"],
            documentationUrl="https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_hellbender_tools_walkers_bqsr_ApplyBQSR.php",
            documentation="""
Apply base quality score recalibration: This tool performs the second pass in a two-stage 
process called Base Quality Score Recalibration (BQSR). Specifically, it recalibrates the 
base qualities of the input reads based on the recalibration table produced by the 
BaseRecalibrator tool, and outputs a recalibrated BAM or CRAM file.

Summary of the BQSR procedure: The goal of this procedure is to correct for systematic bias 
that affect the assignment of base quality scores by the sequencer. The first pass consists 
of calculating error empirically and finding patterns in how error varies with basecall 
features over all bases. The relevant observations are written to a recalibration table. 
The second pass consists of applying numerical corrections to each individual basecall 
based on the patterns identified in the first step (recorded in the recalibration table) 
and write out the recalibrated data to a new BAM or CRAM file.

- This tool replaces the use of PrintReads for the application of base quality score 
    recalibration as practiced in earlier versions of GATK (2.x and 3.x).
- You should only run ApplyBQSR with the covariates table created from the input BAM or CRAM file(s).
- Original qualities can be retained in the output file under the "OQ" tag if desired. 
    See the `--emit-original-quals` argument for details.
""".strip(),
        )

    additional_args = [
        # Put more detail in here from documentation
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--tmp-dir",
            position=11,
            default="/tmp/",
            doc="Temp directory to use.",
        )
    ]
예제 #27
0
class VarDictSomaticBase(BioinformaticsTool, ABC):
    def friendly_name(self) -> str:
        return "Vardict (Somatic)"

    def tool_provider(self):
        return "VarDict"

    def tool(self):
        return "vardict_somatic"

    def base_command(self):
        return "VarDict"

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("tumorBam", BamBai(), doc="The indexed BAM file"),
            ToolInput("normalBam", BamBai(), doc="The indexed BAM file"),
            ToolInput("intervals", Bed(), position=2, shell_quote=False),
            ToolInput(
                "reference",
                FastaFai(),
                prefix="-G",
                position=1,
                shell_quote=False,
                doc="The reference fasta. Should be indexed (.fai). "
                "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa",
            ),
            ToolInput(
                "tumorName",
                String(),
                doc=
                "The sample name to be used directly.  Will overwrite -n option",
            ),
            ToolInput(
                "normalName",
                String(),
                doc="The normal sample name to use with the -b option",
            ),
            ToolInput(
                "alleleFreqThreshold",
                Float(optional=True),
                doc="The threshold for allele frequency, default: 0.05 or 5%",
            ),
            ToolInput(
                "outputFilename",
                Filename(extension=".vcf", suffix=".vardict"),
                prefix=">",
                position=6,
                shell_quote=False,
            ),
            *VarDictSomaticBase.vardict_inputs,
            *VarDictSomaticBase.var2vcf_inputs,
        ]

    def outputs(self):
        return [ToolOutput("out", Vcf(), glob=InputSelector("outputFilename"))]

    def arguments(self):
        return [
            ToolArgument("| testsomatic.R |", position=3, shell_quote=False),
            ToolArgument("var2vcf_paired.pl", position=4, shell_quote=False),
            ToolArgument(
                JoinOperator(
                    [InputSelector("tumorBam"),
                     InputSelector("normalBam")], "|"),
                prefix="-b",
                position=1,
                shell_quote=True,
            ),
            ToolArgument(InputSelector("tumorName"),
                         prefix="-N",
                         position=1,
                         shell_quote=True),
            ToolArgument(
                JoinOperator(
                    [InputSelector("tumorName"),
                     InputSelector("normalName")], "|"),
                prefix="-N",
                position=5,
                shell_quote=True,
            ),
            ToolArgument(
                InputSelector("alleleFreqThreshold"),
                prefix="-f",
                position=5,
                shell_quote=False,
            ),
            ToolArgument(
                InputSelector("alleleFreqThreshold"),
                prefix="-f",
                position=1,
                shell_quote=False,
            ),
        ]

    vardict_inputs = [
        ToolInput(
            "indels3prime",
            Boolean(optional=True),
            prefix="-3",
            position=1,
            shell_quote=False,
            doc=
            "Indicate to move indels to 3-prime if alternative alignment can be achieved.",
        ),
        ToolInput(
            "amplicon",
            Float(optional=True),
            prefix="-a",
            position=1,
            shell_quote=False,
            doc=
            "Indicate it's amplicon based calling.  Reads that don't map to the amplicon will be skipped.  "
            "A read pair is considered belonging  to the amplicon if the edges are less than int bp to "
            "the amplicon, and overlap fraction is at least float.  Default: 10:0.95",
        ),
        ToolInput(
            "minReads",
            Int(optional=True),
            prefix="-B",
            position=1,
            shell_quote=False,
            doc="The minimum # of reads to determine strand bias, default 2",
        ),
        ToolInput(
            "chromNamesAreNumbers",
            Boolean(optional=True),
            prefix="-C",
            position=1,
            shell_quote=False,
            doc=
            "Indicate the chromosome names are just numbers, such as 1, 2, not chr1, chr2",
        ),
        ToolInput(
            "chromColumn",
            Int(optional=True),
            prefix="-c",
            position=1,
            shell_quote=False,
            doc="The column for chromosome",
        ),
        ToolInput(
            "debug",
            Boolean(optional=True),
            prefix="-D",
            position=1,
            shell_quote=False,
            doc=
            "Debug mode.  Will print some error messages and append full genotype at the end.",
        ),
        ToolInput(
            "splitDelimeter",
            String(optional=True),
            prefix="-d",
            position=1,
            shell_quote=False,
            doc='The delimiter for split region_info, default to tab "\t"',
        ),
        ToolInput(
            "geneEndCol",
            Int(optional=True),
            prefix="-E",
            position=1,
            shell_quote=False,
            doc="The column for region end, e.g. gene end",
        ),
        ToolInput(
            "segEndCol",
            Int(optional=True),
            prefix="-e",
            position=1,
            shell_quote=False,
            doc="The column for segment ends in the region, e.g. exon ends",
        ),
        ToolInput(
            "filter",
            String(optional=True),
            prefix="-F",
            position=1,
            shell_quote=False,
            doc=
            "The hexical to filter reads using samtools. Default: 0x500 (filter 2nd alignments and "
            "duplicates). Use -F 0 to turn it off.",
        ),
        ToolInput(
            "geneNameCol",
            Int(optional=True),
            prefix="-g",
            position=1,
            shell_quote=False,
            doc="The column for gene name, or segment annotation",
        ),
        # ToolInput("help", Boolean(optional=True), prefix="-H", position=1, shell_quote=False,
        #           doc="Print this help page"),
        ToolInput(
            "printHeaderRow",
            Boolean(optional=True),
            prefix="-h",
            position=1,
            shell_quote=False,
            doc="Print a header row describing columns",
        ),
        ToolInput(
            "indelSize",
            Int(optional=True),
            prefix="-I",
            position=1,
            shell_quote=False,
            doc="The indel size.  Default: 120bp",
        ),
        ToolInput(
            "outputSplice",
            Boolean(optional=True),
            prefix="-i",
            position=1,
            shell_quote=False,
            doc="Output splicing read counts",
        ),
        ToolInput(
            "performLocalRealignment",
            Int(optional=True),
            prefix="-k",
            position=1,
            shell_quote=False,
            doc=
            "Indicate whether to perform local realignment.  Default: 1.  Set to 0 to disable it. "
            "For Ion or PacBio, 0 is recommended.",
        ),
        ToolInput(
            "minMatches",
            Int(optional=True),
            prefix="-M",
            position=1,
            shell_quote=False,
            doc=
            "The minimum matches for a read to be considered. If, after soft-clipping, the matched "
            "bp is less than INT, then the read is discarded. It's meant for PCR based targeted sequencing "
            "where there's no insert and the matching is only the primers. Default: 0, or no filtering",
        ),
        ToolInput(
            "maxMismatches",
            Int(optional=True),
            prefix="-m",
            position=1,
            shell_quote=False,
            doc=
            "If set, reads with mismatches more than INT will be filtered and ignored. "
            "Gaps are not counted as mismatches. Valid only for bowtie2/TopHat or BWA aln "
            "followed by sampe. BWA mem is calculated as NM - Indels. "
            "Default: 8, or reads with more than 8 mismatches will not be used.",
        ),
        ToolInput(
            "regexSampleName",
            String(optional=True),
            prefix="-n",
            position=1,
            shell_quote=False,
            doc=
            "The regular expression to extract sample name from BAM filenames. "
            "Default to: /([^\/\._]+?)_[^\/]*.bam/",
        ),
        ToolInput(
            "mapq",
            String(optional=True),
            prefix="-O",
            position=1,
            shell_quote=False,
            doc=
            "The reads should have at least mean MapQ to be considered a valid variant. "
            "Default: no filtering",
        ),
        ToolInput(
            "qratio",
            Float(optional=True),
            prefix="-o",
            position=1,
            shell_quote=False,
            doc="The Qratio of (good_quality_reads)/(bad_quality_reads+0.5). "
            "The quality is defined by -q option.  Default: 1.5",
        ),
        ToolInput(
            "readPosition",
            Float(optional=True),
            prefix="-P",
            position=1,
            shell_quote=False,
            doc=
            "The read position filter. If the mean variants position is less that specified, "
            "it's considered false positive.  Default: 5",
        ),
        ToolInput(
            "pileup",
            Boolean(optional=True),
            prefix="-p",
            position=1,
            shell_quote=False,
            doc="Do pileup regardless of the frequency",
        ),
        ToolInput(
            "minMappingQual",
            Int(optional=True),
            prefix="-Q",
            position=1,
            shell_quote=False,
            doc=
            "If set, reads with mapping quality less than INT will be filtered and ignored",
        ),
        ToolInput(
            "phredScore",
            Int(optional=True),
            prefix="-q",
            position=1,
            shell_quote=False,
            doc="The phred score for a base to be considered a good call.  "
            "Default: 25 (for Illumina) For PGM, set it to ~15, as PGM tends to under estimate base quality.",
        ),
        ToolInput(
            "region",
            String(optional=True),
            prefix="-R",
            position=1,
            shell_quote=False,
            doc=
            "The region of interest.  In the format of chr:start-end.  If end is omitted, "
            "then a single position.  No BED is needed.",
        ),
        ToolInput(
            "minVariantReads",
            Int(optional=True),
            prefix="-r",
            position=1,
            shell_quote=False,
            doc="The minimum # of variant reads, default 2",
        ),
        ToolInput(
            "regStartCol",
            Int(optional=True),
            prefix="-S",
            position=1,
            shell_quote=False,
            doc="The column for region start, e.g. gene start",
        ),
        ToolInput(
            "segStartCol",
            Int(optional=True),
            prefix="-s",
            position=1,
            shell_quote=False,
            doc="The column for segment starts in the region, e.g. exon starts",
        ),
        ToolInput(
            "minReadsBeforeTrim",
            Int(optional=True),
            prefix="-T",
            position=1,
            shell_quote=False,
            doc="Trim bases after [INT] bases in the reads",
        ),
        ToolInput(
            "removeDuplicateReads",
            Boolean(optional=True),
            prefix="-t",
            position=1,
            shell_quote=False,
            doc=
            "Indicate to remove duplicated reads.  Only one pair with same start positions will be kept",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-th",
            position=1,
            shell_quote=False,
            doc="Threads count.",
        ),
        ToolInput(
            "freq",
            Int(optional=True),
            prefix="-V",
            position=1,
            shell_quote=False,
            doc=
            "The lowest frequency in the normal sample allowed for a putative somatic mutation. "
            "Defaults to 0.05",
        ),
        ToolInput(
            "vcfFormat",
            Boolean(optional=True),
            prefix="-v",
            position=1,
            shell_quote=False,
            doc="VCF format output",
        ),
        ToolInput(
            "vs",
            String(optional=True),
            prefix="-VS",
            position=1,
            shell_quote=False,
            doc=
            "[STRICT | LENIENT | SILENT] How strict to be when reading a SAM or BAM: "
            "STRICT   - throw an exception if something looks wrong. "
            "LENIENT	- Emit warnings but keep going if possible. "
            "SILENT	- Like LENIENT, only don't emit warning messages. "
            "Default: LENIENT",
        ),
        ToolInput(
            "bp",
            Int(optional=True),
            prefix="-X",
            position=1,
            shell_quote=False,
            doc=
            "Extension of bp to look for mismatches after insersion or deletion.  "
            "Default to 3 bp, or only calls when they're within 3 bp.",
        ),
        ToolInput(
            "extensionNucleotide",
            Int(optional=True),
            prefix="-x",
            position=1,
            shell_quote=False,
            doc=
            "The number of nucleotide to extend for each segment, default: 0",
        ),
        ToolInput(
            "yy",
            Boolean(optional=True),
            prefix="-y",
            position=1,
            shell_quote=False,
            doc="<No content>",
        ),
        ToolInput(
            "downsamplingFraction",
            Int(optional=True),
            prefix="-Z",
            position=1,
            shell_quote=False,
            doc=
            "For downsampling fraction.  e.g. 0.7 means roughly 70% downsampling.  "
            "Default: No downsampling.  Use with caution.  "
            "The downsampling will be random and non-reproducible.",
        ),
        ToolInput(
            "zeroBasedCoords",
            Int(optional=True),
            prefix="-z",
            position=1,
            shell_quote=False,
            doc=
            "0/1  Indicate whether coordinates are zero-based, as IGV uses.  "
            "Default: 1 for BED file or amplicon BED file. Use 0 to turn it off. "
            "When using the -R option, it's set to 0",
        ),
    ]

    var2vcf_inputs = []

    def docurl():
        return "https://github.com/AstraZeneca-NGS/VarDict"

    def doc(self):
        return """
예제 #28
0
class Gatk4MarkDuplicatesBase(Gatk4ToolBase, ABC):
    @classmethod
    def gatk_command(cls):
        return "MarkDuplicates"

    def tool(self):
        return "Gatk4MarkDuplicates"

    def friendly_name(self):
        return "GATK4: Mark Duplicates"

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self):
        return [
            ToolInput(
                "bam",
                Array(Bam),
                prefix="-I",
                position=10,
                # secondaries_present_as={".bai": "^.bai"},
                doc=
                "One or more input SAM or BAM files to analyze. Must be coordinate sorted.",
            ),
            ToolInput(
                "outputFilename",
                Filename(
                    prefix="generated",
                    suffix=".markduped",
                    extension=".bam",
                ),
                position=10,
                prefix="-O",
                doc="File to write duplication metrics to",
            ),
            ToolInput(
                "metricsFilename",
                Filename(extension=".metrics.txt"),
                position=10,
                prefix="-M",
                doc="The output file to write marked records to.",
            ),
            *super().inputs(),
            *self.additional_args,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out",
                BamBai,
                glob=InputSelector("outputFilename"),
                secondaries_present_as={".bai": "^.bai"},
            ),
            ToolOutput("metrics", Tsv(),
                       glob=InputSelector("metricsFilename")),
        ]

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2018, 12, 24),
            dateUpdated=date(2019, 1, 24),
            institution="Broad Institute",
            doi=None,
            citation=
            "See https://software.broadinstitute.org/gatk/documentation/article?id=11027 for more information",
            keywords=["gatk", "gatk4", "broad", "mark", "duplicates"],
            documentationUrl=
            "https://software.broadinstitute.org/gatk/documentation/tooldocs/current/picard_sam_markduplicates_MarkDuplicates.php",
            documentation="""MarkDuplicates (Picard): Identifies duplicate reads.

This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are 
defined as originating from a single fragment of DNA. Duplicates can arise during sample 
preparation e.g. library construction using PCR. See also EstimateLibraryComplexity for 
additional notes on PCR duplication artifacts. Duplicate reads can also result from a single 
amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the 
sequencing instrument. These duplication artifacts are referred to as optical duplicates.

The MarkDuplicates tool works by comparing sequences in the 5 prime positions of both reads 
and read-pairs in a SAM/BAM file. An BARCODE_TAG option is available to facilitate duplicate
marking using molecular barcodes. After duplicate reads are collected, the tool differentiates 
the primary and duplicate reads using an algorithm that ranks reads by the sums of their 
base-quality scores (default method).

The tool's main output is a new SAM or BAM file, in which duplicates have been identified 
in the SAM flags field for each read. Duplicates are marked with the hexadecimal value of 0x0400, 
which corresponds to a decimal value of 1024. If you are not familiar with this type of annotation, 
please see the following blog post for additional information.

Although the bitwise flag annotation indicates whether a read was marked as a duplicate, 
it does not identify the type of duplicate. To do this, a new tag called the duplicate type (DT) 
tag was recently added as an optional output in the 'optional field' section of a SAM/BAM file. 
Invoking the TAGGING_POLICY option, you can instruct the program to mark all the duplicates (All), 
only the optical duplicates (OpticalOnly), or no duplicates (DontTag). The records within the 
output of a SAM/BAM file will have values for the 'DT' tag (depending on the invoked TAGGING_POLICY), 
as either library/PCR-generated duplicates (LB), or sequencing-platform artifact duplicates (SQ). 
This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the 
primary methods to identify and differentiate duplicate types. Set READ_NAME_REGEX to null to 
skip optical duplicate detection, e.g. for RNA-seq or other data where duplicate sets are 
extremely large and estimating library complexity is not an aim. Note that without optical 
duplicate counts, library size estimation will be inaccurate.

MarkDuplicates also produces a metrics file indicating the numbers 
of duplicates for both single- and paired-end reads.

The program can take either coordinate-sorted or query-sorted inputs, however the behavior 
is slightly different. When the input is coordinate-sorted, unmapped mates of mapped records 
and supplementary/secondary alignments are not marked as duplicates. However, when the input 
is query-sorted (actually query-grouped), then unmapped mates and secondary/supplementary 
reads are not excluded from the duplication test and can be marked as duplicate reads.

If desired, duplicates can be removed using the REMOVE_DUPLICATE and REMOVE_SEQUENCING_DUPLICATES options."""
            .strip(),
        )

    additional_args = [
        ToolInput(
            "argumentsFile",
            Array(File(), optional=True),
            prefix="--arguments_file",
            position=10,
            doc=
            "read one or more arguments files and add them to the command line",
        ),
        ToolInput(
            "assumeSortOrder",
            String(optional=True),
            prefix="-ASO",
            doc=
            "If not null, assume that the input file has this order even if the header says otherwise. "
            "Exclusion: This argument cannot be used at the same time as ASSUME_SORTED. "
            "The --ASSUME_SORT_ORDER argument is an enumerated type (SortOrder), which can have one of "
            "the following values: [unsorted, queryname, coordinate, duplicate, unknown]",
        ),
        ToolInput(
            "barcodeTag",
            String(optional=True),
            prefix="--BARCODE_TAG",
            doc="Barcode SAM tag (ex. BC for 10X Genomics)",
        ),
        ToolInput(
            "comment",
            Array(String(), optional=True),
            prefix="-CO",
            doc="Comment(s) to include in the output file's header.",
        ),
        # ToolInput(
        #     "compressionLevel",
        #     Int(optional=True),
        #     prefix="--COMPRESSION_LEVEL",
        #     position=11,
        #     doc="Compression level for all compressed files created (e.g. BAM and GELI).",
        # ),
        ToolInput(
            "createIndex",
            Boolean(optional=True),
            prefix="--CREATE_INDEX",
            default=True,
            position=11,
            doc=
            "Whether to create a BAM index when writing a coordinate-sorted BAM file.",
        ),
        ToolInput(
            "createMd5File",
            Boolean(optional=True),
            prefix="--CREATE_MD5_FILE",
            position=11,
            doc=
            "Whether to create an MD5 digest for any BAM or FASTQ files created.",
        ),
        ToolInput(
            "maxRecordsInRam",
            Int(optional=True),
            prefix="--MAX_RECORDS_IN_RAM",
            position=11,
            doc=
            "When writing SAM files that need to be sorted, this will specify the number of "
            "records stored in RAM before spilling to disk. Increasing this number reduces "
            "the number of file handles needed to sort a SAM file, and increases the amount of RAM needed.",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--QUIET",
            position=11,
            doc="Whether to suppress job-summary info on System.err.",
        ),
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--TMP_DIR",
            position=11,
            default="tmp/",
            doc="Undocumented option",
        ),
        ToolInput(
            "useJdkDeflater",
            Boolean(optional=True),
            prefix="--use_jdk_deflater",
            position=11,
            doc="Whether to use the JdkDeflater (as opposed to IntelDeflater)",
        ),
        ToolInput(
            "useJdkInflater",
            Boolean(optional=True),
            prefix="--use_jdk_inflater",
            position=11,
            doc="Whether to use the JdkInflater (as opposed to IntelInflater)",
        ),
        ToolInput(
            "validationStringency",
            String(optional=True),
            prefix="--VALIDATION_STRINGENCY",
            position=11,
            doc=
            "Validation stringency for all SAM files read by this program. Setting stringency to SILENT "
            "can improve performance when processing a BAM file in which variable-length data "
            "(read, qualities, tags) do not otherwise need to be decoded."
            "The --VALIDATION_STRINGENCY argument is an enumerated type (ValidationStringency), "
            "which can have one of the following values: [STRICT, LENIENT, SILENT]",
        ),
        ToolInput(
            "verbosity",
            String(optional=True),
            prefix="--verbosity",
            position=11,
            doc=
            "The --verbosity argument is an enumerated type (LogLevel), which can have "
            "one of the following values: [ERROR, WARNING, INFO, DEBUG]",
        ),
        ToolInput(
            "opticalDuplicatePixelDistance",
            Int(optional=True),
            prefix="--OPTICAL_DUPLICATE_PIXEL_DISTANCE",
            doc=
            "The maximum offset between two duplicate clusters in order to consider them optical duplicates. "
            "The default is appropriate for unpatterned versions of the Illumina platform. For the patterned "
            "flowcell models, 2500 is more appropriate. For other platforms and models, users should experiment "
            "to find what works best.",
        ),
    ]
예제 #29
0
 def test_str_optstr(self):
     s1 = String(optional=False)
     s2 = String(optional=True)
     self.assertTrue(s2.can_receive_from(s1))
예제 #30
0
class Gatk4FixMateInformationBase(Gatk4ToolBase):
    @classmethod
    def gatk_command(cls):
        return "FixMateInformation"

    def tool(self) -> str:
        return "Gatk4FixMateInformation"

    def friendly_name(self) -> str:
        return "GATK4: FixMateInformation"

    def inputs(self):
        prefix = FirstOperator([InputSelector("outputPrefix"), "generated"])
        return [
            *super().inputs(),
            ToolInput(
                "inputBam",
                BamBai(),
                prefix="--INPUT",
                position=2,
                shell_quote=False,
                doc="Input BAM",
            ),
            ToolInput("outputPrefix", String(optional=True)),
            ToolInput(
                "outputBam",
                Filename(prefix=prefix, suffix=".mc", extension=".bam"),
                prefix="--OUTPUT",
                position=2,
                shell_quote=False,
                doc="Output BAM filename",
            ),
            *self.additional_fix_mate_information_args,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out",
                BamBai(),
                selector=InputSelector("outputBam"),
                secondaries_present_as={".bai": "^.bai"},
            )
        ]

    def bind_metadata(self):
        from datetime import datetime

        return ToolMetadata(
            contributors=["Miriam M Yeung"],
            dateCreated=datetime(2021, 10, 6),
            dateUpdated=datetime(2021, 10, 6),
            documentation="USAGE: ",
        )

    additional_fix_mate_information_args = [
        ToolInput(
            "addMateCigar",
            Boolean(optional=True),
            prefix="--ADD_MATE_CIGAR",
            default=True,
            position=3,
            shell_quote=False,
            doc="Add the mate CIGAR tag (MC) if true. [Default: true]",
        ),
        ToolInput(
            "assumeSorted",
            Boolean(optional=True),
            prefix="--ASSUME_SORTED",
            position=3,
            shell_quote=False,
            doc="If true, assumes that the input file is QUERYNAME sorted, even if the header says otherwise. [Default: false]",
        ),
        ToolInput(
            "ignoreMissingMates",
            Boolean(optional=True),
            prefix="--IGNORE_MISSING_MATES",
            position=3,
            shell_quote=False,
            doc="If true, ignore missing mates, other will threow an exception when missing mates are found. [Default: true]",
        ),
        ToolInput(
            "sortOrder",
            String(optional=True),
            prefix="--SORT_ORDER",
            position=3,
            shell_quote=False,
            doc="Optional sort order if the output file should be sorted differently than the input file.[Default: null]\nValid values: unsorted | queryname | coordinate | duplicate | unknown.",
        ),
        ToolInput(
            "createIndex",
            Boolean(optional=True),
            prefix="--CREATE_INDEX",
            default=True,
            position=3,
            shell_quote=False,
            doc="Whether to create a BAM index when writing a coordinate-sorted BAM file. [Default: false]",
        ),
        ToolInput(
            "maxRecordsInRam",
            Int(optional=True),
            prefix="--MAX_RECORDS_IN_RAM",
            position=3,
            shell_quote=False,
            doc="When writing files that need to be sorted, this will specify the number of records stored in RAM before spilling to disk. Increasing the number reduces the number of file handles needed to sort the file, and increases the amount of RAM needed.",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--QUIET",
            position=3,
            shell_quote=False,
            doc="Whether to suppress job-summary info on System.err.",
        ),
        ToolInput(
            "referenceSequence",
            FastaWithDict(optional=True),
            prefix="--REFERENCE_SEQUENCE",
            position=3,
            shell_quote=False,
            doc="Reference Sequence File",
        ),
        ToolInput(
            "tmpDir",
            String(optional=True),
            prefix="--TMP_DIR",
            position=3,
            default="/tmp/",
            shell_quote=False,
            doc="One or more directories with space available to be used by this program for temporary storage of working files",
        ),
        ToolInput(
            "validationStringency",
            Boolean(optional=True),
            prefix="--VALIDATION_STRINGENCY",
            position=3,
            shell_quote=False,
            doc="Validation stringency for all SAM files read by this program. SILEN can imporve performance with porcessing a BAM fiel in which variable-length data (read,qualities,tags) do no otherwise need to be decoded.[Default: strict]\nValid values: strict | lenient | silent",
        ),
    ]