Пример #1
0
 def arguments(self):
     return [
         # BWA MEM command
         ToolArgument("bwa", position=0, shell_quote=False),
         ToolArgument("mem", position=1, shell_quote=False),
         ToolArgument(
             StringFormatter(
                 "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}",
                 name=InputSelector("sampleName"),
                 pl=InputSelector("platformTechnology"),
             ),
             prefix="-R",
             position=3,
             doc="Complete read group header line.",
         ),
         ToolArgument(
             CpuSelector(),
             prefix="-t",
             position=3,
             shell_quote=False,
             doc="Number of threads. (default = 1)",
         ),
         ToolArgument("|", position=6, shell_quote=False),
         # Alt Aware Post Processing command
         ToolArgument("k8", position=7, shell_quote=False),
         ToolArgument("/opt/conda/bin/bwa-postalt.js",
                      position=7,
                      shell_quote=False),
         # Samtools View command
         ToolArgument("|", position=10, shell_quote=False),
         ToolArgument("samtools", position=11, shell_quote=False),
         ToolArgument("view", position=12, shell_quote=False),
         ToolArgument(
             InputSelector("reference"),
             prefix="-T",
             position=13,
             shell_quote=False,
         ),
         ToolArgument(
             CpuSelector(),
             position=13,
             prefix="--threads",
             doc="(@) Number of additional threads to use [0]",
             shell_quote=False,
         ),
         ToolArgument(
             "-h",
             position=13,
             shell_quote=False,
             doc="Include header in the output",
         ),
         ToolArgument(
             "-b",
             position=13,
             shell_quote=False,
             doc="Output in the BAM format.",
         ),
     ]
Пример #2
0
 def arguments(self):
     return [
         ToolArgument("bwa", position=0, shell_quote=False),
         ToolArgument("mem", position=1, shell_quote=False),
         ToolArgument("|", position=5, shell_quote=False),
         ToolArgument("samtools", position=6, shell_quote=False),
         ToolArgument("view", position=7, shell_quote=False),
         ToolArgument(InputSelector("reference"),
                      prefix="-T",
                      position=8,
                      shell_quote=False),
         ToolArgument(
             CpuSelector(),
             position=8,
             shell_quote=False,
             prefix="--threads",
             doc="(-@)  Number of additional threads to use [0]",
         ),
         ToolArgument(
             "-h",
             position=8,
             shell_quote=False,
             doc="Include the header in the output.",
         ),
         ToolArgument("-b",
                      position=8,
                      shell_quote=False,
                      doc="Output in the BAM format."),
         ToolArgument(
             StringFormatter(
                 "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}",
                 name=InputSelector("sampleName"),
                 pl=InputSelector("platformTechnology"),
             ),
             prefix="-R",
             position=2,
             doc=
             "Complete read group header line. ’\\t’ can be used in STR and will be converted to a TAB"
             "in the output SAM. The read group ID will be attached to every read in the output. "
             "An example is ’@RG\\tID:foo\\tSM:bar’. (Default=null) "
             "https://gatkforums.broadinstitute.org/gatk/discussion/6472/read-groups",
         ),
         ToolArgument(
             CpuSelector(),
             prefix="-t",
             position=2,
             shell_quote=False,
             doc="Number of threads. (default = 1)",
         ),
     ]
Пример #3
0
 def inputs(self):
     return [
         ToolInput("bams", Array(Bam()), position=10),
         ToolInput("reference",
                   FastaWithDict(),
                   position=1,
                   prefix="--reference"),
         ToolInput(
             "outputFilename",
             Filename(suffix=".svs", extension=".vcf"),
             position=2,
             prefix="--output",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembled", extension=".bam"),
             position=3,
             prefix="--assembly",
         ),
         ToolInput("threads",
                   Int(optional=True),
                   default=CpuSelector(),
                   prefix="--threads"),
         ToolInput("blacklist",
                   Bed(optional=True),
                   position=4,
                   prefix="--blacklist"),
         ToolInput("tmpdir",
                   String(optional=True),
                   default="./TMP",
                   prefix="--workingdir"),
     ]
Пример #4
0
 def inputs(self):
     return [
         *StarAlignerBase.additional_inputs,
         ToolInput("help", Boolean(optional=True), prefix="--help", doc="help page"),
         ToolInput(
             "runThreadN",
             Int(optional=True),
             default=CpuSelector(),
             prefix="--runThreadN",
             doc="int: number of threads to run STAR. Default: 1.",
         ),
         ToolInput(
             "genomeDir",
             Directory(optional=True),
             prefix="--genomeDir",
             doc="string: path to the directory where genome files are stored (for –runMode alignReads) or will be generated (for –runMode generateGenome). Default: ./GenomeDir",
         ),
         ToolInput(
             "readFilesIn",
             Array(FastqGz, optional=True),
             prefix="--readFilesIn",
             separator=",",
             doc="string(s): paths to files that contain input read1 (and, if needed, read2). Default: Read1,Read2.",
         ),
         ToolInput(
             "outFileNamePrefix",
             Filename(),
             prefix="--outFileNamePrefix",
             doc="string: output files name prefix (including full or relative path). Can only be defined on the command line.",
         ),
         ToolInput(
             "outSAMtype",
             Array(String(), optional=True),
             prefix="--outSAMtype",
             separator=" ",
             prefix_applies_to_all_elements=False,
             doc='strings: type of SAM/BAM output. 1st word: "BAM": outputBAMwithoutsorting, "SAM": outputSAMwithoutsorting, "None": no SAM/BAM output. 2nd,3rd: "Unsorted": standard unsorted. "SortedByCoordinate": sorted by coordinate. This option will allocate extra memory for sorting which can be specified by –limitBAMsortRAM.',
         ),
         ToolInput(
             "outSAMunmapped",
             String(optional=True),
             prefix="--outSAMunmapped",
             doc="string(s): output of unmapped reads in the SAM format",
         ),
         ToolInput(
             "outSAMattributes",
             String(optional=True),
             prefix="--outSAMattributes",
             doc="string: a string of desired SAM attributes, in the order desired for the output SAM",
         ),
         ToolInput(
             "readFilesCommand",
             String(optional=True),
             prefix="--readFilesCommand",
             doc="string(s): command line to execute for each of the input file. This command should generate FASTA or FASTQ text and send it to stdout",
         ),
     ]
Пример #5
0
 def inputs(self):
     return [
         ToolInput("vcf", CompressedVcf, position=1, localise_file=True),
         ToolInput(
             tag="csi",
             input_type=Boolean(optional=True),
             prefix="--csi",
             doc=
             "(-c) generate CSI-format index for VCF/BCF files [default]",
         ),
         ToolInput(
             tag="force",
             input_type=Boolean(optional=True),
             prefix="--force",
             doc="(-f) overwrite index if it already exists",
         ),
         ToolInput(
             tag="minShift",
             input_type=Int(optional=True),
             prefix="--min-shift",
             doc=
             "(-m) set minimal interval size for CSI indices to 2^INT [14]",
         ),
         # ToolInput(
         #     tag="outputFilename",
         #     input_type=Filename(suffix=".indexed", extension=".vcf.gz"),
         #     prefix="--output-file",
         #     doc="(-o) optional output index file name",
         # ),
         ToolInput(
             tag="tbi",
             input_type=Boolean(optional=True),
             default=True,
             prefix="--tbi",
             doc="(-t) generate TBI-format index for VCF files",
         ),
         ToolInput(
             tag="threads",
             input_type=Int(optional=True),
             default=CpuSelector(),
             prefix="--threads",
             doc="sets the number of threads [0]",
         ),
         ToolInput(
             tag="nrecords",
             input_type=Boolean(optional=True),
             prefix="--nrecords",
             doc="(-n) print number of records based on existing index file",
         ),
         ToolInput(
             tag="stats",
             input_type=Boolean(optional=True),
             prefix="--stats",
             doc="(-s) print per contig stats based on existing index file",
         ),
     ]
Пример #6
0
 def inputs(self):
     return [
         *super(SamToolsIndexBase, self).inputs(),
         *SamToolsIndexBase.additional_inputs,
         ToolInput("bam", Bam, position=10, localise_file=True),
         ToolInput(
             "threads",
             Int(optional=True),
             prefix="-@",
             default=CpuSelector(),
             position=10,
         ),
     ]
Пример #7
0
 def arguments(self) -> List[ToolArgument]:
     return [
         ToolArgument("configManta.py", position=0, shell_quote=False),
         ToolArgument(
             StringFormatter(";") + InputSelector("runDir") + "/runWorkflow.py",
             position=2,
             shell_quote=False,
         ),
         ToolArgument(
             CpuSelector(None),
             position=3,
             shell_quote=False,
             prefix="-j",
             doc="(-j) number of jobs, must be an integer or 'unlimited' "
             "(default: Estimate total cores on this node for local mode, 128 for sge mode)",
         ),
     ]
Пример #8
0
 def arguments(self):
     return [
         ToolArgument("configureStrelkaSomaticWorkflow.py", position=0),
         ToolArgument(
             StringFormatter(";") + InputSelector("rundir") + "/runWorkflow.py",
             position=2,
             shell_quote=False,
         ),
         ToolArgument(
             CpuSelector(None),
             prefix="--jobs",
             position=3,
             shell_quote=False,
             doc=" (-j JOBS)  number of jobs, must be an integer or 'unlimited' "
             "(default: Estimate total cores on this node for local mode, 128 for sge mode)",
         ),
     ]
Пример #9
0
class BwaMemBase(BioinformaticsTool, ABC):
    def tool(self):
        return "bwamem"

    def friendly_name(self):
        return "BWA-MEM"

    def tool_provider(self):
        return "BWA"

    def base_command(self):
        return ["bwa", "mem"]

    def inputs(self):
        return [
            ToolInput("reference", FastaBwa(), position=9),
            ToolInput("reads", FastqGzPair, position=10, doc=None),
            ToolInput("mates",
                      FastqGzPair(optional=True),
                      position=11,
                      doc=None),
            ToolInput("outputFilename", Filename(extension=".sam")),
            *BwaMemBase.additional_inputs,
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out", Stdout(Sam(),
                              stdoutname=InputSelector("outputFilename")))
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_MEM_TUPLE)
        if val:
            return val
        return 16

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BWA_CORES_TUPLE)
        if val:
            return val
        return 16

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=date(2018, 12, 24),
            dateUpdated=date(2019, 7, 23),
            institution="Sanger Institute",
            doi=None,
            citation="The BWA-MEM algorithm has not been published yet.",
            keywords=["bwa", "mem", "align"],
            documentationUrl="http://bio-bwa.sourceforge.net/bwa.shtml#3",
            documentation="""bwa - Burrows-Wheeler Alignment Tool
BWA is a software package for mapping low-divergent sequences against a large reference genome, such as the human 
genome. It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for 
Illumina sequence reads up to 100bp, while the rest two for longer sequences ranged from 70bp to 1Mbp. 
BWA-MEM and BWA-SW share similar features such as long-read support and split alignment, but BWA-MEM, which is 
the latest, is generally recommended for high-quality queries as it is faster and more accurate. 
BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina reads.

Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the algorithm works by seeding alignments 
with maximal exact matches (MEMs) and then extending seeds with the affine-gap Smith-Waterman algorithm (SW).

If mates.fq file is absent and option -p is not set, this command regards input reads are single-end. If 'mates.fq' 
is present, this command assumes the i-th read in reads.fq and the i-th read in mates.fq constitute a read pair. 
If -p is used, the command assumes the 2i-th and the (2i+1)-th read in reads.fq constitute a read pair (such input 
file is said to be interleaved). In this case, mates.fq is ignored. In the paired-end mode, the mem command will 
infer the read orientation and the insert size distribution from a batch of reads.

The BWA-MEM algorithm performs local alignment. It may produce multiple primary alignments for different part of a 
query sequence. This is a crucial feature for long sequences. However, some tools such as Picard’s markDuplicates 
does not work with split alignments. One may consider to use option -M to flag shorter split hits as secondary.
""".strip(),
        )

    additional_inputs = [
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-t",
            doc="Number of threads. (default = 1)",
        ),
        ToolInput(
            "minimumSeedLength",
            Int(optional=True),
            prefix="-k",
            doc=
            "Matches shorter than INT will be missed. The alignment speed is usually "
            "insensitive to this value unless it significantly deviates 20. (Default: 19)",
        ),
        ToolInput(
            "bandwidth",
            Int(optional=True),
            prefix="-w",
            doc=
            "Essentially, gaps longer than ${bandWidth} will not be found. Note that the maximum gap length "
            "is also affected by the scoring matrix and the hit length, not solely determined by this option."
            " (Default: 100)",
        ),
        ToolInput(
            "offDiagonalXDropoff",
            Int(optional=True),
            prefix="-d",
            doc=
            "(Z-dropoff): Stop extension when the difference between the best and the current extension "
            "score is above |i-j|*A+INT, where i and j are the current positions of the query and reference, "
            "respectively, and A is the matching score. Z-dropoff is similar to BLAST’s X-dropoff except "
            "that it doesn’t penalize gaps in one of the sequences in the alignment. Z-dropoff not only "
            "avoids unnecessary extension, but also reduces poor alignments inside a long good alignment. "
            "(Default: 100)",
        ),
        ToolInput(
            "reseedTrigger",
            Float(optional=True),
            prefix="-r",
            doc=
            "Trigger re-seeding for a MEM longer than minSeedLen*FLOAT. This is a key heuristic parameter "
            "for tuning the performance. Larger value yields fewer seeds, which leads to faster alignment "
            "speed but lower accuracy. (Default: 1.5)",
        ),
        ToolInput(
            "occurenceDiscard",
            Int(optional=True),
            prefix="-c",
            doc="Discard a MEM if it has more than INT occurence in the genome. "
            "This is an insensitive parameter. (Default: 10000)",
        ),
        ToolInput(
            "performSW",
            Boolean(optional=True),
            prefix="-P",
            doc=
            "In the paired-end mode, perform SW to rescue missing hits only but "
            "do not try to find hits that fit a proper pair.",
        ),
        ToolInput(
            "matchingScore",
            Int(optional=True),
            prefix="-A",
            doc="Matching score. (Default: 1)",
        ),
        ToolInput(
            "mismatchPenalty",
            Int(optional=True),
            prefix="-B",
            doc=
            "Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. "
            "(Default: 4)",
        ),
        ToolInput(
            "openGapPenalty",
            Int(optional=True),
            prefix="-O",
            doc="Gap open penalty. (Default: 6)",
        ),
        ToolInput(
            "gapExtensionPenalty",
            Int(optional=True),
            prefix="-E",
            doc="Gap extension penalty. A gap of length k costs O + k*E "
            "(i.e. -O is for opening a zero-length gap). (Default: 1)",
        ),
        ToolInput(
            "clippingPenalty",
            Int(optional=True),
            prefix="-L",
            doc=
            "Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best score "
            "reaching the end of query. If this score is larger than the best SW score minus the "
            "clipping penalty, clipping will not be applied. Note that in this case, the SAM AS tag "
            "reports the best SW score; clipping penalty is not deducted. (Default: 5)",
        ),
        ToolInput(
            "unpairedReadPenalty",
            Int(optional=True),
            prefix="-U",
            doc=
            "Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as "
            "scoreRead1+scoreRead2-INT and scores a paired as scoreRead1+scoreRead2-insertPenalty. "
            "It compares these two scores to determine whether we should force pairing. (Default: 9)",
        ),
        ToolInput(
            "assumeInterleavedFirstInput",
            Boolean(optional=True),
            prefix="-p",
            doc=
            "Assume the first input query file is interleaved paired-end FASTA/Q. ",
        ),
        ToolInput(
            "readGroupHeaderLine",
            String(optional=True),
            prefix="-R",
            doc=
            "Complete read group header line. ’\\t’ can be used in STR and will be converted to a TAB i"
            "n the output SAM. The read group ID will be attached to every read in the output. "
            "An example is ’@RG\\tID:foo\\tSM:bar’. (Default=null)",
        ),
        ToolInput(
            "outputAlignmentThreshold",
            Int(optional=True),
            prefix="-T",
            doc=
            "Don’t output alignment with score lower than INT. Only affects output. (Default: 30)",
        ),
        ToolInput(
            "outputAllElements",
            Boolean(optional=True),
            prefix="-a",
            doc=
            "Output all found alignments for single-end or unpaired paired-end reads. "
            "These alignments will be flagged as secondary alignments.",
        ),
        ToolInput(
            "appendComments",
            Boolean(optional=True),
            prefix="-C",
            doc=
            "Append append FASTA/Q comment to SAM output. This option can be used to transfer "
            "read meta information (e.g. barcode) to the SAM output. Note that the FASTA/Q comment "
            "(the string after a space in the header line) must conform the SAM spec (e.g. BC:Z:CGTAC). "
            "Malformated comments lead to incorrect SAM output.",
        ),
        ToolInput(
            "hardClipping",
            Boolean(optional=True),
            prefix="-H",
            doc=
            "Use hard clipping ’H’ in the SAM output. This option may dramatically reduce "
            "the redundancy of output when mapping long contig or BAC sequences.",
        ),
        ToolInput(
            "markShorterSplits",
            Boolean(optional=True),
            prefix="-M",
            doc=
            "Mark shorter split hits as secondary (for Picard compatibility).",
        ),
        ToolInput(
            "verboseLevel",
            Int(optional=True),
            prefix="-v",
            doc="Control the verbose level of the output. "
            "This option has not been fully supported throughout BWA. Ideally, a value: "
            "0 for disabling all the output to stderr; "
            "1 for outputting errors only; "
            "2 for warnings and errors; "
            "3 for all normal messages; "
            "4 or higher for debugging. When this option takes value 4, the output is not SAM. (Default: 3)",
        ),
    ]
class VarDictSomaticCompressedBase(BioinformaticsTool, ABC):
    def friendly_name(self) -> str:
        return "Vardict (Somatic)"

    def tool_provider(self):
        return "VarDict"

    def tool(self):
        return "vardict_somatic"

    def base_command(self):
        return "VarDict"

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("tumorBam", BamBai(), doc="The indexed BAM file"),
            ToolInput("normalBam", BamBai(), doc="The indexed BAM file"),
            ToolInput("intervals", Bed(), position=2, shell_quote=False),
            ToolInput(
                "reference",
                FastaFai(),
                prefix="-G",
                position=1,
                shell_quote=False,
                doc="The reference fasta. Should be indexed (.fai). "
                "Defaults to: /ngs/reference_data/genomes/Hsapiens/hg19/seq/hg19.fa",
            ),
            ToolInput(
                "tumorName",
                String(),
                doc=
                "The sample name to be used directly.  Will overwrite -n option",
            ),
            ToolInput(
                "normalName",
                String(),
                doc="The normal sample name to use with the -b option",
            ),
            ToolInput(
                "alleleFreqThreshold",
                Float(optional=True),
                doc="The threshold for allele frequency, default: 0.05 or 5%",
            ),
            ToolInput(
                "outputFilename",
                Filename(extension=".vcf", suffix=".vardict"),
                prefix=">",
                position=10,
                shell_quote=False,
            ),
            *VarDictSomaticCompressedBase.vardict_inputs,
            *VarDictSomaticCompressedBase.var2vcf_inputs,
        ]

    def outputs(self):
        return [
            ToolOutput("out",
                       CompressedVcf,
                       glob=InputSelector("outputFilename"))
        ]

    def arguments(self):
        return [
            ToolArgument("| testsomatic.R |", position=3, shell_quote=False),
            ToolArgument("var2vcf_paired.pl", position=4, shell_quote=False),
            ToolArgument(
                JoinOperator(
                    [InputSelector("tumorBam"),
                     InputSelector("normalBam")], "|"),
                prefix="-b",
                position=1,
                shell_quote=True,
            ),
            ToolArgument(InputSelector("tumorName"),
                         prefix="-N",
                         position=1,
                         shell_quote=True),
            ToolArgument(
                JoinOperator(
                    [InputSelector("tumorName"),
                     InputSelector("normalName")], "|"),
                prefix="-N",
                position=5,
                shell_quote=True,
            ),
            ToolArgument(
                InputSelector("alleleFreqThreshold"),
                prefix="-f",
                position=5,
                shell_quote=False,
            ),
            ToolArgument(
                InputSelector("alleleFreqThreshold"),
                prefix="-f",
                position=1,
                shell_quote=False,
            ),
            ToolArgument(" | bcftools view -O z",
                         position=6,
                         shell_quote=False),
        ]

    vardict_inputs = [
        ToolInput(
            "indels3prime",
            Boolean(optional=True),
            prefix="-3",
            position=1,
            shell_quote=False,
            doc=
            "Indicate to move indels to 3-prime if alternative alignment can be achieved.",
        ),
        ToolInput(
            "amplicon",
            Float(optional=True),
            prefix="-a",
            position=1,
            shell_quote=False,
            doc=
            "Indicate it's amplicon based calling.  Reads that don't map to the amplicon will be skipped.  "
            "A read pair is considered belonging  to the amplicon if the edges are less than int bp to "
            "the amplicon, and overlap fraction is at least float.  Default: 10:0.95",
        ),
        ToolInput(
            "minReads",
            Int(optional=True),
            prefix="-B",
            position=1,
            shell_quote=False,
            doc="The minimum # of reads to determine strand bias, default 2",
        ),
        ToolInput(
            "chromNamesAreNumbers",
            Boolean(optional=True),
            prefix="-C",
            position=1,
            shell_quote=False,
            doc=
            "Indicate the chromosome names are just numbers, such as 1, 2, not chr1, chr2",
        ),
        ToolInput(
            "chromColumn",
            Int(optional=True),
            prefix="-c",
            position=1,
            shell_quote=False,
            doc="The column for chromosome",
        ),
        ToolInput(
            "debug",
            Boolean(optional=True),
            prefix="-D",
            position=1,
            shell_quote=False,
            doc=
            "Debug mode.  Will print some error messages and append full genotype at the end.",
        ),
        ToolInput(
            "splitDelimeter",
            String(optional=True),
            prefix="-d",
            position=1,
            shell_quote=False,
            doc='The delimiter for split region_info, default to tab "\t"',
        ),
        ToolInput(
            "geneEndCol",
            Int(optional=True),
            prefix="-E",
            position=1,
            shell_quote=False,
            doc="The column for region end, e.g. gene end",
        ),
        ToolInput(
            "segEndCol",
            Int(optional=True),
            prefix="-e",
            position=1,
            shell_quote=False,
            doc="The column for segment ends in the region, e.g. exon ends",
        ),
        ToolInput(
            "filter",
            String(optional=True),
            prefix="-F",
            position=1,
            shell_quote=False,
            doc=
            "The hexical to filter reads using samtools. Default: 0x500 (filter 2nd alignments and "
            "duplicates). Use -F 0 to turn it off.",
        ),
        ToolInput(
            "geneNameCol",
            Int(optional=True),
            prefix="-g",
            position=1,
            shell_quote=False,
            doc="The column for gene name, or segment annotation",
        ),
        # ToolInput("help", Boolean(optional=True), prefix="-H", position=1, shell_quote=False,
        #           doc="Print this help page"),
        ToolInput(
            "printHeaderRow",
            Boolean(optional=True),
            prefix="-h",
            position=1,
            shell_quote=False,
            doc="Print a header row describing columns",
        ),
        ToolInput(
            "indelSize",
            Int(optional=True),
            prefix="-I",
            position=1,
            shell_quote=False,
            doc="The indel size.  Default: 120bp",
        ),
        ToolInput(
            "outputSplice",
            Boolean(optional=True),
            prefix="-i",
            position=1,
            shell_quote=False,
            doc="Output splicing read counts",
        ),
        ToolInput(
            "performLocalRealignment",
            Int(optional=True),
            prefix="-k",
            position=1,
            shell_quote=False,
            doc=
            "Indicate whether to perform local realignment.  Default: 1.  Set to 0 to disable it. "
            "For Ion or PacBio, 0 is recommended.",
        ),
        ToolInput(
            "minMatches",
            Int(optional=True),
            prefix="-M",
            position=1,
            shell_quote=False,
            doc=
            "The minimum matches for a read to be considered. If, after soft-clipping, the matched "
            "bp is less than INT, then the read is discarded. It's meant for PCR based targeted sequencing "
            "where there's no insert and the matching is only the primers. Default: 0, or no filtering",
        ),
        ToolInput(
            "maxMismatches",
            Int(optional=True),
            prefix="-m",
            position=1,
            shell_quote=False,
            doc=
            "If set, reads with mismatches more than INT will be filtered and ignored. "
            "Gaps are not counted as mismatches. Valid only for bowtie2/TopHat or BWA aln "
            "followed by sampe. BWA mem is calculated as NM - Indels. "
            "Default: 8, or reads with more than 8 mismatches will not be used.",
        ),
        ToolInput(
            "regexSampleName",
            String(optional=True),
            prefix="-n",
            position=1,
            shell_quote=False,
            doc=
            "The regular expression to extract sample name from BAM filenames. "
            "Default to: /([^\/\._]+?)_[^\/]*.bam/",
        ),
        ToolInput(
            "mapq",
            String(optional=True),
            prefix="-O",
            position=1,
            shell_quote=False,
            doc=
            "The reads should have at least mean MapQ to be considered a valid variant. "
            "Default: no filtering",
        ),
        ToolInput(
            "qratio",
            Float(optional=True),
            prefix="-o",
            position=1,
            shell_quote=False,
            doc="The Qratio of (good_quality_reads)/(bad_quality_reads+0.5). "
            "The quality is defined by -q option.  Default: 1.5",
        ),
        ToolInput(
            "readPosition",
            Float(optional=True),
            prefix="-P",
            position=1,
            shell_quote=False,
            doc=
            "The read position filter. If the mean variants position is less that specified, "
            "it's considered false positive.  Default: 5",
        ),
        ToolInput(
            "pileup",
            Boolean(optional=True),
            prefix="-p",
            position=1,
            shell_quote=False,
            doc="Do pileup regardless of the frequency",
        ),
        ToolInput(
            "minMappingQual",
            Int(optional=True),
            prefix="-Q",
            position=1,
            shell_quote=False,
            doc=
            "If set, reads with mapping quality less than INT will be filtered and ignored",
        ),
        ToolInput(
            "phredScore",
            Int(optional=True),
            prefix="-q",
            position=1,
            shell_quote=False,
            doc="The phred score for a base to be considered a good call.  "
            "Default: 25 (for Illumina) For PGM, set it to ~15, as PGM tends to under estimate base quality.",
        ),
        ToolInput(
            "region",
            String(optional=True),
            prefix="-R",
            position=1,
            shell_quote=False,
            doc=
            "The region of interest.  In the format of chr:start-end.  If end is omitted, "
            "then a single position.  No BED is needed.",
        ),
        ToolInput(
            "minVariantReads",
            Int(optional=True),
            prefix="-r",
            position=1,
            shell_quote=False,
            doc="The minimum # of variant reads, default 2",
        ),
        ToolInput(
            "regStartCol",
            Int(optional=True),
            prefix="-S",
            position=1,
            shell_quote=False,
            doc="The column for region start, e.g. gene start",
        ),
        ToolInput(
            "segStartCol",
            Int(optional=True),
            prefix="-s",
            position=1,
            shell_quote=False,
            doc="The column for segment starts in the region, e.g. exon starts",
        ),
        ToolInput(
            "minReadsBeforeTrim",
            Int(optional=True),
            prefix="-T",
            position=1,
            shell_quote=False,
            doc="Trim bases after [INT] bases in the reads",
        ),
        ToolInput(
            "removeDuplicateReads",
            Boolean(optional=True),
            prefix="-t",
            position=1,
            shell_quote=False,
            doc=
            "Indicate to remove duplicated reads.  Only one pair with same start positions will be kept",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-th",
            position=1,
            shell_quote=False,
            doc="Threads count.",
        ),
        ToolInput(
            "freq",
            Int(optional=True),
            prefix="-V",
            position=1,
            shell_quote=False,
            doc=
            "The lowest frequency in the normal sample allowed for a putative somatic mutation. "
            "Defaults to 0.05",
        ),
        ToolInput(
            "vcfFormat",
            Boolean(optional=True),
            prefix="-v",
            position=1,
            shell_quote=False,
            doc="VCF format output",
        ),
        ToolInput(
            "vs",
            String(optional=True),
            prefix="-VS",
            position=1,
            shell_quote=False,
            doc=
            "[STRICT | LENIENT | SILENT] How strict to be when reading a SAM or BAM: "
            "STRICT   - throw an exception if something looks wrong. "
            "LENIENT	- Emit warnings but keep going if possible. "
            "SILENT	- Like LENIENT, only don't emit warning messages. "
            "Default: LENIENT",
        ),
        ToolInput(
            "bp",
            Int(optional=True),
            prefix="-X",
            position=1,
            shell_quote=False,
            doc=
            "Extension of bp to look for mismatches after insersion or deletion.  "
            "Default to 3 bp, or only calls when they're within 3 bp.",
        ),
        ToolInput(
            "extensionNucleotide",
            Int(optional=True),
            prefix="-x",
            position=1,
            shell_quote=False,
            doc=
            "The number of nucleotide to extend for each segment, default: 0",
        ),
        ToolInput(
            "yy",
            Boolean(optional=True),
            prefix="-y",
            position=1,
            shell_quote=False,
            doc="<No content>",
        ),
        ToolInput(
            "downsamplingFraction",
            Int(optional=True),
            prefix="-Z",
            position=1,
            shell_quote=False,
            doc=
            "For downsampling fraction.  e.g. 0.7 means roughly 70% downsampling.  "
            "Default: No downsampling.  Use with caution.  "
            "The downsampling will be random and non-reproducible.",
        ),
        ToolInput(
            "zeroBasedCoords",
            Int(optional=True),
            prefix="-z",
            position=1,
            shell_quote=False,
            doc=
            "0/1  Indicate whether coordinates are zero-based, as IGV uses.  "
            "Default: 1 for BED file or amplicon BED file. Use 0 to turn it off. "
            "When using the -R option, it's set to 0",
        ),
    ]

    var2vcf_inputs = []

    def docurl():
        return "https://github.com/AstraZeneca-NGS/VarDict"

    def doc(self):
        return """
Пример #11
0
    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput(
                "inputFile",
                CompressedVcf(),
                prefix="--input_file",
                doc="Input file name. Can use compressed file (gzipped).",
            ),
            ToolInput(
                "outputFilename",
                Filename(
                    prefix=InputSelector("inputFile", remove_file_extension=True),
                    extension=".vcf",
                ),
                prefix="--output_file",
                doc="(-o) Output file name. Results can write to STDOUT by specifying "
                ' as the output file name - this will force quiet mode. Default = "variant_effect_output.txt"',
            ),
            ToolInput(
                "vcf",
                Boolean(),
                default=True,
                prefix="--vcf",
                doc="Writes output in VCF format. Consequences are added in the INFO field of the VCF file, using the "
                'key "CSQ". Data fields are encoded separated by "|"; the order of fields is written in the VCF header.'
                ' Output fields in the "CSQ" INFO field can be selected by using --fields. If the input format was VCF,'
                " the file will remain unchanged save for the addition of the CSQ field (unless using any filtering). "
                "Custom data added with --custom are added as separate fields, using the key specified for each data "
                "file. Commas in fields are replaced with ampersands (&) to preserve VCF format.",
            ),
            # ToolInput('plugin', [PLUGINS](optional=True), prefix='--plugin',
            #           doc='Use named plugin. Plugin modules should be installed in the Plugins subdirectory of the VEP cache directory (defaults to $HOME/.vep/). Multiple plugins can be used by supplying the --plugin flag multiple times. See plugin documentation. Not used by default'),
            ToolInput(
                "help",
                Boolean(optional=True),
                prefix="--help",
                doc="Display help message and quit",
            ),
            ToolInput(
                "quiet",
                Boolean(optional=True),
                prefix="--quiet",
                doc="(-q) Suppress warning messages.Not used by default",
            ),
            ToolInput(
                "verbose",
                Boolean(optional=True),
                prefix="--verbose",
                doc="(-v) Print out a bit more information while running. Not used by default",
            ),
            ToolInput(
                "config",
                File(optional=True),
                prefix="--config",
                doc="""Load configuration options from a config file. The config file should consist of whitespace-separated pairs of option names and settings e.g.:

            output_file   my_output.txt
            species       mus_musculus
            format        vcf
            host          useastdb.ensembl.org

            A config file can also be implicitly read; save the file as $HOME/.vep/vep.ini (or equivalent directory if 
            using --dir). Any options in this file will be overridden by those specified in a config file using --config, 
            and in turn by any options specified on the command line. You can create a quick version file of this by 
            setting the flags as normal and running VEP in verbose (-v) mode. This will output lines that can be copied 
            to a config file that can be loaded in on the next run using --config. Not used by default""",
            ),
            ToolInput(
                "everything",
                Boolean(optional=True),
                prefix="--everything",
                doc="(-e) Shortcut flag to switch on all of the following: --sift b, --polyphen b, --ccds, "
                "--uniprot, --hgvs, --symbol, --numbers, --domains, --regulatory, --canonical, --protein, "
                "--biotype, --uniprot, --tsl, --appris, --gene_phenotype --af, --af_1kg, --af_esp, "
                "--af_gnomad, --max_af, --pubmed, --variant_class, --mane",
            ),
            ToolInput(
                "species",
                String(optional=True),
                prefix="--species",
                doc='Species for your data. This can be the latin name e.g. "homo_sapiens" or any Ensembl alias e.g. '
                '"mouse". Specifying the latin name can speed up initial database connection as the registry does '
                'not have to load all available database aliases on the server. Default = "homo_sapiens"',
            ),
            ToolInput(
                "assembly",
                String(optional=True),
                prefix="--assembly",
                doc="""(-a) Select the assembly version to use if more than one available. If using the cache, you must 
                have the appropriate assembly's cache file installed. If not specified and you have only 1 assembly 
                version installed, this will be chosen by default. Default = use found assembly version""",
            ),
            ToolInput(
                "inputData",
                String(optional=True),
                prefix="--input_data",
                doc="(--id) Raw input data as a string. May be used, for example, to input a single rsID or HGVS "
                "notation quickly to vep: --input_data rs699",
            ),
            ToolInput(
                "format",
                String(optional=True),
                prefix="--format",
                doc='Input file format - one of "ensembl", "vcf", "hgvs", "id", "region", "spdi". By default, '
                "VEP auto-detects the input file format. Using this option you can specify the input file is "
                "Ensembl, VCF, IDs, HGVS, SPDI or region format. Can use compressed version (gzipped) of any "
                "file format listed above. Auto-detects format by default",
            ),
            ToolInput(
                "forceOverwrite",
                Boolean(optional=True),
                prefix="--force_overwrite",
                doc="(--force) By default, VEP will fail with an error if the output file already exists. You can "
                "force the overwrite of the existing file by using this flag. Not used by default",
            ),
            ToolInput(
                "statsFile",
                String(optional=True),
                default="variant_effect_output.txt_summary.html",
                prefix="--stats_file",
                doc="(--sf) Summary stats file name. This is an HTML file containing a summary of the VEP run - the "
                'file name must end ".htm" or ".html". Default = "variant_effect_output.txt_summary.html"',
            ),
            ToolInput(
                "noStats",
                Boolean(optional=True),
                prefix="--no_stats",
                doc="""Don\'t generate a stats file. Provides marginal gains in run time.""",
            ),
            ToolInput(
                "statsText",
                Boolean(optional=True),
                prefix="--stats_text",
                doc="Generate a plain text stats file in place of the HTML.",
            ),
            ToolInput(
                "warningFile",
                Filename(suffix="warning", extension=".txt"),
                prefix="--warning_file",
                doc="File name to write warnings and errors to. Default = STDERR (standard error)",
            ),
            ToolInput(
                "maxSvSize",
                Boolean(optional=True),
                prefix="--max_sv_size",
                doc="Extend the maximum Structural Variant size VEP can process.",
            ),
            ToolInput(
                "noCheckVariantsOrder",
                Boolean(optional=True),
                prefix="--no_check_variants_order",
                doc="Permit the use of unsorted input files. However running VEP on unsorted input files slows down "
                "the tool and requires more memory.",
            ),
            ToolInput(
                "fork",
                Int(optional=True),
                default=CpuSelector(),
                prefix="--fork",
                doc="Enable forking, using the specified number of forks. Forking can dramatically improve runtime. "
                "Not used by default",
            ),
            ToolInput(
                "custom",
                Array(BedTabix, optional=True),
                prefix="--custom",
                prefix_applies_to_all_elements=True,
                doc="Add custom annotation to the output. Files must be tabix indexed or in the bigWig format. "
                "Multiple files can be specified by supplying the --custom flag multiple times. "
                "See https://asia.ensembl.org/info/docs/tools/vep/script/vep_custom.html for full details. "
                "Not used by default",
            ),
            ToolInput(
                "gff",
                File(optional=True),
                prefix="--gff",
                doc="Use GFF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "gtf",
                File(optional=True),
                prefix="--gtf",
                doc="Use GTF transcript annotations in [filename] as an annotation source. "
                "Requires a FASTA file of genomic sequence.Not used by default",
            ),
            ToolInput(
                "bam",
                Bam(optional=True),
                prefix="--bam",
                doc="ADVANCED Use BAM file of sequence alignments to correct transcript models not derived from "
                "reference genome sequence. Used to correct RefSeq transcript models. "
                "Enables --use_transcript_ref; add --use_given_ref to override this behaviour. Not used by default",
            ),
            ToolInput(
                "useTranscriptRef",
                Boolean(optional=True),
                prefix="--use_transcript_ref",
                doc="By default VEP uses the reference allele provided in the input file to calculate consequences "
                "for the provided alternate allele(s). Use this flag to force VEP to replace the provided "
                "reference allele with sequence derived from the overlapped transcript. This is especially "
                "relevant when using the RefSeq cache, see documentation for more details. The GIVEN_REF and "
                "USED_REF fields are set in the output to indicate any change. Not used by default",
            ),
            ToolInput(
                "useGivenRef",
                Boolean(optional=True),
                prefix="--use_given_ref",
                doc="Using --bam or a BAM-edited RefSeq cache by default enables --use_transcript_ref; add this flag "
                "to override this behaviour and use the provided reference allele from the input. Not used by default",
            ),
            ToolInput(
                "customMultiAllelic",
                Boolean(optional=True),
                prefix="--custom_multi_allelic",
                doc="By default, comma separated lists found within the INFO field of custom annotation VCFs are "
                "assumed to be allele specific. For example, a variant with allele_string A/G/C with associated "
                'custom annotation "single,double,triple" will associate triple with C, double with G and single '
                "with A. This flag instructs VEP to return all annotations for all alleles. Not used by default",
            ),
            ToolInput(
                "tab",
                Boolean(optional=True),
                prefix="--tab",
                doc="Writes output in tab-delimited format. Not used by default",
            ),
            ToolInput(
                "json",
                Boolean(optional=True),
                prefix="--json",
                doc="Writes output in JSON format. Not used by default",
            ),
            ToolInput(
                "compressOutput",
                String(optional=True),
                default="bgzip",
                prefix="--compress_output",
                doc="Writes output compressed using either gzip or bgzip. Not used by default",
            ),
            ToolInput(
                "fields",
                Array(String, optional=True),
                prefix="--fields",
                doc="""Configure the output format using a comma separated list of fields.
Can only be used with tab (--tab) or VCF format (--vcf) output.
For the tab format output, the selected fields may be those present in the default output columns, or 
any of those that appear in the Extra column (including those added by plugins or custom annotations). 
Output remains tab-delimited. For the VCF format output, the selected fields are those present within the ""CSQ"" INFO field.

Example of command for the tab output:

--tab --fields ""Uploaded_variation,Location,Allele,Gene""
Example of command for the VCF format output:

--vcf --fields ""Allele,Consequence,Feature_type,Feature""
Not used by default""",
            ),
            ToolInput(
                "minimal",
                Boolean(optional=True),
                prefix="--minimal",
                doc="Convert alleles to their most minimal representation before consequence calculation i.e. "
                "sequence that is identical between each pair of reference and alternate alleles is trimmed "
                "off from both ends, with coordinates adjusted accordingly. Note this may lead to discrepancies "
                "between input coordinates and coordinates reported by VEP relative to transcript sequences; "
                "to avoid issues, use --allele_number and/or ensure that your input variants have unique "
                "identifiers. The MINIMISED flag is set in the VEP output where relevant. Not used by default",
            ),
            ToolInput(
                "variantClass",
                Boolean(optional=True),
                prefix="--variant_class",
                doc="Output the Sequence Ontology variant class. Not used by default",
            ),
            ToolInput(
                "sift",
                String(optional=True),
                prefix="--sift",
                doc="Species limited SIFT predicts whether an amino acid substitution affects protein function based "
                "on sequence homology and the physical properties of amino acids. VEP can output the prediction "
                "term, score or both. Not used by default",
            ),
            ToolInput(
                "polyphen",
                String(optional=True),
                prefix="--polyphen",
                doc="Human only PolyPhen is a tool which predicts possible impact of an amino acid substitution on "
                "the structure and function of a human protein using straightforward physical and comparative "
                "considerations. VEP can output the prediction term, score or both. VEP uses the humVar score "
                "by default - use --humdiv to retrieve the humDiv score. Not used by default",
            ),
            ToolInput(
                "humdiv",
                Boolean(optional=True),
                prefix="--humdiv",
                doc="Human only Retrieve the humDiv PolyPhen prediction instead of the default humVar. "
                "Not used by default",
            ),
            ToolInput(
                "nearest",
                String(optional=True),
                prefix="--nearest",
                doc="""Retrieve the transcript or gene with the nearest protein-coding transcription start site 
                (TSS) to each input variant. Use ""transcript"" to retrieve the transcript stable ID, ""gene"" to 
                retrieve the gene stable ID, or ""symbol"" to retrieve the gene symbol. Note that the nearest 
                TSS may not belong to a transcript that overlaps the input variant, and more than one may be 
                reported in the case where two are equidistant from the input coordinates.

            Currently only available when using a cache annotation source, and requires the Set::IntervalTree perl module.
            Not used by default""",
            ),
            ToolInput(
                "distance",
                Array(Int, optional=True),
                separator=",",
                prefix="--distance",
                doc="Modify the distance up and/or downstream between a variant and a transcript for which VEP will assign the upstream_gene_variant or downstream_gene_variant consequences. Giving one distance will modify both up- and downstream distances; prodiving two separated by commas will set the up- (5') and down - (3') stream distances respectively. Default: 5000",
            ),
            ToolInput(
                "overlaps",
                Boolean(optional=True),
                prefix="--overlaps",
                doc="Report the proportion and length of a transcript overlapped by a structural variant in VCF format.",
            ),
            ToolInput(
                "genePhenotype",
                Boolean(optional=True),
                prefix="--gene_phenotype",
                doc="Indicates if the overlapped gene is associated with a phenotype, disease or trait. See list of phenotype sources. Not used by default",
            ),
            ToolInput(
                "regulatory",
                Boolean(optional=True),
                prefix="--regulatory",
                doc="Look for overlaps with regulatory regions. VEP can also report if a variant falls in a high information position within a transcription factor binding site. Output lines have a Feature type of RegulatoryFeature or MotifFeature. Not used by default",
            ),
            ToolInput(
                "cellType",
                Boolean(optional=True),
                prefix="--cell_type",
                doc="Report only regulatory regions that are found in the given cell type(s). Can be a single cell type or a comma-separated list. The functional type in each cell type is reported under CELL_TYPE in the output. To retrieve a list of cell types, use --cell_type list. Not used by default",
            ),
            ToolInput(
                "individual",
                Array(String, optional=True),
                prefix="--individual",
                separator=",",
                doc='Consider only alternate alleles present in the genotypes of the specified individual(s). May be a single individual, a comma-separated list or "all" to assess all individuals separately. Individual variant combinations homozygous for the given reference allele will not be reported. Each individual and variant combination is given on a separate line of output. Only works with VCF files containing individual genotype data; individual IDs are taken from column headers. Not used by default',
            ),
            ToolInput(
                "phased",
                Boolean(optional=True),
                prefix="--phased",
                doc="Force VCF genotypes to be interpreted as phased. For use with plugins that depend on phased data. Not used by default",
            ),
            ToolInput(
                "alleleNumber",
                Boolean(optional=True),
                prefix="--allele_number",
                doc="Identify allele number from VCF input, where 1 = first ALT allele, 2 = second ALT allele etc. Useful when using --minimal Not used by default",
            ),
            ToolInput(
                "showRefAllele",
                Boolean(optional=True),
                prefix="--show_ref_allele",
                doc='Adds the reference allele in the output. Mainly useful for the VEP "default" and tab-delimited output formats. Not used by default',
            ),
            ToolInput(
                "totalLength",
                Boolean(optional=True),
                prefix="--total_length",
                doc="Give cDNA, CDS and protein positions as Position/Length. Not used by default",
            ),
            ToolInput(
                "numbers",
                Boolean(optional=True),
                prefix="--numbers",
                doc="Adds affected exon and intron numbering to to output. Format is Number/Total. Not used by default",
            ),
            ToolInput(
                "noEscape",
                Boolean(optional=True),
                prefix="--no_escape",
                doc="Don't URI escape HGVS strings. Default = escape",
            ),
            ToolInput(
                "keepCsq",
                Boolean(optional=True),
                prefix="--keep_csq",
                doc="Don't overwrite existing CSQ entry in VCF INFO field. Overwrites by default",
            ),
            ToolInput(
                "vcfInfoField",
                String(optional=True),
                prefix="--vcf_info_field",
                doc='Change the name of the INFO key that VEP write the consequences to in its VCF output. Use "ANN" for compatibility with other tools such as snpEff. Default: CSQ',
            ),
            ToolInput(
                "terms",
                String(optional=True),
                prefix="--terms",
                doc='(-t) The type of consequence terms to output. The Ensembl terms are described here. The Sequence Ontology is a joint effort by genome annotation centres to standardise descriptions of biological sequences. Default = "SO"',
            ),
            ToolInput(
                "noHeaders",
                Boolean(optional=True),
                prefix="--no_headers",
                doc="Don't write header lines in output files. Default = add headers",
            ),
            ToolInput(
                "hgvs",
                Boolean(optional=True),
                prefix="--hgvs",
                doc="Add HGVS nomenclature based on Ensembl stable identifiers to the output. Both coding and protein sequence names are added where appropriate. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. HGVS notations given on Ensembl identifiers are versioned. Not used by default",
            ),
            ToolInput(
                "hgvsg",
                Boolean(optional=True),
                prefix="--hgvsg",
                doc="Add genomic HGVS nomenclature based on the input chromosome name. To generate HGVS identifiers when using --cache or --offline you must use a FASTA file and --fasta. Not used by default",
            ),
            ToolInput(
                "shiftHgvs",
                Boolean(optional=True),
                prefix="--shift_hgvs",
                doc="""Enable or disable 3\' shifting of HGVS notations. When enabled, this causes ambiguous insertions or deletions (typically in repetetive sequence tracts) to be "shifted" to their most 3' possible coordinates (relative to the transcript sequence and strand) before the HGVS notations are calculated; the flag HGVS_OFFSET is set to the number of bases by which the variant has shifted, relative to the input genomic coordinates. Disabling retains the original input coordinates of the variant. Default: 1 (shift)""",
            ),
            ToolInput(
                "transcriptVersion",
                Boolean(optional=True),
                prefix="--transcript_version",
                doc="Add version numbers to Ensembl transcript identifiers",
            ),
            ToolInput(
                "protein",
                Boolean(optional=True),
                prefix="--protein",
                doc="Add the Ensembl protein identifier to the output where appropriate. Not used by default",
            ),
            ToolInput(
                "symbol",
                Boolean(optional=True),
                prefix="--symbol",
                doc="Adds the gene symbol (e.g. HGNC) (where available) to the output. Not used by default",
            ),
            ToolInput(
                "ccds",
                Boolean(optional=True),
                prefix="--ccds",
                doc="Adds the CCDS transcript identifer (where available) to the output. Not used by default",
            ),
            ToolInput(
                "uniprot",
                Boolean(optional=True),
                prefix="--uniprot",
                doc="Adds best match accessions for translated protein products from three UniProt-related databases (SWISSPROT, TREMBL and UniParc) to the output. Not used by default",
            ),
            ToolInput(
                "tsl",
                Boolean(optional=True),
                prefix="--tsl",
                doc="Adds the transcript support level for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "appris",
                Boolean(optional=True),
                prefix="--appris",
                doc="Adds the APPRIS isoform annotation for this transcript to the output. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "canonical",
                Boolean(optional=True),
                prefix="--canonical",
                doc="Adds a flag indicating if the transcript is the canonical transcript for the gene. Not used by default",
            ),
            ToolInput(
                "mane",
                Boolean(optional=True),
                prefix="--mane",
                doc="Adds a flag indicating if the transcript is the MANE Select transcript for the gene. Not used by default. Note: Only available for human on the GRCh38 assembly",
            ),
            ToolInput(
                "biotype",
                Boolean(optional=True),
                prefix="--biotype",
                doc="Adds the biotype of the transcript or regulatory feature. Not used by default",
            ),
            ToolInput(
                "domains",
                Boolean(optional=True),
                prefix="--domains",
                doc="Adds names of overlapping protein domains to output. Not used by default",
            ),
            ToolInput(
                "xrefRefseq",
                Boolean(optional=True),
                prefix="--xref_refseq",
                doc="Output aligned RefSeq mRNA identifier for transcript. Not used by default. Note: The RefSeq and Ensembl transcripts aligned in this way MAY NOT, AND FREQUENTLY WILL NOT, match exactly in sequence, exon structure and protein product",
            ),
            ToolInput(
                "synonyms",
                Tsv(optional=True),
                prefix="--synonyms",
                doc="Load a file of chromosome synonyms. File should be tab-delimited with the primary identifier in column 1 and the synonym in column 2. Synonyms allow different chromosome identifiers to be used in the input file and any annotation source (cache, database, GFF, custom file, FASTA file). Not used by default",
            ),
            ToolInput(
                "checkExisting",
                Boolean(optional=True),
                prefix="--check_existing",
                doc="""Checks for the existence of known variants that are co-located with your input. By default the alleles are compared and variants on an allele-specific basis - to compare only coordinates, use --no_check_alleles.

            Some databases may contain variants with unknown (null) alleles and these are included by default; to exclude them use --exclude_null_alleles.

            See this page for more details.

            Not used by default""",
            ),
            ToolInput(
                "checkSvs",
                Boolean(optional=True),
                prefix="--check_svs",
                doc="Checks for the existence of structural variants that overlap your input. Currently requires database access. Not used by default",
            ),
            ToolInput(
                "clinSigAllele",
                Boolean(optional=True),
                prefix="--clin_sig_allele",
                doc="Return allele specific clinical significance. Setting this option to 0 will provide all known clinical significance values at the given locus. Default: 1 (Provide allele-specific annotations)",
            ),
            ToolInput(
                "excludeNullAlleles",
                Boolean(optional=True),
                prefix="--exclude_null_alleles",
                doc="Do not include variants with unknown alleles when checking for co-located variants. Our human database contains variants from HGMD and COSMIC for which the alleles are not publically available; by default these are included when using --check_existing, use this flag to exclude them. Not used by default",
            ),
            ToolInput(
                "noCheckAlleles",
                Boolean(optional=True),
                prefix="--no_check_alleles",
                doc="""When checking for existing variants, by default VEP only reports a co-located variant if none of the input alleles are novel. For example, if your input variant has alleles A/G, and an existing co-located variant has alleles A/C, the co-located variant will not be reported.

            Strand is also taken into account - in the same example, if the input variant has alleles T/G but on the negative strand, then the co-located variant will be reported since its alleles match the reverse complement of input variant.

            Use this flag to disable this behaviour and compare using coordinates alone. Not used by default""",
            ),
            ToolInput(
                "af",
                Boolean(optional=True),
                prefix="--af",
                doc="Add the global allele frequency (AF) from 1000 Genomes Phase 3 data for any known co-located variant to the output. For this and all --af_* flags, the frequency reported is for the input allele only, not necessarily the non-reference or derived allele. Not used by default",
            ),
            ToolInput(
                "maxAf",
                Boolean(optional=True),
                prefix="--max_af",
                doc="Report the highest allele frequency observed in any population from 1000 genomes, ESP or gnomAD. Not used by default",
            ),
            ToolInput(
                "af1kg",
                String(optional=True),
                prefix="--af_1kg",
                doc="Add allele frequency from continental populations (AFR,AMR,EAS,EUR,SAS) of 1000 Genomes Phase 3 to the output. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afEsp",
                Boolean(optional=True),
                prefix="--af_esp",
                doc="Include allele frequency from NHLBI-ESP populations. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "afGnomad",
                Boolean(optional=True),
                prefix="--af_gnomad",
                doc="Include allele frequency from Genome Aggregation Database (gnomAD) exome populations. Note only data from the gnomAD exomes are included; to retrieve data from the additional genomes data set, see this guide. Must be used with --cache Not used by default",
            ),
            ToolInput(
                "afExac",
                Boolean(optional=True),
                prefix="--af_exac",
                doc="Include allele frequency from ExAC project populations. Must be used with --cache. Not used by default. Note: ExAC data has been superceded by gnomAD. This flag remains for those wishing to use older cache versions containing ExAC data.",
            ),
            ToolInput(
                "pubmed",
                Boolean(optional=True),
                prefix="--pubmed",
                doc="Report Pubmed IDs for publications that cite existing variant. Must be used with --cache. Not used by default",
            ),
            ToolInput(
                "failed",
                Boolean(optional=True),
                prefix="--failed",
                doc="When checking for co-located variants, by default VEP will exclude variants that have been flagged as failed. Set this flag to include such variants. Default: 0 (exclude)",
            ),
            ToolInput(
                "gencodeBasic",
                Boolean(optional=True),
                prefix="--gencode_basic",
                doc="Limit your analysis to transcripts belonging to the GENCODE basic set. This set has fragmented or problematic transcripts removed. Not used by default",
            ),
            ToolInput(
                "excludePredicted",
                Boolean(optional=True),
                prefix="--exclude_predicted",
                doc='When using the RefSeq or merged cache, exclude predicted transcripts (i.e. those with identifiers beginning with "XM_" or "XR_").',
            ),
            ToolInput(
                "transcriptFilter",
                Boolean(optional=True),
                prefix="--transcript_filter",
                doc='''ADVANCED Filter transcripts according to any arbitrary set of rules. Uses similar notation to filter_vep.

            You may filter on any key defined in the root of the transcript object; most commonly this will be ""stable_id"":

            --transcript_filter ""stable_id match N[MR]_""''',
            ),
            ToolInput(
                "checkRef",
                Boolean(optional=True),
                prefix="--check_ref",
                doc="Force VEP to check the supplied reference allele against the sequence stored in the Ensembl Core database or supplied FASTA file. Lines that do not match are skipped. Not used by default",
            ),
            ToolInput(
                "lookupRef",
                Boolean(optional=True),
                prefix="--lookup_ref",
                doc="Force overwrite the supplied reference allele with the sequence stored in the Ensembl Core database or supplied FASTA file. Not used by default",
            ),
            ToolInput(
                "dontSkip",
                Boolean(optional=True),
                prefix="--dont_skip",
                doc="Don't skip input variants that fail validation, e.g. those that fall on unrecognised sequences. Combining --check_ref with --dont_skip will add a CHECK_REF output field when the given reference does not match the underlying reference sequence.",
            ),
            ToolInput(
                "allowNonVariant",
                Boolean(optional=True),
                prefix="--allow_non_variant",
                doc="When using VCF format as input and output, by default VEP will skip non-variant lines of input (where the ALT allele is null). Enabling this option the lines will be printed in the VCF output with no consequence data added.",
            ),
            ToolInput(
                "chr",
                Array(String, optional=True),
                prefix="--chr",
                separator=",",
                doc='Select a subset of chromosomes to analyse from your file. Any data not on this chromosome in the input will be skipped. The list can be comma separated, with "-" characters representing an interval. For example, to include chromosomes 1, 2, 3, 10 and X you could use --chr 1-3,10,X Not used by default',
            ),
            ToolInput(
                "codingOnly",
                Boolean(optional=True),
                prefix="--coding_only",
                doc="Only return consequences that fall in the coding regions of transcripts. Not used by default",
            ),
            ToolInput(
                "noIntergenic",
                Boolean(optional=True),
                prefix="--no_intergenic",
                doc="Do not include intergenic consequences in the output. Not used by default",
            ),
            ToolInput(
                "pick",
                Boolean(optional=True),
                prefix="--pick",
                doc="Pick once line or block of consequence data per variant, including transcript-specific columns. Consequences are chosen according to the criteria described here, and the order the criteria are applied may be customised with --pick_order. This is the best method to use if you are interested only in one consequence per variant. Not used by default",
            ),
            ToolInput(
                "pickAllele",
                Boolean(optional=True),
                prefix="--pick_allele",
                doc="Like --pick, but chooses one line or block of consequence data per variant allele. Will only differ in behaviour from --pick when the input variant has multiple alternate alleles. Not used by default",
            ),
            ToolInput(
                "perGene",
                Boolean(optional=True),
                prefix="--per_gene",
                doc="Output only the most severe consequence per gene. The transcript selected is arbitrary if more than one has the same predicted consequence. Uses the same ranking system as --pick. Not used by default",
            ),
            ToolInput(
                "pickAlleleGene",
                Boolean(optional=True),
                prefix="--pick_allele_gene",
                doc="Like --pick_allele, but chooses one line or block of consequence data per variant allele and gene combination. Not used by default",
            ),
            ToolInput(
                "flagPick",
                Boolean(optional=True),
                prefix="--flag_pick",
                doc="As per --pick, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAllele",
                Boolean(optional=True),
                prefix="--flag_pick_allele",
                doc="As per --pick_allele, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "flagPickAlleleGene",
                Boolean(optional=True),
                prefix="--flag_pick_allele_gene",
                doc="As per --pick_allele_gene, but adds the PICK flag to the chosen block of consequence data and retains others. Not used by default",
            ),
            ToolInput(
                "pickOrder",
                Array(String, optional=True),
                prefix="--pick_order",
                separator=",",
                doc="""Customise the order of criteria (and the list of criteria) applied when choosing a block of annotation data with one of the following options: --pick, --pick_allele, --per_gene, --pick_allele_gene, --flag_pick, --flag_pick_allele, --flag_pick_allele_gene. See this page for the default order.
            Valid criteria are: [ canonical appris tsl biotype ccds rank length mane ]. e.g.:

            --pick --pick_order tsl,appris,rank""",
            ),
            ToolInput(
                "mostSevere",
                Boolean(optional=True),
                prefix="--most_severe",
                doc="Output only the most severe consequence per variant. Transcript-specific columns will be left blank. Consequence ranks are given in this table. To include regulatory consequences, use the --regulatory option in combination with this flag. Not used by default",
            ),
            ToolInput(
                "summary",
                Boolean(optional=True),
                prefix="--summary",
                doc="Output only a comma-separated list of all observed consequences per variant. Transcript-specific columns will be left blank. Not used by default",
            ),
            ToolInput(
                "filterCommon",
                Boolean(optional=True),
                prefix="--filter_common",
                doc="Shortcut flag for the filters below - this will exclude variants that have a co-located existing variant with global AF > 0.01 (1%). May be modified using any of the following freq_* filters. Not used by default",
            ),
            ToolInput(
                "checkFrequency",
                Boolean(optional=True),
                prefix="--check_frequency",
                doc="Turns on frequency filtering. Use this to include or exclude variants based on the frequency of co-located existing variants in the Ensembl Variation database. You must also specify all of the --freq_* flags below. Frequencies used in filtering are added to the output under the FREQS key in the Extra field. Not used by default",
            ),
            ToolInput(
                "freqPop",
                String(optional=True),
                prefix="--freq_pop",
                doc="Name of the population to use in frequency filter. This must be one of the following: (1KG_ALL, 1KG_AFR, 1KG_AMR, 1KG_EAS, 1KG_EUR, 1KG_SAS, AA, EA, gnomAD, gnomAD_AFR, gnomAD_AMR, gnomAD_ASJ, gnomAD_EAS, gnomAD_FIN, gnomAD_NFE, gnomAD_OTH, gnomAD_SAS)",
            ),
            ToolInput(
                "freqFreq",
                Float(optional=True),
                prefix="--freq_freq",
                doc="Allele frequency to use for filtering. Must be a float value between 0 and 1",
            ),
            ToolInput(
                "freqGtLt",
                String(optional=True),
                prefix="--freq_gt_lt",
                doc="Specify whether the frequency of the co-located variant must be greater than (gt) or less than (lt) the value specified with --freq_freq",
            ),
            ToolInput(
                "freqFilter",
                String(optional=True),
                prefix="--freq_filter",
                doc="Specify whether to exclude or include only variants that pass the frequency filter",
            ),
            # CADD plugin
            ToolInput("caddReference", Array(VcfTabix, optional=True)),
            # Condel
            ToolInput(
                "condelConfig",
                Directory(optional=True),
                doc="Directory containing CondelPlugin config, in format: '<dir>/condel_SP.conf'",
            ),
            # dbNSFP
            ToolInput("dbnspReference", VcfTabix(optional=True), doc=""),
            ToolInput("dbsnpColumns", Array(String, optional=True)),
            # REVEL
            ToolInput("revelReference", VcfTabix(optional=True)),
            # CUSTOM
            ToolInput("custom1Reference", VcfTabix(optional=True)),
            ToolInput("custom1Columns", Array(String, optional=True)),
            ToolInput("custom2Reference", VcfTabix(optional=True)),
            ToolInput("custom2Columns", Array(String, optional=True)),
        ]
Пример #12
0
class PiscesVariantCallerBase(IlluminaToolBase):
    def tool(self) -> str:
        return "PiscesVariantCaller"

    def friendly_name(self) -> str:
        return "Pisces: Variant Caller"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Miriam M Yeung"],
            dateCreated=date(2021, 8, 19),
            dateUpdated=date(2021, 10, 12),
            institution="Illumina",
            doi=None,
            citation="",
            keywords=["Illumina", "Pisces", "Variant Caller"],
            documentationUrl="",
            documentation="Calls variants",
        )

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("export TMPDIR=/tmp;", position=1, shell_quote=False),
            ToolArgument("dotnet", position=2, shell_quote=False),
            ToolArgument(
                StringFormatter(
                    "/app/Pisces_v{PISCES_VERSION}/Pisces.dll",
                    PISCES_VERSION=InputSelector("piscesVersion"),
                ),
                position=3,
                shell_quote=False,
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("piscesVersion", String()),
            ToolInput(
                "inputBam",
                BamBai(),
                prefix="-b",
                position=4,
                shell_quote=False,
                doc="Input BAM file",
            ),
            ToolInput(
                "referenceFolder",
                Directory(),
                prefix="--genomefolders",
                position=5,
                shell_quote=False,
                doc="Folder containing reference genome files",
            ),
            ToolInput(
                "outputDir",
                String(),
                prefix="--outfolder",
                position=4,
                shell_quote=False,
                doc="Output directory",
            ),
            ToolInput(
                "intervalBedFile",
                Bed(optional=True),
                prefix="-i",
                position=5,
                shell_quote=False,
                doc="Bed File denoting regions to call variants.",
            ),
            ToolInput(
                "minimumBaseQuality",
                Int(optional=True),
                prefix="--minbq",
                position=5,
                shell_quote=False,
                default=20,
                doc="Minimum base call quality to use base in read. (Default: 20)",
            ),
            ToolInput(
                "callMNVs",
                String(optional=True),
                prefix="--callmnvs",
                position=5,
                shell_quote=False,
                doc="Call Multi Nucleotide Variants (aka Phased SNPs). (Default: false)",
            ),
            ToolInput(
                "outputSBFiles",
                String(optional=True),
                prefix="--outputsbfiles",
                position=5,
                shell_quote=False,
                doc="Boolean Flag to output strand bias files. (Default: false)",
            ),
            *self.pisces_additional_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("vcf", Vcf(), glob=WildcardSelector("*.vcf")),
            ToolOutput(
                "used_options",
                File(optional=True),
                glob=WildcardSelector("PiscesLogs/*.json"),
            ),
            ToolOutput(
                "strandmetrics",
                File(optional=True),
                glob=WildcardSelector("*ReadStrandBias.txt"),
            ),
        ]

    pisces_additional_args = [
        ToolInput(
            "forcedAlleles",
            Vcf(optional=True),
            prefix="--forcedalleles",
            position=5,
            shell_quote=False,
            doc="Path to vcf of alleles where reporting is forced",
        ),
        ToolInput(
            "maxMNVLength",
            Int(optional=True),
            prefix="--maxmnvlength",
            position=5,
            shell_quote=False,
            doc="Maximum lenght of phased SNPs. Must be between 1 - 1000. (Default: 3)",
        ),
        ToolInput(
            "maxGapBetweenMNV",
            Int(optional=True),
            prefix="--maxgapbetweenmnv",
            position=5,
            shell_quote=False,
            doc="Maximum gap allowed between phased SNPs. Must be greater than 0. (Default: 1)",
        ),
        ToolInput(
            "collapseVariants",
            String(optional=True),
            prefix="--collapse",
            position=5,
            shell_quote=False,
            doc="Boolean flag for whether to collapse variants. (Default: true)",
        ),
        ToolInput(
            "collapseFreqThreshold",
            Float(optional=True),
            prefix="--collapsefreqthreshold",
            position=5,
            shell_quote=False,
            doc="when collapsing, minimum frequency of targetted variants. (Default: 0)",
        ),
        ToolInput(
            "collpaseFreqRatioThreshold",
            Float(optional=True),
            prefix="--collapsefreqratiothreshold",
            position=5,
            shell_quote=False,
            doc="When collapsing, minimum ratio requred of target variant frequency to collapsible variant frequency. (Default: 0.5)",
        ),
        ToolInput(
            "priorsPath",
            Vcf(optional=True),
            prefix="--priorspath",
            position=5,
            shell_quote=False,
            doc="Path to vcf file containing known variants, to preferentially reconcile collapsed variants",
        ),
        ToolInput(
            "trimMNVPriors",
            String(optional=True),
            prefix="--trimmnvpriors",
            position=5,
            shell_quote=False,
            doc="Boolean denoting if preceeding bases from the priorsPath VCF shoudl be trimmed. Note: COSMIC convention includeds preceeeding base for a MNV. (Default: false)",
        ),
        ToolInput(
            "coverageMethod",
            String(optional=True),
            prefix="--coveragemethod",
            position=5,
            shell_quote=False,
            doc="'approximate' or 'exact'. Exact is more precise and requires a minimum of 8GB of memory. (Default: approximate)",
        ),
        ToolInput(
            "baseLogName",
            String(optional=True),
            prefix="--baselogname",
            position=5,
            shell_quote=False,
            doc="",
        ),
        ToolInput(
            "debug",
            String(optional=True),
            prefix="-d",
            position=5,
            shell_quote=False,
            doc="Boolean flag for debugging",
        ),
        ToolInput(
            "useStitchedXD",
            String(optional=True),
            prefix="--usestitchedxd",
            position=5,
            shell_quote=False,
            doc="Boolean denoting whether the XD tag (stitched direction) is specified in the bam. ONLY USE IF USING GEMINI TO STITCH BAMS.",
        ),
        ToolInput(
            "trackedAnchorSize",
            Float(optional=True),
            prefix="--trackanchorsize",
            position=5,
            shell_quote=False,
            doc="Max size of anchor tor granularly track, when collecting reference coverage at insertion sites. Higher values == more precise (Default: 5)",
        ),
        ToolInput(
            "chrFilter",
            String(optional=True),
            prefix="--chrfilter",
            position=5,
            shell_quote=False,
            doc="Chromosome to process, will filter out all other chromosomes from output if specified. (Default: None)",
        ),
        ToolInput(
            "outFolder",
            String(optional=True),
            prefix="-o",
            position=4,
            shell_quote=False,
            doc="Ouput folder path",
        ),
        ToolInput(
            "maxThreads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-t",
            position=4,
            shell_quote=False,
            doc="Maximum number of threads. (Default: 20)",
        ),
        ToolInput(
            "threadByChr",
            String(optional=True),
            prefix="--threadbychr",
            position=5,
            shell_quote=False,
            doc="Parallelize by chromosome. (Default: false)",
        ),
        ToolInput(
            "multiProcess",
            String(optional=True),
            prefix="--multiprocess",
            position=5,
            shell_quote=False,
            doc="When thread by chr, launch separate processes to parallelize. (Default: true)",
        ),
        ## Bam Filtering Options
        ToolInput(
            "minimumMappingQuality",
            Int(),
            prefix="--minmq",
            position=5,
            shell_quote=False,
            default=1,
            doc="Minimum mapping quality to use a read. (Default: 1)",
        ),
        ToolInput(
            "filterDuplicates",
            String(optional=True),
            prefix="--filterduplicates",
            position=5,
            shell_quote=False,
            doc="Boolean Flag to filter out reads marked as duplicates. (Default: true)",
        ),
        ToolInput(
            "onlyUseProperPairs",
            String(optional=True),
            prefix="--pp",
            position=5,
            shell_quote=False,
            doc="Boolean Flag to only use proper pairs. (Default: false)",
        ),
        ## Variant Calling Options
        ToolInput(
            "minimumVariantQualityScore",
            Int(optional=True),
            prefix="--minvariantqscore",
            position=5,
            shell_quote=False,
            doc="Minimum Variant Quality Score to report a variant. (Default: 20)",
        ),
        ToolInput(
            "minimumCoverage",
            Int(optional=True),
            prefix="--mindepth",
            position=5,
            shell_quote=False,
            doc="Minimum depth to call a variant. (Default: 10)",
        ),
        ToolInput(
            "minimumVariantFrequency",
            Float(optional=True),
            prefix="--minimumvariantfrequency",
            position=5,
            shell_quote=False,
            doc="Minimum variant frequency to call a variant. Must be between 0 and 1. (Default: 0.01)",
        ),
        ToolInput(
            "targetLODFrequency",
            Float(optional=True),
            prefix="--targetvf",
            position=5,
            shell_quote=False,
            doc="Target Frequency to call a variant (i.e. to target a 5% allele frequency, we must call down to 2.6%, to capture a 5% allelle 95% of the time). Parameter is used by the Somatic Genotyping Model",
        ),
        ToolInput(
            "variantQualityFilter",
            Int(optional=True),
            prefix="--variantqualityfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for variant quality score filter to report a variant as 'FilteredVariantQScore'. (Default: 30)",
        ),
        ToolInput(
            "minimumVariantFrequencyFilter",
            Float(optional=True),
            prefix="--minvariantfrequencyfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for variant frequency to report a variant as 'FilteredVariantFrequency'. (Default: None)",
        ),
        ToolInput(
            "genotypeQualityFilter",
            Int(optional=True),
            prefix="--gqfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for genotype quality, if below the threshold, variant is reported as 'FilteredGenotype'. Should be greater than 0. (Default: None)",
        ),
        ToolInput(
            "minimumDepthFilter",
            Int(optional=True),
            prefix="--mindepthfilter",
            position=5,
            shell_quote=False,
            doc="Threshold for reporting variants as 'FilteredLowDepth', if below the given threshold. (Default: None)",
        ),
        ToolInput(
            "enableSingleStrandFilter",
            String(optional=True),
            prefix="--ssfilter",
            position=5,
            shell_quote=False,
            doc="Filter variants with coverage limited to a single strand with filter flag 'SB'",
        ),
        ToolInput(
            "strandBiasModel",
            String(optional=True),
            prefix="--sbmodel",
            position=5,
            shell_quote=False,
            doc="Strand Bias Mode. Must be 'poisson|extended'. (Default: extended)",
        ),
        ToolInput(
            "noiseLevelForQModel",
            Int(optional=True),
            prefix="--NoiseLevelForQModel",
            position=5,
            shell_quote=False,
            doc="Noise Level to be used in the quality model for a variant quality score. Which is used to determine false positives. Must be >= 0. (Default: minimum base quality)\nNOTE: If this value is greater than the minBQ, it implies that the variant calls have higher confidence than the recorded BQ.",
        ),
        ToolInput(
            "ploidy",
            String(optional=True),
            prefix="--ploidy",
            position=5,
            shell_quote=False,
            doc="Ploidy model to determine the genotype of variant. Select from 'somatic|diploid|DiploidByAdaptiveGT'. (Default: somatic)",
        ),
        ToolInput(
            "diploidSNVGenotypeParameters",
            String(optional=True),
            prefix="--diploidsnvgenotypeparameters",
            position=5,
            shell_quote=False,
            doc="Comma-separated List of 3 floats in the format A,B,C. All must be between 0 and 1. A = Minimum Allelle frequence to be detected as 0/1(heterozygous), B = Maximum Allele frequence to be detected as 0/1, C = Minimum value for the sum of allells 1 and 2 (i.e. if C is not met the sit is flagged as 'Multiallelic'). (Default: 0.20, 0.70, 0.80)",
        ),
        ToolInput(
            "diploidIndelGenotypeParameters",
            String(optional=True),
            prefix="--diploidindelgenotypeparameters",
            position=5,
            shell_quote=False,
            doc="Comma-separated List of 3 floats in the format A,B,C. All must be between 0 and 1. A = Minimum Allelle frequence to be detected as 0/1(heterozygous), B = Maximum Allele frequence to be detected as 0/1, C = Minimum value for the sum of allells 1 and 2 (i.e. if C is not met the sit is flagged as 'Multiallelic'). (Default: 0.20, 0.70, 0.80)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersSNV",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_snvmodel",
            position=5,
            shell_quote=False,
            doc="Comma-separated list of 4 floats in the format A,B,C,D. (Default: 0.034,0.167,0.499,0.998)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersIndel",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_indelmodel",
            position=5,
            shell_quote=False,
            doc="(Default: 0.037,0.443,0.905)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersSNVPrior",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_snvprior",
            position=5,
            shell_quote=False,
            doc="(Default: 0.729,0.044,0.141,0.087)",
        ),
        ToolInput(
            "adaptiveGenotypeParametersIndelPrior",
            String(optional=True),
            prefix="--adaptivegenotypeparameters_indelprior",
            position=5,
            shell_quote=False,
            doc="(Default: 0.962,0.0266,0.0114)",
        ),
        ToolInput(
            "maximumVariantQualityScore",
            Int(optional=True),
            prefix="--maxvq",
            position=5,
            shell_quote=False,
            doc="Maximum variant quality score possible. (Default: 100)",
        ),
        ToolInput(
            "maximumGenotypeQualityScore",
            Int(optional=True),
            prefix="--maxgq",
            position=5,
            shell_quote=False,
            doc="Maximum genotype quality score possible. (Default: 100)",
        ),
        ToolInput(
            "maximumGenotypePosteriorScore",
            Int(optional=True),
            prefix="--maxgp",
            position=5,
            shell_quote=False,
            doc="Maximum Genotype Posterior score. (Default: 300)",
        ),
        ToolInput(
            "minimumGenotypeQualityScore",
            Int(optional=True),
            prefix="--mingq",
            position=5,
            shell_quote=False,
            doc="Minimum genotype quality score. (Default: 0)",
        ),
        ToolInput(
            "RMxNFilter",
            String(optional=True),
            prefix="--rmxnfilter",
            position=5,
            shell_quote=False,
            doc="Comma-separated list in the format M,N,F, indicating the max length of a repeat region(M), the minimum number of repeatitions (N), to be applied if the variant frequency is less than (F). (Default: 5,8,0.20)",
        ),
        ToolInput(
            "noCallFilter",
            Float(optional=True),
            prefix="--ncfilter",
            position=5,
            shell_quote=False,
            doc="No Call rate filter",
        ),
        ## Vcf Writer options
        ToolInput(
            "gVCF",
            String(optional=True),
            prefix="--gvcf",
            position=5,
            shell_quote=False,
            doc="Output as a gVCF. (Default: false)",
        ),
        ToolInput(
            "crushVCF",
            String(optional=True),
            prefix="--crushvcf",
            position=5,
            shell_quote=False,
            doc="Crush vcf output into one line per loci. (Default: false)",
        ),
        ToolInput(
            "reportNoCalls",
            String(optional=True),
            prefix="--reportnocalls",
            position=5,
            shell_quote=False,
            doc="Report the proportion of no-calls in the output. (Default: false)",
        ),
        ToolInput(
            "reportReadCollapsedReadCount",
            String(optional=True),
            prefix="--reportrccounts",
            position=5,
            shell_quote=False,
            doc="Debugging helper, when BAM files contain X1 & X2 tags, reports collapsed read counts for the categories 'duplex-stitched|duplex-nonstitched|simplex-stitched|simplex-nonstitched'. (Default: false)",
        ),
        ToolInput(
            "reportTemplateStrandCounts",
            String(optional=True),
            prefix="--reporttscounts",
            position=5,
            shell_quote=False,
            doc="Debugging helper, conditional on ReportRcCounts. Reports read counts for different template strands for the categories 'duplex-stitched|duplex-nonstitched|simplex-forward-stitched|simplex-forward-nonstitched|simplex-reverse-stitched|simplex-reverse-nonstitched''",
        ),
        ToolInput(
            "reportSuspiciousCoverageFraction",
            String(optional=True),
            prefix="--reportsuspiciouscoveragefraction",
            position=5,
            shell_quote=False,
            doc="Debugging helper, Reports the fraction of total coverage that is 'suspicious'. i.e. unanchored and bearing some resemblance to an insertion at the site. Note that for spanning varaints, this is start + end coverage, therefore up to double the coverage reported. (Default: false)",
        ),
    ]
Пример #13
0
class ScrambleBase(BioinformaticsTool, ABC):
    def tool(self):
        return "scramble"

    def friendly_name(self):
        return "scramble"

    def tool_provider(self):
        return "io_lib"

    def base_command(self):
        return ["scramble"]

    def inputs(self):
        return [
            ToolInput("inputFilename", Bam(), position=200),
            ToolInput("reference",
                      FastaFai(),
                      prefix="-r",
                      doc="Reference sequence file."),
            ToolInput("outputFilename", Filename(extension=".bam")),
            *ScrambleBase.additional_inputs,
        ]

    def arguments(self):
        return [
            ToolArgument("bam", prefix="-I", doc="input data format"),
            ToolArgument("cram", prefix="-O", doc="output data format"),
            ToolArgument(
                "-9",
                doc=
                "compression settings for output cram file (-1=fast,-9=best)"),
            ToolArgument("3.0", prefix="-V", doc="Cram version to output"),
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out",
                Stdout(Cram(), stdoutname=InputSelector("outputFilename")))
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, SCRAMBLE_MEM_TUPLE)
        if val:
            return val
        return 16

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, SCRAMBLE_CORES_TUPLE)
        if val:
            return val
        return 4

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Matthias De Smet (@mattdsm)"],
            dateCreated=date(2020, 2, 27),
            dateUpdated=date(2020, 2, 27),
            institution="None",
            doi=None,
            keywords=["bam", "cram", "compression"],
            documentationUrl="https://github.com/jkbonfield/io_lib/",
            documentation="scramble: streaming bam to cram compression",
        )

    additional_inputs = [
        ToolInput(
            "range",
            String(optional=True),
            prefix="-R",
            doc="Specifies the refseq:start-end range",
        ),
        ToolInput(
            "maxBases",
            Int(optional=True),
            prefix="-b",
            default=5000000,
            doc="Max. bases per slice, default 5000000.",
        ),
        ToolInput(
            "maxSequences",
            Int(optional=True),
            prefix="-s",
            default=10000,
            doc="Sequences per slice, default 10000.",
        ),
        ToolInput(
            "maxSlicesPerContainer",
            Int(optional=True),
            prefix="-S",
            default=1,
            doc="Slices per container, default 1.",
        ),
        ToolInput(
            "embedReferenceSeuence",
            Boolean(optional=True),
            prefix="-e",
            doc="Embed reference sequence.",
        ),
        ToolInput(
            "nonReferenceBaseEncoding",
            Boolean(optional=True),
            prefix="-x",
            doc="Non-reference based encoding.",
        ),
        ToolInput(
            "multipleReferencesPerSlice",
            Boolean(optional=True),
            prefix="-M",
            doc="Use multiple references per slice.",
        ),
        ToolInput(
            "generateTags",
            Boolean(optional=True),
            prefix="-m",
            doc="Generate MD and NM tags.",
        ),
        ToolInput(
            "lzmaCompression",
            Boolean(optional=True),
            prefix="-Z",
            doc="Also compress using lzma",
        ),
        ToolInput(
            "discardReadNames",
            Boolean(optional=True),
            prefix="-n",
            doc="Discard read names where possible.",
        ),
        ToolInput(
            "preserveAuxTags",
            Boolean(optional=True),
            prefix="-P",
            doc="Preserve all aux tags (incl RG,NM,MD).",
        ),
        ToolInput(
            "preserveAuxTagSizes",
            Boolean(optional=True),
            prefix="-p",
            doc="Preserve aux tag sizes ('i', 's', 'c').",
        ),
        ToolInput(
            "noAddPG",
            Boolean(optional=True),
            prefix="-q",
            doc="Don't add scramble @PG header line.",
        ),
        ToolInput(
            "decodeStop",
            Int(optional=True),
            prefix="-N",
            doc="Stop decoding after 'integer' sequences.",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-t",
            doc="Number of threads. (default = 1)",
        ),
        ToolInput(
            "enableQualityBinning",
            Int(optional=True),
            prefix="-B",
            doc="Enable Illumina 8 quality-binning system (lossy).",
        ),
    ]
Пример #14
0
 def inputs(self):
     return [
         *super().inputs(),
         ToolInput(
             tag="tumorBams",
             input_type=Array(BamBai),
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="(--input) BAM/SAM/CRAM file containing reads This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="normalBams",
             input_type=Array(BamBai, optional=True),
             prefix="-I",
             prefix_applies_to_all_elements=True,
             doc="(--input) Extra BAM/SAM/CRAM file containing reads This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="normalSample",
             input_type=String(optional=True),
             prefix="--normal-sample",
             doc="(--normal-sample, if) May be URL-encoded as output by GetSampleName with",
         ),
         ToolInput(
             "outputPrefix",
             String(optional=True),
             doc="Used as a prefix for the outputFilename if not specified, with format: {outputPrefix}.vcf.gz",
             default="generated",
         ),
         ToolInput(
             "outputFilename",
             Filename(prefix=InputSelector("outputPrefix"), extension=".vcf.gz"),
             position=20,
             prefix="-O",
         ),
         ToolInput(
             tag="reference",
             input_type=FastaWithDict(),
             prefix="--reference",
             doc="(-R) Reference sequence file Required.",
         ),
         ToolInput(
             tag="outputBamName",
             # This is not a FileName because otherwise we cant make this optional
             input_type=String(optional=True),
             prefix="-bamout",
             doc="File to which assembled haplotypes should be written",
         ),
         ToolInput(
             tag="activityProfileOut",
             input_type=String(optional=True),
             prefix="--activity-profile-out",
             doc="Default value: null.",
         ),
         ToolInput(
             tag="addOutputSamProgramRecord",
             input_type=Boolean(optional=True),
             prefix="-add-output-sam-program-record",
             doc="(--add-output-sam-program-record)  If true, adds a PG tag to created SAM/BAM/CRAM files.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="addOutputVcfCommandLine",
             input_type=Boolean(optional=True),
             prefix="-add-output-vcf-command-line",
             doc="(--add-output-vcf-command-line)  If true, adds a command line header line to created VCF files.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="afOfAllelesNotInResource",
             input_type=String(optional=True),
             prefix="--af-of-alleles-not-in-resource",
             doc="(-default-af)  Population allele fraction assigned to alleles not found in germline resource.  Please see docs/mutect/mutect2.pdf fora derivation of the default value.  Default value: -1.0. ",
         ),
         ToolInput(
             tag="alleles",
             input_type=String(optional=True),
             prefix="--alleles",
             doc="The set of alleles for which to force genotyping regardless of evidence Default value: null. ",
         ),
         ToolInput(
             tag="annotation",
             input_type=String(optional=True),
             prefix="--annotation",
             doc="(-A) One or more specific annotations to add to variant calls This argument may be specified 0 or more times. Default value: null. Possible Values: {AlleleFraction, AS_BaseQualityRankSumTest, AS_FisherStrand, AS_InbreedingCoeff, AS_MappingQualityRankSumTest, AS_QualByDepth, AS_ReadPosRankSumTest, AS_RMSMappingQuality, AS_StrandOddsRatio, BaseQuality, BaseQualityRankSumTest, ChromosomeCounts, ClippingRankSumTest, CountNs, Coverage, DepthPerAlleleBySample, DepthPerSampleHC, ExcessHet, FisherStrand, FragmentLength, GenotypeSummaries, InbreedingCoeff, LikelihoodRankSumTest, MappingQuality, MappingQualityRankSumTest, MappingQualityZero, OrientationBiasReadCounts, OriginalAlignment, PossibleDeNovo, QualByDepth, ReadPosition, ReadPosRankSumTest, ReferenceBases, RMSMappingQuality, SampleList, StrandBiasBySample, StrandOddsRatio, TandemRepeat, UniqueAltReadCount}",
         ),
         ToolInput(
             tag="annotationGroup",
             input_type=String(optional=True),
             prefix="--annotation-group",
             doc="(-G) One or more groups of annotations to apply to variant calls This argument may be specified 0 or more times. Default value: null. Possible Values: {AS_StandardAnnotation, ReducibleAnnotation, StandardAnnotation, StandardHCAnnotation, StandardMutectAnnotation}",
         ),
         ToolInput(
             tag="annotationsToExclude",
             input_type=String(optional=True),
             prefix="--annotations-to-exclude",
             doc="(-AX)  One or more specific annotations to exclude from variant calls  This argument may be specified 0 or more times. Default value: null. Possible Values: {BaseQuality, Coverage, DepthPerAlleleBySample, DepthPerSampleHC, FragmentLength, MappingQuality, OrientationBiasReadCounts, ReadPosition, StrandBiasBySample, TandemRepeat}",
         ),
         ToolInput(
             tag="arguments_file",
             input_type=File(optional=True),
             prefix="--arguments_file",
             doc="read one or more arguments files and add them to the command line This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="assemblyRegionOut",
             input_type=String(optional=True),
             prefix="--assembly-region-out",
             doc="Output the assembly region to this IGV formatted file Default value: null.",
         ),
         ToolInput(
             tag="baseQualityScoreThreshold",
             input_type=Int(optional=True),
             prefix="--base-quality-score-threshold",
             doc=" Base qualities below this threshold will be reduced to the minimum (6)  Default value: 18.",
         ),
         ToolInput(
             tag="callableDepth",
             input_type=Int(optional=True),
             prefix="--callable-depth",
             doc="Minimum depth to be considered callable for Mutect stats. Does not affect genotyping. Default value: 10. ",
         ),
         ToolInput(
             tag="cloudIndexPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-index-prefetch-buffer",
             doc="(-CIPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable). Defaults to cloudPrefetchBuffer if unset.  Default value: -1. ",
         ),
         ToolInput(
             tag="cloudPrefetchBuffer",
             input_type=Int(optional=True),
             prefix="--cloud-prefetch-buffer",
             doc="(-CPB)  Size of the cloud-only prefetch buffer (in MB; 0 to disable).  Default value: 40. ",
         ),
         ToolInput(
             tag="createOutputBamIndex",
             input_type=Boolean(optional=True),
             prefix="--create-output-bam-index",
             doc="(-OBI)  If true, create a BAM/CRAM index when writing a coordinate-sorted BAM/CRAM file.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputBamMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-bam-md5",
             doc="(-OBM)  If true, create a MD5 digest for any BAM/SAM/CRAM file created  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputVariantIndex",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-index",
             doc="(-OVI)  If true, create a VCF index when writing a coordinate-sorted VCF file.  Default value: true. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="createOutputVariantMd5",
             input_type=Boolean(optional=True),
             prefix="--create-output-variant-md5",
             doc="(-OVM)  If true, create a a MD5 digest any VCF file created.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableBamIndexCaching",
             input_type=Boolean(optional=True),
             prefix="--disable-bam-index-caching",
             doc="(-DBIC)  If true, don't cache bam indexes, this will reduce memory requirements but may harm performance if many intervals are specified.  Caching is automatically disabled if there are no intervals specified.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableReadFilter",
             input_type=Boolean(optional=True),
             prefix="--disable-read-filter",
             doc="(-DF)  Read filters to be disabled before analysis  This argument may be specified 0 or more times. Default value: null. Possible Values: {GoodCigarReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotSecondaryAlignmentReadFilter, PassesVendorQualityCheckReadFilter, ReadLengthReadFilter, WellformedReadFilter}",
         ),
         ToolInput(
             tag="disableSequenceDictionaryValidation",
             input_type=Boolean(optional=True),
             prefix="-disable-sequence-dictionary-validation",
             doc="(--disable-sequence-dictionary-validation)  If specified, do not check the sequence dictionaries from our inputs for compatibility. Use at your own risk!  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="downsamplingStride",
             input_type=Int(optional=True),
             prefix="--downsampling-stride",
             doc="(-stride)  Downsample a pool of reads starting within a range of one or more bases.  Default value: 1. ",
         ),
         ToolInput(
             tag="excludeIntervals",
             input_type=Boolean(optional=True),
             prefix="--exclude-intervals",
             doc="(-XLOne) This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="f1r2MaxDepth",
             input_type=Int(optional=True),
             prefix="--f1r2-max-depth",
             doc="sites with depth higher than this value will be grouped Default value: 200.",
         ),
         ToolInput(
             tag="f1r2MedianMq",
             input_type=Int(optional=True),
             prefix="--f1r2-median-mq",
             doc="skip sites with median mapping quality below this value Default value: 50.",
         ),
         ToolInput(
             tag="f1r2MinBq",
             input_type=Int(optional=True),
             prefix="--f1r2-min-bq",
             doc="exclude bases below this quality from pileup Default value: 20.",
         ),
         ToolInput(
             tag="f1r2TarGz_outputFilename",
             input_type=Filename(extension=".tar.gz"),
             prefix="--f1r2-tar-gz",
             doc="If specified, collect F1R2 counts and output files into this tar.gz file Default value: null. ",
         ),
         ToolInput(
             tag="founderId",
             input_type=String(optional=True),
             prefix="-founder-id",
             doc="(--founder-id)  Samples representing the population founders This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="gatkConfigFile",
             input_type=String(optional=True),
             prefix="--gatk-config-file",
             doc="A configuration file to use with the GATK. Default value: null.",
         ),
         ToolInput(
             tag="gcsRetries",
             input_type=Int(optional=True),
             prefix="-gcs-retries",
             doc="(--gcs-max-retries)  If the GCS bucket channel errors out, how many times it will attempt to re-initiate the connection  Default value: 20. ",
         ),
         ToolInput(
             tag="gcsProjectForRequesterPays",
             input_type=String(optional=True),
             prefix="--gcs-project-for-requester-pays",
             doc=" Project to bill when accessing requester pays buckets. If unset, these buckets cannot be accessed.  Default value: . ",
         ),
         ToolInput(
             tag="genotypeGermlineSites",
             input_type=Boolean(optional=True),
             prefix="--genotype-germline-sites",
             doc=" (EXPERIMENTAL) Call all apparent germline site even though they will ultimately be filtered.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="genotypePonSites",
             input_type=Boolean(optional=True),
             prefix="--genotype-pon-sites",
             doc="Call sites in the PoN even though they will ultimately be filtered. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="germlineResource",
             input_type=VcfTabix(optional=True),
             prefix="--germline-resource",
             doc=" Population vcf of germline sequencing containing allele fractions.  Default value: null. ",
         ),
         ToolInput(
             tag="graph",
             input_type=String(optional=True),
             prefix="-graph",
             doc="(--graph-output) Write debug assembly graph information to this file Default value: null.",
         ),
         ToolInput(
             tag="help",
             input_type=Boolean(optional=True),
             prefix="-h",
             doc="(--help) display the help message Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="ignoreItrArtifacts",
             input_type=String(optional=True),
             prefix="--ignore-itr-artifactsTurn",
             doc=" inverted tandem repeats.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="initialTumorLod",
             input_type=String(optional=True),
             prefix="--initial-tumor-lod",
             doc="(-init-lod)  Log 10 odds threshold to consider pileup active.  Default value: 2.0. ",
         ),
         ToolInput(
             tag="intervalExclusionPadding",
             input_type=String(optional=True),
             prefix="--interval-exclusion-padding",
             doc="(-ixp)  Amount of padding (in bp) to add to each interval you are excluding.  Default value: 0. ",
         ),
         ToolInput(
             tag="imr",
             input_type=String(optional=True),
             prefix="--interval-merging-rule",
             doc="(--interval-merging-rule)  Interval merging rule for abutting intervals  Default value: ALL. Possible values: {ALL, OVERLAPPING_ONLY} ",
         ),
         ToolInput(
             tag="ip",
             input_type=String(optional=True),
             prefix="-ipAmount",
             doc="(--interval-padding) Default value: 0.",
         ),
         ToolInput(
             tag="isr",
             input_type=String(optional=True),
             prefix="--interval-set-rule",
             doc="(--interval-set-rule)  Set merging approach to use for combining interval inputs  Default value: UNION. Possible values: {UNION, INTERSECTION} ",
         ),
         ToolInput(
             tag="intervals",
             input_type=Bed(optional=True),
             prefix="--intervals",
             doc="(-L) One or more genomic intervals over which to operate This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="le",
             input_type=Boolean(optional=True),
             prefix="-LE",
             doc="(--lenient) Lenient processing of VCF files Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="maxPopulationAf",
             input_type=String(optional=True),
             prefix="--max-population-af",
             doc="(-max-af)  Maximum population allele frequency in tumor-only mode.  Default value: 0.01. ",
         ),
         ToolInput(
             tag="maxReadsPerAlignmentStart",
             input_type=Int(optional=True),
             prefix="--max-reads-per-alignment-start",
             doc=" Maximum number of reads to retain per alignment start position. Reads above this threshold will be downsampled. Set to 0 to disable.  Default value: 50. ",
         ),
         ToolInput(
             tag="minBaseQualityScore",
             input_type=String(optional=True),
             prefix="--min-base-quality-score",
             doc="(-mbq:Byte)  Minimum base quality required to consider a base for calling  Default value: 10. ",
         ),
         ToolInput(
             tag="mitochondriaMode",
             input_type=Boolean(optional=True),
             prefix="--mitochondria-mode",
             doc="Mitochondria mode sets emission and initial LODs to 0. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="nativePairHmmThreads",
             input_type=Int(optional=True),
             prefix="--native-pair-hmm-threads",
             default=CpuSelector(),
             doc=" How many threads should a native pairHMM implementation use  Default value: 4. ",
         ),
         ToolInput(
             tag="nativePairHmmUseDoublePrecision",
             input_type=Boolean(optional=True),
             prefix="--native-pair-hmm-use-double-precision",
             doc=" use double precision in the native pairHmm. This is slower but matches the java implementation better  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="normalLod",
             input_type=Double(optional=True),
             prefix="--normal-lod",
             doc="Log 10 odds threshold for calling normal variant non-germline. Default value: 2.2.",
         ),
         ToolInput(
             tag="encode",
             input_type=String(optional=True),
             prefix="-encode",
             doc="This argument may be specified 0 or more times. Default value: null.",
         ),
         ToolInput(
             tag="panelOfNormals",
             input_type=VcfTabix(optional=True),
             prefix="--panel-of-normals",
             doc="(--panel-of-normals)  VCF file of sites observed in normal.  Default value: null. ",
         ),
         ToolInput(
             tag="pcrIndelQual",
             input_type=Int(optional=True),
             prefix="--pcr-indel-qual",
             doc="Phred-scaled PCR SNV qual for overlapping fragments Default value: 40.",
         ),
         ToolInput(
             tag="pcrSnvQual",
             input_type=Int(optional=True),
             prefix="--pcr-snv-qual",
             doc="Phred-scaled PCR SNV qual for overlapping fragments Default value: 40.",
         ),
         ToolInput(
             tag="pedigree",
             input_type=String(optional=True),
             prefix="--pedigree",
             doc="(-ped) Pedigree file for determining the population founders. Default value: null.",
         ),
         ToolInput(
             tag="quiet",
             input_type=Boolean(optional=True),
             prefix="--QUIET",
             doc="Whether to suppress job-summary info on System.err. Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="readFilter",
             input_type=String(optional=True),
             prefix="--read-filter",
             doc="(-RF) Read filters to be applied before analysis This argument may be specified 0 or more times. Default value: null. Possible Values: {AlignmentAgreesWithHeaderReadFilter, AllowAllReadsReadFilter, AmbiguousBaseReadFilter, CigarContainsNoNOperator, FirstOfPairReadFilter, FragmentLengthReadFilter, GoodCigarReadFilter, HasReadGroupReadFilter, IntervalOverlapReadFilter, LibraryReadFilter, MappedReadFilter, MappingQualityAvailableReadFilter, MappingQualityNotZeroReadFilter, MappingQualityReadFilter, MatchingBasesAndQualsReadFilter, MateDifferentStrandReadFilter, MateOnSameContigOrNoMappedMateReadFilter, MateUnmappedAndUnmappedReadFilter, MetricsReadFilter, NonChimericOriginalAlignmentReadFilter, NonZeroFragmentLengthReadFilter, NonZeroReferenceLengthAlignmentReadFilter, NotDuplicateReadFilter, NotOpticalDuplicateReadFilter, NotSecondaryAlignmentReadFilter, NotSupplementaryAlignmentReadFilter, OverclippedReadFilter, PairedReadFilter, PassesVendorQualityCheckReadFilter, PlatformReadFilter, PlatformUnitReadFilter, PrimaryLineReadFilter, ProperlyPairedReadFilter, ReadGroupBlackListReadFilter, ReadGroupReadFilter, ReadLengthEqualsCigarLengthReadFilter, ReadLengthReadFilter, ReadNameReadFilter, ReadStrandFilter, SampleReadFilter, SecondOfPairReadFilter, SeqIsStoredReadFilter, ValidAlignmentEndReadFilter, ValidAlignmentStartReadFilter, WellformedReadFilter}",
         ),
         ToolInput(
             tag="readIndex",
             input_type=String(optional=True),
             prefix="-read-index",
             doc="(--read-index)  Indices to use for the read inputs. If specified, an index must be provided for every read input and in the same order as the read inputs. If this argument is not specified, the path to the index for each input will be inferred automatically.  This argument may be specified 0 or more times. Default value: null. ",
         ),
         ToolInput(
             tag="readValidationStringency",
             input_type=String(optional=True),
             prefix="--read-validation-stringency",
             doc="(-VS:ValidationStringency)  Validation stringency for all SAM/BAM/CRAM/SRA files read by this program.  The default stringency value SILENT can improve performance when processing a BAM file in which variable-length data (read, qualities, tags) do not otherwise need to be decoded.  Default value: SILENT. Possible values: {STRICT, LENIENT, SILENT} ",
         ),
         ToolInput(
             tag="secondsBetweenProgressUpdates",
             input_type=Double(optional=True),
             prefix="-seconds-between-progress-updates",
             doc="(--seconds-between-progress-updates)  Output traversal statistics every time this many seconds elapse  Default value: 10.0. ",
         ),
         ToolInput(
             tag="sequenceDictionary",
             input_type=String(optional=True),
             prefix="-sequence-dictionary",
             doc="(--sequence-dictionary)  Use the given sequence dictionary as the master/canonical sequence dictionary.  Must be a .dict file.  Default value: null. ",
         ),
         ToolInput(
             tag="sitesOnlyVcfOutput",
             input_type=Boolean(optional=True),
             prefix="--sites-only-vcf-output",
             doc=" If true, don't emit genotype fields when writing vcf file output.  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="tmpDir",
             input_type=String(optional=True),
             prefix="--tmp-dir",
             doc="Temp directory to use. Default value: null.",
         ),
         ToolInput(
             tag="tumorLodToEmit",
             input_type=String(optional=True),
             prefix="--tumor-lod-to-emit",
             doc="(-emit-lod)  Log 10 odds threshold to emit variant to VCF.  Default value: 3.0. ",
         ),
         ToolInput(
             tag="tumor",
             input_type=String(optional=True),
             prefix="-tumor",
             doc="(--tumor-sample) BAM sample name of tumor. May be URL-encoded as output by GetSampleName with -encode argument.  Default value: null. ",
         ),
         ToolInput(
             tag="jdkDeflater",
             input_type=Boolean(optional=True),
             prefix="-jdk-deflater",
             doc="(--use-jdk-deflater)  Whether to use the JdkDeflater (as opposed to IntelDeflater)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="jdkInflater",
             input_type=Boolean(optional=True),
             prefix="-jdk-inflater",
             doc="(--use-jdk-inflater)  Whether to use the JdkInflater (as opposed to IntelInflater)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="verbosity",
             input_type=String(optional=True),
             prefix="-verbosity",
             doc="(--verbosity)  Control verbosity of logging.  Default value: INFO. Possible values: {ERROR, WARNING, INFO, DEBUG} ",
         ),
         ToolInput(
             tag="version",
             input_type=Boolean(optional=True),
             prefix="--version",
             doc="display the version number for this tool Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="activeProbabilityThreshold",
             input_type=Double(optional=True),
             prefix="--active-probability-threshold",
             doc=" Minimum probability for a locus to be considered active.  Default value: 0.002. ",
         ),
         ToolInput(
             tag="adaptivePruningInitialErrorRate",
             input_type=Double(optional=True),
             prefix="--adaptive-pruning-initial-error-rate",
             doc=" Initial base error rate estimate for adaptive pruning  Default value: 0.001. ",
         ),
         ToolInput(
             tag="allowNonUniqueKmersInRef",
             input_type=Boolean(optional=True),
             prefix="--allow-non-unique-kmers-in-ref",
             doc=" Allow graphs that have non-unique kmers in the reference  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="assemblyRegionPadding",
             input_type=Int(optional=True),
             prefix="--assembly-region-padding",
             doc=" Number of additional bases of context to include around each assembly region  Default value: 100. ",
         ),
         ToolInput(
             tag="bamWriterType",
             input_type=String(optional=True),
             prefix="--bam-writer-type",
             doc="Which haplotypes should be written to the BAM Default value: CALLED_HAPLOTYPES. Possible values: {ALL_POSSIBLE_HAPLOTYPES, CALLED_HAPLOTYPES} ",
         ),
         ToolInput(
             tag="debugAssembly",
             input_type=String(optional=True),
             prefix="--debug-assembly",
             doc="(-debug)  Print out verbose debug information about each assembly region  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableAdaptivePruning",
             input_type=Boolean(optional=True),
             prefix="--disable-adaptive-pruning",
             doc=" Disable the adaptive algorithm for pruning paths in the graph  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="disableToolDefaultAnnotations",
             input_type=Boolean(optional=True),
             prefix="-disable-tool-default-annotations",
             doc="(--disable-tool-default-annotations)  Disable all tool default annotations  Default value: false. Possible values: {true, false}",
         ),
         ToolInput(
             tag="disableToolDefaultReadFilters",
             input_type=Boolean(optional=True),
             prefix="-disable-tool-default-read-filters",
             doc="(--disable-tool-default-read-filters)  Disable all tool default read filters (WARNING: many tools will not function correctly without their default read filters on)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontIncreaseKmerSizesForCycles",
             input_type=Boolean(optional=True),
             prefix="--dont-increase-kmer-sizes-for-cycles",
             doc=" Disable iterating over kmer sizes when graph cycles are detected  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontTrimActiveRegions",
             input_type=Boolean(optional=True),
             prefix="--dont-trim-active-regions",
             doc=" If specified, we will not trim down the active region from the full region (active + extension) to just the active interval for genotyping  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="dontUseSoftClippedBases",
             input_type=Boolean(optional=True),
             prefix="--dont-use-soft-clipped-bases",
             doc=" Do not analyze soft clipped bases in the reads  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="erc",
             input_type=String(optional=True),
             prefix="-ERC",
             doc="(--emit-ref-confidence)  (BETA feature) Mode for emitting reference confidence scores  Default value: NONE. Possible values: {NONE, BP_RESOLUTION, GVCF} ",
         ),
         ToolInput(
             tag="enableAllAnnotations",
             input_type=Boolean(optional=True),
             prefix="--enable-all-annotations",
             doc=" Use all possible annotations (not for the faint of heart)  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="forceActive",
             input_type=Boolean(optional=True),
             prefix="--force-active",
             doc="If provided, all regions will be marked as active Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="genotypeFilteredAlleles",
             input_type=Boolean(optional=True),
             prefix="--genotype-filtered-alleles",
             doc=" Whether to force genotype even filtered alleles  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="gvcfLodBand",
             input_type=String(optional=True),
             prefix="--gvcf-lod-band",
             doc="(-LODB) Exclusive upper bounds for reference confidence LOD bands (must be specified in increasing order)  This argument may be specified 0 or more times. Default value: [-2.5, -2.0, -1.5,",
         ),
         ToolInput(
             tag="kmerSize",
             input_type=Int(optional=True),
             prefix="--kmer-size",
             doc="Kmer size to use in the read threading assembler This argument may be specified 0 or more times. Default value: [10, 25]. ",
         ),
         ToolInput(
             tag="maxAssemblyRegionSize",
             input_type=Int(optional=True),
             prefix="--max-assembly-region-size",
             doc=" Maximum size of an assembly region  Default value: 300. ",
         ),
         ToolInput(
             tag="mnpDist",
             input_type=Int(optional=True),
             prefix="-mnp-dist",
             doc="(--max-mnp-distance)  Two or more phased substitutions separated by this distance or less are merged into MNPs.  Default value: 1. ",
         ),
         ToolInput(
             tag="maxNumHaplotypesInPopulation",
             input_type=Int(optional=True),
             prefix="--max-num-haplotypes-in-population",
             doc=" Maximum number of haplotypes to consider for your population  Default value: 128. ",
         ),
         ToolInput(
             tag="maxProbPropagationDistance",
             input_type=Int(optional=True),
             prefix="--max-prob-propagation-distance",
             doc=" Upper limit on how many bases away probability mass can be moved around when calculating the boundaries between active and inactive assembly regions  Default value: 50. ",
         ),
         ToolInput(
             tag="maxSuspiciousReadsPerAlignmentStart",
             input_type=Int(optional=True),
             prefix="--max-suspicious-reads-per-alignment-start",
             doc=" Maximum number of suspicious reads (mediocre mapping quality or too many substitutions) allowed in a downsampling stride.  Set to 0 to disable.  Default value: 0. ",
         ),
         ToolInput(
             tag="maxUnprunedVariants",
             input_type=Int(optional=True),
             prefix="--max-unpruned-variants",
             doc=" Maximum number of variants in graph the adaptive pruner will allow  Default value: 100. ",
         ),
         ToolInput(
             tag="minAssemblyRegionSize",
             input_type=Int(optional=True),
             prefix="--min-assembly-region-size",
             doc=" Minimum size of an assembly region  Default value: 50. ",
         ),
         ToolInput(
             tag="minDanglingBranchLength",
             input_type=Int(optional=True),
             prefix="--min-dangling-branch-length",
             doc=" Minimum length of a dangling branch to attempt recovery  Default value: 4. ",
         ),
         ToolInput(
             tag="minPruning",
             input_type=Int(optional=True),
             prefix="--min-pruning",
             doc="Minimum support to not prune paths in the graph Default value: 2.",
         ),
         ToolInput(
             tag="minimumAlleleFraction",
             input_type=Float(optional=True),
             prefix="--minimum-allele-fraction",
             doc="(-min-AF)  Lower bound of variant allele fractions to consider when calculating variant LOD  Default value: 0.0. ",
         ),
         ToolInput(
             tag="numPruningSamples",
             input_type=Int(optional=True),
             prefix="--num-pruning-samples",
             doc="Default value: 1.",
         ),
         ToolInput(
             tag="pairHmmGapContinuationPenalty",
             input_type=Int(optional=True),
             prefix="--pair-hmm-gap-continuation-penalty",
             doc=" Flat gap continuation penalty for use in the Pair HMM  Default value: 10. ",
         ),
         ToolInput(
             tag="pairhmm",
             input_type=String(optional=True),
             prefix="-pairHMM",
             doc="(--pair-hmm-implementation)  The PairHMM implementation to use for genotype likelihood calculations  Default value: FASTEST_AVAILABLE. Possible values: {EXACT, ORIGINAL, LOGLESS_CACHING, AVX_LOGLESS_CACHING, AVX_LOGLESS_CACHING_OMP, EXPERIMENTAL_FPGA_LOGLESS_CACHING, FASTEST_AVAILABLE} ",
         ),
         ToolInput(
             tag="pcrIndelModel",
             input_type=String(optional=True),
             prefix="--pcr-indel-model",
             doc=" The PCR indel model to use  Default value: CONSERVATIVE. Possible values: {NONE, HOSTILE, AGGRESSIVE, CONSERVATIVE} ",
         ),
         ToolInput(
             tag="phredScaledGlobalReadMismappingRate",
             input_type=Int(optional=True),
             prefix="--phred-scaled-global-read-mismapping-rate",
             doc=" The global assumed mismapping rate for reads  Default value: 45. ",
         ),
         ToolInput(
             tag="pruningLodThreshold",
             input_type=Float(optional=True),
             prefix="--pruning-lod-thresholdLn",
             doc="Default value: 2.302585092994046. ",
         ),
         ToolInput(
             tag="recoverAllDanglingBranches",
             input_type=Boolean(optional=True),
             prefix="--recover-all-dangling-branches",
             doc=" Recover all dangling branches  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="showhidden",
             input_type=Boolean(optional=True),
             prefix="-showHidden",
             doc="(--showHidden)  display hidden arguments  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="smithWaterman",
             input_type=String(optional=True),
             prefix="--smith-waterman",
             doc=" Which Smith-Waterman implementation to use, generally FASTEST_AVAILABLE is the right choice  Default value: JAVA. Possible values: {FASTEST_AVAILABLE, AVX_ENABLED, JAVA} ",
         ),
         ToolInput(
             tag="ambigFilterBases",
             input_type=Int(optional=True),
             prefix="--ambig-filter-bases",
             doc="Threshold number of ambiguous bases. If null, uses threshold fraction; otherwise, overrides threshold fraction.  Default value: null.  Cannot be used in conjuction with argument(s) maxAmbiguousBaseFraction",
         ),
         ToolInput(
             tag="ambigFilterFrac",
             input_type=Double(optional=True),
             prefix="--ambig-filter-frac",
             doc="Threshold fraction of ambiguous bases Default value: 0.05. Cannot be used in conjuction with argument(s) maxAmbiguousBases",
         ),
         ToolInput(
             tag="maxFragmentLength",
             input_type=Int(optional=True),
             prefix="--max-fragment-length",
             doc="Default value: 1000000.",
         ),
         ToolInput(
             tag="minFragmentLength",
             input_type=Int(optional=True),
             prefix="--min-fragment-length",
             doc="Default value: 0.",
         ),
         ToolInput(
             tag="keepIntervals",
             input_type=String(optional=True),
             prefix="--keep-intervals",
             doc="One or more genomic intervals to keep This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="library",
             input_type=String(optional=True),
             prefix="-library",
             doc="(--library) Name of the library to keep This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="maximumMappingQuality",
             input_type=Int(optional=True),
             prefix="--maximum-mapping-quality",
             doc=" Maximum mapping quality to keep (inclusive)  Default value: null. ",
         ),
         ToolInput(
             tag="minimumMappingQuality",
             input_type=Int(optional=True),
             prefix="--minimum-mapping-quality",
             doc=" Minimum mapping quality to keep (inclusive)  Default value: 20. ",
         ),
         ToolInput(
             tag="dontRequireSoftClipsBothEnds",
             input_type=Boolean(optional=True),
             prefix="--dont-require-soft-clips-both-ends",
             doc=" Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must have a soft-clipped block, setting this flag requires only 1 soft-clipped block  Default value: false. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="filterTooShort",
             input_type=Int(optional=True),
             prefix="--filter-too-short",
             doc="Minimum number of aligned bases Default value: 30.",
         ),
         ToolInput(
             tag="platformFilterName",
             input_type=String(optional=True),
             prefix="--platform-filter-name",
             doc="This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="blackListedLanes",
             input_type=String(optional=True),
             prefix="--black-listed-lanes",
             doc="Platform unit (PU) to filter out This argument must be specified at least once. Required.",
         ),
         ToolInput(
             tag="readGroupBlackList",
             input_type=String(optional=True),
             prefix="--read-group-black-listThe",
             doc="This argument must be specified at least once. Required. ",
         ),
         ToolInput(
             tag="keepReadGroup",
             input_type=String(optional=True),
             prefix="--keep-read-group",
             doc="The name of the read group to keep Required.",
         ),
         ToolInput(
             tag="maxReadLength",
             input_type=Int(optional=True),
             prefix="--max-read-length",
             doc="Keep only reads with length at most equal to the specified value Default value: 2147483647. ",
         ),
         ToolInput(
             tag="minReadLength",
             input_type=Int(optional=True),
             prefix="--min-read-length",
             doc="Keep only reads with length at least equal to the specified value Default value: 30.",
         ),
         ToolInput(
             tag="readName",
             input_type=String(optional=True),
             prefix="--read-name",
             doc="Keep only reads with this read name Required.",
         ),
         ToolInput(
             tag="keepReverseStrandOnly",
             input_type=Boolean(optional=True),
             prefix="--keep-reverse-strand-only",
             doc=" Keep only reads on the reverse strand  Required. Possible values: {true, false} ",
         ),
         ToolInput(
             tag="sample",
             input_type=String(optional=True),
             prefix="-sample",
             doc="(--sample) The name of the sample(s) to keep, filtering out all others This argument must be specified at least once. Required. ",
         ),
     ]
Пример #15
0
class piscesStitcherBase(IlluminaToolBase):
    def tool(self) -> str:
        return "piscesStitcher"

    def friendly_name(self) -> str:
        return "Pisces: Stitcher"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Miriam M Yeung"],
            dateCreated=date(2021, 8, 19),
            dateUpdated=date(2021, 10, 12),
            institution="Illumina",
            doi=None,
            citation="",
            keywords=["Illumina", "Pisces", "Stitcher"],
            documentationUrl="",
            documentation="Stitches together overlapping read pairs.",
        )

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 32

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("export TMPDIR=/tmp;", position=1, shell_quote=False),
            ToolArgument("dotnet", position=2, shell_quote=False),
            ToolArgument(
                StringFormatter(
                    "/app/Pisces_v{PISCES_VERSION}/Stitcher.dll",
                    PISCES_VERSION=InputSelector("piscesVersion"),
                ),
                position=3,
                shell_quote=False,
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("piscesVersion", String()),
            ToolInput(
                "inputBam",
                Bam(),
                prefix="--bam",
                position=4,
                shell_quote=False,
                doc="Input Bam to Stitch",
            ),
            ToolInput(
                "outputDir",
                String(),
                prefix="--outfolder",
                position=4,
                shell_quote=False,
                doc="Output file directory",
            ),
            ToolInput(
                "sampleName", String(), doc="Sample name for naming outputs"
            ),
            *self.additional_stitcher_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("out", Bam(), glob=WildcardSelector("*bam")),
            ToolOutput(
                "used_options",
                File(optional=True),
                glob=WildcardSelector("StitcherLogs/*.json"),
            ),
        ]

    additional_stitcher_args = [
        ToolInput(
            "minimumBaseQuality",
            Int(optional=True),
            prefix="--minbasecallquality",
            position=5,
            shell_quote=False,
            doc="Minimum base quality score when considering stitching conflicts. Bases lower than this threshold are automatically disregarded for the base in its mate. (Default: 20)",
        ),
        ToolInput(
            "minimumMappingQuality",
            Int(optional=True),
            prefix="--minmapquality",
            position=5,
            shell_quote=False,
            doc="Minimum mapping quality required to consider reads for computation. Reads that fail to meet this threshold are filtered out. Value should not be negative. (Defaut: 1)",
        ),
        ToolInput(
            "filterPairLowMappingQuality",
            Boolean(optional=True),
            prefix="--filterpairlowmapq",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether read pairs should be filtered when one or both reads are below the minimum mapping quality. (Default: true)",
        ),
        ToolInput(
            "filterPairUnmapped",
            Boolean(optional=True),
            prefix="--filterpairunmapped",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether read pairs should be filtered if one or both reads are not mapped. (Default: false)",
        ),
        ToolInput(
            "filterDuplicates",
            Boolean(optional=True),
            prefix="--filterduplicates",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether duplicate reads should be filtered. (Default: true)",
        ),
        ToolInput(
            "filterProperPairs",
            Boolean(optional=True),
            prefix="--filterforproperpairs",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether only properly paired reads should be considered. (Default: false)",
        ),
        ToolInput(
            "filterUnstitchablePairs",
            Boolean(optional=True),
            prefix="--filterunstitchablepairs",
            position=5,
            shell_quote=False,
            doc="Boolean indicating if read pairs with incompatiable CIGAR strings will be filtered. (Default: false)",
        ),
        ToolInput(
            "convertUnstitchable2N",
            Boolean(optional=True),
            prefix="--nifyyunstitchablepairs",
            position=5,
            shell_quote=False,
            doc="Boolean indicating if reads with incompartiable CIGAR strings should be converted to Ns. (Default: true)",
        ),
        ToolInput(
            "stitchGappedPairs",
            Boolean(optional=True),
            prefix="--stitchgappedparis",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether read pairs with no overlap will be conceptually 'stitched', this will contribute to variant phasing downstream. (Default: false)",
        ),
        ToolInput(
            "useSoftclippedBases",
            Boolean(optional=True),
            prefix="--usesoftclippedbases",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether softclipped bases should contribute to stitching. (Default: true)",
        ),
        ToolInput(
            "identifyDuplicates",
            Boolean(optional=True),
            prefix="--identifyduplicates",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether to check for duplicates based on alignment position and sequence, rather than trust the flags. (Default: false)",
        ),
        ToolInput(
            "convertDisagreement2N",
            Boolean(optional=True),
            prefix="--nifydisagreement",
            position=5,
            shell_quote=False,
            doc="Boolean to indiciate whether to turn high-quality but disagreeing overlapping bases to Ns. (Default: false)",
        ),
        ToolInput(
            "debug",
            Boolean(optional=True),
            prefix="--debug",
            position=5,
            shell_quote=False,
            doc="Debug mode",
        ),
        ToolInput(
            "logFilename",
            Filename(
                prefix=InputSelector("sampleName"),
                suffix=".stitcher",
                extension=".log",
            ),
            prefix="--logfilename",
            position=5,
            shell_quote=False,
            doc="name for stitcher log file. (Default: StitcherLog.txt)",
        ),
        ToolInput(
            "verbose",
            Boolean(optional=True),
            prefix="--debugsummary",
            position=5,
            shell_quote=False,
            doc="Boolean inidicating if debugging is in verbose mode. (Default: false)",
        ),
        ToolInput(
            "threadByChromosomes",
            Boolean(optional=True),
            prefix="--threadbychr",
            position=5,
            shell_quote=False,
            doc="Boolean to indicate whether to thread by chromosome [BETA only]. (Default: False)",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="--numthreads",
            position=5,
            shell_quote=False,
            doc="Number of threads. (Default: 1)",
        ),
        ToolInput(
            "sortMemoryGb",
            Float(optional=True),
            prefix="--sortmemorygb",
            position=5,
            shell_quote=False,
            doc="Max Memory to use to sort the bam. If value is 0, bam will not be sorted. (Default: 0.0)",
        ),
        ToolInput(
            "stitchProbeSoftclips",
            Boolean(optional=True),
            prefix="--stichprovesoftclips",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether to allow prove softclips that overlapt the mate to contribute to a stitched direction. (Default: false)",
        ),
        ToolInput(
            "doNotStitchRepeatOverlap",
            Boolean(optional=True),
            prefix="--dontstitchrepeatoverlap",
            position=5,
            shell_quote=False,
            doc="Boolean indicating that read pairs that overlap repeat regions should NOT be stitched. (Default: true)",
        ),
        ToolInput(
            "maxReadLength",
            Int(optional=True),
            prefix="--maxreadlength",
            position=5,
            shell_quote=False,
            doc="The maximum expectd lenght of individual reads, used to determine the maximum expected stitched read length (2*readlen -1). For optimal performance set to the actual read length of a single read. (Default: 1024)",
        ),
        ToolInput(
            "ignoreReadsAboveMaxLength",
            Boolean(optional=True),
            prefix="--ignorereadsabovemaxlength",
            position=5,
            shell_quote=False,
            doc="Boolean indicating whether to ignore read pairs that are longer than the max stitched length (i.e. caused by extremely long deletions). (Default: false)",
        ),
    ]
Пример #16
0
class FastQCBase(BioinformaticsTool, ABC):
    def friendly_name(self) -> str:
        return "FastQC"

    def tool(self):
        return "fastqc"

    def base_command(self):
        return "fastqc"

    def tool_provider(self):
        return "FastQC"

    def bind_metadata(self):
        return ToolMetadata(
            contributors=["Michael Franklin"],
            dateCreated=datetime(2019, 3, 25),
            dateUpdated=datetime(2019, 3, 25),
            institution="Babraham Bioinformatics",
            doi=None,
            citation=None,
            keywords=["fastqc", "quality", "qa"],
            documentationUrl="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/",
            documentation="FastQC is a program designed to spot potential problems in high througput sequencing datasets. "
            "It runs a set of analyses on one or more raw sequence files in fastq or bam format and produces a "
            "report which summarises the results.\n"
            "FastQC will highlight any areas where this library looks unusual and where you should take a closer look. "
            "The program is not tied to any specific type of sequencing technique and can be used to look at libraries "
            "coming from a large number of different experiment types "
            "(Genomic Sequencing, ChIP-Seq, RNA-Seq, BS-Seq etc etc).",
        )

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, CORES_TUPLE)
        if val:
            return val
        return 1

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 8

    def inputs(self) -> List[ToolInput]:
        return [ToolInput("reads", Array(FastqGz), position=5), *self.additional_inputs]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput(
                "out", Array(ZipFile()), glob=WildcardSelector(wildcard="*.zip")
            ),
            ToolOutput(
                "datafile",
                Array(File),
                glob=WildcardSelector(wildcard="*/fastqc_data.txt"),
            ),
        ]

    additional_inputs = [
        ToolInput(
            "outdir",
            String(optional=True),
            default=".",
            prefix="--outdir",
            doc="(-o) Create all output files in the specified output directory. Please note that this "
            "directory must exist as the program will not create it.  If this option is not set then "
            "the output file for each sequence file is created in the same directory as the sequence "
            "file which was processed.",
        ),
        ToolInput(
            "casava",
            Boolean(optional=True),
            prefix="--casava",
            doc="Files come from raw casava output. Files in the same sample group "
            "(differing only by the group number) will be analysed as a set rather than individually. "
            "Sequences with the filter flag set in the header will be excluded from the analysis. "
            "Files must have the same names given to them by casava (including being gzipped and "
            "ending with .gz) otherwise they won't be grouped together correctly.",
        ),
        ToolInput(
            "nano",
            Boolean(optional=True),
            prefix="--nano",
            doc="Files come from naopore sequences and are in fast5 format. In this mode you can pass in "
            "directories to process and the program will take in all fast5 files within those "
            "directories and produce a single output file from the sequences found in all files.",
        ),
        ToolInput(
            "nofilter",
            Boolean(optional=True),
            prefix="--nofilter",
            doc="If running with --casava then don't remove read flagged by casava as poor quality when "
            "performing the QC analysis.",
        ),
        ToolInput(
            "extract",
            Boolean(optional=True),
            prefix="--extract",
            default=True,
            doc="If set then the zipped output file will be uncompressed in the same directory after it has "
            "been created.  By default this option will be set if fastqc is run in non-interactive mode.",
        ),
        ToolInput(
            "java",
            String(optional=True),
            prefix="--java",
            doc="(-j) Provides the full path to the java binary you want to use to launch fastqc. "
            "If not supplied then java is assumed to be in your path.",
        ),
        ToolInput(
            "noextract",
            Boolean(optional=True),
            prefix="--noextract",
            doc="Do not uncompress the output file after creating it.  You should set this option if you do"
            "not wish to uncompress the output when running in non-interactive mode. ",
        ),
        ToolInput(
            "nogroup",
            Boolean(optional=True),
            prefix="--nogroup",
            doc="Disable grouping of bases for reads >50bp. "
            "All reports will show data for every base in the read. "
            "WARNING: Using this option will cause fastqc to crash and burn if you use it on "
            "really long reads, and your plots may end up a ridiculous size. You have been warned! ",
        ),
        ToolInput(
            "format",
            String(optional=True),
            prefix="--format",
            doc="(-f) Bypasses the normal sequence file format detection and forces the program to use the "
            "specified format.  Valid formats are bam,sam,bam_mapped,sam_mapped and fastq ",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            prefix="--threads",
            default=CpuSelector(),
            doc="(-t) Specifies the number of files which can be processed simultaneously. "
            "Each thread will be allocated 250MB of memory so you shouldn't run more threads than your "
            "available memory will cope with, and not more than 6 threads on a 32 bit machine",
        ),
        ToolInput(
            "contaminants",
            File(optional=True),
            prefix="--contaminants",
            doc="(-c) Specifies a non-default file which contains the list of contaminants to screen "
            "overrepresented sequences against. The file must contain sets of named contaminants in "
            "the form name[tab]sequence.  Lines prefixed with a hash will be ignored.",
        ),
        ToolInput(
            "adapters",
            File(optional=True),
            prefix="--adapters",
            doc="(-a) Specifies a non-default file which contains the list of adapter sequences which will "
            "be explicity searched against the library. The file must contain sets of named adapters in "
            "the form name[tab]sequence. Lines prefixed with a hash will be ignored.",
        ),
        ToolInput(
            "limits",
            File(optional=True),
            prefix="--limits",
            doc="(-l) Specifies a non-default file which contains a set of criteria which will be used to "
            "determine the warn/error limits for the various modules.  This file can also be used to "
            "selectively  remove some modules from the output all together. "
            "The format needs to mirror the default limits.txt file found in the Configuration folder.",
        ),
        ToolInput(
            "kmers",
            Int(optional=True),
            prefix="--kmers",
            doc="(-k) Specifies the length of Kmer to look for in the Kmer content module. "
            "Specified Kmer length must be between 2 and 10. Default length is 7 if not specified. ",
        ),
        ToolInput(
            "quiet",
            Boolean(optional=True),
            prefix="--quiet",
            doc="(-q) Supress all progress messages on stdout and only report errors.",
        ),
        ToolInput(
            "dir",
            String(optional=True),
            prefix="--dir",
            doc="(-d) Selects a directory to be used for temporary files written when generating report images."
            "Defaults to system temp directory if not specified.",
        ),
    ]
Пример #17
0
 def inputs(self):
     return [
         ToolInput(
             tag="run",
             input_type=Directory(),
             prefix="--run=",
             separate_value_from_prefix=False,
             doc="Path of Illumina BCL run folder.",
         ),
         ToolInput(
             tag="id",
             input_type=String(optional=True),
             prefix="--id=",
             separate_value_from_prefix=False,
             doc="Name of the folder created by mkfastq. If not supplied, will default to the name of the flowcell referred to by the --run argument.",
         ),
         ToolInput(
             tag="outputFoldername",
             input_type=Filename(),
             prefix="--output-dir=",
             separate_value_from_prefix=False,
             doc="Same as in bcl2fastq. Folder where FASTQs, reports and stats will be generated.",
         ),
         ToolInput(
             tag="csv",
             input_type=Csv(optional=True),
             prefix="--csv=",
             separate_value_from_prefix=False,
             doc="Apparently the same as `sampleSheet`. The sample sheet can either be a simple CSV with lane, sample and index columns, or an Illumina Experiment Manager-compatible sample sheet.  Sample sheet indexes can refer to 10x sample index set names (e.g., SI-GA-A12).",
         ),
         ToolInput(
             tag="sampleSheet",
             input_type=File(optional=True),
             prefix="--sample-sheet=",
             separate_value_from_prefix=False,
             doc="(--samplesheet= | --csv=) Path to the sample sheet. The sample sheet can either be a simple CSV with lane, sample and index columns, or an Illumina Experiment Manager-compatible sample sheet.  Sample sheet indexes can refer to 10x sample index set names (e.g., SI-GA-A12).",
         ),
         ToolInput(
             tag="ignoreDualIndex",
             input_type=Boolean(optional=True),
             prefix="--ignore-dual-index",
             separate_value_from_prefix=True,
             doc="On a dual-indexed flowcell, ignore the second sample index, if the second sample index was not used for the 10x sample.",
         ),
         ToolInput(
             tag="qc",
             input_type=Boolean(optional=True),
             prefix="--qc",
             separate_value_from_prefix=True,
             doc="Calculate both sequencing and 10x-specific metrics, including per-sample barcode matching rate. Will not be performed unless this flag is specified.",
         ),
         ToolInput(
             tag="lanes",
             input_type=Array(String, optional=True),
             prefix="--lanes=",
             separate_value_from_prefix=False,
             separator=",",
             doc="Comma-delimited series of lanes to demultiplex. Shortcut for the --tiles argument.",
         ),
         ToolInput(
             tag="useBasesMask",
             input_type=String(optional=True),
             prefix="--use-bases-mask=",
             separate_value_from_prefix=False,
             doc="Same as bcl2fastq; override the read lengths as specified in RunInfo.xml. See Illumina bcl2fastq documentation for more information.",
         ),
         ToolInput(
             tag="deleteUndetermined",
             input_type=Boolean(optional=True),
             prefix="--delete-undetermined",
             separate_value_from_prefix=True,
             doc="Delete the Undetermined FASTQ files left by bcl2fastq.  Useful if your sample sheet is only expected to match a subset of the flowcell.",
         ),
         ToolInput(
             tag="project",
             input_type=String(optional=True),
             prefix="--project=",
             separate_value_from_prefix=False,
             doc="Custom project name, to override the samplesheet or to use in conjunction with the --csv argument.",
         ),
         # mfranklin: These are only supported in cluster modes which I've disabled
         # ToolInput(
         #     tag="jobmode",
         #     input_type=String(optional=True),
         #     prefix="--jobmode=",
         #     separate_value_from_prefix=False,
         #     doc="Job manager to use. Valid options: local (default), sge, lsf, or a .template file",
         # ),
         # ToolInput(
         #     tag="mempercore",
         #     input_type=String(optional=True),
         #     prefix="--mempercore=",
         #     separate_value_from_prefix=False,
         #     doc="Set max GB each job may use at one time. Only applies in cluster jobmodes.",
         # ),
         # ToolInput(
         #     tag="maxjobs",
         #     input_type=String(optional=True),
         #     prefix="--maxjobs=",
         #     separate_value_from_prefix=False,
         #     doc="Set max jobs submitted to cluster at one time. Only applies in cluster jobmodes.",
         # ),
         # ToolInput(
         #     tag="jobinterval",
         #     input_type=String(optional=True),
         #     prefix="--jobinterval=",
         #     separate_value_from_prefix=False,
         #     doc="Set delay between submitting jobs to cluster, in ms. Only applies in cluster jobmodes.",
         # ),
         # ToolInput(
         #     tag="overrides",
         #     input_type=File(optional=True),
         #     prefix="--overrides=",
         #     separate_value_from_prefix=False,
         #     doc="The path to a JSON file that specifies stage-level overrides for cores and memory.  Finer-grained than --localcores, --mempercore and --localmem. Consult the 10x support website for an example override file.",
         # ),
         ToolInput(
             tag="localcores",
             input_type=Int(optional=True),
             default=CpuSelector(),
             prefix="--localcores=",
             separate_value_from_prefix=False,
             doc="Set max cores the pipeline may request at one time. Only applies when --jobmode=local.",
         ),
         ToolInput(
             tag="localmem",
             input_type=Float(optional=True),
             default=MemorySelector(),
             prefix="--localmem=",
             separate_value_from_prefix=False,
             doc="Set max GB the pipeline may request at one time. Only applies when --jobmode=local.",
         ),
         # ToolInput(
         #     tag="uiport",
         #     input_type=Boolean(optional=True),
         #     prefix="--uiport=",
         #     separate_value_from_prefix=False,
         #     doc="Serve web UI at http://localhost:PORT",
         # ),
         # ToolInput(
         #     tag="disableUi",
         #     input_type=String(optional=True),
         #     prefix="--disable-ui",
         #     separate_value_from_prefix=True,
         #     doc="Do not serve the UI.",
         # ),
         # ToolInput(
         #     tag="noexit",
         #     input_type=String(optional=True),
         #     prefix="--noexit",
         #     separate_value_from_prefix=True,
         #     doc="Keep web UI running after pipestance completes or fails.",
         # ),
         ToolInput(
             tag="nopreflight",
             input_type=Boolean(optional=True),
             prefix="--nopreflight",
             separate_value_from_prefix=True,
             doc="Skip preflight checks.",
         ),
         # ToolInput(
         #     tag="help",
         #     input_type=String(optional=True),
         #     prefix="-h",
         #     separate_value_from_prefix=True,
         #     doc="Show this message.",
         # ),
         # ToolInput(
         #     tag="version",
         #     input_type=String(optional=True),
         #     prefix="--version",
         #     separate_value_from_prefix=True,
         #     doc="Show version.",
         # ),
     ]
Пример #18
0
class BamSorMaDupBase(BioinformaticsTool, ABC):
    def tool(self):
        return "bamsormadup"

    def friendly_name(self):
        return "BamSorMaDup"

    def tool_provider(self):
        return "BioBamBam"

    def base_command(self):
        return ["bamsormadup"]

    def inputs(self):
        return [
            ToolInput("alignedReads", Bam(), position=200),
            ToolInput("outputFilename", Filename(extension=".bam")),
            *BamSorMaDupBase.additional_inputs,
        ]

    def arguments(self):
        return [
            ToolArgument(
                "metrics.txt",
                prefix="M=",
                separate_value_from_prefix=False,
                doc="file containing metrics from duplicate removal",
            ),
            ToolArgument(
                "bam",
                prefix="inputformat=",
                separate_value_from_prefix=False,
                doc="input data format",
            ),
            ToolArgument(
                "bam",
                prefix="outputFormat=",
                separate_value_from_prefix=False,
                doc="output data format",
            ),
        ]

    def outputs(self):
        return [
            ToolOutput(
                "out", Stdout(Bam(),
                              stdoutname=InputSelector("outputFilename"))),
            ToolOutput("metrics", File(),
                       glob=WildcardSelector("metrics.txt")),
        ]

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BAMSORMADUP_MEM_TUPLE)
        if val:
            return val
        return 16

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, BAMSORMADUP_CORES_TUPLE)
        if val:
            return val
        return 4

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Matthias De Smet (@mattdsm)"],
            dateCreated=date(2020, 2, 26),
            dateUpdated=date(2020, 2, 26),
            institution="None",
            doi=None,
            keywords=["duplicates", "sort"],
            documentationUrl="https://gitlab.com/german.tischler/biobambam2",
            documentation="bamsormadup: parallel sorting and duplicate marking",
        )

    additional_inputs = [
        ToolInput(
            "level",
            Int(optional=True),
            prefix="level=",
            separate_value_from_prefix=False,
            default=0,
            doc=
            "compression settings for output bam file (-1=zlib default,0=uncompressed,1=fast,9=best)",
        ),
        ToolInput(
            "tempLevel",
            Int(optional=True),
            prefix="templevel=",
            separate_value_from_prefix=False,
            default=0,
            doc=
            "compression settings for temporary bam files (-1=zlib default,0=uncompressed,1=fast,9=best)",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="threads=",
            separate_value_from_prefix=False,
            doc="Number of threads. (default = 1)",
        ),
        ToolInput(
            "sortOrder",
            String(optional=True),
            prefix="SO=",
            separate_value_from_prefix=False,
            default="coordinate",
            doc="output sort order(coordinate by default)",
        ),
        ToolInput(
            "optMinPixelDif",
            Int(optional=True),
            prefix="optminpixeldif=",
            separate_value_from_prefix=False,
            default=2500,
            doc=
            "pixel difference threshold for optical duplicates (patterned flowcell: 12000, unpatterned flowcell: 2500)",
        ),
    ]
Пример #19
0
class PiscesHygeaRealingerBase(IlluminaToolBase):
    def tool(self) -> str:
        return "PiscesHygeaRealigner"

    def friendly_name(self) -> str:
        return "Pisces: Hygea Realigner"

    def bind_metadata(self):
        from datetime import date

        return ToolMetadata(
            contributors=["Miriam M Yeung"],
            dateCreated=date(2021, 8, 19),
            dateUpdated=date(2021, 10, 12),
            institution="Illumina",
            doi=None,
            citation="",
            keywords=["Illumina", "Pisces", "Hygea", "Realignment"],
            documentationUrl="",
            documentation="Performs realignment for indels and variants",
        )

    def cpus(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(
            hints, CORES_TUPLE)
        if val:
            return val
        return 4

    def memory(self, hints: Dict[str, Any]):
        val = get_value_for_hints_and_ordered_resource_tuple(hints, MEM_TUPLE)
        if val:
            return val
        return 4

    def base_command(self):
        return None

    def arguments(self):
        return [
            ToolArgument("export TMPDIR=/tmp;", position=1, shell_quote=False),
            ToolArgument("dotnet", position=2, shell_quote=False),
            ToolArgument(
                StringFormatter(
                    "/app/Pisces_v{PISCES_VERSION}/Hygea.dll",
                    PISCES_VERSION=InputSelector("piscesVersion"),
                ),
                position=3,
                shell_quote=False,
            ),
        ]

    def inputs(self) -> List[ToolInput]:
        return [
            ToolInput("piscesVersion", String()),
            ToolInput(
                "inputBam",
                BamBai(),
                prefix="-b",
                position=4,
                shell_quote=False,
                doc="Input BAM file",
            ),
            ToolInput(
                "outputDir",
                String(),
                prefix="--outfolder",
                position=4,
                shell_quote=False,
                doc="Output Folder",
            ),
            ToolInput(
                "referenceFolder",
                Directory(),
                prefix="--genomefolders",
                position=5,
                shell_quote=False,
                doc="Folder containing reference genome files",
            ),
            *self.additional_hygea_args,
        ]

    def outputs(self) -> List[ToolOutput]:
        return [
            ToolOutput("out", Bam(), glob=WildcardSelector("*")),
            ToolOutput("used_options",
                       File(optional=True),
                       glob=WildcardSelector("HygeaLogs/*.json")),
        ]

    additional_hygea_args = [
        ToolInput(
            "minimumBaseQuality",
            Int(optional=True),
            prefix="--minbasequality",
            position=6,
            shell_quote=False,
            default=10,
            doc="Minimum base call quality to use base in read. (Default: 10)",
        ),
        ToolInput(
            "minimumDenovoFrequency",
            Float(optional=True),
            prefix="--mindenovofreq",
            position=6,
            shell_quote=False,
            doc=
            "Minimum frequence to use a denovo indel as a realignment target. Must be between 0 and 1. (Default: 0.01)",
        ),
        ToolInput(
            "priorsFile",
            Vcf(optional=True),
            prefix="--priorsfile",
            position=6,
            shell_quote=False,
            doc=
            "Vcf file contianing known priors (variants/indels) to be used as realignment targets. (Default: None)",
        ),
        ToolInput(
            "maximumIndelSize",
            Int(optional=True),
            prefix="--maxindelsize",
            position=6,
            shell_quote=False,
            doc="Maximum indel size for realignment. (Default: 50)",
        ),
        ToolInput(
            "tryThree",
            Boolean(optional=True),
            prefix="--trythree",
            position=6,
            shell_quote=False,
            doc=
            "Option to turn on realignment attempts to three indels. Not recommended if there are known priors. (Default: false)",
        ),
        ToolInput(
            "remaskSoftclips",
            Boolean(optional=True),
            prefix="--remaksoftclips",
            position=6,
            shell_quote=False,
            doc=
            "Option to reaplly softclips to portions of the realingmed read that were previously softclipped but are now matches (M). (Default: false)",
        ),
        ToolInput(
            "skipDuplicates",
            Boolean(optional=True),
            prefix="--skipduplicates",
            position=6,
            shell_quote=False,
            doc=
            "Option to skip realignment of duplicate reads. When true, duplicates will not be realinged but still outputted. (Default: false)",
        ),
        ToolInput(
            "skipAndRemoveDuplicates",
            String(optional=True),
            prefix="--skipandremoveduplicates",
            position=6,
            shell_quote=False,
            doc=
            "Option to skip realignment of duplicates and remove them from the output. (Default: true)",
        ),
        ToolInput(
            "allowRescoringOriginalZero",
            Boolean(optional=True),
            prefix="--allowrescorignorigzero",
            position=6,
            shell_quote=False,
            doc=
            "Option to allow setting mapping quality of perfectly realigned reads (0 mismatches) to 40 even if original mapping quality was 0. If false, perfectly realigned reads with original mapping quality between 1-20 are still assigned mapping quality of 40, but those with 0 are left at 0. (Default: true)",
        ),
        ToolInput(
            "maximumRealignShift",
            Int(optional=True),
            prefix="--maxrealignshift",
            position=6,
            shell_quote=False,
            doc=
            "Maximum lenght of the shift of realigned reads. Realignments with shifts of >= the threshold, relative to the original position, will be discarded and the original position, will be discarded adn the original alignments will be kept. (Default: 250)",
        ),
        ToolInput(
            "tryRealignSoftclippedReads",
            Boolean(optional=True),
            prefix="--tryrealignsoftclippedreads",
            position=6,
            shell_quote=False,
            doc=
            "Boolean as to whether to treat softclips as realignable, making them eligible for realignment of otherwise perfect reads, and counting against alignments when comparing them. (Default: true)",
        ),
        ToolInput(
            "useAlignmentScorer",
            Boolean(optional=True),
            prefix="--usealignmentscorer",
            position=6,
            shell_quote=False,
            doc=
            """When comparing alignments, whether it use the alignment scorer rathern than prioritzing alignments that mainimise mismatches, softclips and indels (in that order). Alignment scoring is a simple additive function that sums the product of each feature with its specified coefficient. 
            Default coefficients are as follows, and can be altered with the given flags:
                -2 = mismatch [--mismatchcoefficient]
                -1 = softclip [--softclipcoeeficient]
                -1 = indel  [--indelcoefficient]
            (Default: false)""",
        ),
        ToolInput(
            "mismatchCoefficient",
            Int(optional=True),
            prefix="--mismatchcoefficient",
            position=6,
            shell_quote=False,
            doc=
            "Coefficient for a mismatch penalty, for Alignment Scorer. Negative number indicates a penalty. (Default: -2)",
        ),
        ToolInput(
            "indelCoefficient",
            Int(optional=True),
            prefix="--indelcoefficient",
            position=6,
            shell_quote=False,
            doc=
            "Coefficient for indel penalty, for Alignment Scorer. Negative number indicates a penalty. (Default: -1)",
        ),
        ToolInput(
            "indelLengthCoefficient",
            Int(optional=True),
            prefix="--indellengthcoefficient",
            position=6,
            shell_quote=False,
            doc=
            "Coeeficient for the number of indel bases, for Alignment Scorer. Negative number indicates penalty. (Default: 0)",
        ),  ## Is this similar to alignment gap extension penalties??
        ToolInput(
            "softclipCoefficient",
            Int(optional=True),
            prefix="--softclipcoefficient",
            position=6,
            shell_quote=False,
            doc=
            "Coefficient for softclip penalty, for Alignment Scorer. Negative number indiciates penalty. (Default: -1)",
        ),
        ToolInput(
            "anchorLengthCoefficient",
            Int(optional=True),
            prefix="--anchorlengthcoefficient",
            position=6,
            shell_quote=False,
            doc=
            "Coeffient for anchor length, for Alignment Scorer. Positive number indicates a preference for highly anchored reads. (Default: 0)",
        ),
        ToolInput(
            "minimumUnanchoredInsertionLength",
            Int(optional=True),
            prefix="--minimumunanchoredinsertionlength",
            position=6,
            shell_quote=False,
            doc=
            "Minimum length of an unanchored insertion(i.e. no flanking reference base on one side) allowed in a realinged read. Insertions shorter than the specified length will be softclipped. A value of 0 will allow unanchored insertions of any length. (Default: 0)",
        ),
        ToolInput(
            "maskPartialInsertion",
            Int(optional=True),
            prefix="--maskpartialinsertion",
            position=6,
            shell_quote=False,
            doc=
            "Option to softclip a partial insertion at the end of a realinged read. Complete bu un-anchored insertions are allowed. (Default: false)",
        ),
        ToolInput(
            "debug",
            Boolean(optional=True),
            prefix="--debug",
            position=6,
            shell_quote=False,
            doc="Debug mode",
        ),
        # ToolInput("insideSubprocess", Boolean(optional=True), prefix = "--insidesubprocess"), # Commented out as there are known issues with multithreading Hygea
        # ToolInput("multiprocess", Boolean(optional=True), prefix = "--multiprocess")
        ToolInput(
            "chromosomeFilter",
            String(optional=True),
            prefix="--chrfilter",
            position=6,
            shell_quote=False,
            doc=
            "Chromosome to process, will filter out all other chromosomes from output if specified. (Default: None)",
        ),
        ToolInput(
            "threads",
            Int(optional=True),
            default=CpuSelector(),
            prefix="-t",
            position=6,
            shell_quote=False,
            doc="Maximum number of threads. (Default: 20)",
        ),
    ]
Пример #20
0
 def inputs(self):
     return [
         ToolInput("bams", Array(BamBai()), position=10),
         ToolInput(
             "reference",
             FastaWithDict(),
             prefix="--reference",
             doc="reference genome to use.",
         ),
         ToolInput(
             "outputFilename",
             Filename(extension=".vcf"),
             prefix="--output",
             doc="output VCF.",
         ),
         ToolInput(
             "assemblyFilename",
             Filename(suffix=".assembly", extension=".bam"),
             prefix="--assembly",
             doc=
             "location of the GRIDSS assembly BAM. This file will be created by GRIDSS.",
         ),
         ToolInput(
             "threads",
             Int(optional=True),
             default=CpuSelector(),
             prefix="--threads",
             doc="number of threads to use. (Default: 8)",
         ),
         ToolInput(
             "jarPath",
             String(optional=True),
             prefix="--jar",
             doc="location of GRIDSS jar",
         ),
         ToolInput(
             "workingDir",
             String(optional=True),
             default="./TMP",
             prefix="--workingdir",
             doc=
             "directory to place GRIDSS intermediate and temporary files. .gridss.working subdirectories will be created. (Default: .)",
         ),
         ToolInput(
             "blacklist",
             Bed(optional=True),
             prefix="--blacklist",
             doc="BED file containing regions to ignore",
         ),
         ToolInput(
             "steps",
             Array(String, optional=True),
             prefix="--steps",
             separator=",",
             prefix_applies_to_all_elements=False,
             doc=
             "processing steps to run. Defaults to all steps. Multiple steps are specified using comma separators. Possible steps are: setupreference, preprocess, assemble, call, all. WARNING: multiple instances of GRIDSS generating reference files at the same time will result in file corruption. Make sure these files are generated before runninng parallel GRIDSS jobs.",
         ),
         ToolInput(
             "configuration",
             File(optional=True),
             prefix="--configuration",
             doc=
             "configuration file use to override default GRIDSS settings.",
         ),
         ToolInput(
             "labels",
             Array(String, optional=True),
             prefix="--labels",
             separator=",",
             prefix_applies_to_all_elements=False,
             doc=
             'comma separated labels to use in the output VCF for the input files. Supporting read counts for input files with the same label are aggregated (useful for multiple sequencing runs of the same sample). Labels default to input filenames, unless a single read group with a non-empty sample name exists in which case the read group sample name is used (which can be disabled by "useReadGroupSampleNameCategoryLabel=false" in the configuration file). If labels are specified, they must be specified for all input files.',
         ),
         ToolInput(
             "externalaligner",
             String(optional=True),
             prefix="--externalaligner",
             doc=
             "use the system version of bwa instead of the in-process version packaged with GRIDSS",
         ),
         ToolInput(
             "jvmheap",
             String(optional=True),
             prefix="--jvmheap",
             doc=
             "size of JVM heap for assembly and variant calling. (Default: 30g)",
         ),
         ToolInput(
             "maxcoverage",
             Int(optional=True),
             prefix="--maxcoverage",
             doc=
             "maximum coverage. Regions with coverage in excess of this are ignored. (Default: 50000)",
         ),
         ToolInput(
             "picardoptions",
             String(optional=True),
             prefix="--picardoptions",
             doc=
             "additional standard Picard command line options. Useful options include VALIDATION_STRINGENCY=LENIENT and COMPRESSION_LEVEL=0. See https://broadinstitute.github.io/picard/command-line-overview.html",
         ),
         ToolInput(
             "useproperpair",
             String(optional=True),
             prefix="--useproperpair",
             doc=
             "use SAM 'proper pair' flag to determine whether a read pair is discordant. Default: use library fragment size distribution to determine read pair concordance",
         ),
         ToolInput(
             "concordantreadpairdistribution",
             Float(optional=True),
             prefix="--concordantreadpairdistribution",
             doc=
             "portion of 6 sigma read pairs distribution considered concordantly mapped. (Default: 0.995)",
         ),
         ToolInput(
             "keepTempFiles",
             Boolean(optional=True),
             prefix="--keepTempFiles",
             doc=
             "keep intermediate files. Not recommended except for debugging due to the high disk usage.",
         ),
         ToolInput(
             "nojni",
             Boolean(optional=True),
             prefix="--nojni",
             doc=
             "do not use JNI native code acceleration libraries (snappy, GKL, ssw, bwa).",
         ),
         ToolInput(
             "jobindex",
             Int(optional=True),
             prefix="--jobindex",
             doc=
             "zero-based assembly job index (only required when performing parallel assembly across multiple computers)",
         ),
         ToolInput(
             "jobnodes",
             Int(optional=True),
             prefix="--jobnodes",
             doc=
             "total number of assembly jobs (only required when performing parallel assembly across multiple computers). Note than an assembly job with any --job argument is required to be run after all indexed jobs have been completed to gather the output files together.",
         ),
     ]
Пример #21
0
 def inputs(self) -> List[ToolInput]:
     return [
         ToolInput(
             "inputFile",
             CompressedVcf(),
             prefix="--input_file",
             doc="Input file name. Can use compressed file (gzipped).",
         ),
         ToolInput(
             "vcf",
             Boolean(),
             default=True,
             prefix="--vcf",
             doc="Writes output in VCF format. Consequences are added in the INFO field of the VCF file, using the "
             'key "CSQ". Data fields are encoded separated by "|"; the order of fields is written in the VCF header.'
             ' Output fields in the "CSQ" INFO field can be selected by using --fields. If the input format was VCF,'
             " the file will remain unchanged save for the addition of the CSQ field (unless using any filtering). "
             "Custom data added with --custom are added as separate fields, using the key specified for each data "
             "file. Commas in fields are replaced with ampersands (&) to preserve VCF format.",
         ),
         ToolInput(
             "compressOutput",
             String(),
             default="bgzip",
             doc='Compress output: "gzip" or "bgzip"',
         ),
         ToolInput(
             "outputFilename", Filename(), prefix="-o", doc="Output file name"
         ),
         ToolInput(
             "inputData",
             String(optional=True),
             prefix="--input_data",
             doc="Raw input data as a string. May be used, to input a single rsID or HGVS notation quickly to vep:",
         ),
         ToolInput(
             "verbose",
             Boolean(optional=True),
             prefix="--verbose",
             doc="Print out a bit more information while running. ",
         ),
         ToolInput(
             "species",
             String(optional=True),
             prefix="--species",
             doc='Species for your data. This can be the latin name e.g. "homo_sapiens" or any Ensembl alias e.g. '
             '"mouse". Specifying the latin name can speed up initial database connection as the registry does not '
             'have to load all available database aliases on the server. Default = "homo_sapiens"',
         ),
         ToolInput(
             "assembly",
             String(optional=True),
             prefix="--assembly",
             doc="Select the assembly version to use if more than one available. If using the cache, you must have "
             "the appropriate assembly's cache file installed. If not specified and you have only 1 assembly "
             "version installed, this will be chosen by default. Default = use found assembly version",
         ),
         ToolInput(
             "fileFormat",
             String(optional=True),
             prefix="--format",
             doc='one of "ensembl", "vcf", "hgvs", "id", "region", "spdi". By default, VEP auto-detects the input '
             "file format. Using this option you can specify the input file is Ensembl, VCF, IDs, HGVS, SPDI or "
             "region format. Can use compressed version (gzipped) of any file format listed above. "
             "Auto-detects format by default",
         ),
         ToolInput(
             "statsFilename",
             Filename(suffix=".txt_summary", extension=".html"),
             prefix="--stats_file",
             doc="Summary stats file name. This is an HTML file containing a summary of the VEP run - the file "
             'name must end ".htm" or ".html". Default = "variant_effect_output.txt_summary.html"',
         ),
         ToolInput(
             "maxSvSize",
             Int(optional=True),
             prefix="--max_sv_size",
             doc="Extend the maximum Structural Variant size VEP can process.",
         ),
         ToolInput(
             "noCheckVariantsOrder",
             Boolean(optional=True),
             prefix="--no_check_variants_order",
             doc="Permit the use of unsorted input files. However running VEP on unsorted "
             "input files slows down the tool and requires more memory.",
         ),
         ToolInput(
             "fork",
             Int(optional=True),
             default=CpuSelector(),
             prefix="--fork",
             doc="Enable forking, using the specified number of forks. Forking can dramatically improve runtime.",
         ),
         ToolInput(
             "plugin",
             Array(String(), optional=True),
             prefix="--plugin",
             separator=",",
             doc="[plugin name] Use named plugin. Plugin modules should be installed in the Plugins subdirectory "
             "of the VEP cache directory (defaults to $HOME/.vep/). Multiple plugins can be used by supplying "
             "the --plugin flag multiple times. See plugin documentation. .",
         ),
         ToolInput(
             "custom",
             String(optional=True),
             prefix="--custom",
             doc="[filename] Add custom annotation to the output. Files must be tabix indexed or in the bigWig "
             "format. Multiple files can be specified by supplying the --custom flag multiple times. See here "
             "for full details. ",
         ),
         ToolInput(
             "gff",
             String(optional=True),
             prefix="--gff",
             doc="[filename] Use GFF transcript annotations in [filename] as an annotation source. Requires a "
             "FASTA file of genomic sequence.",
         ),
         ToolInput(
             "gtf",
             String(optional=True),
             prefix="--gtf",
             doc="[filename] Use GTF transcript annotations in [filename] as an annotation source. "
             "Requires a FASTA file of genomic sequence.",
         ),
         ToolInput(
             "bam",
             String(optional=True),
             prefix="--bam",
             doc="[filename] ADVANCED Use BAM file of sequence alignments to correct transcript models not derived "
             "from reference genome sequence. Used to correct RefSeq transcript models. "
             "Enables --use_transcript_ref; add --use_given_ref to override this behaviour.",
         ),
         ToolInput(
             "useTranscriptRef",
             String(optional=True),
             prefix="--use_transcript_ref",
             doc="By default VEP uses the reference allele provided in the input file to calculate consequences for "
             "the provided alternate allele(s). Use this flag to force VEP to replace the provided reference "
             "allele with sequence derived from the overlapped transcript. This is especially relevant when "
             "using the RefSeq cache, see documentation for more details. The GIVEN_REF and USED_REF fields "
             "are set in the output to indicate any change.",
         ),
         ToolInput(
             "useGivenRef",
             String(optional=True),
             prefix="--use_given_ref",
             doc="Using --bam or a BAM-edited RefSeq cache by default enables --use_transcriipt_ref; add this flag "
             "to override this behaviour and use the provided reference allele from the input. ",
         ),
         ToolInput(
             "fields",
             Array(String(), optional=True),
             doc="Configure the output format using a comma separated list of fields. Can only be used with tab "
             "(--tab) or VCF format (--vcf) output. For the tab format output, the selected fields may be those "
             "present in the default output columns, or any of those that appear in the Extra column (including "
             "those added by plugins or custom annotations). Output remains tab-delimited. For the VCF format "
             'output, the selected fields are those present within the "CSQ" INFO field.',
         ),
         ToolInput(
             "variantClass",
             String(optional=True),
             prefix="--variant_class",
             doc="Output the Sequence Ontology variant class. ",
         ),
         ToolInput(
             "sift",
             String(optional=True),
             prefix="--sift",
             doc="[p|s|b] Species limited SIFT predicts whether an amino acid substitution affects protein function "
             "based on sequence homology and the physical properties of amino acids. VEP can output the prediction "
             "term, score or both. ",
         ),
         ToolInput(
             "polyphen",
             String(optional=True),
             prefix="--polyphen",
             doc="[p|s|b] Human only PolyPhen is a tool which predicts possible impact of an amino acid "
             "substitution on the structure and function of a human protein using straightforward physical and "
             "comparative considerations. VEP can output the prediction term, score or both. VEP uses the "
             "humVar score by default - use --humdiv to retrieve the humDiv score. ",
         ),
         ToolInput(
             "humdiv",
             String(optional=True),
             prefix="--humdiv",
             doc="Human only Retrieve the humDiv PolyPhen predictioninstead of the default humVar.",
         ),
         ToolInput(
             "nearest",
             String(optional=True),
             prefix="--nearest",
             doc="[transcript|gene|symbol] Retrieve the transcript or gene with the nearest protein-coding "
             'transcription start site (TSS) to each input variant. Use "transcript" to retrieve the transcript '
             'stable ID, "gene" to retrieve the gene stable ID, or "symbol" to retrieve the gene symbol. Note that '
             "the nearest TSS may not belong to a transcript that overlaps the input variant, and more than one may "
             "be reported in the case where two are equidistant from the input coordinates. Currently only available"
             " when using a cache annotation source, and requires the Set::IntervalTree perl module. Currently "
             "only available when using a cache annotation source, and requires the Set::IntervalTree perl module.",
         ),
         ToolInput(
             "distance",
             String(optional=True),
             prefix="--distance",
             doc="[bp_distance(,downstream_distance_if_different)] Modify the distance up and/or downstream between "
             "a variant and a transcript for which VEP will assign the upstream_gene_variant or downstream_gene_"
             "variant consequences. Giving one distance will modify both up- and downstream distances; prodiving two "
             "separated by commas will set the up- (5') and down- (3') stream distances respectively. Default: 5000",
         ),
         ToolInput(
             "overlaps",
             String(optional=True),
             prefix="--overlaps",
             doc="Report the proportion and length of a transcript overlapped by a structural variant in VCF format.",
         ),
         ToolInput(
             "genePhenotype",
             String(optional=True),
             prefix="--gene_phenotype",
             doc="Indicates if the overlapped gene is associated with a phenotype, disease or trait. "
             "See list of phenotype sources. ",
         ),
         ToolInput(
             "regulatory",
             String(optional=True),
             prefix="--regulatory",
             doc="Look for overlaps with regulatory regions. VEP can also report if a variant falls in a high "
             "information position within a transcription factor binding site. Output lines have a Feature "
             "type of RegulatoryFeature or MotifFeature. ",
         ),
         ToolInput(
             "cellType",
             String(optional=True),
             prefix="--cell_type",
             doc="Report only regulatory regions that are found in the given cell type(s). Can be a single cell "
             "type or a comma-separated list. The functional type in each cell type is reported under CELL_"
             "TYPE in the output. To retrieve a list of cell types, use --cell_type list.",
         ),
         ToolInput(
             "individual",
             String(optional=True),
             prefix="--individual",
             doc="[all|ind list] Consider only alternate alleles present in the genotypes of the specified "
             'individual(s). May be a single individual, a comma-separated list or "all" to assess all individuals '
             "separately. Individual variant combinations homozygous for the given reference allele will not be "
             "reported. Each individual and variant combination is given on a separate line of output. Only works "
             "with VCF files containing individual genotype data; individual IDs are taken from column headers. ",
         ),
         ToolInput(
             "phased",
             String(optional=True),
             prefix="--phased",
             doc="Force VCF genotypes to be interpreted as phased. For use with plugins that depend on phased data.",
         ),
         ToolInput(
             "alleleNumber",
             String(optional=True),
             prefix="--allele_number",
             doc="Identify allele number from VCF input, where 1 = first ALT allele, 2 = second ALT allele etc. "
             "Useful when using --minimal ",
         ),
         ToolInput(
             "showRefAllele",
             String(optional=True),
             prefix="--show_ref_allele",
             doc="Adds the reference allele in the output. Mainly useful for"
             ' the VEP "default" and tab-delimited output formats. ',
         ),
         ToolInput(
             "totalLength",
             String(optional=True),
             prefix="--total_length",
             doc="Give cDNA, CDS and protein positions as Position/Length. ",
         ),
         ToolInput(
             "numbers",
             String(optional=True),
             prefix="--numbers",
             doc="Adds affected exon and intron numbering to to output. Format is Number/Total. ",
         ),
         ToolInput(
             "noEscape",
             String(optional=True),
             prefix="--no_escape",
             doc="Don't URI escape HGVS strings. Default = escape",
         ),
         ToolInput(
             "keepCsq",
             String(optional=True),
             prefix="--keep_csq",
             doc="Don't overwrite existing CSQ entry in VCF INFO field. Overwrites by default",
         ),
         ToolInput(
             "vcfInfoField",
             String(optional=True),
             prefix="--vcf_info_field",
             doc="[CSQ|ANN|(other)] Change the name of the INFO key that VEP write the consequences to in its VCF "
             'output. Use "ANN" for compatibility with other tools such as snpEff. Default: CSQ',
         ),
         ToolInput(
             "terms",
             String(optional=True),
             prefix="--terms",
             doc="[SO|display|NCBI] The type of consequence terms to output. The Ensembl terms are described here. "
             "The Sequence Ontology is a joint effort by genome annotation centres to standardise descriptions "
             'of biological sequences. Default = "SO"',
         ),
         ToolInput(
             "noHeaders",
             String(optional=True),
             prefix="--no_headers",
             doc="Don't write header lines in output files. Default = add headers",
         ),
         *VepBase_96_3.identifiers,
     ]
Пример #22
0
 def inputs(self):
     return [
         ToolInput(
             tag="inputFiles",
             input_type=Array(VcfTabix),
             prefix="-i",
             separator=",",
             doc="comma seperated list of vcfs",
         ),
         ToolInput(
             tag="MQ",
             input_type=Int(),
             default=15,
             prefix="--mq",
             doc=
             "minimum mapping quality for a variant to be accepted (default: 15)",
         ),
         ToolInput(
             tag="DP",
             input_type=Int(),
             default=10,
             prefix="--dp",
             doc=
             "minimum depth of coverage for a variant to be accepted (default: 10)",
         ),
         ToolInput(
             tag="EVS",
             input_type=Int(),
             default=10,
             prefix="--evs",
             doc=
             "minimum phred scaled evidence for a variant to be accepted (default: 20)",
         ),
         ToolInput(
             tag="RPRS",
             input_type=Int(),
             default=-10,
             prefix="--rprs",
             doc=
             "minimum phred scaled evidence for a variant to be accepted (default: 20)",
         ),
         ToolInput(
             tag="minAD",
             input_type=Int(),
             default=2,
             prefix="--minAD",
             doc=
             "minimum allelic depth for a variant to be accepted (default: 2)",
         ),
         ToolInput(
             tag="threads",
             input_type=Int(),
             default=CpuSelector(),
             prefix="-t",
             doc="amount of threads to use for parallelization (default: 5)",
         ),
         ToolInput(
             tag="interval",
             input_type=String(optional=True),
             prefix="-L",
             doc="interval to call on (default: everything)",
         ),
         ToolInput(
             tag="normalName",
             input_type=String(optional=True),
             prefix="-n",
             doc=
             "Name of the normal sample (default: infered from all sample names)",
         ),
         ToolInput(
             tag="sampleNames",
             input_type=Array(String, optional=True),
             prefix="--sampleNames",
             separator=",",
             doc=
             "Name of the normal sample (default: infered from all sample names)",
         ),
         ToolInput(
             tag="outputFolder",
             input_type=String(),
             prefix="-o",
             default="./",
             doc=
             "Name of the normal sample (default: infered from all sample names)",
         ),
     ]