Exemplo n.º 1
0
    def __init__(self,
                 input_file,
                 output_prefix,
                 threads=1,
                 options={},
                 dependencies=()):
        # See below for parameters in common between SE/PE
        cmd = _get_common_parameters(threads=threads, options=options)

        # Prefix for output files, ensure that all end up in temp folder
        cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s")

        output_tmpl = output_prefix + ".%s.gz"
        cmd.set_kwargs(
            TEMP_OUT_BASENAME=os.path.basename(output_prefix),
            OUT_SETTINGS=output_prefix + ".settings",
            OUT_MATE_1=output_tmpl % ("truncated", ),
            OUT_DISCARDED=output_tmpl % ("discarded", ),
        )

        cmd.set_option("--file1", "%(IN_READS_1)s")
        cmd.set_kwargs(IN_READS_1=input_file)

        apply_options(cmd, options)

        CommandNode.__init__(
            self,
            command=cmd.finalize(),
            threads=threads,
            description="<AdapterRM (SE): %s -> '%s.*'>" % (
                fileutils.describe_files(input_file),
                output_prefix,
            ),
            dependencies=dependencies,
        )
Exemplo n.º 2
0
    def __init__(self, config, reference, infiles, outfile,
                 threads=1, dependencies=()):
        threads = _get_max_threads(reference, threads)
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")
        command.set_option("-nt", threads)

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        description = "<GATK Indel Realigner (training): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             threads=threads,
                             description=description,
                             command=command.finalize(),
                             dependencies=dependencies)
Exemplo n.º 3
0
 def __init__(self, input_files, output_file, dependencies=()):
     Node.__init__(self,
                   description="<Detect Input Duplication: %s>" %
                   (describe_files(input_files)),
                   input_files=input_files,
                   output_files=output_file,
                   dependencies=dependencies)
Exemplo n.º 4
0
 def __init__(self, parameters):
     description = "<MarkDuplicates: %s>" \
         % (describe_files(parameters.input_bams),)
     PicardNode.__init__(self,
                         command=parameters.command.finalize(),
                         description=description,
                         dependencies=parameters.dependencies)
Exemplo n.º 5
0
 def __init__(self, parameters):
     description = "<MarkDuplicates: %s>" \
         % (describe_files(parameters.input_bams),)
     PicardNode.__init__(self,
                         command=parameters.command.finalize(),
                         description=description,
                         dependencies=parameters.dependencies)
Exemplo n.º 6
0
    def __init__(self,
                 config,
                 input_bams,
                 output_bam,
                 keep_dupes=True,
                 dependencies=()):
        input_bams = safe_coerce_to_tuple(input_bams)

        builder = factory.new("rmdup_collapsed")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.set_kwargs(OUT_STDOUT=output_bam,
                           TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
        builder.add_multiple_kwargs(input_bams)

        if not keep_dupes:
            builder.set_option("--remove-duplicates")

        description = "<FilterCollapsedBAM: %s>" \
            % (describe_files(input_bams),)
        MultiBAMInputNode.__init__(self,
                                   config=config,
                                   input_bams=input_bams,
                                   command=builder.finalize(),
                                   description=description,
                                   dependencies=dependencies)
Exemplo n.º 7
0
 def __init__(self, input_files, output_file, dependencies=()):
     Node.__init__(self,
                   description="<Detect Input Duplication: %s>"
                   % (describe_files(input_files)),
                   input_files=input_files,
                   output_files=output_file,
                   dependencies=dependencies)
Exemplo n.º 8
0
    def __init__(self, config, reference, intervals, infiles, outfile,
                 dependencies=()):
        self._basename = os.path.basename(outfile)

        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "IndelRealigner")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
        command.set_option("-o", "%(OUT_BAMFILE)s")
        command.set_option("--bam_compression", 0)
        command.set_option("--disable_bam_indexing")
        _set_input_files(command, infiles)

        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           IN_INTERVALS=intervals,
                           OUT_BAMFILE=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        calmd = AtomicCmd(["samtools", "calmd", "-b",
                           "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
                          TEMP_IN_BAM=self._basename,
                          IN_REF=reference,
                          TEMP_OUT_STDOUT=self._basename + ".calmd",
                          CHECK_VERSION=SAMTOOLS_VERSION)

        description = "<GATK Indel Realigner (aligning): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds([command.finalize(), calmd]),
                             dependencies=dependencies)
Exemplo n.º 9
0
 def __init__(self, input_files, output_file, offset, dependencies=()):
     self._offset = offset
     Node.__init__(self,
                   description="<Validate FASTQ Files: %s>"
                   % (describe_files(input_files)),
                   input_files=input_files,
                   output_files=output_file,
                   dependencies=dependencies)
Exemplo n.º 10
0
    def __init__(self, input_files, output_file, dependencies=()):
        Node.__init__(self,
                      description="<Validate FASTA Files: %s>" %
                      (describe_files(input_files)),
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)

        assert len(self.output_files) == 1, self.output_files
Exemplo n.º 11
0
    def __init__(self, input_files, output_file, dependencies=()):
        self._output_file = output_file

        Node.__init__(self,
                      description="<MergeCoverage: %s -> '%s'>" %
                      (describe_files(input_files), self._output_file),
                      input_files=input_files,
                      output_files=self._output_file,
                      dependencies=dependencies)
Exemplo n.º 12
0
    def __init__(self, input_files, output_file, dependencies=()):
        Node.__init__(self,
                      description="<Validate FASTA Files: %s>"
                      % (describe_files(input_files)),
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)

        assert len(self.output_files) == 1, self.output_files
Exemplo n.º 13
0
 def __init__(self, parameters):
     description = "<mapDamage (plots): %s -> '%s'>" \
         % (describe_files(parameters.input_files),
            parameters.output_directory)
     MultiBAMInputNode.__init__(self,
                                config=parameters.config,
                                input_bams=parameters.input_files,
                                command=parameters.command.finalize(),
                                description=description,
                                dependencies=parameters.dependencies)
Exemplo n.º 14
0
 def __init__(self, parameters):
     description = "<mapDamage (plots): %s -> '%s'>" \
         % (describe_files(parameters.input_files),
            parameters.output_directory)
     MultiBAMInputNode.__init__(self,
                                config=parameters.config,
                                input_bams=parameters.input_files,
                                command=parameters.command.finalize(),
                                description=description,
                                dependencies=parameters.dependencies)
Exemplo n.º 15
0
    def __init__(
        self,
        reference,
        input_files,
        output_directory,
        title="mapDamage",
        options={},
        dependencies=(),
    ):
        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--no-stats",
                # Prevent references with many contigs from using excessive
                # amounts of memory, at the cost of per-contig statistics:
                "--merge-reference-sequences",
                "-t",
                title,
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"),
            OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"),
            OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"),
            OUT_PLOT_FRAG=os.path.join(
                output_directory, "Fragmisincorporation_plot.pdf"
            ),
            OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"),
            OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"),
            OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"),
            OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"),
            TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
            TEMP_OUT_STDERR="pipe_mapDamage.stderr",
            CHECK_RSCRIPT=RSCRIPT_VERSION,
            CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (plots): %s -> '%s'>"
            % (describe_files(merge.input_files), output_directory,),
            dependencies=dependencies,
        )
Exemplo n.º 16
0
    def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()):
        self._output_file        = output_file
        self._main_tree_files    = safe_coerce_to_tuple(main_tree_files)
        self._support_tree_files = safe_coerce_to_tuple(support_tree_files)
        input_files = self._main_tree_files + self._support_tree_files

        description  = "<NewickSupport: %s>" % \
          (describe_files(main_tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = input_files,
                      output_files = output_file,
                      dependencies = dependencies)
Exemplo n.º 17
0
    def __init__(self, parameters):
        command = parameters.command.finalize()

        self._multi_file_input = len(parameters.input_files) > 1
        if self._multi_file_input:
            cat = _build_cat_command(parameters.input_files, "uncompressed_input")
            command = ParallelCmds((command, cat))

        CommandNode.__init__(self,
                             command=command,
                             threads=parameters.threads,
                             description="<AdapterRM (SE): %s -> '%s.*'>"
                             % (fileutils.describe_files(parameters.input_files),
                                parameters.output_prefix),
                             dependencies=parameters.dependencies)
Exemplo n.º 18
0
    def __init__(self, input_files, destination, dependencies=()):
        cat_cmd = factory.new("cat")
        cat_cmd.add_multiple_values(input_files)
        cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE)
        cat_cmd = cat_cmd.finalize()

        zip_cmd = AtomicCmd("gzip", IN_STDIN=cat_cmd, OUT_STDOUT=destination)

        description = "<Cat %s -> %s>" \
            % (fileutils.describe_files(input_files), destination)

        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds((cat_cmd, zip_cmd)),
                             dependencies=dependencies)
Exemplo n.º 19
0
    def __init__(self, tree_files, output_file, taxa=(), dependencies=()):
        self._output_file = output_file
        self._tree_files = safe_coerce_to_tuple(tree_files)
        self._reroot_on_taxa = safe_coerce_to_tuple(taxa)

        reroot_on = "midpoint"
        if self._reroot_on_taxa:
            reroot_on = repr("', '".join(sorted(self._reroot_on_taxa)))

        description  = "<NewickReroot (on %s): %s>" % \
          (reroot_on, describe_files(tree_files),)

        Node.__init__(self,
                      description=description,
                      input_files=self._tree_files,
                      output_files=self._output_file,
                      dependencies=dependencies)
Exemplo n.º 20
0
    def __init__(self, config, input_files, output_file, dependencies=()):
        input_files = safe_coerce_to_tuple(input_files)

        builder = factory.new("duphist")
        builder.add_value('%(TEMP_IN_BAM)s')
        builder.set_kwargs(OUT_STDOUT=output_file,
                           TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
        builder.add_multiple_kwargs(input_files)

        description = "<DuplicateHistogram: %s -> %r>" \
            % (describe_files(input_files), output_file)
        MultiBAMInputNode.__init__(self,
                                   config=config,
                                   input_bams=input_files,
                                   command=builder.finalize(),
                                   description=description,
                                   dependencies=dependencies)
Exemplo n.º 21
0
    def __init__(self, input_files, output_file, offset, dependencies=()):
        self._offset = offset
        self._files = set()
        for (read_type, filename) in input_files.iteritems():
            if read_type == "Paired":
                self._files.add((read_type, filename.format(Pair=1)))
                self._files.add((read_type, filename.format(Pair=2)))
            else:
                self._files.add((read_type, filename))

        input_files = [filename for _, filename in self._files]
        Node.__init__(self,
                      description="<Validate FASTQ Files: %s>"
                      % (describe_files(input_files)),
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)
Exemplo n.º 22
0
    def __init__(self, input_files, output_file, offset, dependencies=()):
        self._offset = offset
        self._files = set()
        for (read_type, filename) in input_files.iteritems():
            if read_type == "Paired":
                self._files.add((read_type, filename.format(Pair=1)))
                self._files.add((read_type, filename.format(Pair=2)))
            else:
                self._files.add((read_type, filename))

        input_files = [filename for _, filename in self._files]
        Node.__init__(self,
                      description="<Validate FASTQ Files: %s>" %
                      (describe_files(input_files)),
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)
Exemplo n.º 23
0
    def __init__(self, input_files, destination, dependencies=()):
        cat_cmd = factory.new("cat")
        cat_cmd.add_multiple_values(input_files)
        cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE)
        cat_cmd = cat_cmd.finalize()

        zip_cmd = AtomicCmd("gzip",
                            IN_STDIN=cat_cmd,
                            OUT_STDOUT=destination)

        description = "<Cat %s -> %s>" \
            % (fileutils.describe_files(input_files), destination)

        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds((cat_cmd, zip_cmd)),
                             dependencies=dependencies)
Exemplo n.º 24
0
    def __init__(self, tree_files, output_file, taxa = (), dependencies = ()):
        self._output_file    = output_file
        self._tree_files     = safe_coerce_to_tuple(tree_files)
        self._reroot_on_taxa = safe_coerce_to_tuple(taxa)

        reroot_on = "midpoint"
        if self._reroot_on_taxa:
            reroot_on = repr("', '".join(sorted(self._reroot_on_taxa)))

        description  = "<NewickReroot (on %s): %s>" % \
          (reroot_on, describe_files(tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = self._tree_files,
                      output_files = self._output_file,
                      dependencies = dependencies)
Exemplo n.º 25
0
    def __init__(self,
                 main_tree_files,
                 support_tree_files,
                 output_file,
                 dependencies=()):
        self._output_file = output_file
        self._main_tree_files = safe_coerce_to_tuple(main_tree_files)
        self._support_tree_files = safe_coerce_to_tuple(support_tree_files)
        input_files = self._main_tree_files + self._support_tree_files

        description  = "<NewickSupport: %s>" % \
          (describe_files(main_tree_files),)

        Node.__init__(self,
                      description=description,
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)
Exemplo n.º 26
0
    def __init__(
        self,
        reference,
        input_files,
        output_file,
        directory,
        options={},
        dependencies=(),
    ):
        stats_out_fname = "Stats_out_MCMC_correct_prob.csv"

        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--rescale-only",
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
                "--rescale-out",
                "%(OUT_BAM)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            TEMP_OUT_LOG="Runtime_log.txt",
            TEMP_OUT_CSV=stats_out_fname,
            OUT_BAM=output_file,
            CHECK_VERSION=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        self._directory = directory

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (rescale): %s -> %r>"
            % (describe_files(merge.input_files), output_file,),
            dependencies=dependencies,
        )
Exemplo n.º 27
0
    def __init__(self,
                 input_files,
                 output_file,
                 offset,
                 collapsed=False,
                 dependencies=()):
        command = factory.new(":validate_fastq")
        command.set_option("--offset", offset)
        if collapsed:
            command.set_option("--collapsed")
        command.add_multiple_values(input_files)
        command.set_kwargs(OUT_STDOUT=output_file)

        CommandNode.__init__(
            self,
            description="<Validate FASTQ Files: %s>" %
            (describe_files(input_files)),
            command=command.finalize(),
            dependencies=dependencies,
        )
Exemplo n.º 28
0
    def __init__(self,
                 config,
                 input_bams,
                 output_bam,
                 keep_dupes=True,
                 dependencies=()):
        merge = merge_bam_files_command(input_bams)

        builder = factory.new("rmdup_collapsed")
        builder.set_kwargs(IN_STDIN=merge, OUT_STDOUT=output_bam)

        if not keep_dupes:
            builder.set_option("--remove-duplicates")

        description = "<FilterCollapsedBAM: %s>" % (describe_files(
            merge.input_files), )
        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, builder.finalize()]),
            description=description,
            dependencies=dependencies,
        )
Exemplo n.º 29
0
    def __init__(self,
                 config,
                 target_name,
                 input_files,
                 output_file,
                 prefix,
                 regions_file=None,
                 dependencies=()):
        input_files = safe_coerce_to_tuple(input_files)
        index_format = regions_file and prefix['IndexFormat']

        builder = factory.new("depths")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(OUT_FILE=output_file,
                           TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
        builder.add_multiple_kwargs(input_files)

        if regions_file:
            index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format)

            builder.set_option('--regions-file', '%(IN_REGIONS)s')
            builder.set_kwargs(IN_REGIONS=regions_file,
                               TEMP_IN_INDEX=index_file)

        description = "<DepthHistogram: %s -> '%s'>" \
            % (describe_files(input_files), output_file)

        MultiBAMInputNode.__init__(self,
                                   config=config,
                                   input_bams=input_files,
                                   index_format=index_format,
                                   command=builder.finalize(),
                                   description=description,
                                   dependencies=dependencies)
Exemplo n.º 30
0
    def __init__(
            self,
            config,
            input_bams,
            output_bam,
            output_metrics=None,
            keep_dupes=False,
            dependencies=(),
    ):
        params = picard_command(config, "MarkDuplicates")
        _set_max_open_files(params, "MAX_FILE_HANDLES")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        # Validation is mostly left to manual ValidateSamFile runs; required
        # because .csi indexed BAM records can have "invalid" bins.
        params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES",
                              "True",
                              sep="=",
                              fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics)

        description = "<MarkDuplicates: %s>" % (describe_files(input_bams), )
        PicardNode.__init__(
            self,
            command=params.finalize(),
            description=description,
            dependencies=dependencies,
        )
Exemplo n.º 31
0
def test_describe_files__no_files():
    assert_equal(describe_files(()), "No files")
Exemplo n.º 32
0
def test_describe_files__same_path_abs__1_differences():
    fpaths = ("/var/foo/faz", "/var/foo/fao")
    assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
Exemplo n.º 33
0
def test_describe_files__same_path_abs__3_differences():
    fpaths = ("/var/foo/bar", "/var/foo/foo")
    assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
Exemplo n.º 34
0
def test_describe_files__different_paths_rel():
    fpaths = ("var/foo/bar", "var/bar/foo")
    assert_equal(describe_files(fpaths), "2 files")
Exemplo n.º 35
0
def test_describe_files__same_path_rel():
    fpaths = ("var/foo/bar", "var/foo/foo")
    assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
Exemplo n.º 36
0
def test_describe_files__no_files():
    assert_equal(describe_files(()), "No files")
Exemplo n.º 37
0
def test_describe_files__iterable():
    fpaths = iter(("/var/foo/bar", "/var/foo/foo"))
    assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
Exemplo n.º 38
0
def test_describe_files__same_path_abs__3_differences():
    fpaths = ("/var/foo/bar", "/var/foo/foo")
    assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
Exemplo n.º 39
0
def test_describe_files__single_file():
    fpath = "/var/foo/bar"
    assert_equal(describe_files((fpath,)), repr(fpath))
Exemplo n.º 40
0
def test_describe_files__same_path_abs__1_differences():
    fpaths = ("/var/foo/faz", "/var/foo/fao")
    assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
Exemplo n.º 41
0
def test_describe_files__same_path_rel():
    fpaths = ("var/foo/bar", "var/foo/foo")
    assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
Exemplo n.º 42
0
def test_describe_files__different_paths_rel():
    fpaths = ("var/foo/bar", "var/bar/foo")
    assert_equal(describe_files(fpaths), "2 files")
Exemplo n.º 43
0
def test_describe_files__iterable():
    fpaths = iter(("/var/foo/bar", "/var/foo/foo"))
    assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
Exemplo n.º 44
0
def test_describe_files__single_file():
    fpath = "/var/foo/bar"
    assert_equal(describe_files((fpath, )), repr(fpath))