Пример #1
0
def concatenate_input_bams(config, input_bams, out=AtomicCmd.PIPE):
    """Transparent concatenation of input BAMs.

    Return a tuple containing a list of nodes (0 or 1), and an
    object which may be passed to the IN_STDIN of an AtomicCmd
    (either an AtomicCmd, or a filename). This allows transparent
    concatenation when multiple files are specified, while
    avoiding needless overhead when there is only 1 input file."""

    input_bams = safe_coerce_to_tuple(input_bams)
    if len(input_bams) == 1:
        return [], input_bams[0]

    jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar")
    params = AtomicJavaCmdBuilder(config, jar_file)
    params.set_kwargs(CHECK_JAR=_picard_version(jar_file))

    if out == AtomicCmd.PIPE:
        params.set_kwargs(OUT_STDOUT=out)
        params.set_option("OUTPUT", "/dev/stdout", sep="=")
    else:
        params.set_option("OUTPUT", out, sep="=")

    params.set_option("CREATE_INDEX", "False", sep="=")
    params.set_option("COMPRESSION_LEVEL", 0, sep="=")

    for (index, filename) in enumerate(safe_coerce_to_tuple(input_bams),
                                       start=1):
        params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep="=")
        params.set_kwargs(**{"IN_BAM_%02i" % index: filename})

    params.set_option("SO", "coordinate", sep="=", fixed=False)

    cmd = params.finalize()
    return [cmd], cmd
Пример #2
0
def concatenate_input_bams(config, input_bams, out = AtomicCmd.PIPE):
    """Transparent concatenation of input BAMs.

    Return a tuple containing a list of nodes (0 or 1), and an
    object which may be passed to the IN_STDIN of an AtomicCmd
    (either an AtomicCmd, or a filename). This allows transparent
    concatenation when multiple files are specified, while
    avoiding needless overhead when there is only 1 input file."""

    input_bams = safe_coerce_to_tuple(input_bams)
    if len(input_bams) == 1:
        return [], input_bams[0]

    jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar")
    params = AtomicJavaCmdBuilder(config, jar_file)
    params.set_kwargs(CHECK_JAR  = _picard_version(jar_file))

    if out == AtomicCmd.PIPE:
        params.set_kwargs(OUT_STDOUT = out)
        params.set_option("OUTPUT", "/dev/stdout", sep = "=")
    else:
        params.set_option("OUTPUT", out, sep = "=")

    params.set_option("CREATE_INDEX", "False", sep = "=")
    params.set_option("COMPRESSION_LEVEL",  0, sep = "=")

    for (index, filename) in enumerate(safe_coerce_to_tuple(input_bams), start = 1):
        params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep = "=")
        params.set_kwargs(**{"IN_BAM_%02i" % index : filename})

    params.set_option("SO", "coordinate", sep = "=", fixed = False)

    cmd = params.finalize()
    return [cmd], cmd
Пример #3
0
 def add_nodes(self, *nodes):
     for subnodes in safe_coerce_to_tuple(nodes):
         for node in safe_coerce_to_tuple(subnodes):
             if not isinstance(node, Node):
                 raise TypeError("Node object expected, recieved %s" %
                                 repr(node))
             self._nodes.append(node)
Пример #4
0
 def add_nodes(self, *nodes):
     for subnodes in safe_coerce_to_tuple(nodes):
         for node in safe_coerce_to_tuple(subnodes):
             if not isinstance(node, Node):
                 raise TypeError("Node object expected, recieved %s"
                                 % repr(node))
             self._nodes.append(node)
Пример #5
0
    def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()):
        self._output_file        = output_file
        self._main_tree_files    = safe_coerce_to_tuple(main_tree_files)
        self._support_tree_files = safe_coerce_to_tuple(support_tree_files)
        input_files = self._main_tree_files + self._support_tree_files

        description  = "<NewickSupport: %s>" % \
          (describe_files(main_tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = input_files,
                      output_files = output_file,
                      dependencies = dependencies)
Пример #6
0
    def customize(cls,
                  config,
                  input_bams,
                  output_bam,
                  output_metrics=None,
                  dependencies=()):
        jar_file = os.path.join(config.jar_root, "MarkDuplicates.jar")
        params = AtomicJavaCmdBuilder(config, jar_file)

        # Create .bai index, since it is required by a lot of other programs
        params.set_option("CREATE_INDEX", "True", sep="=")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")

        input_bams = safe_coerce_to_tuple(input_bams)
        for (index, filename) in enumerate(input_bams):
            params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep="=")
            params.set_kwargs(**{("IN_BAM_%02i" % index): filename})

        # Remove duplicates from output by default to save disk-space
        params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False)

        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_BAI=swap_ext(output_bam, ".bai"),
                          OUT_METRICS=output_metrics
                          or swap_ext(output_bam, ".metrics"),
                          CHECK_JAR=_picard_version(jar_file))

        return {"command": params, "dependencies": dependencies}
Пример #7
0
    def __init__(self, config, prefixes, name):
        self.name     = name
        self.prefixes = safe_coerce_to_tuple(prefixes)

        self._nodes_alignment = MetaNode(description  = "Alignments:",
                                          dependencies = [prefix.node for prefix in self.prefixes])
        self._nodes_extras    = {}
Пример #8
0
    def __init__(self, config, prefix, samples, features, target):
        self.name = prefix["Name"]
        self.label = prefix.get("Label") or self.name
        self.reference = prefix["Reference"]
        self.aoi = prefix.get("AreasOfInterest", {})

        self.samples = safe_coerce_to_tuple(samples)
        self.bams = {}
        self.folder = config.destination
        self.target = target

        files_and_nodes = {}
        for sample in self.samples:
            files_and_nodes.update(sample.bams.iteritems())

        if "Raw BAM" in features:
            self.bams.update(
                self._build_raw_bam(config, prefix, files_and_nodes))
        if "Realigned BAM" in features:
            self.bams.update(
                self._build_realigned_bam(config, prefix, files_and_nodes))

        sample_nodes = [sample.node for sample in self.samples]
        if not self.bams:
            for sample in self.samples:
                self.bams.update(sample.bams)

            self.node = MetaNode(description="Prefix: %s" % prefix["Name"],
                                 dependencies=sample_nodes)
        else:
            self.node = MetaNode(description="Final BAMs: %s" % prefix["Name"],
                                 subnodes=self.bams.values(),
                                 dependencies=sample_nodes)
Пример #9
0
def _append_aln_user_parameters(mkfile_params, lst):
    for (param, value) in mkfile_params.iteritems():
        if param.startswith("-"):
            for value in safe_coerce_to_tuple(value):
                lst.append(param)
                if value is not None:
                    lst.append(value)
Пример #10
0
    def __init__(self, config, target, prefix, lanes, name):
        self.name = name
        self.lanes = safe_coerce_to_tuple(lanes)
        self.options = lanes[0].options
        self.folder = os.path.dirname(self.lanes[0].folder)
        self.bams = None
        self.mapdamage = None

        assert all((self.folder == os.path.dirname(lane.folder))
                   for lane in self.lanes)
        assert all((self.options == lane.options) for lane in self.lanes)

        lane_bams = self._collect_bams_by_type(self.lanes)
        self.datadup_check = self._build_dataduplication_node(lane_bams)
        self.duphist = \
            self._build_duphist_nodes(config, target, prefix, lane_bams)
        pcr_duplicates = self.options["PCRDuplicates"]
        if pcr_duplicates:
            lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams,
                                                    pcr_duplicates)

        # At this point we no longer need to differentiate between types of reads
        files_and_nodes = self._collect_files_and_nodes(lane_bams)

        self.bams, self.mapdamage = \
          self._build_mapdamage_nodes(config, target, prefix, files_and_nodes)

        self.node = MetaNode(
            description="Library: %s" % os.path.basename(self.folder),
            dependencies=self.bams.values() + [self.datadup_check])
Пример #11
0
    def __init__(self, config, reference, intervals, infiles, outfile, dependencies = ()):
        self._basename = os.path.basename(outfile)

        infiles  = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command  = AtomicJavaCmdBuilder(config, jar_file)
        command.set_option("-T", "IndelRealigner")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
        command.set_option("-o", "%(OUT_BAMFILE)s")
        command.set_option("--bam_compression", 0)
        command.set_option("--disable_bam_indexing")
        _set_input_files(command, infiles)

        command.set_kwargs(IN_REFERENCE = reference,
                           IN_REF_DICT  = fileutils.swap_ext(reference, ".dict"),
                           IN_INTERVALS = intervals,
                           OUT_BAMFILE  = outfile)

        calmd   = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
                            TEMP_IN_BAM     = self._basename,
                            IN_REF          = reference,
                            TEMP_OUT_STDOUT = self._basename + ".calmd")

        description = "<Indel Realign: %i file(s) -> '%s'>" \
            % (len(infiles), outfile)

        CommandNode.__init__(self,
                             description  = description,
                             command      = ParallelCmds([command.finalize(),
                                                          calmd]),
                             dependencies = dependencies)
Пример #12
0
def _append_aln_user_parameters(mkfile_params, lst):
    for (param, value) in mkfile_params.iteritems():
        if param.startswith("-"):
            for value in safe_coerce_to_tuple(value):
                lst.append(param)
                if value is not None:
                    lst.append(value)
Пример #13
0
    def __init__(self, config, target_name, input_files, output_file, intervals_file = None, print_stats = False, max_contigs = _MAX_CONTIGS, dependencies = ()):
        self._target_name = target_name
        self._input_files = safe_coerce_to_tuple(input_files)
        self._output_file = output_file
        self._intervals   = intervals_file
        self._print_stats = print_stats
        self._max_contigs = max_contigs
        self._max_contigs_reached = False

        input_files = []
        input_files.extend(self._input_files)
        input_files.extend(swap_ext(input_file, ".bai") for input_file in self._input_files)
        if intervals_file:
            input_files.append(intervals_file)

        executables = ["coverageBed"] if intervals_file else ["genomeCoverageBed"]
        auxiliary_files = []
        for cmd in concatenate_input_bams(config, self._input_files)[0]:
            executables.extend(cmd.executables)
            auxiliary_files.extend(cmd.auxiliary_files)

        Node.__init__(self,
                      description  = "<DepthHistogram: %s -> '%s'>" \
                        % (describe_files(self._input_files),
                           self._output_file),
                      input_files  = input_files,
                      output_files = self._output_file,
                      dependencies = dependencies,
                      executables  = executables,
                      auxiliary_files = auxiliary_files)
Пример #14
0
    def __init__(self, config, input_bams, pipename="input.bam"):
        self.pipe = pipename
        self.files = safe_coerce_to_tuple(input_bams)

        self.commands = []
        self.kwargs = {"TEMP_IN_BAM": self.pipe}
        if len(self.files) > 1:
            jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar")
            params = AtomicJavaCmdBuilder(jar=jar_file,
                                          temp_root=config.temp_root,
                                          jre_options=config.jre_options)

            params.set_option("SO", "coordinate", sep="=", fixed=False)
            params.set_option("CREATE_INDEX", "False", sep="=")
            params.set_option("COMPRESSION_LEVEL", 0, sep="=")
            params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
            params.add_multiple_options("I", input_bams, sep="=")

            params.set_kwargs(CHECK_JAR=_picard_version(config, jar_file),
                              TEMP_OUT_BAM=self.pipe)

            self.commands = [params.finalize()]
        else:
            # Ensure that the actual command depends on the input
            self.kwargs["IN_FILE_00"] = self.files[0]
            self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
Пример #15
0
    def add_support(self, bootstraps, fmt = "{Support}"):
        """Adds support values to the current tree, based on a set of trees containing
        the same taxa. It is assumed that the support trees represent unrooted or
        arbitarily rooted trees, and no weight is given to the rooted topology of these
        trees.

        The main tree should itself be rooted, and the the toplogy and ordering of this
        tree is preserved, with node-names updated using the formatting string 'fmt'.

        Formatting is carried out using str.format, with these fields:
          {Support}    -- The total number of trees in which a clade is supported.
          {Percentage} -- The percentage of trees in which a clade is supported (float).
          {Fraction}   -- The fraction of trees in which a clade is supported (float).

        For example, typical percentage support-values can be realized by setting 'fmt'
        to the value "{Percentage:.0f}" to produce integer values.
        """
        clade_counts = {}
        leaf_names_lst = list(self.get_leaf_names())
        leaf_names = frozenset(leaf_names_lst)
        if len(leaf_names) != len(leaf_names_lst):
            raise NewickError("Cannot add support values to trees with duplicate leaf names")

        bootstraps   = safe_coerce_to_tuple(bootstraps)
        for support_tree in bootstraps:
            support_tree_names = frozenset(support_tree.get_leaf_names())
            if leaf_names != support_tree_names:
                raise NewickError("Support tree does not contain same set of leaf nodes")

            support_graph      = _NewickGraph(support_tree)
            for clade in support_graph.get_clade_names():
                clade_counts[clade] = clade_counts.get(clade, 0) + 1

        return self._add_support(self, len(bootstraps), clade_counts, fmt)
Пример #16
0
    def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False,
                 dependencies=(), file_dependencies=()):
        """
        infiles = {names : {"partitions" : ..., "filenames" : [...]}}
        """
        if not (isinstance(infiles, dict)
                and all(isinstance(dd, dict) for dd in infiles.values())):
            raise TypeError("'infiles' must be a dictionary of dictionaries")

        input_filenames = []
        for (name, subdd) in infiles.iteritems():
            if set(subdd) - _VALID_KEYS:
                raise ValueError("Invalid keys found for %r: %s"
                                 % (name, ", ".join(set(subdd) - _VALID_KEYS)))
            elif not isinstance(subdd["filenames"], list):
                raise ValueError("filenames must be a list of strings")
            input_filenames.extend(subdd["filenames"])
        # Optional file dependencies; used to depend on the list of sequcences
        input_filenames.extend(safe_coerce_to_tuple(file_dependencies))

        self._reduce = bool(reduce)
        self._infiles = copy.deepcopy(infiles)
        self._out_prefix = out_prefix
        self._excluded = safe_coerce_to_frozenset(exclude_groups)

        description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \
            (" (reducing)" if reduce else "", len(infiles), out_prefix)

        Node.__init__(self,
                      description=description,
                      input_files=input_filenames,
                      output_files=[out_prefix + ".phy",
                                    out_prefix + ".partitions"],
                      dependencies=dependencies)
Пример #17
0
    def __init__(self, config, reference, intervals, infiles, outfile,
                 dependencies=()):
        self._basename = os.path.basename(outfile)

        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "IndelRealigner")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
        command.set_option("-o", "%(OUT_BAMFILE)s")
        command.set_option("--bam_compression", 0)
        command.set_option("--disable_bam_indexing")
        _set_input_files(command, infiles)

        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           IN_INTERVALS=intervals,
                           OUT_BAMFILE=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        calmd = AtomicCmd(["samtools", "calmd", "-b",
                           "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
                          TEMP_IN_BAM=self._basename,
                          IN_REF=reference,
                          TEMP_OUT_STDOUT=self._basename + ".calmd",
                          CHECK_VERSION=SAMTOOLS_VERSION)

        description = "<Indel Realigner (aligning): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds([command.finalize(), calmd]),
                             dependencies=dependencies)
Пример #18
0
    def customize(cls, config, input_bams, output_bam, output_metrics = None, dependencies = ()):
        jar_file = os.path.join(config.jar_root, "MarkDuplicates.jar")
        params = AtomicJavaCmdBuilder(config, jar_file)

        # Create .bai index, since it is required by a lot of other programs
        params.set_option("CREATE_INDEX", "True", sep = "=")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep = "=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep = "=")

        input_bams = safe_coerce_to_tuple(input_bams)
        for (index, filename) in enumerate(input_bams):
            params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep = "=")
            params.set_kwargs(**{("IN_BAM_%02i" % index) : filename})

        # Remove duplicates from output by default to save disk-space
        params.set_option("REMOVE_DUPLICATES", "True", sep = "=", fixed = False)

        params.set_kwargs(OUT_BAM     = output_bam,
                         OUT_BAI     = swap_ext(output_bam, ".bai"),
                         OUT_METRICS = output_metrics or swap_ext(output_bam, ".metrics"),
                         CHECK_JAR  = _picard_version(jar_file))

        return {"command"      : params,
                "dependencies" : dependencies}
Пример #19
0
    def __init__(self, config, input_bams, pipename="input.bam"):
        self.pipe = pipename
        self.files = safe_coerce_to_tuple(input_bams)

        self.commands = []
        self.kwargs = {"TEMP_IN_BAM": self.pipe}
        if len(self.files) > 1:
            jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar")
            params = AtomicJavaCmdBuilder(jar=jar_file,
                                          temp_root=config.temp_root,
                                          jre_options=config.jre_options)

            params.set_option("SO", "coordinate", sep="=", fixed=False)
            params.set_option("CREATE_INDEX", "False", sep="=")
            params.set_option("COMPRESSION_LEVEL", 0, sep="=")
            params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
            params.add_multiple_options("I", input_bams, sep="=")

            params.set_kwargs(CHECK_JAR=_picard_version(config, jar_file),
                              TEMP_OUT_BAM=self.pipe)

            self.commands = [params.finalize()]
        else:
            # Ensure that the actual command depends on the input
            self.kwargs["IN_FILE_00"] = self.files[0]
            self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
Пример #20
0
    def __init__(self, config, prefix, samples, features, target):
        self.name      = prefix["Name"]
        self.label     = prefix.get("Label") or self.name
        self.reference = prefix["Reference"]
        self.aoi       = prefix.get("AreasOfInterest", {})

        self.samples = safe_coerce_to_tuple(samples)
        self.bams    = {}
        self.folder  = config.destination
        self.target  = target

        files_and_nodes = {}
        for sample in self.samples:
            files_and_nodes.update(sample.bams.iteritems())

        if "Raw BAM" in features:
            self.bams.update(self._build_raw_bam(config, prefix, files_and_nodes))
        if "Realigned BAM" in features:
            self.bams.update(self._build_realigned_bam(config, prefix, files_and_nodes))

        sample_nodes = [sample.node for sample in self.samples]
        if not self.bams:
            for sample in self.samples:
                self.bams.update(sample.bams)

            self.node = MetaNode(description  = "Prefix: %s" % prefix["Name"],
                                 dependencies = sample_nodes)
        else:
            self.node = MetaNode(description  = "Final BAMs: %s" % prefix["Name"],
                                 subnodes     = self.bams.values(),
                                 dependencies = sample_nodes)
Пример #21
0
    def __init__(self, config, target, prefix, lanes, name):
        self.name        = name
        self.lanes       = safe_coerce_to_tuple(lanes)
        self.options     = lanes[0].options
        self.folder      = os.path.dirname(self.lanes[0].folder)
        self.bams        = None
        self.mapdamage   = None

        assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes)
        assert all((self.options == lane.options) for lane in self.lanes)

        lane_bams = self._collect_bams_by_type(self.lanes)
        self.datadup_check = self._build_dataduplication_node(lane_bams)
        self.duphist = \
            self._build_duphist_nodes(config, target, prefix, lane_bams)
        pcr_duplicates = self.options["PCRDuplicates"]
        if pcr_duplicates:
            lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates)

        # At this point we no longer need to differentiate between types of reads
        files_and_nodes = self._collect_files_and_nodes(lane_bams)

        self.bams, self.mapdamage = \
          self._build_mapdamage_nodes(config, target, prefix, files_and_nodes)

        self.node = MetaNode(description  = "Library: %s" % os.path.basename(self.folder),
                             dependencies = self.bams.values() + [self.datadup_check])
Пример #22
0
    def customize(cls, reference, infiles, outfile, options, dependencies = ()):
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(options.jar_root,"GenomeAnalysisTK.jar")
        UnifiedGenotyper = AtomicJavaCmdBuilder(options,jar_file)
        UnifiedGenotyper.set_option("-R", "%(IN_REFERENCE)s")
        UnifiedGenotyper.set_option("-T", "UnifiedGenotyper")
        for bam in infiles:
            assert os.path.exists(bam), "Couldn't find input BAM: {}".format(bam)
            UnifiedGenotyper.add_option("-I", bam)
        UnifiedGenotyper.set_option("-o", "%(OUT_VCFFILES)s")
        UnifiedGenotyper.set_option("-stand_call_conf", "30.0")
        UnifiedGenotyper.set_option("-stand_emit_conf", "10.0")
        UnifiedGenotyper.set_option("-dcov", "200")
        #UnifiedGenotyper.set_option("-nct", "3")
        UnifiedGenotyper.set_option("-L", "chrUn2:1-19213991")
    

        UnifiedGenotyper.set_kwargs(
            IN_REFERENCE = reference,
            OUT_VCFFILES = outfile,
            OUT_VCF_IDX  = outfile + ".idx"
        )

        return {
            "commands" : {
                "unifiedgenotyper" : UnifiedGenotyper
            }
        }
Пример #23
0
    def __init__(self, config, reference, infiles, outfile, intervals=None,
                 dependencies=()):
        if not intervals:
            intervals = outfile + ".intervals"

        infiles = safe_coerce_to_tuple(infiles)
        trainer = _IndelTrainerNode(config=config,
                                    reference=reference,
                                    infiles=infiles,
                                    outfile=intervals,
                                    dependencies=dependencies)
        aligner = _IndelRealignerNode(config=config,
                                      reference=reference,
                                      intervals=intervals,
                                      infiles=infiles,
                                      outfile=outfile,
                                      dependencies=trainer)

        description = "<GATK Indel Realigner: %i files -> '%s'>" \
            % (len(infiles), outfile)

        MetaNode.__init__(self,
                          description=description,
                          subnodes=[trainer, aligner],
                          dependencies=dependencies)
Пример #24
0
    def __init__(self, command, set_cwd=False, **kwargs):
        """Takes a command and a set of files.

        The command is expected to be an iterable starting with the name of an
        executable, with each item representing one string on the command line.
        Thus, the command "find /etc -name 'profile*'" might be represented as
        the list ["find", "/etc", "-name", "profile*"].

        If 'set_cwd' is True, the current working directory is set to the
        temporary directory before the command is executed. Input paths are
        automatically turned into absolute paths in this case.

        Each keyword represents a type of file, as determined by the prefix:
           IN_    -- Path to input file transformed/analysed the executable.
           OUT_   -- Path to output file generated by the executable. During
                     execution of the AtomicCmd, these paths are modified to
                     point to the temporary directory.
           EXEC_  -- Name of / path to executable. The first item in the
                     command is always one of the executables, even if not
                     specified in this manner.
           AUX_   -- Auxillery files required by the executable(s), which are
                     themselves not executable. Examples include scripts,
                     config files, data-bases, and the like.
           CHECK_ -- A callable, which upon calling carries out version checking,
                     raising an exception in the case of requirements not being
                     met. This may be used to help ensure that prerequisites are
                     met before running the command. The function is not called
                     by AtomicCmd itself.

        Note that files that are not directly invoked may be included above,
        in order to allow the specification of requirements. This could include
        required data files, or executables indirectly executed by a script.

        If the above is prefixed with "TEMP_", the files are read from / written
        to the temporary folder in which the command is executed. Note that all
        TEMP_OUT_ files are deleted when commit is called (if they exist), and
        only filenames (not dirname component) are allowed for TEMP_ values.

        In addition, the follow special names may be used with the above:
           STDIN_  -- Takes a filename, or an AtomicCmd, in which case the stdout
                      of that command is piped to the stdin of this instance.
           STDOUT_ -- Takes a filename, or the special value PIPE to allow
                      another AtomicCmd instance to use the output directly.
           STDERR_ -- Takes a filename.

        Each pipe can only be used once, with or without the TEMP_ prefix."""
        self._proc = None
        self._temp = None
        self._command = map(str, safe_coerce_to_tuple(command))
        self._handles = {}
        self._set_cwd = set_cwd
        if not self._command or not self._command[0]:
            raise ValueError("Empty command in AtomicCmd constructor")

        self._files = self._process_arguments(id(self), self._command, kwargs)
        self._file_sets = self._build_files_map(self._command, self._files)

        # Dry-run, to catch errors early
        self._generate_call("")
Пример #25
0
    def __init__(self, command, set_cwd = False, **kwargs):
        """Takes a command and a set of files.

        The command is expected to be an iterable starting with the name of an
        executable, with each item representing one string on the command line.
        Thus, the command "find /etc -name 'profile*'" might be represented as
        the list ["find", "/etc", "-name", "profile*"].

        If 'set_cwd' is True, the current working directory is set to the
        temporary directory before the command is executed. Input paths are
        automatically turned into absolute paths in this case.

        Each keyword represents a type of file, as determined by the prefix:
           IN_    -- Path to input file transformed/analysed the executable.
           OUT_   -- Path to output file generated by the executable. During
                     execution of the AtomicCmd, these paths are modified to
                     point to the temporary directory.
           EXEC_  -- Name of / path to executable. The first item in the
                     command is always one of the executables, even if not
                     specified in this manner.
           AUX_   -- Auxillery files required by the executable(s), which are
                     themselves not executable. Examples include scripts,
                     config files, data-bases, and the like.
           CHECK_ -- A callable, which upon calling carries out version checking,
                     raising an exception in the case of requirements not being
                     met. This may be used to help ensure that prerequisites are
                     met before running the command. The function is not called
                     by AtomicCmd itself.

        Note that files that are not directly invoked may be included above,
        in order to allow the specification of requirements. This could include
        required data files, or executables indirectly executed by a script.

        If the above is prefixed with "TEMP_", the files are read from / written
        to the temporary folder in which the command is executed. Note that all
        TEMP_OUT_ files are deleted when commit is called (if they exist), and
        only filenames (not dirname component) are allowed for TEMP_ values.

        In addition, the follow special names may be used with the above:
           STDIN_  -- Takes a filename, or an AtomicCmd, in which case the stdout
                      of that command is piped to the stdin of this instance.
           STDOUT_ -- Takes a filename, or the special value PIPE to allow
                      another AtomicCmd instance to use the output directly.
           STDERR_ -- Takes a filename.

        Each pipe can only be used once, with or without the TEMP_ prefix."""
        self._proc    = None
        self._temp    = None
        self._command = map(str, safe_coerce_to_tuple(command))
        self._handles = {}
        self._set_cwd = set_cwd
        if not self._command or not self._command[0]:
            raise ValueError("Empty command in AtomicCmd constructor")

        self._files     = self._process_arguments(id(self), self._command, kwargs)
        self._file_sets = self._build_files_map(self._command, self._files)

        # Dry-run, to catch errors early
        self._generate_call("")
Пример #26
0
    def __init__(self, tree_files, output_file, taxa = (), dependencies = ()):
        self._output_file    = output_file
        self._tree_files     = safe_coerce_to_tuple(tree_files)
        self._reroot_on_taxa = safe_coerce_to_tuple(taxa)

        reroot_on = "midpoint"
        if self._reroot_on_taxa:
            reroot_on = repr("', '".join(sorted(self._reroot_on_taxa)))

        description  = "<NewickReroot (on %s): %s>" % \
          (reroot_on, describe_files(tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = self._tree_files,
                      output_files = self._output_file,
                      dependencies = dependencies)
Пример #27
0
    def __init__(self, commands):
        self._ready = False

        commands = safe_coerce_to_tuple(commands)
        for command in commands:
            if not isinstance(command, (AtomicCmd, _CommandSet)):
                raise CmdError("ParallelCmds must only contain AtomicCmds or other ParallelCmds!")
        _CommandSet.__init__(self, commands)
Пример #28
0
def _validate_filenames(filenames):
    """Sanity checks for filenames handled by
    'describe_files' and 'describe_paired_files."""
    filenames = safe_coerce_to_tuple(filenames)
    for filename in filenames:
        if not isinstance(filename, types.StringTypes):
            raise ValueError("Only string types are allowed for filenames, not %s" \
                             % (filename.__class__.__name__,))
    return filenames
Пример #29
0
def _validate_filenames(filenames):
    """Sanity checks for filenames handled by
    'describe_files' and 'describe_paired_files."""
    filenames = safe_coerce_to_tuple(filenames)
    for filename in filenames:
        if not isinstance(filename, types.StringTypes):
            raise ValueError("Only string types are allowed for filenames, not %s" \
                             % (filename.__class__.__name__,))
    return filenames
Пример #30
0
    def _AnyOf(path, value):
        values = safe_coerce_to_tuple(value)
        for value in values:
            if key_func(value) not in args:
                raise MakefileError("Value for '%s' must be among %s, not %s!" \
                                    % (":".join(path), ", ".join(map(repr, args)), repr(value)))

        if not min_items <= len(values) <= max_items:
            raise MakefileError("Expected %s to %s values, found %i!" \
                                % (repr(min_items), repr(max_items), len(values)))
Пример #31
0
def _build_unicat_command(input_files, output_file):
    paths = {"TEMP_OUT_CAT" : output_file}
    call = ["unicat", "--output", "%(TEMP_OUT_CAT)s"]
    for (index, filename) in enumerate(utilities.safe_coerce_to_tuple(input_files)):
        key = "IN_CAT_%02i" % index

        call.append("%%(%s)s" % key)
        paths[key] = filename

    return AtomicCmd(call, **paths)
Пример #32
0
 def __init__(self, call, search, checks, name=None, priority=0):
     """See function 'Requrement' for a description of parameters.
     """
     self._call = safe_coerce_to_tuple(call)
     self._done = None
     self.name = str(name or self._call[0])
     self.priority = int(priority)
     self.checks = checks
     self._rege = re.compile(search)
     self._version = None
Пример #33
0
 def __init__(self, call, search, checks, name=None, priority=0):
     """See function 'Requrement' for a description of parameters.
     """
     self._call = safe_coerce_to_tuple(call)
     self._done = None
     self.name = str(name or self._call[0])
     self.priority = int(priority)
     self.checks = checks
     self._rege = re.compile(search)
     self._version = None
Пример #34
0
    def __init__(self, call, **kwargs):
        """See AtomiCmd.__init__ for parameters / keyword arguments."""

        self._call = safe_coerce_to_tuple(call)
        self._options = []
        self._values = []
        self._kwargs = {}
        self._object = None

        self.set_kwargs(**kwargs)
Пример #35
0
    def __init__(self, call, **kwargs):
        """See AtomiCmd.__init__ for parameters / keyword arguments.
        """
        self._call = safe_coerce_to_tuple(call)
        self._options = []
        self._values = []
        self._kwargs = {}
        self._object = None

        self.set_kwargs(**kwargs)
Пример #36
0
    def _AnyOf(path, value):
        values = safe_coerce_to_tuple(value)
        for value in values:
            if key_func(value) not in args:
                raise MakefileError("Value for '%s' must be among %s, not %s!" \
                                    % (":".join(path), ", ".join(map(repr, args)), repr(value)))

        if not min_items <= len(values) <= max_items:
            raise MakefileError("Expected %s to %s values, found %i!" \
                                % (repr(min_items), repr(max_items), len(values)))
Пример #37
0
    def __init__(self, commands):
        self._ready = False

        commands = safe_coerce_to_tuple(commands)
        for command in commands:
            if not isinstance(command, (AtomicCmd, _CommandSet)):
                raise CmdError(
                    "ParallelCmds must only contain AtomicCmds or other ParallelCmds!"
                )
        _CommandSet.__init__(self, commands)
Пример #38
0
    def __init__(self, config, prefixes, name):
        self.name     = name
        self.prefixes = safe_coerce_to_tuple(prefixes)

        self._nodes_extras    = {}
        self._nodes_alignment = MetaNode(description  = "Alignments:",
                                          dependencies = [prefix.node for prefix in self.prefixes])

        self._setup_extra_nodes("mapDamage", "mapdamage")
        self._setup_extra_nodes("Duplicate Histogram", "duphist")
Пример #39
0
def _build_unicat_command(input_files, output_file):
    paths = {"TEMP_OUT_CAT": output_file}
    call = ["unicat", "--output", "%(TEMP_OUT_CAT)s"]
    for (index,
         filename) in enumerate(utilities.safe_coerce_to_tuple(input_files)):
        key = "IN_CAT_%02i" % index

        call.append("%%(%s)s" % key)
        paths[key] = filename

    return AtomicCmd(call, **paths)
Пример #40
0
    def __init__(self, config, prefix, libraries, name):
        self.name      = name
        self.bams      = {}
        self.libraries = safe_coerce_to_tuple(libraries)

        for library in self.libraries:
            self.bams.update(library.bams.iteritems())
        self.folder = os.path.dirname(self.libraries[0].folder)

        self.node = MetaNode(description  = "Sample: %s" % os.path.basename(self.folder),
                             dependencies = [library.node for library in self.libraries])
Пример #41
0
    def __init__(self, config, prefix, libraries, name):
        self.name = name
        self.bams = {}
        self.libraries = safe_coerce_to_tuple(libraries)

        for library in self.libraries:
            self.bams.update(library.bams.iteritems())
        self.folder = os.path.dirname(self.libraries[0].folder)

        self.node = MetaNode(
            description="Sample: %s" % os.path.basename(self.folder),
            dependencies=[library.node for library in self.libraries])
Пример #42
0
    def __init__(self, input_file, output_file, exclude_groups, dependencies = ()):
        self._input_file  = input_file
        self._output_file = output_file
        self._excluded = safe_coerce_to_tuple(exclude_groups)
        description  = "<FastaToPAMLPhy: '%s' -> '%s'>" % \
            (input_file, output_file)

        Node.__init__(self,
                      description  = description,
                      input_files  = [input_file],
                      output_files = [output_file],
                      dependencies = dependencies)
Пример #43
0
def describe_files(files):
    """Return a text description of a set of files."""
    files = safe_coerce_to_tuple(files)
    if not files:
        return "No files"
    elif len(files) == 1:
        return repr(files[0])

    paths = set(os.path.dirname(filename) for filename in files)
    if len(paths) == 1:
        return "%i files in '%s'" % (len(files), paths.pop())
    return "%i files" % (len(files),)
Пример #44
0
    def __init__(self, call, threads = 1, **kwargs):
        if not isinstance(threads, (types.IntType, types.LongType)):
            raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__)
        elif threads < 1:
            raise ValueError("'threads' must be 1 or greater, not %i" % threads)
        elif threads == 1:
            AtomicCmdBuilder.__init__(self, call, EXEC_MPI = "mpirun", **kwargs)
        else:
            call = safe_coerce_to_tuple(call)
            mpi_call = ["mpirun", "-n", threads]
            mpi_call.extend(call)

            AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN = call[0], **kwargs)
Пример #45
0
    def __init__(self, description, destination, source_nodes):
        source_nodes = safe_coerce_to_tuple(source_nodes)

        input_files  = []
        for source_node in source_nodes:
            input_files.extend(source_node.output_files)

        output_files = [reroot_path(destination, fpath) for fpath in input_files]
        self._files  = zip(input_files, output_files)

        Node.__init__(self,
                      description  = "<Copy %s output to %r>" % (description, destination),
                      input_files  = input_files,
                      output_files = output_files,
                      dependencies = source_nodes)
Пример #46
0
    def __init__(self,
                 input_file,
                 output_file,
                 exclude_groups,
                 dependencies=()):
        self._input_file = input_file
        self._output_file = output_file
        self._excluded = safe_coerce_to_tuple(exclude_groups)
        description  = "<FastaToPAMLPhy: '%s' -> '%s'>" % \
            (input_file, output_file)

        Node.__init__(self,
                      description=description,
                      input_files=[input_file],
                      output_files=[output_file],
                      dependencies=dependencies)
Пример #47
0
def Requirement(call, search, checks, name=None, priority=0):
    # Ignore function naming scheme
    # pylint: disable=C0103
    """Returns a singleton Requirement object, based on the parameters,
    which may be used to check that version requirements are met for a
    given program/utility/module, etc.

    Parameters:
      call   -- A string, or a tuple containing strings for a system call,
                or a tuple containing a function at the first position, and
                a set of positional parameters. In the case of system calls,
                stdout and stderr are returned as a single string, in the case
                of a function call, the return value is expected to be a str.
      search -- A regular expression (string or re object), used to search
                the output of the "call". Groups are assumed to represent
                version numbers.
      checks -- A callable that implements the interface described in the
                Check class.
      name   -- Descriptive name for the executable/module/etc. If not
                specified, first value in 'call' will be used; if multiple
                otherwise identical checks are made, the last name that
                does not equal the first value of 'call' will be used.
      priority -- Order in which requirements are checked; if multiple
                  otherwise identical checks are made with different priority,
                  the highest priority takes precedence.

    Implementation detail: To reduce the need for performing calls or system-
    calls multiple times, caches are implemented using the call object as keys.
    Thus the same calls should be passed in a manner which allow equality
    between the same calls to be established.
    """
    call = safe_coerce_to_tuple(call)
    key = (call, search, checks)

    try:
        requirement = _REQUIREMENT_CACHE[key]

        # Highest priority takes precedence
        requirement.priority = max(requirement.priority, priority)
        # Last explicitly specified name takes precedence
        requirement.name = name or requirement.name
    except KeyError:
        requirement = RequirementObj(*key, name=name, priority=priority)
        _REQUIREMENT_CACHE[key] = requirement

    return requirement
Пример #48
0
def Requirement(call, search, checks, name=None, priority=0):
    # Ignore function naming scheme
    # pylint: disable=C0103
    """Returns a singleton Requirement object, based on the parameters,
    which may be used to check that version requirements are met for a
    given program/utility/module, etc.

    Parameters:
      call   -- A string, or a tuple containing strings for a system call,
                or a tuple containing a function at the first position, and
                a set of positional parameters. In the case of system calls,
                stdout and stderr are returned as a single string, in the case
                of a function call, the return value is expected to be a str.
      search -- A regular expression (string or re object), used to search
                the output of the "call". Groups are assumed to represent
                version numbers.
      checks -- A callable that implements the interface described in the
                Check class.
      name   -- Descriptive name for the executable/module/etc. If not
                specified, first value in 'call' will be used; if multiple
                otherwise identical checks are made, the last name that
                does not equal the first value of 'call' will be used.
      priority -- Order in which requirements are checked; if multiple
                  otherwise identical checks are made with different priority,
                  the highest priority takes precedence.

    Implementation detail: To reduce the need for performing calls or system-
    calls multiple times, caches are implemented using the call object as keys.
    Thus the same calls should be passed in a manner which allow equality
    between the same calls to be established.
    """
    call = safe_coerce_to_tuple(call)
    key = (call, search, checks)

    try:
        requirement = _REQUIREMENT_CACHE[key]

        # Highest priority takes precedence
        requirement.priority = max(requirement.priority, priority)
        # Last explicitly specified name takes precedence
        requirement.name = name or requirement.name
    except KeyError:
        requirement = RequirementObj(*key, name=name, priority=priority)
        _REQUIREMENT_CACHE[key] = requirement

    return requirement
Пример #49
0
    def __init__(self, config, reference, infiles, outfile, dependencies=()):
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")

        _set_input_files(command, infiles)
        command.set_kwargs(
            IN_REFERENCE=reference,
            IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
            OUT_INTERVALS=outfile,
            CHECK_GATK=_get_gatk_version_check(config),
        )

        description = "<Indel Realigner (training): %s -> %r>" % (describe_files(infiles), outfile)
        CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
Пример #50
0
    def __init__(self, call, threads=1, **kwargs):
        if not isinstance(threads, (types.IntType, types.LongType)):
            raise TypeError("'threads' must be an integer value, not %r" %
                            threads.__class__.__name__)
        elif threads < 1:
            raise ValueError("'threads' must be 1 or greater, not %i" %
                             threads)
        elif threads == 1:
            AtomicCmdBuilder.__init__(self, call, **kwargs)
        else:
            call = safe_coerce_to_tuple(call)
            mpi_call = ["mpirun", "-n", threads]
            mpi_call.extend(call)

            AtomicCmdBuilder.__init__(self,
                                      mpi_call,
                                      EXEC_MAIN=call[0],
                                      **kwargs)
Пример #51
0
    def __init__(self, description, destination, source_nodes):
        source_nodes = safe_coerce_to_tuple(source_nodes)

        input_files = []
        for source_node in source_nodes:
            input_files.extend(source_node.output_files)

        output_files = [
            reroot_path(destination, fpath) for fpath in input_files
        ]
        self._files = zip(input_files, output_files)

        Node.__init__(self,
                      description="<Copy %s output to %r>" %
                      (description, destination),
                      input_files=input_files,
                      output_files=output_files,
                      dependencies=source_nodes)
Пример #52
0
    def __init__(self, config, reference, infiles, outfile, dependencies = ()):
        infiles  = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command  = AtomicJavaCmdBuilder(config, jar_file)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE  = reference,
                           IN_REF_DICT   = fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS = outfile)

        description = "<Train Indel Realigner: %i file(s) -> '%s'>" \
            % (len(infiles), outfile)
        CommandNode.__init__(self,
                             description  = description,
                             command      = command.finalize(),
                             dependencies = dependencies)
Пример #53
0
    def __init__(self, config, reference, infiles, outfile, dependencies=()):
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(config, jar_file)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS=outfile)

        description = "<Train Indel Realigner: %i file(s) -> '%s'>" \
            % (len(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=command.finalize(),
                             dependencies=dependencies)
Пример #54
0
    def __init__(self, config, reference, infiles, outfile, dependencies=()):
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        description = "<Indel Realigner (training): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=command.finalize(),
                             dependencies=dependencies)
Пример #55
0
    def __init__(self,
                 config,
                 target_name,
                 input_files,
                 output_file,
                 intervals_file=None,
                 print_stats=False,
                 max_contigs=_MAX_CONTIGS,
                 dependencies=()):
        self._target_name = target_name
        self._input_files = safe_coerce_to_tuple(input_files)
        self._output_file = output_file
        self._intervals = intervals_file
        self._print_stats = print_stats
        self._max_contigs = max_contigs
        self._max_contigs_reached = False

        input_files = []
        input_files.extend(self._input_files)
        input_files.extend(
            swap_ext(input_file, ".bai") for input_file in self._input_files)
        if intervals_file:
            input_files.append(intervals_file)

        executables = ["coverageBed"
                       ] if intervals_file else ["genomeCoverageBed"]
        auxiliary_files = []
        for cmd in concatenate_input_bams(config, self._input_files)[0]:
            executables.extend(cmd.executables)
            auxiliary_files.extend(cmd.auxiliary_files)

        Node.__init__(self,
                      description  = "<DepthHistogram: %s -> '%s'>" \
                        % (describe_files(self._input_files),
                           self._output_file),
                      input_files  = input_files,
                      output_files = self._output_file,
                      dependencies = dependencies,
                      executables  = executables,
                      auxiliary_files = auxiliary_files)
Пример #56
0
    def __init__(self, config, prefix, lanes, name):
        self.name    = name
        self.lanes   = safe_coerce_to_tuple(lanes)
        self.options = lanes[0].options
        self.folder  = os.path.dirname(self.lanes[0].folder)
        self.is_rmdupped = self.options["PCRDuplicates"]
        self.is_rescaled = self.options["RescaleQualities"]
        assert all((self.folder  == os.path.dirname(lane.folder)) for lane in self.lanes)

        bams = self._collect_bams(self.lanes)
        if self.is_rmdupped:
            bams = self._remove_pcr_duplicates(config, prefix, bams)

        if self.is_rescaled:
            bams = self._rescale_quality_scores(config, prefix, bams)

        self.bams = {}
        for files_and_nodes in bams.itervalues():
            self.bams.update(files_and_nodes)

        self.node = MetaNode(description  = "Library: %s" % os.path.basename(self.folder),
                             dependencies = self.bams.values())
Пример #57
0
    def __init__(self, config, input_bams, pipename="input.bam"):
        self.pipe = pipename
        self.files = safe_coerce_to_tuple(input_bams)

        self.commands = []
        self.kwargs = {"TEMP_IN_BAM": self.pipe}
        if len(self.files) > 1:
            params = picard_command(config, "MergeSamFiles")

            params.set_option("SO", "coordinate", sep="=", fixed=False)
            params.set_option("CREATE_INDEX", "False", sep="=")
            params.set_option("COMPRESSION_LEVEL", 0, sep="=")
            params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
            params.add_multiple_options("I", input_bams, sep="=")

            params.set_kwargs(TEMP_OUT_BAM=self.pipe)

            self.commands = [params.finalize()]
        else:
            # Ensure that the actual command depends on the input
            self.kwargs["IN_FILE_00"] = self.files[0]
            self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
Пример #58
0
    def __init__(self,
                 infiles,
                 out_prefix,
                 exclude_groups=(),
                 reduce=False,
                 dependencies=(),
                 file_dependencies=()):
        """
        infiles = {names : {"partitions" : ..., "filenames" : [...]}}
        """
        if not (isinstance(infiles, dict)
                and all(isinstance(dd, dict) for dd in infiles.values())):
            raise TypeError("'infiles' must be a dictionary of dictionaries")

        input_filenames = []
        for (name, subdd) in infiles.iteritems():
            if set(subdd) - _VALID_KEYS:
                raise ValueError("Invalid keys found for %r: %s" %
                                 (name, ", ".join(set(subdd) - _VALID_KEYS)))
            elif not isinstance(subdd["filenames"], list):
                raise ValueError("filenames must be a list of strings")
            input_filenames.extend(subdd["filenames"])
        # Optional file dependencies; used to depend on the list of sequcences
        input_filenames.extend(safe_coerce_to_tuple(file_dependencies))

        self._reduce = bool(reduce)
        self._infiles = copy.deepcopy(infiles)
        self._out_prefix = out_prefix
        self._excluded = safe_coerce_to_frozenset(exclude_groups)

        description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \
            (" (reducing)" if reduce else "", len(infiles), out_prefix)

        Node.__init__(
            self,
            description=description,
            input_files=input_filenames,
            output_files=[out_prefix + ".phy", out_prefix + ".partitions"],
            dependencies=dependencies)