示例#1
0
    def __init__(self, infiles, out_prefix, partition_by = "123", add_flag = False, exclude_groups = (), dependencies = ()):
        if (len(partition_by) != 3):
            raise ValueError("Default 'partition_by' must be 3 entires long!")
        elif not isinstance(infiles, dict):
            raise TypeError("'infiles' must be a dictionary")
        elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()):
            raise ValueError("'partition_by' must be 3 entires long!")
        elif not all(isinstance(dd, dict) for dd in infiles.values()):
            raise TypeError("'infiles' must be a dictionary of dictionaries")
        elif not any(("name" in dd) for dd in infiles.values()):
            raise ValueError("'name' must be specified for all input files")
        elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()):
            raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS))

        self._infiles    = infiles
        self._out_prefix = out_prefix
        self._part_by    = partition_by
        self._add_flag   = add_flag
        self._excluded   = exclude_groups

        description  = "<FastaToPartitionedPhy (default: %s): %i file(s) -> '%s.*'>" % \
            (partition_by, len(infiles), out_prefix)

        Node.__init__(self,
                      description  = description,
                      input_files  = infiles,
                      output_files = [out_prefix + ".phy", out_prefix + ".partitions"],
                      dependencies = dependencies)
示例#2
0
    def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False,
                 dependencies=(), file_dependencies=()):
        """
        infiles = {names : {"partitions" : ..., "filenames" : [...]}}
        """
        if not (isinstance(infiles, dict)
                and all(isinstance(dd, dict) for dd in infiles.values())):
            raise TypeError("'infiles' must be a dictionary of dictionaries")

        input_filenames = []
        for (name, subdd) in infiles.iteritems():
            if set(subdd) - _VALID_KEYS:
                raise ValueError("Invalid keys found for %r: %s"
                                 % (name, ", ".join(set(subdd) - _VALID_KEYS)))
            elif not isinstance(subdd["filenames"], list):
                raise ValueError("filenames must be a list of strings")
            input_filenames.extend(subdd["filenames"])
        # Optional file dependencies; used to depend on the list of sequcences
        input_filenames.extend(safe_coerce_to_tuple(file_dependencies))

        self._reduce = bool(reduce)
        self._infiles = copy.deepcopy(infiles)
        self._out_prefix = out_prefix
        self._excluded = safe_coerce_to_frozenset(exclude_groups)

        description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \
            (" (reducing)" if reduce else "", len(infiles), out_prefix)

        Node.__init__(self,
                      description=description,
                      input_files=input_filenames,
                      output_files=[out_prefix + ".phy",
                                    out_prefix + ".partitions"],
                      dependencies=dependencies)
示例#3
0
    def __init__(self, infiles, out_partitions, partition_by = "123", dependencies = ()):
        if (len(partition_by) != 3):
            raise ValueError("Default 'partition_by' must be 3 entires long!")
        elif not isinstance(infiles, dict):
            raise TypeError("'infiles' must be a dictionary")
        elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()):
            raise ValueError("'partition_by' must be 3 entires long!")
        elif not all(isinstance(dd, dict) for dd in infiles.values()):
            raise TypeError("'infiles' must be a dictionary of dictionaries")
        elif not any(("name" in dd) for dd in infiles.values()):
            raise ValueError("'name' must be specified for all input files")
        elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()):
            raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS))

        self._infiles   = infiles
        self._out_part  = out_partitions
        self._part_by   = partition_by

        description  = "<FastaToPartitions (default: %s): %i file(s) -> '%s'>" % \
            (partition_by, len(infiles), out_partitions)

        Node.__init__(self,
                      description  = description,
                      input_files  = infiles.keys(),
                      output_files = out_partitions,
                      dependencies = dependencies)
示例#4
0
    def __init__(self, config, target_name, input_files, output_file, intervals_file = None, print_stats = False, max_contigs = _MAX_CONTIGS, dependencies = ()):
        self._target_name = target_name
        self._input_files = safe_coerce_to_tuple(input_files)
        self._output_file = output_file
        self._intervals   = intervals_file
        self._print_stats = print_stats
        self._max_contigs = max_contigs
        self._max_contigs_reached = False

        input_files = []
        input_files.extend(self._input_files)
        input_files.extend(swap_ext(input_file, ".bai") for input_file in self._input_files)
        if intervals_file:
            input_files.append(intervals_file)

        executables = ["coverageBed"] if intervals_file else ["genomeCoverageBed"]
        auxiliary_files = []
        for cmd in concatenate_input_bams(config, self._input_files)[0]:
            executables.extend(cmd.executables)
            auxiliary_files.extend(cmd.auxiliary_files)

        Node.__init__(self,
                      description  = "<DepthHistogram: %s -> '%s'>" \
                        % (describe_files(self._input_files),
                           self._output_file),
                      input_files  = input_files,
                      output_files = self._output_file,
                      dependencies = dependencies,
                      executables  = executables,
                      auxiliary_files = auxiliary_files)
示例#5
0
 def __init__(self, input_files, output_file, dependencies=()):
     Node.__init__(self,
                   description="<Detect Input Duplication: %s>"
                   % (describe_files(input_files)),
                   input_files=input_files,
                   output_files=output_file,
                   dependencies=dependencies)
示例#6
0
 def __init__(self, input_files, output_file, offset, dependencies=()):
     self._offset = offset
     Node.__init__(self,
                   description="<Validate FASTQ Files: %s>"
                   % (describe_files(input_files)),
                   input_files=input_files,
                   output_files=output_file,
                   dependencies=dependencies)
示例#7
0
    def __init__(self, input_files, output_file, dependencies=()):
        Node.__init__(self,
                      description="<Validate FASTA Files: %s>"
                      % (describe_files(input_files)),
                      input_files=input_files,
                      output_files=output_file,
                      dependencies=dependencies)

        assert len(self.output_files) == 1, self.output_files
示例#8
0
    def __init__(self, input_files, output_file, dependencies=()):
        self._output_file = output_file

        Node.__init__(
            self,
            description="<MergeCoverage: '%s' -> '%s'>" % (describe_files(input_files), self._output_file),
            input_files=input_files,
            output_files=self._output_file,
            dependencies=dependencies,
        )
示例#9
0
    def __init__(self, infiles, out_phy, add_flag = False, dependencies = ()):
        self._add_flag  = add_flag
        self._out_phy   = out_phy

        description  = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \
            (len(infiles), out_phy, (" (w/ flag)" if add_flag else ""))

        Node.__init__(self,
                      description  = description,
                      input_files  = infiles,
                      output_files = [out_phy],
                      dependencies = dependencies)
示例#10
0
文件: paml.py 项目: schae234/pypeline
    def __init__(self, input_file, output_file, exclude_groups, dependencies = ()):
        self._input_file  = input_file
        self._output_file = output_file
        self._excluded = safe_coerce_to_tuple(exclude_groups)
        description  = "<FastaToPAMLPhy: '%s' -> '%s'>" % \
            (input_file, output_file)

        Node.__init__(self,
                      description  = description,
                      input_files  = [input_file],
                      output_files = [output_file],
                      dependencies = dependencies)
示例#11
0
    def __init__(self, reference, bedfile, outfile, dependencies=()):
        self._reference = reference
        self._bedfile = bedfile
        self._outfile = outfile

        description = "<ExtractReference: '%s' -> '%s'>" \
            % (reference, outfile)
        Node.__init__(self,
                      description=description,
                      input_files=[reference, bedfile],
                      output_files=[outfile],
                      dependencies=dependencies)
示例#12
0
    def __init__(self, input_alignment, input_partition, output_alignment,
                 seed = None, dependencies = ()):
        self._input_phy  = input_alignment
        self._input_part = input_partition
        self._output_phy = output_alignment
        self._seed       = seed

        Node.__init__(self,
                      description  = "<PHYLIPBootstrap: %r -> %r>" \
                        % (input_alignment, output_alignment),
                      input_files  = (input_alignment, input_partition),
                      output_files = (output_alignment,),
                      dependencies = dependencies)
示例#13
0
    def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()):
        self._output_file        = output_file
        self._main_tree_files    = safe_coerce_to_tuple(main_tree_files)
        self._support_tree_files = safe_coerce_to_tuple(support_tree_files)
        input_files = self._main_tree_files + self._support_tree_files

        description  = "<NewickSupport: %s>" % \
          (describe_files(main_tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = input_files,
                      output_files = output_file,
                      dependencies = dependencies)
示例#14
0
文件: misc.py 项目: CarlesV/paleomix
    def __init__(self, description, destination, source_nodes):
        source_nodes = safe_coerce_to_tuple(source_nodes)

        input_files  = []
        for source_node in source_nodes:
            input_files.extend(source_node.output_files)

        output_files = [reroot_path(destination, fpath) for fpath in input_files]
        self._files  = zip(input_files, output_files)

        Node.__init__(self,
                      description  = "<Copy %s output to %r>" % (description, destination),
                      input_files  = input_files,
                      output_files = output_files,
                      dependencies = source_nodes)
示例#15
0
    def __init__(self, input_file, output_file, filter_by, dependencies):
        self._input_file      = input_file
        self._output_file     = output_file
        self._filter_by       = dict(filter_by)
        for (to_filter, groups) in self._filter_by.items():
            groups = set(groups) | set([to_filter])
            if len(groups) == 1:
                raise RuntimeError("Singleton filtering must involve at least one other group")
            self._filter_by[to_filter] = groups

        Node.__init__(self,
                      description  = "<FilterSingleton: '%s' -> '%s'>" \
                            % (input_file, output_file),
                      input_files  = [input_file],
                      output_files = [output_file],
                      dependencies = dependencies)
示例#16
0
    def __init__(self, fasta_files, sequences, destination, dependencies = ()):
        """
        fasta_files -- { taxon_name_1 : filename_1, ... }
        sequences   -- { interval_name_1 : { taxon_name_1 : interval_name_1.1, ... }, ...
        """

        self._infiles     = copy.deepcopy(fasta_files)
        self._sequences   = copy.deepcopy(sequences)
        self._destination = copy.copy(destination)
        self._outfiles    = [os.path.join(destination, name + ".fasta") for name in self._sequences]

        Node.__init__(self,
                      description  = "<CollectSequences: %i sequences from %i files -> '%s'>" \
                            % (len(self._sequences), len(self._infiles), self._destination),
                      input_files  = self._infiles.values(),
                      output_files = self._outfiles,
                      dependencies = dependencies)
示例#17
0
    def __init__(self, tree_files, output_file, taxa = (), dependencies = ()):
        self._output_file    = output_file
        self._tree_files     = safe_coerce_to_tuple(tree_files)
        self._reroot_on_taxa = safe_coerce_to_tuple(taxa)

        reroot_on = "midpoint"
        if self._reroot_on_taxa:
            reroot_on = repr("', '".join(sorted(self._reroot_on_taxa)))

        description  = "<NewickReroot (on %s): %s>" % \
          (reroot_on, describe_files(tree_files),)

        Node.__init__(self,
                      description  = description,
                      input_files  = self._tree_files,
                      output_files = self._output_file,
                      dependencies = dependencies)
示例#18
0
    def __init__(self, input_file, output_file, filter_by, dependencies):
        self._input_file = input_file
        self._output_file = output_file
        self._filter_by = dict(filter_by)
        for (to_filter, groups) in self._filter_by.items():
            # The taxa to be filtered is implied to be part of the group,
            # but is not needed when actually carrying out the filtering
            groups = utilities.safe_coerce_to_frozenset(groups) - utilities.safe_coerce_to_frozenset(to_filter)

            if not groups:
                raise RuntimeError("Singleton filtering must involve at least " "one other taxa")
            self._filter_by[to_filter] = groups

        Node.__init__(
            self,
            description="<FilterSingleton: '%s' -> '%s'>" % (input_file, output_file),
            input_files=[input_file],
            output_files=[output_file],
            dependencies=dependencies,
        )
示例#19
0
    def __init__(self, input_file, output_file, filter_by, dependencies):
        self._input_file = input_file
        self._output_file = output_file
        self._filter_by = dict(filter_by)
        for (to_filter, groups) in self._filter_by.items():
            # The taxa to be filtered is implied to be part of the group,
            # but is not needed when actually carrying out the filtering
            groups = utilities.safe_coerce_to_frozenset(groups) \
                - utilities.safe_coerce_to_frozenset(to_filter)

            if not groups:
                raise RuntimeError("Singleton filtering must involve at least "
                                   "one other taxa")
            self._filter_by[to_filter] = groups

        Node.__init__(self,
                      description="<FilterSingleton: '%s' -> '%s'>"
                      % (input_file, output_file),
                      input_files=[input_file],
                      output_files=[output_file],
                      dependencies=dependencies)
示例#20
0
    def __init__(self,
                 config,
                 target_name,
                 input_files,
                 output_file,
                 intervals_file=None,
                 print_stats=False,
                 max_contigs=_MAX_CONTIGS,
                 dependencies=()):
        self._target_name = target_name
        self._input_files = safe_coerce_to_tuple(input_files)
        self._output_file = output_file
        self._intervals = intervals_file
        self._print_stats = print_stats
        self._max_contigs = max_contigs
        self._max_contigs_reached = False

        input_files = []
        input_files.extend(self._input_files)
        input_files.extend(
            swap_ext(input_file, ".bai") for input_file in self._input_files)
        if intervals_file:
            input_files.append(intervals_file)

        executables = ["coverageBed"
                       ] if intervals_file else ["genomeCoverageBed"]
        auxiliary_files = []
        for cmd in concatenate_input_bams(config, self._input_files)[0]:
            executables.extend(cmd.executables)
            auxiliary_files.extend(cmd.auxiliary_files)

        Node.__init__(self,
                      description  = "<DepthHistogram: %s -> '%s'>" \
                        % (describe_files(self._input_files),
                           self._output_file),
                      input_files  = input_files,
                      output_files = self._output_file,
                      dependencies = dependencies,
                      executables  = executables,
                      auxiliary_files = auxiliary_files)
示例#21
0
    def __init__(self,
                 infiles,
                 out_prefix,
                 exclude_groups=(),
                 reduce=False,
                 dependencies=(),
                 file_dependencies=()):
        """
        infiles = {names : {"partitions" : ..., "filenames" : [...]}}
        """
        if not (isinstance(infiles, dict)
                and all(isinstance(dd, dict) for dd in infiles.values())):
            raise TypeError("'infiles' must be a dictionary of dictionaries")

        input_filenames = []
        for (name, subdd) in infiles.iteritems():
            if set(subdd) - _VALID_KEYS:
                raise ValueError("Invalid keys found for %r: %s" %
                                 (name, ", ".join(set(subdd) - _VALID_KEYS)))
            elif not isinstance(subdd["filenames"], list):
                raise ValueError("filenames must be a list of strings")
            input_filenames.extend(subdd["filenames"])
        # Optional file dependencies; used to depend on the list of sequcences
        input_filenames.extend(safe_coerce_to_tuple(file_dependencies))

        self._reduce = bool(reduce)
        self._infiles = copy.deepcopy(infiles)
        self._out_prefix = out_prefix
        self._excluded = safe_coerce_to_frozenset(exclude_groups)

        description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \
            (" (reducing)" if reduce else "", len(infiles), out_prefix)

        Node.__init__(
            self,
            description=description,
            input_files=input_filenames,
            output_files=[out_prefix + ".phy", out_prefix + ".partitions"],
            dependencies=dependencies)
示例#22
0
    def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies=()):
        self._target = target.name
        self._output_file = os.path.join(config.destination, self._target + ".summary")
        self._prefixes = makefile["Prefixes"]
        self._makefile = makefile["Statistics"]

        self._in_raw_bams = cov_for_lanes
        self._in_lib_bams = cov_for_libs
        input_files = set()
        input_files.update(sum(map(list, self._in_raw_bams.values()), []))
        input_files.update(sum(map(list, self._in_lib_bams.values()), []))

        self._in_raw_read = collections.defaultdict(list)
        for prefix in target.prefixes:
            for sample in prefix.samples:
                for library in sample.libraries:
                    for lane in library.lanes:
                        if lane.reads:
                            if lane.reads.stats:
                                value = lane.reads.stats
                                input_files.add(value)
                            elif set(lane.reads.files) & _PE_READS:
                                value = _PE_READS
                            elif set(lane.reads.files) & _SE_READS:
                                value = _SE_READS
                            else:
                                assert False
                        else:
                            value = _BAMS
                        self._in_raw_read[(sample.name, library.name, lane.name)] = value

        Node.__init__(
            self,
            description="<Summary: %s>" % self._output_file,
            input_files=filter(None, input_files),
            output_files=[self._output_file],
            dependencies=dependencies,
        )
示例#23
0
    def __init__(self, fasta_files, sequences, destination, dependencies=()):
        """
        fasta_files -- { taxon_name_1 : filename_1, ... }
        sequences   -- { interval_name_1, ... }
        """

        self._infiles = copy.deepcopy(fasta_files)
        self._sequences = utilities.safe_coerce_to_frozenset(sequences)
        self._destination = copy.copy(destination)
        self._outfiles = [os.path.join(destination, name + ".fasta") for name in self._sequences]

        input_files = list(self._infiles.itervalues())
        for filename in self._infiles.itervalues():
            input_files.append(filename + ".fai")

        desc = "<CollectSequences: %i sequences from %i files -> '%s'>" % (
            len(self._sequences),
            len(self._infiles),
            self._destination,
        )
        Node.__init__(
            self, description=desc, input_files=input_files, output_files=self._outfiles, dependencies=dependencies
        )
示例#24
0
    def __init__(self, fasta_files, sequences, destination, dependencies=()):
        """
        fasta_files -- { taxon_name_1 : filename_1, ... }
        sequences   -- { interval_name_1, ... }
        """

        self._infiles = copy.deepcopy(fasta_files)
        self._sequences = utilities.safe_coerce_to_frozenset(sequences)
        self._destination = copy.copy(destination)
        self._outfiles = [os.path.join(destination, name + ".fasta")
                          for name in self._sequences]

        input_files = list(self._infiles.itervalues())
        for filename in self._infiles.itervalues():
            input_files.append(filename + ".fai")

        desc = "<CollectSequences: %i sequences from %i files -> '%s'>" \
               % (len(self._sequences), len(self._infiles), self._destination)
        Node.__init__(self,
                      description=desc,
                      input_files=input_files,
                      output_files=self._outfiles,
                      dependencies=dependencies)
示例#25
0
    def __init__(self,
                 anal,
                 d_bam,
                 bed_name,
                 bed_path,
                 gcnode,
                 splitnode,
                 dependencies=()):
        self.analysis = MODULES[anal]
        self.infile, self.d_bam = d_bam.baminfo['BamPath'], d_bam
        self.bed_path, self.anal = bed_path, anal
        dependencies, analname = self._correct_subnodes(
            gcnode, splitnode, anal)

        out_f_name = d_bam.fmt.format(d_bam.bam_name, analname, bed_name)
        self.dest = os.path.join(d_bam.bam_temp_local, out_f_name)

        description = "<ANALYSIS:'%s', BAM: %s, Bed:'%s'>" % \
                      (analname, d_bam.bam_name, bed_name)
        Node.__init__(self,
                      description=description,
                      input_files=[self.infile, bed_path],
                      output_files=self.dest,
                      dependencies=dependencies)
示例#26
0
    def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies = ()):
        self._target        = target.name
        self._output_file   = os.path.join(config.destination, self._target + ".summary")
        self._prefixes      = makefile["Prefixes"]
        self._makefile      = makefile["Statistics"]

        self._in_raw_bams = cov_for_lanes
        self._in_lib_bams = cov_for_libs
        input_files = set()
        input_files.update(sum(map(list, self._in_raw_bams.values()), []))
        input_files.update(sum(map(list, self._in_lib_bams.values()), []))

        self._in_raw_read = collections.defaultdict(list)
        for prefix in target.prefixes:
            for sample in prefix.samples:
                for library in sample.libraries:
                    for lane in library.lanes:
                        if lane.reads:
                            if lane.reads.stats:
                                value = lane.reads.stats
                                input_files.add(value)
                            elif set(lane.reads.files) & _PE_READS:
                                value = _PE_READS
                            elif set(lane.reads.files) & _SE_READS:
                                value = _SE_READS
                            else:
                                assert False
                        else:
                            value = _BAMS
                        self._in_raw_read[(sample.name, library.name, lane.name)] = value

        Node.__init__(self,
                      description  = "<Summary: %s>" % self._output_file,
                      input_files  = filter(None, input_files),
                      output_files = [self._output_file],
                      dependencies = dependencies)
示例#27
0
 def __init__(self):
     self.a_property = lambda: None # pragma: no coverage
     Node.__init__(self)