def __init__(self, infiles, out_prefix, partition_by = "123", add_flag = False, exclude_groups = (), dependencies = ()): if (len(partition_by) != 3): raise ValueError("Default 'partition_by' must be 3 entires long!") elif not isinstance(infiles, dict): raise TypeError("'infiles' must be a dictionary") elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()): raise ValueError("'partition_by' must be 3 entires long!") elif not all(isinstance(dd, dict) for dd in infiles.values()): raise TypeError("'infiles' must be a dictionary of dictionaries") elif not any(("name" in dd) for dd in infiles.values()): raise ValueError("'name' must be specified for all input files") elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()): raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS)) self._infiles = infiles self._out_prefix = out_prefix self._part_by = partition_by self._add_flag = add_flag self._excluded = exclude_groups description = "<FastaToPartitionedPhy (default: %s): %i file(s) -> '%s.*'>" % \ (partition_by, len(infiles), out_prefix) Node.__init__(self, description = description, input_files = infiles, output_files = [out_prefix + ".phy", out_prefix + ".partitions"], dependencies = dependencies)
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__(self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)
def __init__(self, infiles, out_partitions, partition_by = "123", dependencies = ()): if (len(partition_by) != 3): raise ValueError("Default 'partition_by' must be 3 entires long!") elif not isinstance(infiles, dict): raise TypeError("'infiles' must be a dictionary") elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()): raise ValueError("'partition_by' must be 3 entires long!") elif not all(isinstance(dd, dict) for dd in infiles.values()): raise TypeError("'infiles' must be a dictionary of dictionaries") elif not any(("name" in dd) for dd in infiles.values()): raise ValueError("'name' must be specified for all input files") elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()): raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS)) self._infiles = infiles self._out_part = out_partitions self._part_by = partition_by description = "<FastaToPartitions (default: %s): %i file(s) -> '%s'>" % \ (partition_by, len(infiles), out_partitions) Node.__init__(self, description = description, input_files = infiles.keys(), output_files = out_partitions, dependencies = dependencies)
def __init__(self, config, target_name, input_files, output_file, intervals_file = None, print_stats = False, max_contigs = _MAX_CONTIGS, dependencies = ()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend(swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed"] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Detect Input Duplication: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, offset, dependencies=()): self._offset = offset Node.__init__(self, description="<Validate FASTQ Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, input_files, output_file, dependencies=()): Node.__init__(self, description="<Validate FASTA Files: %s>" % (describe_files(input_files)), input_files=input_files, output_files=output_file, dependencies=dependencies) assert len(self.output_files) == 1, self.output_files
def __init__(self, input_files, output_file, dependencies=()): self._output_file = output_file Node.__init__( self, description="<MergeCoverage: '%s' -> '%s'>" % (describe_files(input_files), self._output_file), input_files=input_files, output_files=self._output_file, dependencies=dependencies, )
def __init__(self, infiles, out_phy, add_flag = False, dependencies = ()): self._add_flag = add_flag self._out_phy = out_phy description = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \ (len(infiles), out_phy, (" (w/ flag)" if add_flag else "")) Node.__init__(self, description = description, input_files = infiles, output_files = [out_phy], dependencies = dependencies)
def __init__(self, input_file, output_file, exclude_groups, dependencies = ()): self._input_file = input_file self._output_file = output_file self._excluded = safe_coerce_to_tuple(exclude_groups) description = "<FastaToPAMLPhy: '%s' -> '%s'>" % \ (input_file, output_file) Node.__init__(self, description = description, input_files = [input_file], output_files = [output_file], dependencies = dependencies)
def __init__(self, reference, bedfile, outfile, dependencies=()): self._reference = reference self._bedfile = bedfile self._outfile = outfile description = "<ExtractReference: '%s' -> '%s'>" \ % (reference, outfile) Node.__init__(self, description=description, input_files=[reference, bedfile], output_files=[outfile], dependencies=dependencies)
def __init__(self, input_alignment, input_partition, output_alignment, seed = None, dependencies = ()): self._input_phy = input_alignment self._input_part = input_partition self._output_phy = output_alignment self._seed = seed Node.__init__(self, description = "<PHYLIPBootstrap: %r -> %r>" \ % (input_alignment, output_alignment), input_files = (input_alignment, input_partition), output_files = (output_alignment,), dependencies = dependencies)
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [reroot_path(destination, fpath) for fpath in input_files] self._files = zip(input_files, output_files) Node.__init__(self, description = "<Copy %s output to %r>" % (description, destination), input_files = input_files, output_files = output_files, dependencies = source_nodes)
def __init__(self, input_file, output_file, filter_by, dependencies): self._input_file = input_file self._output_file = output_file self._filter_by = dict(filter_by) for (to_filter, groups) in self._filter_by.items(): groups = set(groups) | set([to_filter]) if len(groups) == 1: raise RuntimeError("Singleton filtering must involve at least one other group") self._filter_by[to_filter] = groups Node.__init__(self, description = "<FilterSingleton: '%s' -> '%s'>" \ % (input_file, output_file), input_files = [input_file], output_files = [output_file], dependencies = dependencies)
def __init__(self, fasta_files, sequences, destination, dependencies = ()): """ fasta_files -- { taxon_name_1 : filename_1, ... } sequences -- { interval_name_1 : { taxon_name_1 : interval_name_1.1, ... }, ... """ self._infiles = copy.deepcopy(fasta_files) self._sequences = copy.deepcopy(sequences) self._destination = copy.copy(destination) self._outfiles = [os.path.join(destination, name + ".fasta") for name in self._sequences] Node.__init__(self, description = "<CollectSequences: %i sequences from %i files -> '%s'>" \ % (len(self._sequences), len(self._infiles), self._destination), input_files = self._infiles.values(), output_files = self._outfiles, dependencies = dependencies)
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, input_file, output_file, filter_by, dependencies): self._input_file = input_file self._output_file = output_file self._filter_by = dict(filter_by) for (to_filter, groups) in self._filter_by.items(): # The taxa to be filtered is implied to be part of the group, # but is not needed when actually carrying out the filtering groups = utilities.safe_coerce_to_frozenset(groups) - utilities.safe_coerce_to_frozenset(to_filter) if not groups: raise RuntimeError("Singleton filtering must involve at least " "one other taxa") self._filter_by[to_filter] = groups Node.__init__( self, description="<FilterSingleton: '%s' -> '%s'>" % (input_file, output_file), input_files=[input_file], output_files=[output_file], dependencies=dependencies, )
def __init__(self, input_file, output_file, filter_by, dependencies): self._input_file = input_file self._output_file = output_file self._filter_by = dict(filter_by) for (to_filter, groups) in self._filter_by.items(): # The taxa to be filtered is implied to be part of the group, # but is not needed when actually carrying out the filtering groups = utilities.safe_coerce_to_frozenset(groups) \ - utilities.safe_coerce_to_frozenset(to_filter) if not groups: raise RuntimeError("Singleton filtering must involve at least " "one other taxa") self._filter_by[to_filter] = groups Node.__init__(self, description="<FilterSingleton: '%s' -> '%s'>" % (input_file, output_file), input_files=[input_file], output_files=[output_file], dependencies=dependencies)
def __init__(self, config, target_name, input_files, output_file, intervals_file=None, print_stats=False, max_contigs=_MAX_CONTIGS, dependencies=()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend( swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed" ] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__( self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)
def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies=()): self._target = target.name self._output_file = os.path.join(config.destination, self._target + ".summary") self._prefixes = makefile["Prefixes"] self._makefile = makefile["Statistics"] self._in_raw_bams = cov_for_lanes self._in_lib_bams = cov_for_libs input_files = set() input_files.update(sum(map(list, self._in_raw_bams.values()), [])) input_files.update(sum(map(list, self._in_lib_bams.values()), [])) self._in_raw_read = collections.defaultdict(list) for prefix in target.prefixes: for sample in prefix.samples: for library in sample.libraries: for lane in library.lanes: if lane.reads: if lane.reads.stats: value = lane.reads.stats input_files.add(value) elif set(lane.reads.files) & _PE_READS: value = _PE_READS elif set(lane.reads.files) & _SE_READS: value = _SE_READS else: assert False else: value = _BAMS self._in_raw_read[(sample.name, library.name, lane.name)] = value Node.__init__( self, description="<Summary: %s>" % self._output_file, input_files=filter(None, input_files), output_files=[self._output_file], dependencies=dependencies, )
def __init__(self, fasta_files, sequences, destination, dependencies=()): """ fasta_files -- { taxon_name_1 : filename_1, ... } sequences -- { interval_name_1, ... } """ self._infiles = copy.deepcopy(fasta_files) self._sequences = utilities.safe_coerce_to_frozenset(sequences) self._destination = copy.copy(destination) self._outfiles = [os.path.join(destination, name + ".fasta") for name in self._sequences] input_files = list(self._infiles.itervalues()) for filename in self._infiles.itervalues(): input_files.append(filename + ".fai") desc = "<CollectSequences: %i sequences from %i files -> '%s'>" % ( len(self._sequences), len(self._infiles), self._destination, ) Node.__init__( self, description=desc, input_files=input_files, output_files=self._outfiles, dependencies=dependencies )
def __init__(self, fasta_files, sequences, destination, dependencies=()): """ fasta_files -- { taxon_name_1 : filename_1, ... } sequences -- { interval_name_1, ... } """ self._infiles = copy.deepcopy(fasta_files) self._sequences = utilities.safe_coerce_to_frozenset(sequences) self._destination = copy.copy(destination) self._outfiles = [os.path.join(destination, name + ".fasta") for name in self._sequences] input_files = list(self._infiles.itervalues()) for filename in self._infiles.itervalues(): input_files.append(filename + ".fai") desc = "<CollectSequences: %i sequences from %i files -> '%s'>" \ % (len(self._sequences), len(self._infiles), self._destination) Node.__init__(self, description=desc, input_files=input_files, output_files=self._outfiles, dependencies=dependencies)
def __init__(self, anal, d_bam, bed_name, bed_path, gcnode, splitnode, dependencies=()): self.analysis = MODULES[anal] self.infile, self.d_bam = d_bam.baminfo['BamPath'], d_bam self.bed_path, self.anal = bed_path, anal dependencies, analname = self._correct_subnodes( gcnode, splitnode, anal) out_f_name = d_bam.fmt.format(d_bam.bam_name, analname, bed_name) self.dest = os.path.join(d_bam.bam_temp_local, out_f_name) description = "<ANALYSIS:'%s', BAM: %s, Bed:'%s'>" % \ (analname, d_bam.bam_name, bed_name) Node.__init__(self, description=description, input_files=[self.infile, bed_path], output_files=self.dest, dependencies=dependencies)
def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies = ()): self._target = target.name self._output_file = os.path.join(config.destination, self._target + ".summary") self._prefixes = makefile["Prefixes"] self._makefile = makefile["Statistics"] self._in_raw_bams = cov_for_lanes self._in_lib_bams = cov_for_libs input_files = set() input_files.update(sum(map(list, self._in_raw_bams.values()), [])) input_files.update(sum(map(list, self._in_lib_bams.values()), [])) self._in_raw_read = collections.defaultdict(list) for prefix in target.prefixes: for sample in prefix.samples: for library in sample.libraries: for lane in library.lanes: if lane.reads: if lane.reads.stats: value = lane.reads.stats input_files.add(value) elif set(lane.reads.files) & _PE_READS: value = _PE_READS elif set(lane.reads.files) & _SE_READS: value = _SE_READS else: assert False else: value = _BAMS self._in_raw_read[(sample.name, library.name, lane.name)] = value Node.__init__(self, description = "<Summary: %s>" % self._output_file, input_files = filter(None, input_files), output_files = [self._output_file], dependencies = dependencies)
def __init__(self): self.a_property = lambda: None # pragma: no coverage Node.__init__(self)