def add_nodes(self, *nodes): for subnodes in safe_coerce_to_tuple(nodes): for node in safe_coerce_to_tuple(subnodes): if not isinstance(node, Node): raise TypeError("Node object expected, recieved %s" % repr(node)) self._nodes.append(node)
def customize(cls, input_file_1, input_file_2, output_file, reference, prefix, threads=2, log_file=None, dependencies=()): # Setting IN_FILE_2 to None makes AtomicCmd ignore this key aln = _bowtie2_template(("bowtie2", ), prefix, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BOWTIE2_VERSION) aln.set_option("-x", prefix) if log_file is not None: aln.set_kwargs(OUT_STDERR=log_file) if input_file_1 and not input_file_2: aln.add_multiple_options("-U", safe_coerce_to_tuple(input_file_1), template="IN_FILE_1_%02i") elif input_file_1 and input_file_2: aln.add_multiple_options("-1", safe_coerce_to_tuple(input_file_1), template="IN_FILE_1_%02i") aln.add_multiple_options("-2", safe_coerce_to_tuple(input_file_2), template="IN_FILE_2_%02i") else: raise NodeError("Input 1, OR both input 1 and input 2 must " "be specified for Bowtie2 node") max_threads = _get_max_threads(reference, threads) aln.set_option("--threads", max_threads) run_fixmate = input_file_1 and input_file_2 order, commands = _process_output(aln, output_file, reference, run_fixmate=run_fixmate) commands["aln"] = aln return { "commands": commands, "order": ["aln"] + order, "threads": max_threads, "dependencies": dependencies }
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def __init__(self, config, reference, infiles, outfile, threads=1, dependencies=()): threads = _get_max_threads(reference, threads) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") command.set_option("-nt", threads) _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<GATK Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, threads=threads, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, config, target, prefix, lanes, name): self.name = name self.lanes = safe_coerce_to_tuple(lanes) self.options = lanes[0].options self.folder = os.path.dirname(self.lanes[0].folder) assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes) assert all((self.options == lane.options) for lane in self.lanes) lane_bams = self._collect_bams_by_type(self.lanes) pcr_duplicates = self.options["PCRDuplicates"] if pcr_duplicates: # pcr_duplicates may be "mark" or any trueish value lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates) # At this point we no longer need to differentiate between types of reads files_and_nodes = self._collect_files_and_nodes(lane_bams) # Collect output bams, possible following rescaling self.bams, mapdamage_nodes \ = self._build_mapdamage_nodes(config, target, prefix, files_and_nodes) nodes = [self._build_dataduplication_node(lane_bams)] nodes.extend(mapdamage_nodes) histogram_node = self._build_duphist_nodes(config, target, prefix, lane_bams) if histogram_node: nodes.append(histogram_node) self.nodes = tuple(nodes)
def __init__(self, config, input_bams, pipename="input.bam", indexed=True): self.pipe = pipename self.indexed = indexed self.files = safe_coerce_to_tuple(input_bams) self.commands = [] self.kwargs = {"TEMP_IN_BAM": self.pipe} if len(self.files) > 1: params = picard_command(config, "MergeSamFiles") params.set_option("SO", "coordinate", sep="=", fixed=False) params.set_option("CREATE_INDEX", "False", sep="=") params.set_option("COMPRESSION_LEVEL", 0, sep="=") params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") params.add_multiple_options("I", input_bams, sep="=") params.set_kwargs(TEMP_OUT_BAM=self.pipe) self.commands = [params.finalize()] else: # Ensure that the actual command depends on the input self.kwargs["IN_FILE_00"] = self.files[0] if indexed: self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
def add_support(self, bootstraps, fmt = "{Support}"): """Adds support values to the current tree, based on a set of trees containing the same taxa. It is assumed that the support trees represent unrooted or arbitarily rooted trees, and no weight is given to the rooted topology of these trees. The main tree should itself be rooted, and the the toplogy and ordering of this tree is preserved, with node-names updated using the formatting string 'fmt'. Formatting is carried out using str.format, with these fields: {Support} -- The total number of trees in which a clade is supported. {Percentage} -- The percentage of trees in which a clade is supported (float). {Fraction} -- The fraction of trees in which a clade is supported (float). For example, typical percentage support-values can be realized by setting 'fmt' to the value "{Percentage:.0f}" to produce integer values. """ clade_counts = {} leaf_names_lst = list(self.get_leaf_names()) leaf_names = frozenset(leaf_names_lst) if len(leaf_names) != len(leaf_names_lst): raise NewickError("Cannot add support values to trees with duplicate leaf names") bootstraps = safe_coerce_to_tuple(bootstraps) for support_tree in bootstraps: support_tree_names = frozenset(support_tree.get_leaf_names()) if leaf_names != support_tree_names: raise NewickError("Support tree does not contain same set of leaf nodes") support_graph = _NewickGraph(support_tree) for clade in support_graph.get_clade_names(): clade_counts[clade] = clade_counts.get(clade, 0) + 1 return self._add_support(self, len(bootstraps), clade_counts, fmt)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<GATK Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def __init__(self, config, input_bams, command, index_format=None, description=None, threads=1, dependencies=()): self._input_bams = safe_coerce_to_tuple(input_bams) self._index_format = index_format if not self._input_bams: raise ValueError("No input BAM files specified!") elif len(self._input_bams) > 1 and index_format: raise ValueError("BAM index cannot be required for > 1 file") elif index_format not in (None, ".bai", ".csi"): raise ValueError("Unknown index format %r" % (index_format,)) if len(self._input_bams) > 1: merge = picard_command(config, "MergeSamFiles") merge.set_option("SO", "coordinate", sep="=") merge.set_option("COMPRESSION_LEVEL", 0, sep="=") merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; this # is because .csi indexed BAM records can have "invalid" bins. merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") merge.add_multiple_options("I", input_bams, sep="=") merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE) command = ParallelCmds([merge.finalize(), command]) CommandNode.__init__(self, command=command, description=description, threads=threads, dependencies=dependencies)
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__(self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)
def customize(self, config, reference, input_files, output_file, directory, dependencies=()): input_files = safe_coerce_to_tuple(input_files) stats_out_fname = "Stats_out_MCMC_correct_prob.csv" command = AtomicCmdBuilder([ "mapDamage", "--rescale-only", "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s" ], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return { "command": command, "config": config, "input_files": input_files, "directory": directory, "dependencies": dependencies }
def customize(self, config, reference, input_files, output_file, directory, dependencies=()): input_files = safe_coerce_to_tuple(input_files) stats_out_fname = "Stats_out_MCMC_correct_prob.csv" command = AtomicCmdBuilder(["mapDamage", "--rescale-only", "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s", "--rescale-out", "%(OUT_BAM)s"], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, TEMP_OUT_LOG="Runtime_log.txt", TEMP_OUT_CSV=stats_out_fname, OUT_BAM=output_file, CHECK_VERSION=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return {"command": command, "config": config, "input_files": input_files, "directory": directory, "dependencies": dependencies}
def __init__(self, config, target, prefix, lanes, name): self.name = name self.lanes = safe_coerce_to_tuple(lanes) self.options = lanes[0].options self.folder = os.path.dirname(os.path.dirname(self.lanes[0].folder)) assert all( (self.folder == os.path.dirname(os.path.dirname(lane.folder))) for lane in self.lanes ) assert all((self.options == lane.options) for lane in self.lanes) lane_bams = self._collect_bams_by_type(self.lanes) pcr_duplicates = self.options["PCRDuplicates"] if pcr_duplicates: # pcr_duplicates may be "mark" or any trueish value lane_bams = self._remove_pcr_duplicates( config, prefix, lane_bams, pcr_duplicates ) # At this point we no longer need to differentiate between read types files_and_nodes = self._collect_files_and_nodes(lane_bams) # Collect output bams, possible following rescaling self.bams, mapdamage_nodes = self._build_mapdamage_nodes( config, target, prefix, files_and_nodes ) nodes = [self._build_dataduplication_node(lane_bams)] nodes.extend(mapdamage_nodes) self.nodes = tuple(nodes)
def __init__(self, config, input_bams, output_bam, keep_dupes=True, dependencies=()): input_bams = safe_coerce_to_tuple(input_bams) builder = factory.new("rmdup_collapsed") builder.add_value("%(TEMP_IN_BAM)s") builder.set_kwargs(OUT_STDOUT=output_bam, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_bams) if not keep_dupes: builder.set_option("--remove-duplicates") description = "<FilterCollapsedBAM: %s>" \ % (describe_files(input_bams),) MultiBAMInputNode.__init__(self, config=config, input_bams=input_bams, command=builder.finalize(), description=description, dependencies=dependencies)
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.reference = prefix["Reference"] self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) self.datadup_check = self._build_dataduplication_node(prefix, files_and_nodes) self.bams = {} if features["RawBAM"]: self.bams.update(self._build_raw_bam(config, prefix, files_and_nodes)) if features["RealignedBAM"]: self.bams.update(self._build_realigned_bam(config, prefix, files_and_nodes)) if not self.bams: for sample in self.samples: self.bams.update(sample.bams) nodes = [] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def __init__(self, tree_files, output_file, taxa=(), dependencies=()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description=description, input_files=self._tree_files, output_files=self._output_file, dependencies=dependencies)
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, commands): self._ready = False commands = safe_coerce_to_tuple(commands) for command in commands: if not isinstance(command, (AtomicCmd, _CommandSet)): raise CmdError("ParallelCmds must only contain AtomicCmds or other ParallelCmds!") _CommandSet.__init__(self, commands)
def _validate_filenames(filenames): """Sanity checks for filenames handled by 'describe_files' and 'describe_paired_files.""" filenames = safe_coerce_to_tuple(filenames) for filename in filenames: if not isinstance(filename, types.StringTypes): raise ValueError("Only string types are allowed for filenames, not %s" \ % (filename.__class__.__name__,)) return filenames
def __init__(self, config, prefixes, name): self.name = name self.prefixes = safe_coerce_to_tuple(prefixes) self.bams = {} self.nodes = [] for prefix in self.prefixes: self.nodes.extend(prefix.nodes) self.bams.update(prefix.bams.iteritems())
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies=()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description=description, input_files=input_files, output_files=output_file, dependencies=dependencies)
def __init__(self, commands): self._ready = False commands = safe_coerce_to_tuple(commands) for command in commands: if not isinstance(command, (AtomicCmd, _CommandSet)): raise CmdError( "ParallelCmds must only contain AtomicCmds or other ParallelCmds!" ) _CommandSet.__init__(self, commands)
def __init__(self, call, search, checks, name=None, priority=0): """See function 'Requrement' for a description of parameters. """ self._call = safe_coerce_to_tuple(call) self._done = None self.name = str(name or self._call[0]) self.priority = int(priority) self.checks = checks self._rege = re.compile(search) self._version = None
def __init__(self, call, **kwargs): """See AtomiCmd.__init__ for parameters / keyword arguments. """ self._call = safe_coerce_to_tuple(call) self._options = [] self._values = [] self._kwargs = {} self._object = None self.set_kwargs(**kwargs)
def __init__(self, config, prefix, libraries, name): self.name = name self.libraries = safe_coerce_to_tuple(libraries) self.folder = os.path.dirname(self.libraries[0].folder) self.bams = {} for library in self.libraries: self.bams.update(library.bams.items()) nodes = [] for library in self.libraries: nodes.extend(library.nodes) self.nodes = tuple(nodes)
def __init__(self, call, threads = 1, **kwargs): if not isinstance(threads, (types.IntType, types.LongType)): raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__) elif threads < 1: raise ValueError("'threads' must be 1 or greater, not %i" % threads) elif threads == 1: AtomicCmdBuilder.__init__(self, call, EXEC_MPI = "mpirun", **kwargs) else: call = safe_coerce_to_tuple(call) mpi_call = ["mpirun", "-n", threads] mpi_call.extend(call) AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN = call[0], **kwargs)
def __init__(self, config, prefix, libraries, name): self.name = name self.libraries = safe_coerce_to_tuple(libraries) self.folder = os.path.dirname(self.libraries[0].folder) self.bams = {} for library in self.libraries: self.bams.update(library.bams.iteritems()) nodes = [] for library in self.libraries: nodes.extend(library.nodes) self.nodes = tuple(nodes)
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) self.datadup_check = self._build_dataduplication_node( prefix, files_and_nodes) build_raw_bam = features["RawBAM"] build_realigned_bam = features["RealignedBAM"] if build_realigned_bam and prefix['IndexFormat'] == '.csi': if prefix['Path'] not in _CSI_WARNINGS: ui.print_err("\nWARNING: Realigned BAMs enabled for reference " "genome %r, but the file contains sequences too " "large for GATK, which does not support .csi " "index files. Raw BAMs will be built instead of " "realigned BAMs, for this reference sequence." % (prefix['Path'])) # TODO: Add reference to FAQ when written. _CSI_WARNINGS.add(prefix['Path']) build_realigned_bam = False build_raw_bam = True self.bams = {} if build_raw_bam: self.bams.update( self._build_raw_bam(config, prefix, files_and_nodes)) if build_realigned_bam: self.bams.update( self._build_realigned_bam(config, prefix, files_and_nodes)) if not self.bams: for sample in self.samples: self.bams.update(sample.bams) nodes = [self.datadup_check] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) self.datadup_check = self._build_dataduplication_node( prefix, files_and_nodes) build_raw_bam = features["RawBAM"] build_realigned_bam = features["RealignedBAM"] if build_realigned_bam and prefix['IndexFormat'] == '.csi': if prefix['Path'] not in _CSI_WARNINGS: ui.print_err("\nWARNING: Realigned BAMs enabled for reference " "genome %r, but the file contains sequences too " "large for GATK, which does not support .csi " "index files. Raw BAMs will be built instead of " "realigned BAMs, for this reference sequence." % (prefix['Path'])) # TODO: Add reference to FAQ when written. _CSI_WARNINGS.add(prefix['Path']) build_realigned_bam = False build_raw_bam = True self.bams = {} if build_raw_bam: self.bams.update(self._build_raw_bam( config, prefix, files_and_nodes)) if build_realigned_bam: self.bams.update(self._build_realigned_bam( config, prefix, files_and_nodes)) if not self.bams: for sample in self.samples: self.bams.update(sample.bams) nodes = [self.datadup_check] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def customize(self, config, reference, input_files, output_directory, title="mapDamage", dependencies=()): input_files = safe_coerce_to_tuple(input_files) command = AtomicCmdBuilder( [ "mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s" ], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join(output_directory, "Fragmisincorporation_plot.pdf"), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return { "command": command, "config": config, "input_files": input_files, "dependencies": dependencies }
def customize(cls, input_file_1, input_file_2, output_file, reference, prefix, threads=2, log_file=None, dependencies=()): # Setting IN_FILE_2 to None makes AtomicCmd ignore this key aln = _bowtie2_template(("bowtie2",), prefix, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BOWTIE2_VERSION) aln.set_option("-x", prefix) if log_file is not None: aln.set_kwargs(OUT_STDERR=log_file) if input_file_1 and not input_file_2: aln.add_multiple_options("-U", safe_coerce_to_tuple(input_file_1), template="IN_FILE_1_%02i") elif input_file_1 and input_file_2: aln.add_multiple_options("-1", safe_coerce_to_tuple(input_file_1), template="IN_FILE_1_%02i") aln.add_multiple_options("-2", safe_coerce_to_tuple(input_file_2), template="IN_FILE_2_%02i") else: raise NodeError("Input 1, OR both input 1 and input 2 must " "be specified for Bowtie2 node") max_threads = _get_max_threads(reference, threads) aln.set_option("--threads", max_threads) run_fixmate = input_file_1 and input_file_2 order, commands = _process_output(aln, output_file, reference, run_fixmate=run_fixmate) commands["aln"] = aln return {"commands": commands, "order": ["aln"] + order, "threads": max_threads, "dependencies": dependencies}
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [reroot_path(destination, fpath) for fpath in input_files] self._files = zip(input_files, output_files) Node.__init__(self, description = "<Copy %s output to %r>" % (description, destination), input_files = input_files, output_files = output_files, dependencies = source_nodes)
def __init__(self, call, threads=1, **kwargs): if not isinstance(threads, (types.IntType, types.LongType)): raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__) elif threads < 1: raise ValueError("'threads' must be 1 or greater, not %i" % threads) elif threads == 1: AtomicCmdBuilder.__init__(self, call, EXEC_MPI="mpirun", **kwargs) else: call = safe_coerce_to_tuple(call) mpi_call = ["mpirun", "-n", threads] mpi_call.extend(call) AtomicCmdBuilder.__init__( self, mpi_call, EXEC_MAIN=call[0], **kwargs)
def __init__(self, config, input_files, output_file, dependencies=()): input_files = safe_coerce_to_tuple(input_files) builder = factory.new("duphist") builder.add_value('%(TEMP_IN_BAM)s') builder.set_kwargs(OUT_STDOUT=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) description = "<DuplicateHistogram: %s -> %r>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, command=builder.finalize(), description=description, dependencies=dependencies)
def Requirement(call, search, checks, name=None, priority=0): # Ignore function naming scheme # pylint: disable=C0103 """Returns a singleton Requirement object, based on the parameters, which may be used to check that version requirements are met for a given program/utility/module, etc. Parameters: call -- A string, or a tuple containing strings for a system call, or a tuple containing a function at the first position, and a set of positional parameters. In the case of system calls, stdout and stderr are returned as a single string, in the case of a function call, the return value is expected to be a str. search -- A regular expression (string or re object), used to search the output of the "call". Groups are assumed to represent version numbers. checks -- A callable that implements the interface described in the Check class. name -- Descriptive name for the executable/module/etc. If not specified, first value in 'call' will be used; if multiple otherwise identical checks are made, the last name that does not equal the first value of 'call' will be used. priority -- Order in which requirements are checked; if multiple otherwise identical checks are made with different priority, the highest priority takes precedence. Implementation detail: To reduce the need for performing calls or system- calls multiple times, caches are implemented using the call object as keys. Thus the same calls should be passed in a manner which allow equality between the same calls to be established. """ call = safe_coerce_to_tuple(call) key = (call, search, checks) try: requirement = _REQUIREMENT_CACHE[key] # Highest priority takes precedence requirement.priority = max(requirement.priority, priority) # Last explicitly specified name takes precedence requirement.name = name or requirement.name except KeyError: requirement = RequirementObj(*key, name=name, priority=priority) _REQUIREMENT_CACHE[key] = requirement return requirement
def __init__( self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=(), ): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.items(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % ( " (reducing)" if reduce else "", len(infiles), out_prefix, ) Node.__init__( self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies, )
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [ reroot_path(destination, fpath) for fpath in input_files ] self._files = zip(input_files, output_files) Node.__init__(self, description="<Copy %s output to %r>" % (description, destination), input_files=input_files, output_files=output_files, dependencies=source_nodes)
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.items()) self.datadup_check = self._build_dataduplication_node( prefix, files_and_nodes) self.bams = self._build_bam(config, prefix, files_and_nodes) nodes = [self.datadup_check] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def customize(self, config, reference, input_files, output_directory, title="mapDamage", dependencies=()): input_files = safe_coerce_to_tuple(input_files) command = AtomicCmdBuilder( ["mapDamage", "--no-stats", # Prevent references with many contigs from using excessive # amounts of memory, at the cost of per-contig statistics: "--merge-reference-sequences", "-t", title, "-i", "%(TEMP_IN_BAM)s", "-d", "%(TEMP_DIR)s", "-r", "%(IN_REFERENCE)s"], TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE, IN_REFERENCE=reference, OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"), OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"), OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"), OUT_PLOT_FRAG=os.path.join(output_directory, "Fragmisincorporation_plot.pdf"), OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"), OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"), OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"), OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"), TEMP_OUT_STDOUT="pipe_mapDamage.stdout", TEMP_OUT_STDERR="pipe_mapDamage.stderr", CHECK_RSCRIPT=RSCRIPT_VERSION, CHECK_MAPDAMAGE=MAPDAMAGE_VERSION) command.add_multiple_kwargs(input_files) return {"command": command, "config": config, "input_files": input_files, "dependencies": dependencies}
def add_support(self, bootstraps, fmt="{Support}"): """Adds support values to the current tree, based on a set of trees containing the same taxa. It is assumed that the support trees represent unrooted or arbitarily rooted trees, and no weight is given to the rooted topology of these trees. The main tree should itself be rooted, and the the toplogy and ordering of this tree is preserved, with node-names updated using the formatting string 'fmt'. Formatting is carried out using str.format, with these fields: {Support} -- The total number of trees in which a clade is supported. {Percentage} -- The percentage of trees in which a clade is supported (float). {Fraction} -- The fraction of trees in which a clade is supported (float). For example, typical percentage support-values can be realized by setting 'fmt' to the value "{Percentage:.0f}" to produce integer values. """ clade_counts = {} leaf_names_lst = list(self.get_leaf_names()) leaf_names = frozenset(leaf_names_lst) if len(leaf_names) != len(leaf_names_lst): raise NewickError( "Cannot add support values to trees with duplicate leaf names" ) bootstraps = safe_coerce_to_tuple(bootstraps) for support_tree in bootstraps: support_tree_names = frozenset(support_tree.get_leaf_names()) if leaf_names != support_tree_names: raise NewickError( "Support tree does not contain same set of leaf nodes" ) support_graph = _NewickGraph(support_tree) for clade in support_graph.get_clade_names(): clade_counts[clade] = clade_counts.get(clade, 0) + 1 return self._add_support(self, len(bootstraps), clade_counts, fmt)
def __init__(self, config, input_bams, command, index_format=None, description=None, threads=1, dependencies=()): self._input_bams = safe_coerce_to_tuple(input_bams) self._index_format = index_format if not self._input_bams: raise ValueError("No input BAM files specified!") elif len(self._input_bams) > 1 and index_format: raise ValueError("BAM index cannot be required for > 1 file") elif index_format not in (None, ".bai", ".csi"): raise ValueError("Unknown index format %r" % (index_format, )) if len(self._input_bams) > 1: merge = picard_command(config, "MergeSamFiles") merge.set_option("SO", "coordinate", sep="=") merge.set_option("COMPRESSION_LEVEL", 0, sep="=") merge.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") # Validation is mostly left to manual ValidateSamFile runs; this # is because .csi indexed BAM records can have "invalid" bins. merge.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=") merge.add_multiple_options("I", input_bams, sep="=") merge.set_kwargs(TEMP_OUT_BAM=self.PIPE_FILE) command = ParallelCmds([merge.finalize(), command]) CommandNode.__init__(self, command=command, description=description, threads=threads, dependencies=dependencies)
def __init__(self, config, target_name, input_files, output_file, prefix, regions_file=None, dependencies=()): input_files = safe_coerce_to_tuple(input_files) index_format = regions_file and prefix['IndexFormat'] builder = factory.new("depths") builder.add_value("%(TEMP_IN_BAM)s") builder.add_value("%(OUT_FILE)s") builder.set_option("--target-name", target_name) builder.set_kwargs(OUT_FILE=output_file, TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE) builder.add_multiple_kwargs(input_files) if regions_file: index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format) builder.set_option('--regions-file', '%(IN_REGIONS)s') builder.set_kwargs(IN_REGIONS=regions_file, TEMP_IN_INDEX=index_file) description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(input_files), output_file) MultiBAMInputNode.__init__(self, config=config, input_bams=input_files, index_format=index_format, command=builder.finalize(), description=description, dependencies=dependencies)
def test_safe_coerce_to_tuple__list(): assert_equal(utils.safe_coerce_to_tuple([1, 3, 2]), (1, 3, 2))
def __init__(self, command, set_cwd=False, **kwargs): """Takes a command and a set of files. The command is expected to be an iterable starting with the name of an executable, with each item representing one string on the command line. Thus, the command "find /etc -name 'profile*'" might be represented as the list ["find", "/etc", "-name", "profile*"]. Commands typically consist of an executable, one or more input files, one or more output files, and one or more pipes. In atomic command, such files are not specified directly, but instead are specified using keywords, which allows easy tracking of requirements and other features. Note that only files, and not directories, are supported as input/output! Each keyword represents a type of file, as determined by the prefix: IN_ -- Path to input file transformed/analysed the executable. OUT_ -- Path to output file generated by the executable. During execution of the AtomicCmd, these paths are modified to point to the temporary directory. EXEC_ -- Name of / path to executable. The first item in the command is always one of the executables, even if not specified in this manner. AUX_ -- Auxillery files required by the executable(s), which are themselves not executable. Examples include scripts, config files, data-bases, and the like. CHECK_ -- A callable, which upon calling does version checking, raising an exception in the case of requirements not being met. This may be used to ensure that prerequisites are met before running the command. The function is not called by AtomicCmd itself. EXAMPLE 1: Creating a gzipped tar-archive from two files The command "tar cjf output-file input-file-1 input-file-2" could be represented using the following AtomicCmd: cmd = AtomicCmd(["tar", "cjf", "%(OUT_FILE)s", "%(IN_FILE_1)s", "%(IN_FILE_2)s"], OUT_FILE = "output-file", IN_FILE_1 = "input-file-1", IN_FILE_2 = "input-file-2") Note that files that are not directly invoked may be included above, in order to allow the specification of requirements. This could include required data files, or executables indirectly executed by a script. If the above is prefixed with "TEMP_", files are read from / written to the temporary folder in which the command is executed. Note that all TEMP_OUT_ files are deleted when commit is called (if they exist), and only filenames (not dirname component) are allowed for TEMP_ values. In addition, the follow special names may be used with the above: STDIN_ -- Takes a filename, or an AtomicCmd, in which case stdout of that command is piped to the stdin of this instance. STDOUT_ -- Takes a filename, or the special value PIPE to allow another AtomicCmd instance to use the output directly. STDERR_ -- Takes a filename. Each pipe can only be used once, with or without the TEMP_ prefix. EXAMPLE 2: zcat'ing an archive The command "zcat input-file > output-file" could be represented using the following AtomicCmd: cmd = AtomicCmd(["zcat", "%(IN_FILE)s"], OUT_STDOUT = "output-file") If 'set_cwd' is True, the current working directory is set to the temporary directory before the command is executed. Input paths are automatically turned into absolute paths in this case.""" self._proc = None self._temp = None self._running = False self._command = map(str, safe_coerce_to_tuple(command)) self._set_cwd = set_cwd if not self._command or not self._command[0]: raise ValueError("Empty command in AtomicCmd constructor") arguments = self._process_arguments(id(self), self._command, kwargs) self._files = self._build_files_dict(arguments) self._file_sets = self._build_files_map(self._command, arguments) # Dry-run, to catch errors early self._generate_call("/tmp")
def test_safe_coerce_to_tuple__dict(): assert_equal(utils.safe_coerce_to_tuple({1: 2, 3: 4}), ({1: 2, 3: 4}, ))
def test_safe_coerce_to_tuple__iterable(): assert_equal(utils.safe_coerce_to_tuple(xrange(3)), (0, 1, 2))
def test_safe_coerce_to_tuple__tuple(): assert_equal(utils.safe_coerce_to_tuple((1, 3, 2)), (1, 3, 2))
def __init__(self, commands): self._commands = safe_coerce_to_tuple(commands) if not self._commands: raise CmdError("Empty list passed to command set") self._validate_commands()