def concatenate_input_bams(config, input_bams, out=AtomicCmd.PIPE): """Transparent concatenation of input BAMs. Return a tuple containing a list of nodes (0 or 1), and an object which may be passed to the IN_STDIN of an AtomicCmd (either an AtomicCmd, or a filename). This allows transparent concatenation when multiple files are specified, while avoiding needless overhead when there is only 1 input file.""" input_bams = safe_coerce_to_tuple(input_bams) if len(input_bams) == 1: return [], input_bams[0] jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar") params = AtomicJavaCmdBuilder(config, jar_file) params.set_kwargs(CHECK_JAR=_picard_version(jar_file)) if out == AtomicCmd.PIPE: params.set_kwargs(OUT_STDOUT=out) params.set_option("OUTPUT", "/dev/stdout", sep="=") else: params.set_option("OUTPUT", out, sep="=") params.set_option("CREATE_INDEX", "False", sep="=") params.set_option("COMPRESSION_LEVEL", 0, sep="=") for (index, filename) in enumerate(safe_coerce_to_tuple(input_bams), start=1): params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep="=") params.set_kwargs(**{"IN_BAM_%02i" % index: filename}) params.set_option("SO", "coordinate", sep="=", fixed=False) cmd = params.finalize() return [cmd], cmd
def concatenate_input_bams(config, input_bams, out = AtomicCmd.PIPE): """Transparent concatenation of input BAMs. Return a tuple containing a list of nodes (0 or 1), and an object which may be passed to the IN_STDIN of an AtomicCmd (either an AtomicCmd, or a filename). This allows transparent concatenation when multiple files are specified, while avoiding needless overhead when there is only 1 input file.""" input_bams = safe_coerce_to_tuple(input_bams) if len(input_bams) == 1: return [], input_bams[0] jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar") params = AtomicJavaCmdBuilder(config, jar_file) params.set_kwargs(CHECK_JAR = _picard_version(jar_file)) if out == AtomicCmd.PIPE: params.set_kwargs(OUT_STDOUT = out) params.set_option("OUTPUT", "/dev/stdout", sep = "=") else: params.set_option("OUTPUT", out, sep = "=") params.set_option("CREATE_INDEX", "False", sep = "=") params.set_option("COMPRESSION_LEVEL", 0, sep = "=") for (index, filename) in enumerate(safe_coerce_to_tuple(input_bams), start = 1): params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep = "=") params.set_kwargs(**{"IN_BAM_%02i" % index : filename}) params.set_option("SO", "coordinate", sep = "=", fixed = False) cmd = params.finalize() return [cmd], cmd
def add_nodes(self, *nodes): for subnodes in safe_coerce_to_tuple(nodes): for node in safe_coerce_to_tuple(subnodes): if not isinstance(node, Node): raise TypeError("Node object expected, recieved %s" % repr(node)) self._nodes.append(node)
def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()): self._output_file = output_file self._main_tree_files = safe_coerce_to_tuple(main_tree_files) self._support_tree_files = safe_coerce_to_tuple(support_tree_files) input_files = self._main_tree_files + self._support_tree_files description = "<NewickSupport: %s>" % \ (describe_files(main_tree_files),) Node.__init__(self, description = description, input_files = input_files, output_files = output_file, dependencies = dependencies)
def customize(cls, config, input_bams, output_bam, output_metrics=None, dependencies=()): jar_file = os.path.join(config.jar_root, "MarkDuplicates.jar") params = AtomicJavaCmdBuilder(config, jar_file) # Create .bai index, since it is required by a lot of other programs params.set_option("CREATE_INDEX", "True", sep="=") params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=") input_bams = safe_coerce_to_tuple(input_bams) for (index, filename) in enumerate(input_bams): params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep="=") params.set_kwargs(**{("IN_BAM_%02i" % index): filename}) # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep="=", fixed=False) params.set_kwargs(OUT_BAM=output_bam, OUT_BAI=swap_ext(output_bam, ".bai"), OUT_METRICS=output_metrics or swap_ext(output_bam, ".metrics"), CHECK_JAR=_picard_version(jar_file)) return {"command": params, "dependencies": dependencies}
def __init__(self, config, prefixes, name): self.name = name self.prefixes = safe_coerce_to_tuple(prefixes) self._nodes_alignment = MetaNode(description = "Alignments:", dependencies = [prefix.node for prefix in self.prefixes]) self._nodes_extras = {}
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.reference = prefix["Reference"] self.aoi = prefix.get("AreasOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.bams = {} self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) if "Raw BAM" in features: self.bams.update( self._build_raw_bam(config, prefix, files_and_nodes)) if "Realigned BAM" in features: self.bams.update( self._build_realigned_bam(config, prefix, files_and_nodes)) sample_nodes = [sample.node for sample in self.samples] if not self.bams: for sample in self.samples: self.bams.update(sample.bams) self.node = MetaNode(description="Prefix: %s" % prefix["Name"], dependencies=sample_nodes) else: self.node = MetaNode(description="Final BAMs: %s" % prefix["Name"], subnodes=self.bams.values(), dependencies=sample_nodes)
def _append_aln_user_parameters(mkfile_params, lst): for (param, value) in mkfile_params.iteritems(): if param.startswith("-"): for value in safe_coerce_to_tuple(value): lst.append(param) if value is not None: lst.append(value)
def __init__(self, config, target, prefix, lanes, name): self.name = name self.lanes = safe_coerce_to_tuple(lanes) self.options = lanes[0].options self.folder = os.path.dirname(self.lanes[0].folder) self.bams = None self.mapdamage = None assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes) assert all((self.options == lane.options) for lane in self.lanes) lane_bams = self._collect_bams_by_type(self.lanes) self.datadup_check = self._build_dataduplication_node(lane_bams) self.duphist = \ self._build_duphist_nodes(config, target, prefix, lane_bams) pcr_duplicates = self.options["PCRDuplicates"] if pcr_duplicates: lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates) # At this point we no longer need to differentiate between types of reads files_and_nodes = self._collect_files_and_nodes(lane_bams) self.bams, self.mapdamage = \ self._build_mapdamage_nodes(config, target, prefix, files_and_nodes) self.node = MetaNode( description="Library: %s" % os.path.basename(self.folder), dependencies=self.bams.values() + [self.datadup_check])
def __init__(self, config, reference, intervals, infiles, outfile, dependencies = ()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(config, jar_file) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE = reference, IN_REF_DICT = fileutils.swap_ext(reference, ".dict"), IN_INTERVALS = intervals, OUT_BAMFILE = outfile) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM = self._basename, IN_REF = reference, TEMP_OUT_STDOUT = self._basename + ".calmd") description = "<Indel Realign: %i file(s) -> '%s'>" \ % (len(infiles), outfile) CommandNode.__init__(self, description = description, command = ParallelCmds([command.finalize(), calmd]), dependencies = dependencies)
def __init__(self, config, target_name, input_files, output_file, intervals_file = None, print_stats = False, max_contigs = _MAX_CONTIGS, dependencies = ()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend(swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed"] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, config, input_bams, pipename="input.bam"): self.pipe = pipename self.files = safe_coerce_to_tuple(input_bams) self.commands = [] self.kwargs = {"TEMP_IN_BAM": self.pipe} if len(self.files) > 1: jar_file = os.path.join(config.jar_root, "MergeSamFiles.jar") params = AtomicJavaCmdBuilder(jar=jar_file, temp_root=config.temp_root, jre_options=config.jre_options) params.set_option("SO", "coordinate", sep="=", fixed=False) params.set_option("CREATE_INDEX", "False", sep="=") params.set_option("COMPRESSION_LEVEL", 0, sep="=") params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") params.add_multiple_options("I", input_bams, sep="=") params.set_kwargs(CHECK_JAR=_picard_version(config, jar_file), TEMP_OUT_BAM=self.pipe) self.commands = [params.finalize()] else: # Ensure that the actual command depends on the input self.kwargs["IN_FILE_00"] = self.files[0] self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
def add_support(self, bootstraps, fmt = "{Support}"): """Adds support values to the current tree, based on a set of trees containing the same taxa. It is assumed that the support trees represent unrooted or arbitarily rooted trees, and no weight is given to the rooted topology of these trees. The main tree should itself be rooted, and the the toplogy and ordering of this tree is preserved, with node-names updated using the formatting string 'fmt'. Formatting is carried out using str.format, with these fields: {Support} -- The total number of trees in which a clade is supported. {Percentage} -- The percentage of trees in which a clade is supported (float). {Fraction} -- The fraction of trees in which a clade is supported (float). For example, typical percentage support-values can be realized by setting 'fmt' to the value "{Percentage:.0f}" to produce integer values. """ clade_counts = {} leaf_names_lst = list(self.get_leaf_names()) leaf_names = frozenset(leaf_names_lst) if len(leaf_names) != len(leaf_names_lst): raise NewickError("Cannot add support values to trees with duplicate leaf names") bootstraps = safe_coerce_to_tuple(bootstraps) for support_tree in bootstraps: support_tree_names = frozenset(support_tree.get_leaf_names()) if leaf_names != support_tree_names: raise NewickError("Support tree does not contain same set of leaf nodes") support_graph = _NewickGraph(support_tree) for clade in support_graph.get_clade_names(): clade_counts[clade] = clade_counts.get(clade, 0) + 1 return self._add_support(self, len(bootstraps), clade_counts, fmt)
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__(self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)
def __init__(self, config, reference, intervals, infiles, outfile, dependencies=()): self._basename = os.path.basename(outfile) infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "IndelRealigner") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-targetIntervals", "%(IN_INTERVALS)s") command.set_option("-o", "%(OUT_BAMFILE)s") command.set_option("--bam_compression", 0) command.set_option("--disable_bam_indexing") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), IN_INTERVALS=intervals, OUT_BAMFILE=outfile, CHECK_GATK=_get_gatk_version_check(config)) calmd = AtomicCmd(["samtools", "calmd", "-b", "%(TEMP_IN_BAM)s", "%(IN_REF)s"], TEMP_IN_BAM=self._basename, IN_REF=reference, TEMP_OUT_STDOUT=self._basename + ".calmd", CHECK_VERSION=SAMTOOLS_VERSION) description = "<Indel Realigner (aligning): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=ParallelCmds([command.finalize(), calmd]), dependencies=dependencies)
def customize(cls, config, input_bams, output_bam, output_metrics = None, dependencies = ()): jar_file = os.path.join(config.jar_root, "MarkDuplicates.jar") params = AtomicJavaCmdBuilder(config, jar_file) # Create .bai index, since it is required by a lot of other programs params.set_option("CREATE_INDEX", "True", sep = "=") params.set_option("OUTPUT", "%(OUT_BAM)s", sep = "=") params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep = "=") input_bams = safe_coerce_to_tuple(input_bams) for (index, filename) in enumerate(input_bams): params.add_option("I", "%%(IN_BAM_%02i)s" % index, sep = "=") params.set_kwargs(**{("IN_BAM_%02i" % index) : filename}) # Remove duplicates from output by default to save disk-space params.set_option("REMOVE_DUPLICATES", "True", sep = "=", fixed = False) params.set_kwargs(OUT_BAM = output_bam, OUT_BAI = swap_ext(output_bam, ".bai"), OUT_METRICS = output_metrics or swap_ext(output_bam, ".metrics"), CHECK_JAR = _picard_version(jar_file)) return {"command" : params, "dependencies" : dependencies}
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.reference = prefix["Reference"] self.aoi = prefix.get("AreasOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.bams = {} self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) if "Raw BAM" in features: self.bams.update(self._build_raw_bam(config, prefix, files_and_nodes)) if "Realigned BAM" in features: self.bams.update(self._build_realigned_bam(config, prefix, files_and_nodes)) sample_nodes = [sample.node for sample in self.samples] if not self.bams: for sample in self.samples: self.bams.update(sample.bams) self.node = MetaNode(description = "Prefix: %s" % prefix["Name"], dependencies = sample_nodes) else: self.node = MetaNode(description = "Final BAMs: %s" % prefix["Name"], subnodes = self.bams.values(), dependencies = sample_nodes)
def __init__(self, config, target, prefix, lanes, name): self.name = name self.lanes = safe_coerce_to_tuple(lanes) self.options = lanes[0].options self.folder = os.path.dirname(self.lanes[0].folder) self.bams = None self.mapdamage = None assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes) assert all((self.options == lane.options) for lane in self.lanes) lane_bams = self._collect_bams_by_type(self.lanes) self.datadup_check = self._build_dataduplication_node(lane_bams) self.duphist = \ self._build_duphist_nodes(config, target, prefix, lane_bams) pcr_duplicates = self.options["PCRDuplicates"] if pcr_duplicates: lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates) # At this point we no longer need to differentiate between types of reads files_and_nodes = self._collect_files_and_nodes(lane_bams) self.bams, self.mapdamage = \ self._build_mapdamage_nodes(config, target, prefix, files_and_nodes) self.node = MetaNode(description = "Library: %s" % os.path.basename(self.folder), dependencies = self.bams.values() + [self.datadup_check])
def customize(cls, reference, infiles, outfile, options, dependencies = ()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(options.jar_root,"GenomeAnalysisTK.jar") UnifiedGenotyper = AtomicJavaCmdBuilder(options,jar_file) UnifiedGenotyper.set_option("-R", "%(IN_REFERENCE)s") UnifiedGenotyper.set_option("-T", "UnifiedGenotyper") for bam in infiles: assert os.path.exists(bam), "Couldn't find input BAM: {}".format(bam) UnifiedGenotyper.add_option("-I", bam) UnifiedGenotyper.set_option("-o", "%(OUT_VCFFILES)s") UnifiedGenotyper.set_option("-stand_call_conf", "30.0") UnifiedGenotyper.set_option("-stand_emit_conf", "10.0") UnifiedGenotyper.set_option("-dcov", "200") #UnifiedGenotyper.set_option("-nct", "3") UnifiedGenotyper.set_option("-L", "chrUn2:1-19213991") UnifiedGenotyper.set_kwargs( IN_REFERENCE = reference, OUT_VCFFILES = outfile, OUT_VCF_IDX = outfile + ".idx" ) return { "commands" : { "unifiedgenotyper" : UnifiedGenotyper } }
def __init__(self, config, reference, infiles, outfile, intervals=None, dependencies=()): if not intervals: intervals = outfile + ".intervals" infiles = safe_coerce_to_tuple(infiles) trainer = _IndelTrainerNode(config=config, reference=reference, infiles=infiles, outfile=intervals, dependencies=dependencies) aligner = _IndelRealignerNode(config=config, reference=reference, intervals=intervals, infiles=infiles, outfile=outfile, dependencies=trainer) description = "<GATK Indel Realigner: %i files -> '%s'>" \ % (len(infiles), outfile) MetaNode.__init__(self, description=description, subnodes=[trainer, aligner], dependencies=dependencies)
def __init__(self, command, set_cwd=False, **kwargs): """Takes a command and a set of files. The command is expected to be an iterable starting with the name of an executable, with each item representing one string on the command line. Thus, the command "find /etc -name 'profile*'" might be represented as the list ["find", "/etc", "-name", "profile*"]. If 'set_cwd' is True, the current working directory is set to the temporary directory before the command is executed. Input paths are automatically turned into absolute paths in this case. Each keyword represents a type of file, as determined by the prefix: IN_ -- Path to input file transformed/analysed the executable. OUT_ -- Path to output file generated by the executable. During execution of the AtomicCmd, these paths are modified to point to the temporary directory. EXEC_ -- Name of / path to executable. The first item in the command is always one of the executables, even if not specified in this manner. AUX_ -- Auxillery files required by the executable(s), which are themselves not executable. Examples include scripts, config files, data-bases, and the like. CHECK_ -- A callable, which upon calling carries out version checking, raising an exception in the case of requirements not being met. This may be used to help ensure that prerequisites are met before running the command. The function is not called by AtomicCmd itself. Note that files that are not directly invoked may be included above, in order to allow the specification of requirements. This could include required data files, or executables indirectly executed by a script. If the above is prefixed with "TEMP_", the files are read from / written to the temporary folder in which the command is executed. Note that all TEMP_OUT_ files are deleted when commit is called (if they exist), and only filenames (not dirname component) are allowed for TEMP_ values. In addition, the follow special names may be used with the above: STDIN_ -- Takes a filename, or an AtomicCmd, in which case the stdout of that command is piped to the stdin of this instance. STDOUT_ -- Takes a filename, or the special value PIPE to allow another AtomicCmd instance to use the output directly. STDERR_ -- Takes a filename. Each pipe can only be used once, with or without the TEMP_ prefix.""" self._proc = None self._temp = None self._command = map(str, safe_coerce_to_tuple(command)) self._handles = {} self._set_cwd = set_cwd if not self._command or not self._command[0]: raise ValueError("Empty command in AtomicCmd constructor") self._files = self._process_arguments(id(self), self._command, kwargs) self._file_sets = self._build_files_map(self._command, self._files) # Dry-run, to catch errors early self._generate_call("")
def __init__(self, command, set_cwd = False, **kwargs): """Takes a command and a set of files. The command is expected to be an iterable starting with the name of an executable, with each item representing one string on the command line. Thus, the command "find /etc -name 'profile*'" might be represented as the list ["find", "/etc", "-name", "profile*"]. If 'set_cwd' is True, the current working directory is set to the temporary directory before the command is executed. Input paths are automatically turned into absolute paths in this case. Each keyword represents a type of file, as determined by the prefix: IN_ -- Path to input file transformed/analysed the executable. OUT_ -- Path to output file generated by the executable. During execution of the AtomicCmd, these paths are modified to point to the temporary directory. EXEC_ -- Name of / path to executable. The first item in the command is always one of the executables, even if not specified in this manner. AUX_ -- Auxillery files required by the executable(s), which are themselves not executable. Examples include scripts, config files, data-bases, and the like. CHECK_ -- A callable, which upon calling carries out version checking, raising an exception in the case of requirements not being met. This may be used to help ensure that prerequisites are met before running the command. The function is not called by AtomicCmd itself. Note that files that are not directly invoked may be included above, in order to allow the specification of requirements. This could include required data files, or executables indirectly executed by a script. If the above is prefixed with "TEMP_", the files are read from / written to the temporary folder in which the command is executed. Note that all TEMP_OUT_ files are deleted when commit is called (if they exist), and only filenames (not dirname component) are allowed for TEMP_ values. In addition, the follow special names may be used with the above: STDIN_ -- Takes a filename, or an AtomicCmd, in which case the stdout of that command is piped to the stdin of this instance. STDOUT_ -- Takes a filename, or the special value PIPE to allow another AtomicCmd instance to use the output directly. STDERR_ -- Takes a filename. Each pipe can only be used once, with or without the TEMP_ prefix.""" self._proc = None self._temp = None self._command = map(str, safe_coerce_to_tuple(command)) self._handles = {} self._set_cwd = set_cwd if not self._command or not self._command[0]: raise ValueError("Empty command in AtomicCmd constructor") self._files = self._process_arguments(id(self), self._command, kwargs) self._file_sets = self._build_files_map(self._command, self._files) # Dry-run, to catch errors early self._generate_call("")
def __init__(self, tree_files, output_file, taxa = (), dependencies = ()): self._output_file = output_file self._tree_files = safe_coerce_to_tuple(tree_files) self._reroot_on_taxa = safe_coerce_to_tuple(taxa) reroot_on = "midpoint" if self._reroot_on_taxa: reroot_on = repr("', '".join(sorted(self._reroot_on_taxa))) description = "<NewickReroot (on %s): %s>" % \ (reroot_on, describe_files(tree_files),) Node.__init__(self, description = description, input_files = self._tree_files, output_files = self._output_file, dependencies = dependencies)
def __init__(self, commands): self._ready = False commands = safe_coerce_to_tuple(commands) for command in commands: if not isinstance(command, (AtomicCmd, _CommandSet)): raise CmdError("ParallelCmds must only contain AtomicCmds or other ParallelCmds!") _CommandSet.__init__(self, commands)
def _validate_filenames(filenames): """Sanity checks for filenames handled by 'describe_files' and 'describe_paired_files.""" filenames = safe_coerce_to_tuple(filenames) for filename in filenames: if not isinstance(filename, types.StringTypes): raise ValueError("Only string types are allowed for filenames, not %s" \ % (filename.__class__.__name__,)) return filenames
def _AnyOf(path, value): values = safe_coerce_to_tuple(value) for value in values: if key_func(value) not in args: raise MakefileError("Value for '%s' must be among %s, not %s!" \ % (":".join(path), ", ".join(map(repr, args)), repr(value))) if not min_items <= len(values) <= max_items: raise MakefileError("Expected %s to %s values, found %i!" \ % (repr(min_items), repr(max_items), len(values)))
def _build_unicat_command(input_files, output_file): paths = {"TEMP_OUT_CAT" : output_file} call = ["unicat", "--output", "%(TEMP_OUT_CAT)s"] for (index, filename) in enumerate(utilities.safe_coerce_to_tuple(input_files)): key = "IN_CAT_%02i" % index call.append("%%(%s)s" % key) paths[key] = filename return AtomicCmd(call, **paths)
def __init__(self, call, search, checks, name=None, priority=0): """See function 'Requrement' for a description of parameters. """ self._call = safe_coerce_to_tuple(call) self._done = None self.name = str(name or self._call[0]) self.priority = int(priority) self.checks = checks self._rege = re.compile(search) self._version = None
def __init__(self, call, **kwargs): """See AtomiCmd.__init__ for parameters / keyword arguments.""" self._call = safe_coerce_to_tuple(call) self._options = [] self._values = [] self._kwargs = {} self._object = None self.set_kwargs(**kwargs)
def __init__(self, call, **kwargs): """See AtomiCmd.__init__ for parameters / keyword arguments. """ self._call = safe_coerce_to_tuple(call) self._options = [] self._values = [] self._kwargs = {} self._object = None self.set_kwargs(**kwargs)
def __init__(self, commands): self._ready = False commands = safe_coerce_to_tuple(commands) for command in commands: if not isinstance(command, (AtomicCmd, _CommandSet)): raise CmdError( "ParallelCmds must only contain AtomicCmds or other ParallelCmds!" ) _CommandSet.__init__(self, commands)
def __init__(self, config, prefixes, name): self.name = name self.prefixes = safe_coerce_to_tuple(prefixes) self._nodes_extras = {} self._nodes_alignment = MetaNode(description = "Alignments:", dependencies = [prefix.node for prefix in self.prefixes]) self._setup_extra_nodes("mapDamage", "mapdamage") self._setup_extra_nodes("Duplicate Histogram", "duphist")
def _build_unicat_command(input_files, output_file): paths = {"TEMP_OUT_CAT": output_file} call = ["unicat", "--output", "%(TEMP_OUT_CAT)s"] for (index, filename) in enumerate(utilities.safe_coerce_to_tuple(input_files)): key = "IN_CAT_%02i" % index call.append("%%(%s)s" % key) paths[key] = filename return AtomicCmd(call, **paths)
def __init__(self, config, prefix, libraries, name): self.name = name self.bams = {} self.libraries = safe_coerce_to_tuple(libraries) for library in self.libraries: self.bams.update(library.bams.iteritems()) self.folder = os.path.dirname(self.libraries[0].folder) self.node = MetaNode(description = "Sample: %s" % os.path.basename(self.folder), dependencies = [library.node for library in self.libraries])
def __init__(self, config, prefix, libraries, name): self.name = name self.bams = {} self.libraries = safe_coerce_to_tuple(libraries) for library in self.libraries: self.bams.update(library.bams.iteritems()) self.folder = os.path.dirname(self.libraries[0].folder) self.node = MetaNode( description="Sample: %s" % os.path.basename(self.folder), dependencies=[library.node for library in self.libraries])
def __init__(self, input_file, output_file, exclude_groups, dependencies = ()): self._input_file = input_file self._output_file = output_file self._excluded = safe_coerce_to_tuple(exclude_groups) description = "<FastaToPAMLPhy: '%s' -> '%s'>" % \ (input_file, output_file) Node.__init__(self, description = description, input_files = [input_file], output_files = [output_file], dependencies = dependencies)
def describe_files(files): """Return a text description of a set of files.""" files = safe_coerce_to_tuple(files) if not files: return "No files" elif len(files) == 1: return repr(files[0]) paths = set(os.path.dirname(filename) for filename in files) if len(paths) == 1: return "%i files in '%s'" % (len(files), paths.pop()) return "%i files" % (len(files),)
def __init__(self, call, threads = 1, **kwargs): if not isinstance(threads, (types.IntType, types.LongType)): raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__) elif threads < 1: raise ValueError("'threads' must be 1 or greater, not %i" % threads) elif threads == 1: AtomicCmdBuilder.__init__(self, call, EXEC_MPI = "mpirun", **kwargs) else: call = safe_coerce_to_tuple(call) mpi_call = ["mpirun", "-n", threads] mpi_call.extend(call) AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN = call[0], **kwargs)
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [reroot_path(destination, fpath) for fpath in input_files] self._files = zip(input_files, output_files) Node.__init__(self, description = "<Copy %s output to %r>" % (description, destination), input_files = input_files, output_files = output_files, dependencies = source_nodes)
def __init__(self, input_file, output_file, exclude_groups, dependencies=()): self._input_file = input_file self._output_file = output_file self._excluded = safe_coerce_to_tuple(exclude_groups) description = "<FastaToPAMLPhy: '%s' -> '%s'>" % \ (input_file, output_file) Node.__init__(self, description=description, input_files=[input_file], output_files=[output_file], dependencies=dependencies)
def Requirement(call, search, checks, name=None, priority=0): # Ignore function naming scheme # pylint: disable=C0103 """Returns a singleton Requirement object, based on the parameters, which may be used to check that version requirements are met for a given program/utility/module, etc. Parameters: call -- A string, or a tuple containing strings for a system call, or a tuple containing a function at the first position, and a set of positional parameters. In the case of system calls, stdout and stderr are returned as a single string, in the case of a function call, the return value is expected to be a str. search -- A regular expression (string or re object), used to search the output of the "call". Groups are assumed to represent version numbers. checks -- A callable that implements the interface described in the Check class. name -- Descriptive name for the executable/module/etc. If not specified, first value in 'call' will be used; if multiple otherwise identical checks are made, the last name that does not equal the first value of 'call' will be used. priority -- Order in which requirements are checked; if multiple otherwise identical checks are made with different priority, the highest priority takes precedence. Implementation detail: To reduce the need for performing calls or system- calls multiple times, caches are implemented using the call object as keys. Thus the same calls should be passed in a manner which allow equality between the same calls to be established. """ call = safe_coerce_to_tuple(call) key = (call, search, checks) try: requirement = _REQUIREMENT_CACHE[key] # Highest priority takes precedence requirement.priority = max(requirement.priority, priority) # Last explicitly specified name takes precedence requirement.name = name or requirement.name except KeyError: requirement = RequirementObj(*key, name=name, priority=priority) _REQUIREMENT_CACHE[key] = requirement return requirement
def __init__(self, config, reference, infiles, outfile, dependencies=()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs( IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config), ) description = "<Indel Realigner (training): %s -> %r>" % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, call, threads=1, **kwargs): if not isinstance(threads, (types.IntType, types.LongType)): raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__) elif threads < 1: raise ValueError("'threads' must be 1 or greater, not %i" % threads) elif threads == 1: AtomicCmdBuilder.__init__(self, call, **kwargs) else: call = safe_coerce_to_tuple(call) mpi_call = ["mpirun", "-n", threads] mpi_call.extend(call) AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN=call[0], **kwargs)
def __init__(self, description, destination, source_nodes): source_nodes = safe_coerce_to_tuple(source_nodes) input_files = [] for source_node in source_nodes: input_files.extend(source_node.output_files) output_files = [ reroot_path(destination, fpath) for fpath in input_files ] self._files = zip(input_files, output_files) Node.__init__(self, description="<Copy %s output to %r>" % (description, destination), input_files=input_files, output_files=output_files, dependencies=source_nodes)
def __init__(self, config, reference, infiles, outfile, dependencies = ()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(config, jar_file) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE = reference, IN_REF_DICT = fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS = outfile) description = "<Train Indel Realigner: %i file(s) -> '%s'>" \ % (len(infiles), outfile) CommandNode.__init__(self, description = description, command = command.finalize(), dependencies = dependencies)
def __init__(self, config, reference, infiles, outfile, dependencies=()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(config, jar_file) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile) description = "<Train Indel Realigner: %i file(s) -> '%s'>" \ % (len(infiles), outfile) CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, config, reference, infiles, outfile, dependencies=()): infiles = safe_coerce_to_tuple(infiles) jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar") command = AtomicJavaCmdBuilder(jar_file, jre_options=config.jre_options) command.set_option("-T", "RealignerTargetCreator") command.set_option("-R", "%(IN_REFERENCE)s") command.set_option("-o", "%(OUT_INTERVALS)s") _set_input_files(command, infiles) command.set_kwargs(IN_REFERENCE=reference, IN_REF_DICT=fileutils.swap_ext(reference, ".dict"), OUT_INTERVALS=outfile, CHECK_GATK=_get_gatk_version_check(config)) description = "<Indel Realigner (training): %s -> %r>" \ % (describe_files(infiles), outfile) CommandNode.__init__(self, description=description, command=command.finalize(), dependencies=dependencies)
def __init__(self, config, target_name, input_files, output_file, intervals_file=None, print_stats=False, max_contigs=_MAX_CONTIGS, dependencies=()): self._target_name = target_name self._input_files = safe_coerce_to_tuple(input_files) self._output_file = output_file self._intervals = intervals_file self._print_stats = print_stats self._max_contigs = max_contigs self._max_contigs_reached = False input_files = [] input_files.extend(self._input_files) input_files.extend( swap_ext(input_file, ".bai") for input_file in self._input_files) if intervals_file: input_files.append(intervals_file) executables = ["coverageBed" ] if intervals_file else ["genomeCoverageBed"] auxiliary_files = [] for cmd in concatenate_input_bams(config, self._input_files)[0]: executables.extend(cmd.executables) auxiliary_files.extend(cmd.auxiliary_files) Node.__init__(self, description = "<DepthHistogram: %s -> '%s'>" \ % (describe_files(self._input_files), self._output_file), input_files = input_files, output_files = self._output_file, dependencies = dependencies, executables = executables, auxiliary_files = auxiliary_files)
def __init__(self, config, prefix, lanes, name): self.name = name self.lanes = safe_coerce_to_tuple(lanes) self.options = lanes[0].options self.folder = os.path.dirname(self.lanes[0].folder) self.is_rmdupped = self.options["PCRDuplicates"] self.is_rescaled = self.options["RescaleQualities"] assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes) bams = self._collect_bams(self.lanes) if self.is_rmdupped: bams = self._remove_pcr_duplicates(config, prefix, bams) if self.is_rescaled: bams = self._rescale_quality_scores(config, prefix, bams) self.bams = {} for files_and_nodes in bams.itervalues(): self.bams.update(files_and_nodes) self.node = MetaNode(description = "Library: %s" % os.path.basename(self.folder), dependencies = self.bams.values())
def __init__(self, config, input_bams, pipename="input.bam"): self.pipe = pipename self.files = safe_coerce_to_tuple(input_bams) self.commands = [] self.kwargs = {"TEMP_IN_BAM": self.pipe} if len(self.files) > 1: params = picard_command(config, "MergeSamFiles") params.set_option("SO", "coordinate", sep="=", fixed=False) params.set_option("CREATE_INDEX", "False", sep="=") params.set_option("COMPRESSION_LEVEL", 0, sep="=") params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=") params.add_multiple_options("I", input_bams, sep="=") params.set_kwargs(TEMP_OUT_BAM=self.pipe) self.commands = [params.finalize()] else: # Ensure that the actual command depends on the input self.kwargs["IN_FILE_00"] = self.files[0] self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False, dependencies=(), file_dependencies=()): """ infiles = {names : {"partitions" : ..., "filenames" : [...]}} """ if not (isinstance(infiles, dict) and all(isinstance(dd, dict) for dd in infiles.values())): raise TypeError("'infiles' must be a dictionary of dictionaries") input_filenames = [] for (name, subdd) in infiles.iteritems(): if set(subdd) - _VALID_KEYS: raise ValueError("Invalid keys found for %r: %s" % (name, ", ".join(set(subdd) - _VALID_KEYS))) elif not isinstance(subdd["filenames"], list): raise ValueError("filenames must be a list of strings") input_filenames.extend(subdd["filenames"]) # Optional file dependencies; used to depend on the list of sequcences input_filenames.extend(safe_coerce_to_tuple(file_dependencies)) self._reduce = bool(reduce) self._infiles = copy.deepcopy(infiles) self._out_prefix = out_prefix self._excluded = safe_coerce_to_frozenset(exclude_groups) description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \ (" (reducing)" if reduce else "", len(infiles), out_prefix) Node.__init__( self, description=description, input_files=input_filenames, output_files=[out_prefix + ".phy", out_prefix + ".partitions"], dependencies=dependencies)