def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def add_collations_to_pipeline(pipeline, map_tool_to_runner, collations, tasks=None, config=None, **kwargs): runners = [] ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) for coll in collations: if coll not in config: raise KeyError( "configuration file requires a section for '{}'".format(coll)) coll_info = config[coll] for keyword in ("runner", "regex_in", "pattern_out"): if keyword not in coll_info: raise ValueError("section {} is missing required keyword '{}'".format( coll, keyword)) runner_options = config.get(coll_info["runner"], {}) runner_name = runner_options.get("name", coll_info["runner"]).strip() colcc = map_tool_to_runner[runner_name] taskf = colcc(**runner_options) # automatically set alias through regex (required field) taskf._input_regex = coll_info.get("regex", None) taskf._input_alias = coll_info.get("alias", None) taskf.__name__ = coll if tasks is not None: input_tasks = tasks elif "glob" in coll_info: input_tasks = coll_info["glob"] else: raise ValueError("need either tasks or glob expression " "for collation") filter_regex = ruffus.regex(coll_info["regex_in"]) filter_regex = ruffus.regex(coll_info["regex_in"]) result_dir = os.path.join(coll + ".dir") output_pattern = coll_info["pattern_out"] output_prefix = r"{}/{}".format(result_dir, output_pattern) output_dir = os.path.dirname(output_prefix) if hasattr(taskf, "output"): output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, output_dir) else: multiple_outputs = False output = output_prefix found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn( "the following task will be ignored: " "{} matching {}".format( result_dir, i)) found = True if found: continue metric_task = pipeline.collate( task_func=taskf, input=input_tasks, filter=filter_regex, output=output, **kwargs).mkdir( input_tasks, filter_regex, output_dir) if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_passthrough" output = [re.sub(r"\\\d+", "*", x) for x in output] metric_task = pipeline.split( task_func=f, input=metric_task, output=output) runners.append(metric_task) return runners
def add_tools_to_pipeline(pipeline, map_tool_to_runner, config=None, input_files=None, **kwargs): """add tools to a workflow pipeline. This function adds for each input and tool combination a task to the workflow. The configuration dictionary should contain the following sections: input: Configuration of input files. Key/value pairs and possibly hierarchical. The following keys are optional: regex alias group_regex group_alias tool: A list of tools to apply. A typical configuration dictionary might look like this:: {"input": {"bam": "*.bam"}, "tool": ["bwa_mem", "isaac"]} Arguments --------- pipeline : object The ruffus pipeline that tasks will be added to. map_tool_to_runner: dict Dictionary mapping tools to functions in the :ref:`TaskLibrary`. config: dict Configuration dictionary. input_files: list List of (optional) input files. """ tool_functions = build_tool_functions(map_tool_to_runner, config) if "input" not in config: raise KeyError("configuration file requires an 'input' section") if config["input"] is None: raise ValueError("input section is empty") input_regex = config["input"].pop("regex", None) input_alias = config["input"].pop("alias", None) input_group_regex = config["input"].pop("group_regex", None) input_group_alias = config["input"].pop("group_alias", "\\1") is_test = "is_test" in config # update selected fields for testing purposes if "test" in config["input"]: config["input"].update(config["input"]["test"]) del config["input"]["test"] config_files = expand_globs(config["input"], is_test=is_test) if input_group_regex: config_files = group_files(config_files, input_group_regex, input_group_alias) input_combos = build_combinations(config_files) tool_runners = [] ignore = config["setup"].get("ignore", []) ignore.extend(config["input"].get("ignore", [])) make_unique = check_unique(tool_functions, input_combos=input_combos, input_regex=input_regex, input_alias=input_alias, is_test=is_test) suffix = None for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files, regex=input_regex, alias=input_alias, make_unique=make_unique, is_test=is_test) if "name" in input_files: # create copy of input_files without name, do # not modify original as different tools require # the 'name' input_files = dict([(x, y) for x, y in list(input_files.items()) if x != "name"]) result_dir = os.path.join(taskf.__name__ + ".dir") found = False for i in IOTools.val2list(ignore): if i in result_dir: P.get_logger().warn( "the following task will be ignored: " "{} matching {}".format( result_dir, i)) found = True if found: continue output, multiple_outputs, flexible_outputs, _suffix = \ build_output(taskf, result_dir) if suffix is None: suffix = _suffix elif suffix != _suffix: raise ValueError( "tools produce output files of different type, " "got {}, expected {}".format(_suffix, suffix)) tool_task = pipeline.merge( task_func=taskf, input=list(input_files.values()), output=output, **kwargs).mkdir(result_dir) # if there are multilpe output files, split the task so that # each output file will be processed separately further down the # pipeline. if multiple_outputs: f = EmptyRunner() f.__name__ = taskf.__name__ + "_split" tool_task = pipeline.split( task_func=f, input=tool_task, output=output) tool_runners.append(tool_task) # convenience target f = EmptyRunner() f.__name__ = "tools" pipeline.merge(task_func=f, input=tool_runners, output=None) return suffix, tool_runners