def __init__(self, rule, dag, targetfile=None, format_wildcards=None): self.rule = rule self.dag = dag self.targetfile = targetfile self.wildcards_dict = self.rule.get_wildcards(targetfile) self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) (self.input, self.output, self.params, self.log, self.benchmark, self.ruleio, self.dependencies) = rule.expand_wildcards(self.wildcards_dict) self.resources_dict = {} for name, res in rule.resources.items(): if callable(res): res = res(self.wildcards) if not isinstance(res, int): raise ValueError("Callable for resources must return int") self.resources_dict[name] = min( self.rule.workflow.global_resources.get(name, res), res) self.threads = self.resources_dict["_cores"] self.resources = Resources(fromdict=self.resources_dict) self.shadow_dir = None self._inputsize = None self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = self.ruleio[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = self.ruleio[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] self._hash = self.rule.__hash__() if True or not self.dynamic_output: for o in self.output: self._hash ^= o.__hash__()
def resources(self): if self._resources is None: self._resources = defaultdict(int) self._resources["_nodes"] = 1 pipe_group = any([job.is_pipe for job in self.jobs]) # iterate over siblings that can be executed in parallel for siblings in self.toposorted: sibling_resources = defaultdict(int) for job in siblings: try: job_resources = job.resources except FileNotFoundError: # Skip job if resource evaluation leads to a file not found error. # This will be caused by an inner job, which needs files created by the same group. # All we can do is to ignore such jobs for now. continue for res, value in job_resources.items(): if res != "_nodes": sibling_resources[res] += value for res, value in sibling_resources.items(): if res != "_nodes": if self.dag.workflow.run_local or pipe_group: # in case of local execution, this must be a # group of jobs that are connected with pipes # and have to run simultaneously self._resources[res] += value else: # take the maximum with previous values self._resources[res] = max( self._resources.get(res, 0), value) return Resources(fromdict=self._resources)
def expand_resources(self, wildcards, input, attempt): resources = dict() def apply(name, res, threads=None): if callable(res): aux = {"threads": threads} if threads is not None else dict() res = self.apply_input_function(res, wildcards, input=input, attempt=attempt, **aux) if not isinstance(res, int): raise WorkflowError( "Resources function did not return int.") res = min(self.workflow.global_resources.get(name, res), res) return res threads = apply("_cores", self.resources["_cores"]) resources["_cores"] = threads for name, res in self.resources.items(): if name != "_cores": resources[name] = apply(name, res) resources = Resources(fromdict=resources) return resources
def resources(self): if self._resources is None: self._resources = defaultdict(int) pipe_group = any([ any([is_flagged(o, "pipe") for o in job.output]) for job in self.jobs ]) for job in self.jobs: try: job_resources = job.resources except FileNotFoundError: # Skip job if resource evaluation leads to a file not found error. # This will be caused by an inner job, which needs files created by the same group. # All we can do is to ignore such jobs for now. continue for res, value in job_resources.items(): if self.dag.workflow.run_local or pipe_group: # in case of local execution, this must be a # group of jobs that are connected with pipes # and have to run simultaneously self._resources[res] += value else: # take the maximum over all jobs self._resources[res] = max( self._resources.get(res, value), value) return Resources(fromdict=self._resources)
def expand_resources(self, wildcards, input, attempt): resources = dict() def apply(name, res, threads=None): if callable(res): aux = dict(rulename=self.name) if threads: aux["threads"] = threads res = self.apply_input_function( res, wildcards, input=input, attempt=attempt, incomplete_checkpoint_func=lambda e: 0, **aux) if not isinstance(res, int): raise WorkflowError( "Resources function did not return int.") res = min(self.workflow.global_resources.get(name, res), res) return res threads = apply("_cores", self.resources["_cores"]) resources["_cores"] = threads for name, res in self.resources.items(): if name != "_cores": resources[name] = apply(name, res, threads=threads) resources = Resources(fromdict=resources) return resources
def expand_resources(self, wildcards, input, attempt): resources = dict() def apply(name, res, threads=None): if callable(res): aux = dict(rulename=self.name) if threads is not None: aux["threads"] = threads try: res, _ = self.apply_input_function( res, wildcards, input=input, attempt=attempt, incomplete_checkpoint_func=lambda e: 0, raw_exceptions=True, **aux, ) except (Exception, BaseException) as e: raise InputFunctionException(e, rule=self, wildcards=wildcards) if isinstance(res, float): # round to integer res = int(round(res)) if not isinstance(res, int) and not isinstance(res, str): raise WorkflowError( f"Resource {name} is neither int, float(would be rounded to nearest int), or str.", rule=self, ) global_res = self.workflow.global_resources.get(name) if global_res is not None: if not isinstance(res, TBDString) and type(res) != type(global_res): global_type = ("an int" if isinstance(global_res, int) else type(global_res)) raise WorkflowError( f"Resource {name} is of type {type(res).__name__} but global resource constraint " f"defines {global_type} with value {global_res}. " "Resources with the same name need to have the same types (int, float, or str are allowed).", rule=self, ) if isinstance(res, int): res = min(global_res, res) return res threads = apply("_cores", self.resources["_cores"]) if self.workflow.max_threads is not None: threads = min(threads, self.workflow.max_threads) resources["_cores"] = threads for name, res in self.resources.items(): if name != "_cores": resources[name] = apply(name, res, threads=threads) resources = Resources(fromdict=resources) return resources
def __init__(self, rule, dag, targetfile=None, format_wildcards=None): self.rule = rule self.dag = dag self.targetfile = targetfile self.wildcards_dict = self.rule.get_wildcards(targetfile) self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) (self.input, self.output, self.params, self.log, self.benchmark, self.ruleio, self.dependencies) = rule.expand_wildcards(self.wildcards_dict) self.resources_dict = { name: min(self.rule.workflow.global_resources.get(name, res), res) for name, res in rule.resources.items() } self.threads = self.resources_dict["_cores"] self.resources = Resources(fromdict=self.resources_dict) self._inputsize = None self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = self.ruleio[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = self.ruleio[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] self._hash = self.rule.__hash__() if True or not self.dynamic_output: for o in self.output: self._hash ^= o.__hash__()
def expand_resources(self, wildcards, input): resources = dict() for name, res in self.resources.items(): if callable(res): res = self.apply_input_function(res, wildcards, input=input) if not isinstance(res, int): raise WorkflowError( "Resources function did not return int.") res = min(self.workflow.global_resources.get(name, res), res) resources[name] = res resources = Resources(fromdict=resources) return resources
def expand_resources(self, wildcards, input, attempt): resources = dict() def apply(name, res, threads=None): if callable(res): aux = dict(rulename=self.name) if threads: aux["threads"] = threads try: try: res, _ = self.apply_input_function( res, wildcards, input=input, attempt=attempt, incomplete_checkpoint_func=lambda e: 0, raw_exceptions=True, **aux) except FileNotFoundError as e: # Resources can depend on input files. Since expansion can happen during dryrun, # where input files are not yet present, we need to skip such resources and # mark them as [TBD]. if e.filename in input: # use zero for resource if it cannot yet be determined res = TBDInt(0) else: raise e except (Exception, BaseException) as e: raise InputFunctionException(e, rule=self, wildcards=wildcards) if not isinstance(res, int) and not isinstance(res, str): raise WorkflowError( "Resources function did not return int or str.", rule=self) if isinstance(res, int): global_res = self.workflow.global_resources.get(name, res) if global_res is not None: res = min(global_res, res) return res threads = apply("_cores", self.resources["_cores"]) resources["_cores"] = threads for name, res in self.resources.items(): if name != "_cores": resources[name] = apply(name, res, threads=threads) resources = Resources(fromdict=resources) return resources
def expand_resources(self, wildcards, input, attempt): resources = dict() def apply(name, res, threads=None): if callable(res): aux = dict(rulename=self.name) if threads is not None: aux["threads"] = threads try: res, _ = self.apply_input_function( res, wildcards, input=input, attempt=attempt, incomplete_checkpoint_func=lambda e: 0, raw_exceptions=True, **aux) except (Exception, BaseException) as e: raise InputFunctionException(e, rule=self, wildcards=wildcards) if isinstance(res, float): # round to integer res = int(round(res)) if not isinstance(res, int) and not isinstance(res, str): raise WorkflowError( "Resources function did not return int, float (floats are " "rouded to the nearest integer), or str.", rule=self, ) if isinstance(res, int): global_res = self.workflow.global_resources.get(name, res) if global_res is not None: res = min(global_res, res) return res threads = apply("_cores", self.resources["_cores"]) if self.workflow.max_threads is not None: threads = min(threads, self.workflow.max_threads) resources["_cores"] = threads for name, res in self.resources.items(): if name != "_cores": resources[name] = apply(name, res, threads=threads) resources = Resources(fromdict=resources) return resources
def resources(self): if self._resources is None: self._resources = defaultdict(int) # take the maximum over all jobs pipe_group = any([ any([is_flagged(o, "pipe") for o in job.output]) for job in self.jobs ]) for job in self.jobs: for res, value in job.resources.items(): if self.dag.workflow.run_local or pipe_group: # in case of local execution, this must be a # group of jobs that are connected with pipes # and have to run simultaneously self._resources[res] += value else: self._resources[res] = max( self._resources.get(res, value), value) return Resources(fromdict=self._resources)
def resources(self): if self._resources is None: def check_string_resource(res, value1, value2): if value1 != value2: raise WorkflowError( "Failed to group jobs together. Resource {} " "is a string but not all group jobs require the same value. " "Observed: {} != {}.".format(res, value1, value2)) self._resources = defaultdict(int) self._resources["_nodes"] = 1 pipe_group = any([job.is_pipe for job in self.jobs]) # iterate over siblings that can be executed in parallel for siblings in self.toposorted: sibling_resources = defaultdict(int) for job in siblings: try: job_resources = job.resources except FileNotFoundError: # Skip job if resource evaluation leads to a file not found error. # This will be caused by an inner job, which needs files created by the same group. # All we can do is to ignore such jobs for now. continue for res, value in job_resources.items(): if isinstance(value, int): if res != "_nodes": sibling_resources[res] += value elif isinstance(value, TBDString): # we omit TBDs continue else: # all string resources must be the same for all group jobs if res in sibling_resources: check_string_resource(res, sibling_resources[res], value) else: sibling_resources[res] = value for res, value in sibling_resources.items(): if isinstance(value, int): if res != "_nodes": if self.dag.workflow.run_local or pipe_group: # in case of local execution, this must be a # group of jobs that are connected with pipes # and have to run simultaneously self._resources[res] += value else: # take the maximum with previous values self._resources[res] = max( self._resources.get(res, 0), value) elif isinstance(value, TBDString): # we omit TBDs continue else: # all string resources must be the same for all group jobs if res in self._resources: check_string_resource(res, self._resources[res], value) else: self._resources[res] = value return Resources(fromdict=self._resources)
class Job: HIGHEST_PRIORITY = sys.maxsize def __init__(self, rule, dag, targetfile=None, format_wildcards=None): self.rule = rule self.dag = dag self.targetfile = targetfile self.wildcards_dict = self.rule.get_wildcards(targetfile) self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) (self.input, self.output, self.params, self.log, self.benchmark, self.ruleio, self.dependencies) = rule.expand_wildcards(self.wildcards_dict) self.resources_dict = { name: min(self.rule.workflow.global_resources.get(name, res), res) for name, res in rule.resources.items() } self.threads = self.resources_dict["_cores"] self.resources = Resources(fromdict=self.resources_dict) self._inputsize = None self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = self.ruleio[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = self.ruleio[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] self._hash = self.rule.__hash__() if True or not self.dynamic_output: for o in self.output: self._hash ^= o.__hash__() @property def priority(self): return self.dag.priority(self) @property def b64id(self): return base64.b64encode( (self.rule.name + "".join(self.output)).encode("utf-8")).decode("utf-8") @property def inputsize(self): """ Return the size of the input files. Input files need to be present. """ if self._inputsize is None: self._inputsize = sum(f.size for f in self.input) return self._inputsize @property def message(self): """ Return the message for this job. """ try: return (self.format_wildcards(self.rule.message) if self.rule.message else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable in message " "of shell command: {}".format(str(ex)), rule=self.rule) @property def shellcmd(self): """ Return the shell command. """ try: return (self.format_wildcards(self.rule.shellcmd) if self.rule.shellcmd else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable when printing " "shell command: {}".format(str(ex)), rule=self.rule) @property def expanded_output(self): """ Iterate over output files while dynamic output is expanded. """ for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: expansion = self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill) if not expansion: yield f_ for f, _ in expansion: yield IOFile(f, self.rule) else: yield f @property def dynamic_wildcards(self): """ Return all wildcard values determined from dynamic output. """ combinations = set() for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: for f, w in self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill): combinations.add(tuple(w.items())) wildcards = defaultdict(list) for combination in combinations: for name, value in combination: wildcards[name].append(value) return wildcards @property def missing_input(self): """ Return missing input files. """ # omit file if it comes from a subworkflow return set(f for f in self.input if not f.exists and not f in self.subworkflow_input) @property def output_mintime(self): """ Return oldest output file. """ existing = [f.mtime for f in self.expanded_output if f.exists] if self.benchmark and self.benchmark.exists: existing.append(self.benchmark.mtime) if existing: return min(existing) return None @property def input_maxtime(self): """ Return newest input file. """ existing = [f.mtime for f in self.input if f.exists] if existing: return max(existing) return None def missing_output(self, requested=None): """ Return missing output files. """ files = set() if self.benchmark and (requested is None or self.benchmark in requested): if not self.benchmark.exists: files.add(self.benchmark) for f, f_ in zip(self.output, self.rule.output): if requested is None or f in requested: if f in self.dynamic_output: if not self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill): files.add("{} (dynamic)".format(f_)) elif not f.exists: files.add(f) return files @property def existing_output(self): return filter(lambda f: f.exists, self.expanded_output) def check_protected_output(self): protected = list(filter(lambda f: f.protected, self.expanded_output)) if protected: raise ProtectedOutputException(self.rule, protected) def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) if self.dynamic_output: for f, _ in chain(*map( partial(self.expand_dynamic, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill), self.rule.dynamic_output)): os.remove(f) for f, f_ in zip(self.output, self.rule.output): f.prepare() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare() def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] if to_remove: logger.info("Removing output files of failed job {}" " since they might be corrupted:\n{}".format( self, ", ".join(to_remove))) for f in to_remove: f.remove() def format_wildcards(self, string, **variables): """ Format a string with variables from the job. """ _variables = dict() _variables.update(self.rule.workflow.globals) _variables.update( dict( input=self.input, output=self.output, params=self.params, wildcards=self._format_wildcards, threads=self.threads, resources=self.resources, log=self.log, version=self.rule.version, rule=self.rule.name, )) _variables.update(variables) try: return format(string, **_variables) except NameError as ex: raise RuleException("NameError: " + str(ex), rule=self.rule) except IndexError as ex: raise RuleException("IndexError: " + str(ex), rule=self.rule) def properties(self, omit_resources="_cores _nodes".split()): resources = { name: res for name, res in self.resources.items() if name not in omit_resources } params = {name: value for name, value in self.params.items()} properties = { "rule": self.rule.name, "local": self.dag.workflow.is_local(self.rule), "input": self.input, "output": self.output, "params": params, "threads": self.threads, "resources": resources } return properties def json(self): return json.dumps(self.properties()) def __repr__(self): return self.rule.name def __eq__(self, other): if other is None: return False return self.rule == other.rule and ( self.dynamic_output or self.wildcards_dict == other.wildcards_dict) def __lt__(self, other): return self.rule.__lt__(other.rule) def __gt__(self, other): return self.rule.__gt__(other.rule) def __hash__(self): return self._hash @staticmethod def expand_dynamic(pattern, restriction=None, omit_value=None): """ Expand dynamic files. """ return list( listfiles(pattern, restriction=restriction, omit_value=omit_value))
class Job: HIGHEST_PRIORITY = sys.maxsize def __init__(self, rule, dag, targetfile=None, format_wildcards=None): self.rule = rule self.dag = dag self.targetfile = targetfile self.wildcards_dict = self.rule.get_wildcards(targetfile) self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) (self.input, self.output, self.params, self.log, self.benchmark, self.ruleio, self.dependencies) = rule.expand_wildcards(self.wildcards_dict) self.resources_dict = {} for name, res in rule.resources.items(): if callable(res): res = res(self.wildcards) if not isinstance(res, int): raise ValueError("Callable for resources must return int") self.resources_dict[name] = min( self.rule.workflow.global_resources.get(name, res), res) self.threads = self.resources_dict["_cores"] self.resources = Resources(fromdict=self.resources_dict) self.shadow_dir = None self._inputsize = None self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = self.ruleio[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = self.ruleio[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] self._hash = self.rule.__hash__() if True or not self.dynamic_output: for o in self.output: self._hash ^= o.__hash__() @property def is_shadow(self): return self.rule.shadow_depth is not None @property def priority(self): return self.dag.priority(self) @property def b64id(self): return base64.b64encode((self.rule.name + "".join(self.output)).encode( "utf-8")).decode("utf-8") @property def inputsize(self): """ Return the size of the input files. Input files need to be present. """ if self._inputsize is None: self._inputsize = sum(f.size for f in self.input) return self._inputsize @property def message(self): """ Return the message for this job. """ try: return (self.format_wildcards(self.rule.message) if self.rule.message else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable in message " "of shell command: {}".format(str(ex)), rule=self.rule) @property def shellcmd(self): """ Return the shell command. """ try: return (self.format_wildcards(self.rule.shellcmd) if self.rule.shellcmd else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable when printing " "shell command: {}".format(str(ex)), rule=self.rule) @property def expanded_output(self): """ Iterate over output files while dynamic output is expanded. """ for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: expansion = self.expand_dynamic(f_) if not expansion: yield f_ for f, _ in expansion: file_to_yield = IOFile(f, self.rule) file_to_yield.clone_flags(f_) yield file_to_yield else: yield f def shadowed_path(self, f): """ Get the shadowed path of IOFile f. """ if not self.shadow_dir: return f f_ = IOFile(os.path.join(self.shadow_dir, f), self.rule) f_.clone_flags(f) return f_ @property def dynamic_wildcards(self): """ Return all wildcard values determined from dynamic output. """ combinations = set() for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: for f, w in self.expand_dynamic(f_): combinations.add(tuple(w.items())) wildcards = defaultdict(list) for combination in combinations: for name, value in combination: wildcards[name].append(value) return wildcards @property def missing_input(self): """ Return missing input files. """ # omit file if it comes from a subworkflow return set(f for f in self.input if not f.exists and not f in self.subworkflow_input) @property def existing_remote_input(self): files = set() for f in self.input: if f.is_remote: if f.exists_remote: files.add(f) return files @property def existing_remote_output(self): files = set() for f in self.remote_output: if f.exists_remote: files.add(f) return files @property def missing_remote_input(self): return self.remote_input - self.existing_remote_input @property def missing_remote_output(self): return self.remote_output - self.existing_remote_output @property def output_mintime(self): """ Return oldest output file. """ existing = [f.mtime for f in self.expanded_output if f.exists] if self.benchmark and self.benchmark.exists: existing.append(self.benchmark.mtime) if existing: return min(existing) return None @property def output_mintime_local(self): existing = [f.mtime_local for f in self.expanded_output if f.exists] if self.benchmark and self.benchmark.exists: existing.append(self.benchmark.mtime_local) if existing: return min(existing) return None @property def input_maxtime(self): """ Return newest input file. """ existing = [f.mtime for f in self.input if f.exists] if existing: return max(existing) return None def missing_output(self, requested=None): """ Return missing output files. """ files = set() if self.benchmark and (requested is None or self.benchmark in requested): if not self.benchmark.exists: files.add(self.benchmark) for f, f_ in zip(self.output, self.rule.output): if requested is None or f in requested: if f in self.dynamic_output: if not self.expand_dynamic(f_): files.add("{} (dynamic)".format(f_)) elif not f.exists: files.add(f) return files @property def local_input(self): for f in self.input: if not f.is_remote: yield f @property def local_output(self): for f in self.output: if not f.is_remote: yield f @property def remote_input(self): for f in self.input: if f.is_remote: yield f @property def remote_output(self): for f in self.output: if f.is_remote: yield f @property def remote_input_newer_than_local(self): files = set() for f in self.remote_input: if (f.exists_remote and f.exists_local) and ( f.mtime > f.mtime_local): files.add(f) return files @property def remote_input_older_than_local(self): files = set() for f in self.remote_input: if (f.exists_remote and f.exists_local) and ( f.mtime < f.mtime_local): files.add(f) return files @property def remote_output_newer_than_local(self): files = set() for f in self.remote_output: if (f.exists_remote and f.exists_local) and ( f.mtime > f.mtime_local): files.add(f) return files @property def remote_output_older_than_local(self): files = set() for f in self.remote_output: if (f.exists_remote and f.exists_local) and ( f.mtime < f.mtime_local): files.add(f) return files @property def files_to_download(self): toDownload = set() for f in self.input: if f.is_remote: if not f.exists_local and f.exists_remote: toDownload.add(f) toDownload = toDownload | self.remote_input_newer_than_local return toDownload @property def files_to_upload(self): return self.missing_remote_input & self.remote_input_older_than_local @property def existing_output(self): return filter(lambda f: f.exists, self.expanded_output) def check_protected_output(self): protected = list(filter(lambda f: f.protected, self.expanded_output)) if protected: raise ProtectedOutputException(self.rule, protected) def remove_existing_output(self): """Clean up both dynamic and regular output before rules actually run """ if self.dynamic_output: for f, _ in chain(*map(self.expand_dynamic, self.rule.dynamic_output)): os.remove(f) for f, f_ in zip(self.output, self.rule.output): try: f.remove(remove_non_empty_dir=False) except FileNotFoundError: #No file == no problem pass def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. Creates a shadow directory for the job if specified. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) for f, f_ in zip(self.output, self.rule.output): f.prepare() for f in self.files_to_download: f.download_from_remote() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare() self.remove_existing_output() if not self.is_shadow: return # Create shadow directory structure self.shadow_dir = tempfile.mkdtemp( dir=self.rule.workflow.persistence.shadow_path) cwd = os.getcwd() # Shallow simply symlink everything in the working directory. if self.rule.shadow_depth == "shallow": for source in os.listdir(cwd): link = os.path.join(self.shadow_dir, source) os.symlink(os.path.abspath(source), link) elif self.rule.shadow_depth == "full": snakemake_dir = os.path.join(cwd, ".snakemake") for dirpath, dirnames, filenames in os.walk(cwd): # Must exclude .snakemake and its children to avoid infinite # loop of symlinks. if os.path.commonprefix([snakemake_dir, dirpath ]) == snakemake_dir: continue for dirname in dirnames: if dirname == ".snakemake": continue relative_source = os.path.relpath(os.path.join(dirpath, dirname)) shadow = os.path.join(self.shadow_dir, relative_source) os.mkdir(shadow) for filename in filenames: source = os.path.join(dirpath, filename) relative_source = os.path.relpath(source) link = os.path.join(self.shadow_dir, relative_source) os.symlink(source, link) def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] to_remove.extend([f for f in self.remote_input if f.exists]) to_remove.extend([f for f in self.remote_output if f.exists_local]) if to_remove: logger.info("Removing output files of failed job {}" " since they might be corrupted:\n{}".format( self, ", ".join(to_remove))) for f in to_remove: f.remove() self.rmdir_empty_remote_dirs() @property def empty_remote_dirs(self): for f in (set(self.output) | set(self.input)): if f.is_remote: if os.path.exists(os.path.dirname(f)) and not len(os.listdir( os.path.dirname(f))): yield os.path.dirname(f) def rmdir_empty_remote_dirs(self): for d in self.empty_remote_dirs: try: os.removedirs(d) except: pass # it's ok if we can't remove the leaf def format_wildcards(self, string, **variables): """ Format a string with variables from the job. """ _variables = dict() _variables.update(self.rule.workflow.globals) _variables.update(dict(input=self.input, output=self.output, params=self.params, wildcards=self._format_wildcards, threads=self.threads, resources=self.resources, log=self.log, version=self.rule.version, rule=self.rule.name, )) _variables.update(variables) try: return format(string, **_variables) except NameError as ex: raise RuleException("NameError: " + str(ex), rule=self.rule) except IndexError as ex: raise RuleException("IndexError: " + str(ex), rule=self.rule) def properties(self, omit_resources="_cores _nodes".split(), **aux_properties): resources = { name: res for name, res in self.resources.items() if name not in omit_resources } params = {name: value for name, value in self.params.items()} properties = { "rule": self.rule.name, "local": self.dag.workflow.is_local(self.rule), "input": self.input, "output": self.output, "params": params, "threads": self.threads, "resources": resources, } properties.update(aux_properties) return properties def __repr__(self): return self.rule.name def __eq__(self, other): if other is None: return False return (self.rule == other.rule and (self.dynamic_output or self.wildcards_dict == other.wildcards_dict) and (self.dynamic_input or self.input == other.input)) def __lt__(self, other): return self.rule.__lt__(other.rule) def __gt__(self, other): return self.rule.__gt__(other.rule) def __hash__(self): return self._hash def expand_dynamic(self, pattern): """ Expand dynamic files. """ return list(listfiles(pattern, restriction=self.wildcards, omit_value=DYNAMIC_FILL))
class Job: HIGHEST_PRIORITY = sys.maxsize def __init__(self, rule, dag, targetfile=None, format_wildcards=None): self.rule = rule self.dag = dag self.targetfile = targetfile self.wildcards_dict = self.rule.get_wildcards(targetfile) self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) (self.input, self.output, self.params, self.log, self.benchmark, self.ruleio, self.dependencies) = rule.expand_wildcards(self.wildcards_dict) self.resources_dict = { name: min(self.rule.workflow.global_resources.get(name, res), res) for name, res in rule.resources.items() } self.threads = self.resources_dict["_cores"] self.resources = Resources(fromdict=self.resources_dict) self._inputsize = None self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = self.ruleio[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = self.ruleio[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] self._hash = self.rule.__hash__() if True or not self.dynamic_output: for o in self.output: self._hash ^= o.__hash__() @property def priority(self): return self.dag.priority(self) @property def b64id(self): return base64.b64encode((self.rule.name + "".join(self.output) ).encode("utf-8")).decode("utf-8") @property def inputsize(self): """ Return the size of the input files. Input files need to be present. """ if self._inputsize is None: self._inputsize = sum(f.size for f in self.input) return self._inputsize @property def message(self): """ Return the message for this job. """ try: return (self.format_wildcards(self.rule.message) if self.rule.message else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable in message " "of shell command: {}".format(str(ex)), rule=self.rule) @property def shellcmd(self): """ Return the shell command. """ try: return (self.format_wildcards(self.rule.shellcmd) if self.rule.shellcmd else None) except AttributeError as ex: raise RuleException(str(ex), rule=self.rule) except KeyError as ex: raise RuleException("Unknown variable when printing " "shell command: {}".format(str(ex)), rule=self.rule) @property def expanded_output(self): """ Iterate over output files while dynamic output is expanded. """ for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: expansion = self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill) if not expansion: yield f_ for f, _ in expansion: yield IOFile(f, self.rule) else: yield f @property def dynamic_wildcards(self): """ Return all wildcard values determined from dynamic output. """ combinations = set() for f, f_ in zip(self.output, self.rule.output): if f in self.dynamic_output: for f, w in self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill): combinations.add(tuple(w.items())) wildcards = defaultdict(list) for combination in combinations: for name, value in combination: wildcards[name].append(value) return wildcards @property def missing_input(self): """ Return missing input files. """ # omit file if it comes from a subworkflow return set(f for f in self.input if not f.exists and not f in self.subworkflow_input) @property def output_mintime(self): """ Return oldest output file. """ existing = [f.mtime for f in self.expanded_output if f.exists] if self.benchmark and self.benchmark.exists: existing.append(self.benchmark.mtime) if existing: return min(existing) return None @property def input_maxtime(self): """ Return newest input file. """ existing = [f.mtime for f in self.input if f.exists] if existing: return max(existing) return None def missing_output(self, requested=None): """ Return missing output files. """ files = set() if self.benchmark and (requested is None or self.benchmark in requested): if not self.benchmark.exists: files.add(self.benchmark) for f, f_ in zip(self.output, self.rule.output): if requested is None or f in requested: if f in self.dynamic_output: if not self.expand_dynamic( f_, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill): files.add("{} (dynamic)".format(f_)) elif not f.exists: files.add(f) return files @property def existing_output(self): return filter(lambda f: f.exists, self.expanded_output) def check_protected_output(self): protected = list(filter(lambda f: f.protected, self.expanded_output)) if protected: raise ProtectedOutputException(self.rule, protected) def prepare(self): """ Prepare execution of job. This includes creation of directories and deletion of previously created dynamic files. """ self.check_protected_output() unexpected_output = self.dag.reason(self).missing_output.intersection( self.existing_output) if unexpected_output: logger.warning( "Warning: the following output files of rule {} were not " "present when the DAG was created:\n{}".format( self.rule, unexpected_output)) if self.dynamic_output: for f, _ in chain(*map(partial(self.expand_dynamic, restriction=self.wildcards, omit_value=_IOFile.dynamic_fill), self.rule.dynamic_output)): os.remove(f) for f, f_ in zip(self.output, self.rule.output): f.prepare() for f in self.log: f.prepare() if self.benchmark: self.benchmark.prepare() def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] if to_remove: logger.info("Removing output files of failed job {}" " since they might be corrupted:\n{}".format( self, ", ".join(to_remove))) for f in to_remove: f.remove() def format_wildcards(self, string, **variables): """ Format a string with variables from the job. """ _variables = dict() _variables.update(self.rule.workflow.globals) _variables.update(dict(input=self.input, output=self.output, params=self.params, wildcards=self._format_wildcards, threads=self.threads, resources=self.resources, log=self.log, version=self.rule.version, rule=self.rule.name, )) _variables.update(variables) try: return format(string, **_variables) except NameError as ex: raise RuleException("NameError: " + str(ex), rule=self.rule) except IndexError as ex: raise RuleException("IndexError: " + str(ex), rule=self.rule) def properties(self, omit_resources="_cores _nodes".split()): resources = { name: res for name, res in self.resources.items() if name not in omit_resources } params = {name: value for name, value in self.params.items()} properties = { "rule": self.rule.name, "local": self.dag.workflow.is_local(self.rule), "input": self.input, "output": self.output, "params": params, "threads": self.threads, "resources": resources } return properties def json(self): return json.dumps(self.properties()) def __repr__(self): return self.rule.name def __eq__(self, other): if other is None: return False return self.rule == other.rule and ( self.dynamic_output or self.wildcards_dict == other.wildcards_dict) def __lt__(self, other): return self.rule.__lt__(other.rule) def __gt__(self, other): return self.rule.__gt__(other.rule) def __hash__(self): return self._hash @staticmethod def expand_dynamic(pattern, restriction=None, omit_value=None): """ Expand dynamic files. """ return list(listfiles(pattern, restriction=restriction, omit_value=omit_value))