def object_from_state(state, parms, frag_only): inputobj = {} for inp in parms: iid = inp["id"] if frag_only: (_, iid) = urlparse.urldefrag(iid) iid = iid.split(".")[-1] if "source" in inp: connections = aslist(inp["source"]) for src in connections: if src in state and state[src] is not None: if not match_types( inp["type"], state[src], iid, inputobj, inp.get("linkMerge", ("merge_nested" if len(connections) > 1 else None))): raise WorkflowException( "Type mismatch between source '%s' (%s) and sink '%s' (%s)" % (src, state[src].parameter["type"], inp["id"], inp["type"])) elif src not in state: raise WorkflowException( "Connect source '%s' on parameter '%s' does not exist" % (src, inp["id"])) else: return None elif "default" in inp: inputobj[iid] = inp["default"] else: raise WorkflowException("Value for %s not specified" % (inp["id"])) return inputobj
def exeval(ex, jobinput, requirements, docpath, context, pull_image): for r in reversed(requirements): if r["class"] == "ExpressionEngineRequirement" and r["id"] == ex[ "engine"]: if r["id"][0] != "#": with open(os.path.join(docpath, r["id"])) as f: ex_obj = yaml.load(f) sch = process.get_schema() validate.validate_ex( sch.get_name("ExpressionEngineRequirement", ""), ex_obj) r = ex_obj runtime = [] img_id = docker.get_from_requirements(r.get("requirements"), r.get("hints"), pull_image) if img_id: runtime = ["docker", "run", "-i", "--rm", img_id] exdefs = [] for exdef in r.get("expressionDefs", []): if isinstance(exdef, dict) and "ref" in exdef: with open(os.path.join(r["_docpath"], exdef["ref"])) as f: exdefs.append(f.read()) elif isinstance(exdef, basestring): exdefs.append(exdef) inp = { "script": ex["script"], "expressionDefs": exdefs, "job": jobinput, "context": context } _logger.debug(json.dumps(inp)) sp = subprocess.Popen(runtime + aslist(r["engineCommand"]), shell=False, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = sp.communicate(json.dumps(inp) + "\n\n") if sp.returncode != 0: raise WorkflowException( "Expression engine returned non-zero exit code.") return json.loads(stdoutdata) raise WorkflowException("Unknown expression engine '%s'" % ex["engine"])
def _init_job(self, joborder, basedir, **kwargs): # Validate job order try: validate.validate_ex( self.names.get_name("input_record_schema", ""), joborder) except validate.ValidationException as v: _logger.error("Failed to validate %s\n%s" % (pprint.pformat(joborder), v)) raise for r in self.tool.get("requirements", []): if r["class"] not in supportedProcessRequirements: raise WorkflowException("Unsupported process requirement %s" % (r["class"])) self.requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) builder = Builder() builder.job = copy.deepcopy(joborder) builder.jslib = '' builder.basedir = basedir builder.files = [] builder.bindings = [] builder.schemaDefs = self.schemaDefs builder.docpath = self.docpath builder.bindings.extend( builder.bind_input(self.inputs_record_schema, builder.job)) return builder
def adjust_for_scatter(self, steps): (scatterSpec, _) = self.get_requirement("ScatterFeatureRequirement") for step in steps: if scatterSpec and "scatter" in step.tool: inputparms = copy.deepcopy(step.tool["inputs"]) outputparms = copy.deepcopy(step.tool["outputs"]) scatter = aslist(step.tool["scatter"]) inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise WorkflowException( "Invalid Scatter parameter '%s'" % s) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if step.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in xrange(0, nesting): for i in outputparms: i["type"] = {"type": "array", "items": i["type"]} step.tool["inputs"] = inputparms step.tool["outputs"] = outputparms
def dotproduct_scatter(process, joborder, basedir, scatter_keys, output_callback, **kwargs): l = None for s in scatter_keys: if l is None: l = len(joborder[s]) elif l != len(joborder[s]): raise WorkflowException( "Length of input arrays must be equal when performing dotproduct scatter." ) output = {} for i in process.tool["outputs"]: output[i["id"]] = [None] * l rc = ReceiveScatterOutput(output_callback, output) for n in range(0, l): jo = copy.copy(joborder) for s in scatter_keys: jo[s] = joborder[s][n] for j in process.job(jo, basedir, functools.partial(rc.receive_scatter_output, n), **kwargs): yield j rc.setTotal(l)
def job(self, joborder, basedir, output_callback, **kwargs): # Validate job order validate.validate_ex(self.names.get_name("input_record_schema", ""), joborder) requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) hints = kwargs.get("hints", []) + self.tool.get("hints", []) steps = [ makeTool(step, basedir) for step in self.tool.get("steps", []) ] random.shuffle(steps) self.state = {} for i in self.tool["inputs"]: iid = idk(i["id"]) if iid in joborder: self.state[iid] = WorkflowStateItem( i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[iid] = WorkflowStateItem( i, copy.deepcopy(i["default"])) else: raise WorkflowException( "Input '%s' not in input object and does not have a default value." % (i["id"])) for s in steps: for out in s.tool["outputs"]: self.state[idk(out["id"])] = None s.completed = False completed = 0 while completed < len(steps): made_progress = False completed = 0 for step in steps: if step.completed: completed += 1 else: for newjob in self.try_make_job(step, basedir, requirements=requirements, hints=hints, **kwargs): if newjob: made_progress = True yield newjob if not made_progress and completed < len(steps): yield None wo = {} for i in self.tool["outputs"]: if "connect" in i: src = idk(i["connect"]["source"]) wo[idk(i["id"])] = self.state[src].value output_callback(wo)
def tostr(self, value): if isinstance(value, dict) and value.get("class") == "File": if "path" not in value: raise WorkflowException("File object must have \"path\": %s" % (value)) return value["path"] else: return str(value)
def __init__(self, toolpath_object, docpath): self.impl = toolpath_object["impl"] try: self.embedded_tool = makeTool( from_url(os.path.join(docpath, self.impl)), docpath) except validate.ValidationException as v: raise WorkflowException( "Tool definition %s failed validation:\n%s" % (os.path.join(docpath, self.impl), validate.indent(str(v)))) if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step_" + str(random.randint(1, 1000000000)) for i in toolpath_object["inputs"]: d = i["def"][len(self.impl):] toolid = i.get("id", self.id + "." + idk(d)) found = False for a in self.embedded_tool.tool["inputs"]: if a["id"] == d: i.update(a) found = True if not found: raise WorkflowException( "Did not find input '%s' in external process" % (i["def"])) i["id"] = toolid for i in toolpath_object["outputs"]: d = i["def"][len(self.impl):] toolid = i["id"] found = False for a in self.embedded_tool.tool["outputs"]: if a["id"] == d: i.update(a) found = True if not found: raise WorkflowException( "Did not find output '%s' in external process" % (i["def"])) i["id"] = toolid super(External, self).__init__(toolpath_object, "Process", docpath)
def receive_output(self, jobout): self.output = {} for i in self.tool["outputs"]: if i["def"][:len(self.impl)] != self.impl: raise WorkflowException( "'def' is '%s' but must refer to fragment of resource '%s' listed in 'impl'" % (i["def"], self.impl)) d = idk(i["def"][len(self.impl):]) self.output[idk(i["id"])] = jobout[d]
def __init__(self, toolpath_object, **kwargs): try: makeTool = kwargs.get("makeTool") self.embedded_tool = makeTool(toolpath_object["run"], **kwargs) except validate.ValidationException as v: raise WorkflowException( "Tool definition %s failed validation:\n%s" % (toolpath_object["run"]["id"], validate.indent(str(v)))) if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step_" + str(random.randint(1, 1000000000)) for field in ("inputs", "outputs"): for i in toolpath_object[field]: inputid = i["id"] (_, d) = urlparse.urldefrag(inputid) frag = d.split(".")[-1] p = urlparse.urljoin(toolpath_object["run"].get("id", self.id), "#" + frag) found = False for a in self.embedded_tool.tool[field]: if a["id"] == p: i.update(a) found = True if not found: raise WorkflowException( "Did not find %s parameter '%s' in workflow step" % (field, p)) i["id"] = inputid super(WorkflowStep, self).__init__(toolpath_object, "Process", do_validate=False, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but SubworkflowFeatureRequirement not declared" )
def _init_job(self, joborder, input_basedir, **kwargs): builder = Builder() builder.job = copy.deepcopy(joborder) for i in self.tool["inputs"]: (_, d) = urlparse.urldefrag(i["id"]) if d not in builder.job and "default" in i: builder.job[d] = i["default"] # Validate job order try: validate.validate_ex( self.names.get_name("input_record_schema", ""), builder.job) except validate.ValidationException as e: raise WorkflowException("Error validating input record, " + str(e)) for r in self.requirements: if r["class"] not in supportedProcessRequirements: raise WorkflowException("Unsupported process requirement %s" % (r["class"])) builder.files = [] builder.bindings = [] builder.schemaDefs = self.schemaDefs builder.names = self.names builder.requirements = self.requirements dockerReq, _ = self.get_requirement("DockerRequirement") if dockerReq and kwargs.get("use_container"): builder.outdir = kwargs.get("docker_outdir") or "/tmp/job_output" builder.tmpdir = kwargs.get("docker_tmpdir") or "/tmp/job_tmp" else: builder.outdir = kwargs.get("outdir") or tempfile.mkdtemp() builder.tmpdir = kwargs.get("tmpdir") or tempfile.mkdtemp() builder.fs_access = kwargs.get("fs_access") or StdFsAccess( input_basedir) builder.bindings.extend( builder.bind_input(self.inputs_record_schema, builder.job)) return builder
def receive_output(self, step, outputparms, jobout): _logger.info("Job got output: %s", jobout) for i in outputparms: if "id" in i: if idk(i["id"]) in jobout: self.state[idk(i["id"])] = WorkflowStateItem( i, jobout[idk(i["id"])]) else: raise WorkflowException( "Output is missing expected field %s" % idk(i["id"])) step.completed = True
def defaultMakeTool(toolpath_object, **kwargs): if "class" in toolpath_object: if toolpath_object["class"] == "CommandLineTool": return draft2tool.CommandLineTool(toolpath_object, **kwargs) elif toolpath_object["class"] == "ExpressionTool": return draft2tool.ExpressionTool(toolpath_object, **kwargs) elif toolpath_object["class"] == "Workflow": return Workflow(toolpath_object, **kwargs) raise WorkflowException( "Missing or invalid 'class' field in %s, expecting one of: CommandLineTool, ExpressionTool" % toolpath_object["id"])
def try_make_job(self, step, basedir, **kwargs): inputparms = step.tool["inputs"] outputparms = step.tool["outputs"] try: inputobj = object_from_state(self.state, inputparms, False) if inputobj is None: _logger.debug("[workflow %s] job step %s not ready", id(self), step.id) return _logger.debug("[step %s] starting job step %s of workflow %s", id(step), step.id, id(self)) if step.submitted: return callback = functools.partial(self.receive_output, step, outputparms) if "scatter" in step.tool: scatter = aslist(step.tool["scatter"]) method = step.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) if method == "dotproduct" or method is None: jobs = dotproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "nested_crossproduct": jobs = nested_crossproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "flat_crossproduct": jobs = flat_crossproduct_scatter(step, inputobj, basedir, scatter, callback, 0, **kwargs) else: jobs = step.job(inputobj, basedir, callback, **kwargs) step.submitted = True for j in jobs: yield j except Exception as e: _logger.exception("Unhandled exception") self.processStatus = "permanentFail" step.completed = True
def try_make_job(self, step, basedir, **kwargs): _logger.debug("Try to make job %s", step.id) inputparms = step.tool["inputs"] outputparms = step.tool["outputs"] try: inputobj = self.object_from_state(inputparms, False) if inputobj is None: return if step.submitted: return callback = functools.partial(self.receive_output, step, outputparms) (scatterSpec, _) = self.get_requirement("ScatterFeatureRequirement") if scatterSpec and "scatter" in step.tool: scatter = aslist(step.tool["scatter"]) method = step.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) if method == "dotproduct" or method is None: jobs = dotproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "nested_crossproduct": jobs = nested_crossproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "flat_crossproduct": jobs = flat_crossproduct_scatter(step, inputobj, basedir, scatter, callback, 0, **kwargs) else: jobs = step.job(inputobj, basedir, callback, **kwargs) step.submitted = True for j in jobs: yield j except Exception as e: _logger.error(e) self.processStatus = "permanentFail" step.completed = True
def makeTool(toolpath_object, docpath): """docpath is the directory the tool file is located.""" if "schema" in toolpath_object: return draft1tool.Tool(toolpath_object) elif "impl" in toolpath_object and toolpath_object.get( "class", "External") == "External": return External(toolpath_object, docpath) if "class" in toolpath_object: if toolpath_object["class"] == "CommandLineTool": return draft2tool.CommandLineTool(toolpath_object, docpath) elif toolpath_object["class"] == "ExpressionTool": return draft2tool.ExpressionTool(toolpath_object, docpath) elif toolpath_object["class"] == "Workflow": return Workflow(toolpath_object, docpath) else: raise WorkflowException( "Missing 'class' field, expecting one of: Workflow, CommandLineTool, ExpressionTool, External" )
def collect_output_ports(self, ports, builder, outdir): try: custom_output = os.path.join(outdir, "cwl.output.json") if builder.fs_access.exists(custom_output): outputdoc = yaml.load(custom_output) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), outputdoc) return outputdoc ret = {} for port in ports: doc_url, fragment = urlparse.urldefrag(port['id']) ret[fragment] = self.collect_output(port, builder, outdir) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), ret) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record, " + str(e) + "\n in " + json.dumps(ret, indent=4))
def match_types(self, sinktype, src, iid, inputobj, linkMerge): if isinstance(sinktype, list): # Sink is union type for st in sinktype: if self.match_types(st, src, iid, inputobj, linkMerge): return True elif isinstance(src.parameter["type"], list): # Source is union type # Check that every source type is compatible with the sink. for st in src.parameter["type"]: srccopy = copy.deepcopy(src) srccopy.parameter["type"] = st if not self.match_types(st, srccopy, iid, inputobj, linkMerge): return False return True else: is_array = isinstance(sinktype, dict) and sinktype["type"] == "array" if is_array and linkMerge: if iid not in inputobj: inputobj[iid] = [] if linkMerge == "merge_nested": inputobj[iid].append(src.value) elif linkMerge == "merge_flattened": if isinstance(src.value, list): inputobj[iid].extend(src.value) else: inputobj[iid].append(src.value) else: raise WorkflowException( "Unrecognized linkMerge enum '%s'" % linkMerge) return True elif src.parameter["type"] == sinktype: # simply assign the value from state to input inputobj[iid] = copy.deepcopy(src.value) return True return False
def __init__(self, toolpath_object, pos, **kwargs): try: makeTool = kwargs.get("makeTool") self.embedded_tool = makeTool(toolpath_object["run"], **kwargs) except validate.ValidationException as v: raise WorkflowException( "Tool definition %s failed validation:\n%s" % (toolpath_object["run"]["id"], validate.indent(str(v)))) if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + str(pos) for field in ("inputs", "outputs"): for i in toolpath_object[field]: inputid = i["id"] (_, d) = urlparse.urldefrag(inputid) frag = d.split(".")[-1] p = urlparse.urljoin(toolpath_object["run"].get("id", self.id), "#" + frag) found = False for a in self.embedded_tool.tool[field]: if a["id"] == p: i.update(a) found = True if not found: raise WorkflowException( "Did not find %s parameter '%s' in workflow step" % (field, p)) i["id"] = inputid super(WorkflowStep, self).__init__(toolpath_object, "Process", do_validate=False, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but SubworkflowFeatureRequirement not declared" ) if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement not declared" ) inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise WorkflowException("Invalid Scatter parameter '%s'" % s) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in xrange(0, nesting): for i in outputparms: i["type"] = {"type": "array", "items": i["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms
def job(self, joborder, basedir, output_callback, move_outputs=True, **kwargs): self.state = {} self.processStatus = "success" if "outdir" in kwargs: del kwargs["outdir"] for i in self.tool["inputs"]: (_, iid) = urlparse.urldefrag(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(i["default"])) else: raise WorkflowException( "Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None output_dirs = set() completed = 0 while completed < len(self.steps) and self.processStatus == "success": made_progress = False completed = 0 for step in self.steps: if step.completed: completed += 1 else: for newjob in self.try_make_job(step, basedir, **kwargs): if newjob: made_progress = True if newjob.outdir: output_dirs.add(newjob.outdir) yield newjob if not made_progress and completed < len(self.steps): yield None wo = object_from_state(self.state, self.tool["outputs"], True) if move_outputs: targets = set() conflicts = set() outfiles = findfiles(wo) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a) + 1:]) if dst in targets: conflicts.add(dst) else: targets.add(dst) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a) + 1:]) if dst in conflicts: sp = os.path.splitext(dst) dst = "%s-%s%s" % ( sp[0], str(random.randint(1, 1000000000)), sp[1]) dirname = os.path.dirname(dst) if not os.path.exists(dirname): os.makedirs(dirname) _logger.debug("[workflow %s] Moving '%s' to '%s'", id(self), src, dst) shutil.move(src, dst) f["path"] = dst for a in output_dirs: if os.path.exists(a) and empty_subtree(a): _logger.debug( "[workflow %s] Removing intermediate output directory %s", id(self), a) shutil.rmtree(a, True) _logger.info("[workflow %s] outdir is %s", id(self), self.outdir) output_callback(wo, self.processStatus)
def run(self, dry_run=False, pull_image=True, rm_container=True, rm_tmpdir=True, move_outputs=True, **kwargs): if not os.path.exists(self.outdir): os.makedirs(self.outdir) #with open(os.path.join(outdir, "cwl.input.json"), "w") as fp: # json.dump(self.joborder, fp) runtime = [] env = {"TMPDIR": self.tmpdir} (docker_req, docker_is_req) = get_feature(self, "DockerRequirement") for f in self.pathmapper.files(): if not os.path.exists(self.pathmapper.mapper(f)[0]): raise WorkflowException("Required input file %s not found" % self.pathmapper.mapper(f)[0]) img_id = None if docker_req and kwargs.get("use_container") is not False: env = os.environ img_id = docker.get_from_requirements(docker_req, docker_is_req, pull_image) if docker_is_req and img_id is None: raise WorkflowException( "Docker is required for running this tool.") if img_id: runtime = ["docker", "run", "-i"] for src in self.pathmapper.files(): vol = self.pathmapper.mapper(src) runtime.append("--volume=%s:%s:ro" % vol) runtime.append("--volume=%s:%s:rw" % (os.path.abspath(self.outdir), "/tmp/job_output")) runtime.append("--volume=%s:%s:rw" % (os.path.abspath(self.tmpdir), "/tmp/job_tmp")) runtime.append("--workdir=%s" % ("/tmp/job_output")) euid = docker_vm_uid() or os.geteuid() runtime.append("--user=%s" % (euid)) if rm_container: runtime.append("--rm") runtime.append("--env=TMPDIR=/tmp/job_tmp") for t, v in self.environment.items(): runtime.append("--env=%s=%s" % (t, v)) runtime.append(img_id) else: env = self.environment if not os.path.exists(self.tmpdir): os.makedirs(self.tmpdir) env["TMPDIR"] = self.tmpdir stdin = None stdout = None _logger.info( "[job %s] %s$ %s%s%s", id(self), self.outdir, " ".join([ shellescape.quote(arg) if needs_shell_quoting(arg) else arg for arg in (runtime + self.command_line) ]), ' < %s' % (self.stdin) if self.stdin else '', ' > %s' % os.path.join(self.outdir, self.stdout) if self.stdout else '') if dry_run: return (self.outdir, {}) outputs = {} try: for t in self.generatefiles: if isinstance(self.generatefiles[t], dict): os.symlink(self.generatefiles[t]["path"], os.path.join(self.outdir, t)) else: with open(os.path.join(self.outdir, t), "w") as f: f.write(self.generatefiles[t]) if self.stdin: stdin = open(self.pathmapper.mapper(self.stdin)[0], "rb") else: stdin = subprocess.PIPE if self.stdout: absout = os.path.join(self.outdir, self.stdout) dn = os.path.dirname(absout) if dn and not os.path.exists(dn): os.makedirs(dn) stdout = open(absout, "wb") else: stdout = sys.stderr sp = subprocess.Popen(runtime + self.command_line, shell=False, close_fds=True, stdin=stdin, stdout=stdout, env=env, cwd=self.outdir) if stdin == subprocess.PIPE: sp.stdin.close() rcode = sp.wait() if stdin != subprocess.PIPE: stdin.close() if stdout is not sys.stderr: stdout.close() if self.successCodes and rcode in self.successCodes: processStatus = "success" elif self.temporaryFailCodes and rcode in self.temporaryFailCodes: processStatus = "temporaryFail" elif self.permanentFailCodes and rcode in self.permanentFailCodes: processStatus = "permanentFail" elif rcode == 0: processStatus = "success" else: processStatus = "permanentFail" for t in self.generatefiles: if isinstance(self.generatefiles[t], dict): os.remove(os.path.join(self.outdir, t)) os.symlink( self.pathmapper.reversemap( self.generatefiles[t]["path"])[1], os.path.join(self.outdir, t)) outputs = self.collect_outputs(self.outdir) except OSError as e: if e.errno == 2: if runtime: _logger.error("'%s' not found", runtime[0]) else: _logger.error("'%s' not found", self.command_line[0]) else: _logger.exception("Exception while running job") processStatus = "permanentFail" except WorkflowException as e: _logger.error("Error while running job: %s" % e) processStatus = "permanentFail" except Exception as e: _logger.exception("Exception while running job") processStatus = "permanentFail" if processStatus != "success": _logger.warn("[job %s] completed %s", id(self), processStatus) else: _logger.debug("[job %s] completed %s", id(self), processStatus) _logger.debug("[job %s] %s", id(self), json.dumps(outputs, indent=4)) self.output_callback(outputs, processStatus) if rm_tmpdir: _logger.debug("[job %s] Removing temporary directory %s", id(self), self.tmpdir) shutil.rmtree(self.tmpdir, True) if move_outputs and empty_subtree(self.outdir): _logger.debug("[job %s] Removing empty output directory %s", id(self), self.tmpdir) shutil.rmtree(self.outdir, True)
def collect_output(self, schema, builder, outdir): r = None if "outputBinding" in schema: binding = schema["outputBinding"] if "glob" in binding: r = [] bg = builder.do_eval(binding["glob"]) for gb in aslist(bg): r.extend([{ "path": g, "class": "File" } for g in builder.fs_access.glob(os.path.join(outdir, gb)) ]) for files in r: checksum = hashlib.sha1() with builder.fs_access.open(files["path"], "rb") as f: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents filesize = 0 while contents != "": checksum.update(contents) filesize += len(contents) contents = f.read(1024 * 1024) files["checksum"] = "sha1$%s" % checksum.hexdigest() files["size"] = filesize if "outputEval" in binding: r = builder.do_eval(binding["outputEval"], context=r) if schema["type"] == "File" and (not isinstance(r, dict) or "path" not in r): raise WorkflowException( "Expression must return a file object.") if schema["type"] == "File": if not r: raise WorkflowException( "No matches for output file with glob: {}.".format( binding["glob"])) if len(r) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) r = r[0] if schema["type"] == "File" and "secondaryFiles" in binding: r["secondaryFiles"] = [] for sf in aslist(binding["secondaryFiles"]): if isinstance(sf, dict): sfpath = builder.do_eval(sf, context=r["path"]) else: sfpath = { "path": substitute(r["path"], sf), "class": "File" } if isinstance(sfpath, list): r["secondaryFiles"].extend(sfpath) else: r["secondaryFiles"].append(sfpath) for sf in r["secondaryFiles"]: if not builder.fs_access.exists(sf["path"]): raise WorkflowException( "Missing secondary file of '%s' of primary file '%s'" % (sf["path"], r["path"])) if not r and schema["type"] == "record": r = {} for f in schema["fields"]: r[f["name"]] = self.collect_output(f, builder, outdir) return r
def job(self, joborder, basedir, output_callback, **kwargs): # Validate job order validate.validate_ex(self.names.get_name("input_record_schema", ""), joborder) self.adjust_for_scatter(self.steps) random.shuffle(self.steps) self.state = {} self.processStatus = "success" for i in self.tool["inputs"]: (_, iid) = urlparse.urldefrag(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(i["default"])) else: raise WorkflowException( "Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None s.submitted = False s.completed = False if "outdir" in kwargs: outdir = kwargs["outdir"] del kwargs["outdir"] else: outdir = tempfile.mkdtemp() actual_jobs = [] completed = 0 while completed < len(self.steps) and self.processStatus == "success": made_progress = False completed = 0 for step in self.steps: if step.completed: completed += 1 else: for newjob in self.try_make_job(step, basedir, **kwargs): if newjob: made_progress = True actual_jobs.append(newjob) yield newjob if not made_progress and completed < len(self.steps): yield None wo = self.object_from_state(self.tool["outputs"], True) if kwargs.get("move_outputs", True): targets = set() conflicts = set() for f in findfiles(wo): for a in actual_jobs: if a.outdir and f["path"].startswith(a.outdir): src = f["path"] dst = os.path.join(outdir, src[len(a.outdir) + 1:]) if dst in targets: conflicts.add(dst) else: targets.add(dst) for f in findfiles(wo): for a in actual_jobs: if a.outdir and f["path"].startswith(a.outdir): src = f["path"] dst = os.path.join(outdir, src[len(a.outdir) + 1:]) if dst in conflicts: sp = os.path.splitext(dst) dst = "%s-%s%s" % ( sp[0], str(random.randint(1, 1000000000)), sp[1]) dirname = os.path.dirname(dst) if not os.path.exists(dirname): os.makedirs(dirname) _logger.info("Moving '%s' to '%s'", src, dst) shutil.move(src, dst) f["path"] = dst for a in actual_jobs: if a.outdir: _logger.info("Removing intermediate output directory %s", a.outdir) shutil.rmtree(a.outdir, True) output_callback(wo, self.processStatus)
def idk(key): if len(key) <= 1: raise WorkflowException("Identifier is too short") if key[0] != '#': raise WorkflowException("Must start with #") return key[1:]
def try_make_job(self, step, basedir, **kwargs): inputobj = {} if "scatter" in step.tool: if not self.check_feature("ScatterFeature", kwargs): raise WorkflowException( "Must include ScatterFeature in requirements.") inputparms = copy.deepcopy(step.tool["inputs"]) outputparms = copy.deepcopy(step.tool["outputs"]) scatter = aslist(step.tool["scatter"]) inp_map = {i["id"]: i for i in inputparms} for s in aslist(step.tool["scatter"]): if s not in inp_map: raise WorkflowException("Invalid Scatter parameter '%s'" % s) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if step.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(aslist(step.tool["scatter"])) else: nesting = 1 for r in xrange(0, nesting): for i in outputparms: i["type"] = {"type": "array", "items": i["type"]} else: inputparms = step.tool["inputs"] outputparms = step.tool["outputs"] for inp in inputparms: _logger.debug(inp) iid = idk(inp["id"]) if "connect" in inp: connections = inp["connect"] is_array = isinstance(inp["type"], dict) and inp["type"]["type"] == "array" for connection in aslist(connections): src = idk(connection["source"]) if src in self.state and self.state[src] is not None: if self.state[src].parameter["type"] == inp["type"]: # source and input types are the same if is_array and iid in inputobj: # there's already a value in the input object, so extend the existing array inputobj[iid].extend(self.state[src].value) else: # simply assign the value from state to input inputobj[iid] = copy.deepcopy( self.state[src].value) elif is_array and self.state[src].parameter[ "type"] == inp["type"]["items"]: # source type is the item type on the input array # promote single item to array entry if iid in inputobj: inputobj[iid].append(self.state[src].value) else: inputobj[iid] = [self.state[src].value] else: raise WorkflowException( "Type mismatch between '%s' (%s) and '%s' (%s)" % (src, self.state[src].parameter["type"], idk(inp["id"]), inp["type"])) elif src not in self.state: raise WorkflowException( "Connect source '%s' on parameter '%s' does not exist" % (src, inp["id"])) else: return elif "default" in inp: inputobj[iid] = inp["default"] else: raise WorkflowException("Value for %s not specified" % (inp["id"])) _logger.info("Creating job with input: %s", inputobj) callback = functools.partial(self.receive_output, step, outputparms) if step.tool.get("scatter"): method = step.tool.get("scatterMethod") if method is None and len(aslist(step.tool["scatter"])) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) if method == "dotproduct" or method is None: jobs = dotproduct_scatter(step, inputobj, basedir, aslist(step.tool["scatter"]), callback, **kwargs) elif method == "nested_crossproduct": jobs = nested_crossproduct_scatter( step, inputobj, basedir, aslist(step.tool["scatter"]), callback, **kwargs) elif method == "flat_crossproduct": jobs = flat_crossproduct_scatter(step, inputobj, basedir, aslist(step.tool["scatter"]), callback, 0, **kwargs) else: jobs = step.job(inputobj, basedir, callback, **kwargs) for j in jobs: yield j
def exeval(ex, jobinput, requirements, outdir, tmpdir, context, pull_image): if ex["engine"] == "cwl:JsonPointer": try: obj = { "job": jobinput, "context": context, "outdir": outdir, "tmpdir": tmpdir } return avro_ld.ref_resolver.resolve_json_pointer(obj, ex["script"]) except ValueError as v: raise WorkflowException("%s in %s" % (v, obj)) for r in reversed(requirements): if r["class"] == "ExpressionEngineRequirement" and r["id"] == ex[ "engine"]: runtime = [] class DR(object): pass dr = DR() dr.requirements = r.get("requirements", []) dr.hints = r.get("hints", []) (docker_req, docker_is_req) = process.get_feature(dr, "DockerRequirement") if docker_req: img_id = docker.get_from_requirements(docker_req, docker_is_req, pull_image) if img_id: runtime = ["docker", "run", "-i", "--rm", img_id] exdefs = [] for exdef in r.get("engineConfig", []): if isinstance(exdef, dict) and "ref" in exdef: with open(exdef["ref"][7:]) as f: exdefs.append(f.read()) elif isinstance(exdef, basestring): exdefs.append(exdef) inp = { "script": ex["script"], "engineConfig": exdefs, "job": jobinput, "context": context, "outdir": outdir, "tmpdir": tmpdir, } _logger.debug("Invoking expression engine %s with %s", runtime + aslist(r["engineCommand"]), json.dumps(inp, indent=4)) sp = subprocess.Popen(runtime + aslist(r["engineCommand"]), shell=False, close_fds=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = sp.communicate(json.dumps(inp) + "\n\n") if sp.returncode != 0: raise WorkflowException( "Expression engine returned non-zero exit code on evaluation of\n%s" % json.dumps(inp, indent=4)) return json.loads(stdoutdata) raise WorkflowException("Unknown expression engine '%s'" % ex["engine"])