def object_from_state(state, parms, frag_only, supportsMultipleInput): inputobj = {} for inp in parms: iid = inp["id"] if frag_only: iid = shortname(iid) if "source" in inp: if isinstance(inp["source"], list) and not supportsMultipleInput: raise WorkflowException("Workflow contains multiple inbound links to a single parameter but MultipleInputFeatureRequirement is not declared.") connections = aslist(inp["source"]) for src in connections: if src in state and state[src] is not None: if not match_types(inp["type"], state[src], iid, inputobj, inp.get("linkMerge", ("merge_nested" if len(connections) > 1 else None)), valueFrom=inp.get("valueFrom")): raise WorkflowException("Type mismatch between source '%s' (%s) and sink '%s' (%s)" % (src, state[src].parameter["type"], inp["id"], inp["type"])) elif src not in state: raise WorkflowException("Connect source '%s' on parameter '%s' does not exist" % (src, inp["id"])) else: return None elif "default" in inp: inputobj[iid] = inp["default"] elif "valueFrom" in inp: inputobj[iid] = None else: raise WorkflowException("Value for %s not specified" % (inp["id"])) return inputobj
def __init__(self, step): self.step = step self.tool = step.tool self.id = step.id self.submitted = False self.completed = False self.name = uniquename("step %s" % shortname(self.id))
def generate_parser(toolparser, tool, namemap): toolparser.add_argument("job_order", nargs="?", help="Job input json file") namemap["job_order"] = "job_order" for inp in tool.tool["inputs"]: name = shortname(inp["id"]) if len(name) == 1: flag = "-" else: flag = "--" namemap[name.replace("-", "_")] = name inptype = inp["type"] required = True if isinstance(inptype, list): if inptype[0] == "null": required = False if len(inptype) == 2: inptype = inptype[1] else: _logger.debug("Can't make command line argument from %s", inptype) return None help = inp.get("description", "").replace("%", "%%") kwargs = {} if inptype == "File": kwargs["action"] = FileAction elif isinstance(inptype, dict) and inptype["type"] == "array": if inptype["items"] == "File": kwargs["action"] = FileAppendAction else: kwargs["action"] = "append" if inptype == "string": kwargs["type"] = str elif inptype == "int": kwargs["type"] = int elif inptype == "float": kwargs["type"] = float elif inptype == "boolean": kwargs["action"] = "store_true" if "default" in inp: kwargs["default"] = inp["default"] required = False if "type" not in kwargs and "action" not in kwargs: _logger.debug("Can't make command line argument from %s", inptype) return None toolparser.add_argument(flag + name, required=required, help=help, **kwargs) return toolparser
def collect_output_ports(self, ports, builder, outdir): try: ret = {} custom_output = os.path.join(outdir, "cwl.output.json") if builder.fs_access.exists(custom_output): with builder.fs_access.open(custom_output, "r") as f: ret = yaml.load(f) _logger.debug("Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) adjustFileObjs(ret, remove_hostfs) adjustFileObjs(ret, functools.partial(revmap_file, builder, outdir)) adjustFileObjs(ret, remove_hostfs) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret) return ret for port in ports: fragment = shortname(port["id"]) try: ret[fragment] = self.collect_output(port, builder, outdir) except Exception as e: raise WorkflowException("Error collecting output for parameter '%s': %s" % (shortname(port["id"]), e)) if ret: adjustFileObjs(ret, remove_hostfs) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record, " + str(e) + "\n in " + json.dumps(ret, indent=4))
def receive_output(self, output_callback, jobout, processStatus): #_logger.debug("WorkflowStep output from run is %s", jobout) output = {} for i in self.tool["outputs"]: field = shortname(i["id"]) if field in jobout: output[i["id"]] = jobout[field] else: processStatus = "permanentFail" output_callback(output, processStatus)
def job(self, joborder, basedir, output_callback, **kwargs): for i in self.tool["inputs"]: p = i["id"] field = shortname(p) joborder[field] = joborder[i["id"]] del joborder[i["id"]] kwargs["requirements"] = kwargs.get("requirements", []) + self.tool.get("requirements", []) kwargs["hints"] = kwargs.get("hints", []) + self.tool.get("hints", []) for t in self.embedded_tool.job(joborder, basedir, functools.partial(self.receive_output, output_callback), **kwargs): yield t
def __init__(self, workflow, **kwargs): self.workflow = workflow self.tool = workflow.tool self.steps = [WorkflowJobStep(s) for s in workflow.steps] self.id = workflow.tool["id"] if "outdir" in kwargs: self.outdir = kwargs["outdir"] elif "tmp_outdir_prefix" in kwargs: self.outdir = tempfile.mkdtemp(prefix=kwargs["tmp_outdir_prefix"]) else: # tmp_outdir_prefix defaults to tmp, so this is unlikely to be used self.outdir = tempfile.mkdtemp() self.name = uniquename(kwargs.get("name", shortname(self.workflow.tool["id"]))) _logger.debug("[workflow %s] initialized from %s", self.name, self.tool["id"])
def collect_output_ports(self, ports, builder, outdir): try: custom_output = os.path.join(outdir, "cwl.output.json") if builder.fs_access.exists(custom_output): outputdoc = yaml.load(custom_output) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), outputdoc) return outputdoc ret = {} for port in ports: fragment = shortname(port["id"]) ret[fragment] = self.collect_output(port, builder, outdir) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record, " + str(e) + "\n in " + json.dumps(ret, indent=4))
def job(self, joborder, basedir, output_callback, **kwargs): for i in self.tool["inputs"]: p = i["id"] field = shortname(p) joborder[field] = joborder[i["id"]] del joborder[i["id"]] kwargs["requirements"] = kwargs.get("requirements", []) + self.tool.get("requirements", []) kwargs["hints"] = kwargs.get("hints", []) + self.tool.get("hints", []) try: for t in self.embedded_tool.job(joborder, basedir, functools.partial(self.receive_output, output_callback), **kwargs): yield t except WorkflowException: raise except Exception as e: _logger.exception("Unexpected exception") raise WorkflowException(str(e))
def job(self, joborder, basedir, output_callback, **kwargs): for i in self.tool["inputs"]: p = i["id"] field = shortname(p) joborder[field] = joborder[i["id"]] del joborder[i["id"]] kwargs["requirements"] = kwargs.get("requirements", []) + self.tool.get("requirements", []) kwargs["hints"] = kwargs.get("hints", []) + self.tool.get("hints", []) try: for t in self.embedded_tool.job(joborder, basedir, functools.partial(self.receive_output, output_callback), **kwargs): yield t except WorkflowException: _logger.error("Exception on step '%s'", kwargs.get("name")) raise except Exception as e: _logger.exception("Unexpected exception") raise WorkflowException(str(e))
def collect_output_ports(self, ports, builder, outdir): try: custom_output = os.path.join(outdir, "cwl.output.json") if builder.fs_access.exists(custom_output): outputdoc = yaml.load(custom_output) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), outputdoc) return outputdoc ret = {} for port in ports: fragment = shortname(port["id"]) ret[fragment] = self.collect_output(port, builder, outdir) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), ret) return ret if ret is not None else {} except validate.ValidationException as e: raise WorkflowException("Error validating output record, " + str(e) + "\n in " + json.dumps(ret, indent=4))
def __init__(self, toolpath_object, pos, **kwargs): if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + str(pos) try: makeTool = kwargs.get("makeTool") runobj = None if isinstance(toolpath_object["run"], basestring): runobj, _ = schema_salad.schema.load_and_validate(kwargs["loader"], kwargs["avsc_names"], toolpath_object["run"], True) else: runobj = toolpath_object["run"] self.embedded_tool = makeTool(runobj, **kwargs) except validate.ValidationException as v: raise WorkflowException("Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(v)))) for field in ("inputs", "outputs"): for i in toolpath_object[field]: inputid = i["id"] p = shortname(inputid) found = False for a in self.embedded_tool.tool[field]: frag = shortname(a["id"]) if frag == p: i.update(a) found = True if not found: i["type"] = "Any" #raise WorkflowException("Parameter '%s' of %s in workflow step %s does not correspond to parameter in %s" % (p, field, self.id, self.embedded_tool.tool.get("id"))) i["id"] = inputid super(WorkflowStep, self).__init__(toolpath_object, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException("Workflow contains embedded workflow but SubworkflowFeatureRequirement not in requirements") if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException("Workflow contains scatter but ScatterFeatureRequirement not in requirements") inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException("Must specify scatterMethod when scattering over multiple inputs") inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise WorkflowException("Invalid Scatter parameter '%s'" % s) inp_map[s]["type"] = {"type": "array", "items": inp_map[s]["type"]} if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in xrange(0, nesting): for i in outputparms: i["type"] = {"type": "array", "items": i["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms
def __init__(self, toolpath_object, pos, **kwargs): if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + str(pos) try: makeTool = kwargs.get("makeTool") runobj = None if isinstance(toolpath_object["run"], basestring): runobj, _ = schema_salad.schema.load_and_validate( kwargs["loader"], kwargs["avsc_names"], toolpath_object["run"], True) else: runobj = toolpath_object["run"] self.embedded_tool = makeTool(runobj, **kwargs) except validate.ValidationException as v: raise WorkflowException( "Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(v)))) for field in ("inputs", "outputs"): for i in toolpath_object[field]: inputid = i["id"] p = shortname(inputid) found = False for a in self.embedded_tool.tool[field]: frag = shortname(a["id"]) if frag == p: i.update(a) found = True if not found: i["type"] = "Any" #raise WorkflowException("Parameter '%s' of %s in workflow step %s does not correspond to parameter in %s" % (p, field, self.id, self.embedded_tool.tool.get("id"))) i["id"] = inputid super(WorkflowStep, self).__init__(toolpath_object, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but SubworkflowFeatureRequirement not in requirements" ) if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement not in requirements" ) inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise WorkflowException("Invalid Scatter parameter '%s'" % s) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in xrange(0, nesting): for i in outputparms: i["type"] = {"type": "array", "items": i["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms
def collect_output(self, schema, builder, outdir): r = None if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] revmap = functools.partial(revmap_file, builder, outdir) if "glob" in binding: r = [] for gb in aslist(binding["glob"]): gb = builder.do_eval(gb) if gb: globpatterns.extend(aslist(gb)) for gb in globpatterns: if gb.startswith("/"): raise WorkflowError("glob patterns must not start with '/'") try: r.extend([{"path": g, "class": "File", "hostfs": True} for g in builder.fs_access.glob(os.path.join(outdir, gb))]) except (OSError, IOError) as e: _logger.warn(str(e)) for files in r: checksum = hashlib.sha1() with builder.fs_access.open(files["path"], "rb") as f: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents filesize = 0 while contents != "": checksum.update(contents) filesize += len(contents) contents = f.read(1024*1024) files["checksum"] = "sha1$%s" % checksum.hexdigest() files["size"] = filesize if "format" in schema: files["format"] = builder.do_eval(schema["format"], context=files) optional = False singlefile = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"]: singlefile = True elif schema["type"] == "File": singlefile = True if "outputEval" in binding: r = builder.do_eval(binding["outputEval"], context=r) if singlefile: # Handle single file outputs not wrapped in a list if r is not None and not isinstance(r, (list, tuple)): r = [r] if optional and r is None: pass elif (r is None or len(r) != 1 or not isinstance(r[0], dict) or "path" not in r[0]): raise WorkflowException("Expression must return a file object for %s." % schema["id"]) if singlefile: if not r and not optional: raise WorkflowException("Did not find output file with glob pattern: '{}'".format(globpatterns)) elif not r and optional: pass elif isinstance(r, list): if len(r) > 1: raise WorkflowException("Multiple matches for output item that is a single file.") else: r = r[0] # Ensure files point to local references outside of the run environment adjustFileObjs(r, revmap) if "secondaryFiles" in schema: for primary in aslist(r): if isinstance(primary, dict): primary["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if isinstance(sf, dict) or "$(" in sf or "${" in sf: sfpath = builder.do_eval(sf, context=r) if isinstance(sfpath, basestring): sfpath = revmap({"path": sfpath, "class": "File"}) else: sfpath = {"path": substitute(primary["path"], sf), "class": "File", "hostfs": True} for sfitem in aslist(sfpath): if builder.fs_access.exists(sfitem["path"]): primary["secondaryFiles"].append(sfitem) if not r and optional: r = None if not r and isinstance(schema["type"], dict) and schema["type"]["type"] == "record": r = {} for f in schema["type"]["fields"]: r[shortname(f["name"])] = self.collect_output(f, builder, outdir) return r
def try_make_job(self, step, basedir, **kwargs): inputparms = step.tool["inputs"] outputparms = step.tool["outputs"] supportsMultipleInput = bool( self.workflow.get_requirement("MultipleInputFeatureRequirement") [0]) try: inputobj = object_from_state(self.state, inputparms, False, supportsMultipleInput) if inputobj is None: _logger.debug("[workflow %s] job step %s not ready", self.name, step.id) return _logger.debug("[step %s] starting job step %s of workflow %s", id(step), step.id, id(self)) if step.submitted: return callback = functools.partial(self.receive_output, step, outputparms) valueFrom = { i["id"]: i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i } if len(valueFrom) > 0 and not bool( self.workflow.get_requirement( "StepInputExpressionRequirement")[0]): raise WorkflowException( "Workflow step contains valueFrom but StepInputExpressionRequirement not in requirements" ) vfinputs = {shortname(k): v for k, v in inputobj.iteritems()} def valueFromFunc(k, v): if k in valueFrom: return expression.do_eval(valueFrom[k], vfinputs, self.workflow.requirements, None, None, {}, context=v) else: return v if "scatter" in step.tool: scatter = aslist(step.tool["scatter"]) method = step.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException( "Must specify scatterMethod when scattering over multiple inputs" ) if "valueFrom" not in kwargs: kwargs["valueFrom"] = valueFromFunc if method == "dotproduct" or method is None: jobs = dotproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "nested_crossproduct": jobs = nested_crossproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "flat_crossproduct": jobs = flat_crossproduct_scatter(step, inputobj, basedir, scatter, callback, 0, **kwargs) else: _logger.debug("[workflow %s] Job is input %s", self.name, json.dumps(inputobj, indent=4)) inputobj = { k: valueFromFunc(k, v) for k, v in inputobj.items() } _logger.debug("[workflow %s] Evaluated job input to %s", self.name, json.dumps(inputobj, indent=4)) jobs = step.job(inputobj, basedir, callback, **kwargs) step.submitted = True for j in jobs: yield j except WorkflowException: raise except Exception as e: _logger.exception("Unhandled exception") self.processStatus = "permanentFail" step.completed = True
def job(self, joborder, basedir, output_callback, move_outputs=True, **kwargs): self.state = {} self.processStatus = "success" if "outdir" in kwargs: del kwargs["outdir"] for i in self.tool["inputs"]: iid = shortname(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[i["id"]] = WorkflowStateItem( i, copy.deepcopy(i["default"])) else: raise WorkflowException( "Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None output_dirs = set() completed = 0 while completed < len(self.steps) and self.processStatus == "success": made_progress = False completed = 0 for step in self.steps: if step.completed: completed += 1 else: for newjob in self.try_make_job(step, basedir, **kwargs): if newjob: made_progress = True if newjob.outdir: output_dirs.add(newjob.outdir) yield newjob if not made_progress and completed < len(self.steps): yield None supportsMultipleInput = bool( self.workflow.get_requirement("MultipleInputFeatureRequirement") [0]) wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput) if wo is None: raise WorkflowException("Output for workflow not available") if move_outputs: targets = set() conflicts = set() outfiles = findfiles(wo) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a) + 1:]) if dst in targets: conflicts.add(dst) else: targets.add(dst) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a) + 1:]) if dst in conflicts: sp = os.path.splitext(dst) dst = "%s-%s%s" % ( sp[0], str(random.randint(1, 1000000000)), sp[1]) dirname = os.path.dirname(dst) if not os.path.exists(dirname): os.makedirs(dirname) _logger.debug("[workflow %s] Moving '%s' to '%s'", self.name, src, dst) shutil.move(src, dst) f["path"] = dst for a in output_dirs: if os.path.exists(a) and empty_subtree(a): if kwargs.get("rm_tmpdir", True): _logger.debug( "[workflow %s] Removing intermediate output directory %s", self.name, a) shutil.rmtree(a, True) _logger.info("[workflow %s] outdir is %s", self.name, self.outdir) output_callback(wo, self.processStatus)
def load_tool(argsworkflow, updateonly, strict, makeTool, debug, print_pre=False, print_rdf=False, print_dot=False, print_deps=False, relative_deps=False, rdf_serializer=None, stdout=sys.stdout, urifrag=None): (document_loader, avsc_names, schema_metadata) = process.get_schema() if isinstance(avsc_names, Exception): raise avsc_names jobobj = None if isinstance(argsworkflow, basestring): split = urlparse.urlsplit(argsworkflow) if split.scheme: uri = argsworkflow else: uri = "file://" + os.path.abspath(argsworkflow) fileuri, urifrag = urlparse.urldefrag(uri) workflowobj = document_loader.fetch(fileuri) elif isinstance(argsworkflow, dict): workflowobj = argsworkflow uri = urifrag fileuri = "#" else: raise schema_salad.validate.ValidationException("Must be URI or dict") if "cwl:tool" in workflowobj: jobobj = workflowobj uri = urlparse.urljoin(uri, jobobj["cwl:tool"]) fileuri, urifrag = urlparse.urldefrag(uri) workflowobj = document_loader.fetch(fileuri) del jobobj["cwl:tool"] if isinstance(workflowobj, list): # bare list without a version must be treated as draft-2 workflowobj = {"cwlVersion": "https://w3id.org/cwl/cwl#draft-2", "id": fileuri, "@graph": workflowobj} workflowobj = update.update(workflowobj, document_loader, fileuri) document_loader.idx.clear() if updateonly: stdout.write(json.dumps(workflowobj, indent=4)) return 0 if print_deps: printdeps(workflowobj, document_loader, stdout, relative_deps) return 0 try: processobj, metadata = schema_salad.schema.load_and_validate(document_loader, avsc_names, workflowobj, strict) except (schema_salad.validate.ValidationException, RuntimeError) as e: _logger.error("Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 if print_pre: stdout.write(json.dumps(processobj, indent=4)) return 0 if print_rdf: printrdf(argsworkflow, processobj, document_loader.ctx, rdf_serializer, stdout) return 0 if print_dot: printdot(argsworkflow, processobj, document_loader.ctx, stdout) return 0 if urifrag: processobj, _ = document_loader.resolve_ref(uri) elif isinstance(processobj, list): if 1 == len(processobj): processobj = processobj[0] else: _logger.error("Tool file contains graph of multiple objects, must specify one of #%s", ", #".join(urlparse.urldefrag(i["id"])[1] for i in processobj if "id" in i)) return 1 try: t = makeTool(processobj, strict=strict, makeTool=makeTool, loader=document_loader, avsc_names=avsc_names) except (schema_salad.validate.ValidationException) as e: _logger.error("Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 except (RuntimeError, workflow.WorkflowException) as e: _logger.error("Tool definition failed initialization:\n%s", e, exc_info=(e if debug else False)) return 1 if jobobj: for inp in t.tool["inputs"]: if shortname(inp["id"]) in jobobj: inp["default"] = jobobj[shortname(inp["id"])] if metadata: t.metadata = metadata else: t.metadata = {"$namespaces": t.tool.get("$namespaces", {}), "$schemas": t.tool.get("$schemas", [])} return t
def job(self, joborder, basedir, output_callback, **kwargs): kwargs["part_of"] = self.name kwargs["name"] = shortname(self.id) for j in self.step.job(joborder, basedir, output_callback, **kwargs): yield j
def load_tool(argsworkflow, updateonly, strict, makeTool, debug, print_pre=False, print_rdf=False, print_dot=False, print_deps=False, relative_deps=False, rdf_serializer=None, stdout=sys.stdout, urifrag=None): (document_loader, avsc_names, schema_metadata) = process.get_schema() if isinstance(avsc_names, Exception): raise avsc_names jobobj = None if isinstance(argsworkflow, basestring): split = urlparse.urlsplit(argsworkflow) if split.scheme: uri = argsworkflow else: uri = "file://" + os.path.abspath(argsworkflow) fileuri, urifrag = urlparse.urldefrag(uri) workflowobj = document_loader.fetch(fileuri) if isinstance(workflowobj, list): # bare list without a version must be treated as draft-2 workflowobj = { "cwlVersion": "https://w3id.org/cwl/cwl#draft-2", "id": fileuri, "@graph": workflowobj } elif isinstance(argsworkflow, dict): workflowobj = argsworkflow uri = urifrag fileuri = "" else: raise schema_salad.validate.ValidationException("Must be URI or dict") if "cwl:tool" in workflowobj: jobobj = workflowobj workflowobj = document_loader.fetch( urlparse.urljoin(uri, workflowobj["cwl:tool"])) workflowobj = update.update(workflowobj, document_loader, fileuri) document_loader.idx.clear() if updateonly: stdout.write(json.dumps(workflowobj, indent=4)) return 0 if print_deps: printdeps(workflowobj, document_loader, stdout, relative_deps) return 0 try: processobj, metadata = schema_salad.schema.load_and_validate( document_loader, avsc_names, workflowobj, strict) except (schema_salad.validate.ValidationException, RuntimeError) as e: _logger.error("Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 if print_pre: stdout.write(json.dumps(processobj, indent=4)) return 0 if print_rdf: printrdf(argsworkflow, processobj, document_loader.ctx, rdf_serializer, stdout) return 0 if print_dot: printdot(argsworkflow, processobj, document_loader.ctx, stdout) return 0 if urifrag: processobj, _ = document_loader.resolve_ref(uri) elif isinstance(processobj, list): if 1 == len(processobj): processobj = processobj[0] else: _logger.error( "Tool file contains graph of multiple objects, must specify one of #%s", ", #".join( urlparse.urldefrag(i["id"])[1] for i in processobj if "id" in i)) return 1 try: t = makeTool(processobj, strict=strict, makeTool=makeTool, loader=document_loader, avsc_names=avsc_names) except (schema_salad.validate.ValidationException) as e: _logger.error("Tool definition failed validation:\n%s", e, exc_info=(e if debug else False)) return 1 except (RuntimeError, workflow.WorkflowException) as e: _logger.error("Tool definition failed initialization:\n%s", e, exc_info=(e if debug else False)) return 1 if jobobj: for inp in t.tool["inputs"]: if shortname(inp["id"]) in jobobj: inp["default"] = jobobj[shortname(inp["id"])] if metadata: t.metadata = metadata else: t.metadata = { "$namespaces": t.tool.get("$namespaces", {}), "$schemas": t.tool.get("$schemas", []) } return t
def load_job_order(args, t, parser, stdin, print_input_deps=False, relative_deps=False, stdout=sys.stdout): job_order_object = None if args.conformance_test: loader = Loader({}) else: jobloaderctx = { "path": { "@type": "@id" }, "format": { "@type": "@id" }, "id": "@id" } jobloaderctx.update(t.metadata.get("$namespaces", {})) loader = Loader(jobloaderctx) if len(args.job_order) == 1 and args.job_order[0][0] != "-": job_order_file = args.job_order[0] elif len(args.job_order) == 1 and args.job_order[0] == "-": job_order_object = yaml.load(stdin) job_order_object, _ = loader.resolve_all(job_order_object, "") else: job_order_file = None if job_order_object: input_basedir = args.basedir if args.basedir else os.getcwd() elif job_order_file: input_basedir = args.basedir if args.basedir else os.path.abspath( os.path.dirname(job_order_file)) try: job_order_object, _ = loader.resolve_ref(job_order_file) except Exception as e: _logger.error(e, exc_info=(e if args.debug else False)) return 1 toolparser = None else: input_basedir = args.basedir if args.basedir else os.getcwd() namemap = {} toolparser = generate_parser( argparse.ArgumentParser(prog=args.workflow), t, namemap) if toolparser: if args.tool_help: toolparser.print_help() return 0 cmd_line = vars(toolparser.parse_args(args.job_order)) if cmd_line["job_order"]: try: input_basedir = args.basedir if args.basedir else os.path.abspath( os.path.dirname(cmd_line["job_order"])) job_order_object = loader.resolve_ref( cmd_line["job_order"]) except Exception as e: _logger.error(e, exc_info=(e if args.debug else False)) return 1 else: job_order_object = {"id": args.workflow} job_order_object.update( {namemap[k]: v for k, v in cmd_line.items()}) _logger.debug("Parsed job order from command line: %s", json.dumps(job_order_object, indent=4)) else: job_order_object = None for inp in t.tool["inputs"]: if "default" in inp and (not job_order_object or shortname(inp["id"]) not in job_order_object): if not job_order_object: job_order_object = {} job_order_object[shortname(inp["id"])] = inp["default"] if not job_order_object and len(t.tool["inputs"]) > 0: parser.print_help() if toolparser: print "\nOptions for %s " % args.workflow toolparser.print_help() _logger.error("") _logger.error("Input object required") return 1 if print_input_deps: printdeps(job_order_object, loader, stdout, relative_deps, basedir="file://%s/" % input_basedir) return 0 return (job_order_object, input_basedir)
def load_job_order(args, t, parser, stdin, print_input_deps=False, relative_deps=False, stdout=sys.stdout): job_order_object = None if args.conformance_test: loader = Loader({}) else: jobloaderctx = {"path": {"@type": "@id"}, "format": {"@type": "@id"}, "id": "@id"} jobloaderctx.update(t.metadata.get("$namespaces", {})) loader = Loader(jobloaderctx) if len(args.job_order) == 1 and args.job_order[0][0] != "-": job_order_file = args.job_order[0] elif len(args.job_order) == 1 and args.job_order[0] == "-": job_order_object = yaml.load(stdin) job_order_object, _ = loader.resolve_all(job_order_object, "") else: job_order_file = None if job_order_object: input_basedir = args.basedir if args.basedir else os.getcwd() elif job_order_file: input_basedir = args.basedir if args.basedir else os.path.abspath(os.path.dirname(job_order_file)) try: job_order_object, _ = loader.resolve_ref(job_order_file) except Exception as e: _logger.error(e, exc_info=(e if args.debug else False)) return 1 toolparser = None else: input_basedir = args.basedir if args.basedir else os.getcwd() namemap = {} toolparser = generate_parser(argparse.ArgumentParser(prog=args.workflow), t, namemap) if toolparser: if args.tool_help: toolparser.print_help() return 0 cmd_line = vars(toolparser.parse_args(args.job_order)) if cmd_line["job_order"]: try: input_basedir = args.basedir if args.basedir else os.path.abspath(os.path.dirname(cmd_line["job_order"])) job_order_object = loader.resolve_ref(cmd_line["job_order"]) except Exception as e: _logger.error(e, exc_info=(e if args.debug else False)) return 1 else: job_order_object = {"id": args.workflow} job_order_object.update({namemap[k]: v for k,v in cmd_line.items()}) _logger.debug("Parsed job order from command line: %s", json.dumps(job_order_object, indent=4)) else: job_order_object = None for inp in t.tool["inputs"]: if "default" in inp and (not job_order_object or shortname(inp["id"]) not in job_order_object): if not job_order_object: job_order_object = {} job_order_object[shortname(inp["id"])] = inp["default"] if not job_order_object and len(t.tool["inputs"]) > 0: parser.print_help() if toolparser: print "\nOptions for %s " % args.workflow toolparser.print_help() _logger.error("") _logger.error("Input object required") return 1 if print_input_deps: printdeps(job_order_object, loader, stdout, relative_deps, basedir="file://%s/" % input_basedir) return 0 if "cwl:tool" in job_order_object: del job_order_object["cwl:tool"] if "id" in job_order_object: del job_order_object["id"] return (job_order_object, input_basedir)
def collect_output(self, schema, builder, outdir): r = None if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] if "glob" in binding: r = [] for gb in aslist(binding["glob"]): try: gb = builder.do_eval(gb) globpatterns.append(gb) if gb: r.extend([{"path": g, "class": "File"} for g in builder.fs_access.glob(os.path.join(outdir, gb))]) except (OSError, IOError) as e: _logger.warn(str(e)) for files in r: checksum = hashlib.sha1() with builder.fs_access.open(files["path"], "rb") as f: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents filesize = 0 while contents != "": checksum.update(contents) filesize += len(contents) contents = f.read(1024*1024) files["checksum"] = "sha1$%s" % checksum.hexdigest() files["size"] = filesize if "format" in schema: files["format"] = builder.do_eval(schema["format"], context=files) optional = False singlefile = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"]: singlefile = True elif schema["type"] == "File": singlefile = True if "outputEval" in binding: r = builder.do_eval(binding["outputEval"], context=r) if singlefile: # Handle single file outputs not wrapped in a list if r is not None and not isinstance(r, (list, tuple)): r = [r] if optional and r is None: pass elif (r is None or len(r) != 1 or not isinstance(r[0], dict) or "path" not in r[0]): raise WorkflowException("Expression must return a file object for %s." % schema["id"]) if singlefile: if not r and not optional: raise WorkflowException("Did not find output file with glob pattern: '{}'".format(globpatterns)) elif not r and optional: pass elif isinstance(r, list): if len(r) > 1: raise WorkflowException("Multiple matches for output item that is a single file.") else: r = r[0] if "secondaryFiles" in schema: for primary in aslist(r): if isinstance(primary, dict): primary["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if isinstance(sf, dict) or "$(" in sf or "${" in sf: sfpath = builder.do_eval(sf, context=r) if isinstance(sfpath, basestring): sfpath = {"path": sfpath, "class": "File"} else: sfpath = {"path": substitute(primary["path"], sf), "class": "File"} for sfitem in aslist(sfpath): if builder.fs_access.exists(sfitem["path"]): primary["secondaryFiles"].append(sfitem) if not r and optional: r = None if not r and isinstance(schema["type"], dict) and schema["type"]["type"] == "record": r = {} for f in schema["type"]["fields"]: r[shortname(f["name"])] = self.collect_output(f, builder, outdir) return r
def try_make_job(self, step, basedir, **kwargs): inputparms = step.tool["inputs"] outputparms = step.tool["outputs"] supportsMultipleInput = bool(self.workflow.get_requirement("MultipleInputFeatureRequirement")[0]) try: inputobj = object_from_state(self.state, inputparms, False, supportsMultipleInput) if inputobj is None: _logger.debug("[workflow %s] job step %s not ready", self.name, step.id) return _logger.debug("[step %s] starting job step %s of workflow %s", id(step), step.id, id(self)) if step.submitted: return callback = functools.partial(self.receive_output, step, outputparms) valueFrom = {i["id"]: i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i} if len(valueFrom) > 0 and not bool(self.workflow.get_requirement("StepInputExpressionRequirement")[0]): raise WorkflowException("Workflow step contains valueFrom but StepInputExpressionRequirement not in requirements") vfinputs = {shortname(k): v for k,v in inputobj.iteritems()} def valueFromFunc(k, v): if k in valueFrom: return expression.do_eval(valueFrom[k], vfinputs, self.workflow.requirements, None, None, {}, context=v) else: return v if "scatter" in step.tool: scatter = aslist(step.tool["scatter"]) method = step.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise WorkflowException("Must specify scatterMethod when scattering over multiple inputs") if "valueFrom" not in kwargs: kwargs["valueFrom"] = valueFromFunc if method == "dotproduct" or method is None: jobs = dotproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "nested_crossproduct": jobs = nested_crossproduct_scatter(step, inputobj, basedir, scatter, callback, **kwargs) elif method == "flat_crossproduct": jobs = flat_crossproduct_scatter(step, inputobj, basedir, scatter, callback, 0, **kwargs) else: _logger.debug("[workflow %s] Job is input %s", self.name, json.dumps(inputobj, indent=4)) inputobj = {k: valueFromFunc(k, v) for k,v in inputobj.items()} _logger.debug("[workflow %s] Evaluated job input to %s", self.name, json.dumps(inputobj, indent=4)) jobs = step.job(inputobj, basedir, callback, **kwargs) step.submitted = True for j in jobs: yield j except WorkflowException: raise except Exception as e: _logger.exception("Unhandled exception") self.processStatus = "permanentFail" step.completed = True
def job(self, joborder, basedir, output_callback, move_outputs=True, **kwargs): self.state = {} self.processStatus = "success" if "outdir" in kwargs: del kwargs["outdir"] for i in self.tool["inputs"]: iid = shortname(i["id"]) if iid in joborder: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(joborder[iid])) elif "default" in i: self.state[i["id"]] = WorkflowStateItem(i, copy.deepcopy(i["default"])) else: raise WorkflowException("Input '%s' not in input object and does not have a default value." % (i["id"])) for s in self.steps: for out in s.tool["outputs"]: self.state[out["id"]] = None output_dirs = set() completed = 0 iterables = [] while completed < len(self.steps) and self.processStatus == "success": made_progress = False for step in self.steps: if not step.submitted: step.iterable = self.try_make_job(step, basedir, **kwargs) if step.iterable: for newjob in step.iterable: if newjob: made_progress = True if newjob.outdir: output_dirs.add(newjob.outdir) yield newjob else: break completed = sum(1 for s in self.steps if s.completed) if not made_progress and completed < len(self.steps): yield None supportsMultipleInput = bool(self.workflow.get_requirement("MultipleInputFeatureRequirement")[0]) wo = object_from_state(self.state, self.tool["outputs"], True, supportsMultipleInput) if wo is None: raise WorkflowException("Output for workflow not available") if move_outputs: targets = set() conflicts = set() outfiles = findfiles(wo) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a)+1:]) if dst in targets: conflicts.add(dst) else: targets.add(dst) for f in outfiles: for a in output_dirs: if f["path"].startswith(a): src = f["path"] dst = os.path.join(self.outdir, src[len(a)+1:]) if dst in conflicts: sp = os.path.splitext(dst) dst = "%s-%s%s" % (sp[0], str(random.randint(1, 1000000000)), sp[1]) dirname = os.path.dirname(dst) if not os.path.exists(dirname): os.makedirs(dirname) _logger.debug("[workflow %s] Moving '%s' to '%s'", self.name, src, dst) shutil.move(src, dst) f["path"] = dst for a in output_dirs: if os.path.exists(a) and empty_subtree(a): if kwargs.get("rm_tmpdir", True): _logger.debug("[workflow %s] Removing intermediate output directory %s", self.name, a) shutil.rmtree(a, True) _logger.info("[workflow %s] outdir is %s", self.name, self.outdir) output_callback(wo, self.processStatus)