def get_overrides( overrides, toolid): # type: (List[Dict[Text, Any]], Text) -> Dict[Text, Any] req = {} # type: Dict[Text, Any] if not isinstance(overrides, list): raise validate.ValidationException( "Expected overrides to be a list, but was %s" % type(overrides)) for ov in overrides: if ov["overrideTarget"] == toolid: req.update(ov) return req
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) if isinstance(self.step.tool["scatter"], basestring): scatter = [self.step.tool["scatter"]] else: scatter = self.step.tool["scatter"] scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" outputs = [] self.vfinputs = cwljob shortscatter = [shortname(s) for s in scatter] cwljob = {k: self.valueFromFunc(k, v) if k not in shortscatter else v for k,v in cwljob.items()} if scatterMethod == "dotproduct": for i in xrange(0, len(cwljob[shortname(scatter[0])])): copyjob = copy.copy(cwljob) for sc in scatter: scatter_key = shortname(sc) copyjob[scatter_key] = self.valueFromFunc(scatter_key, cwljob[scatter_key][i]) (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) elif scatterMethod == "nested_crossproduct": outputs = self.nested_crossproduct_scatter(cwljob, scatter) elif scatterMethod == "flat_crossproduct": self.flat_crossproduct_scatter(cwljob, scatter, outputs) else: if scatterMethod: raise validate.ValidationException( "Unsupported complex scatter type '%s'" % scatterMethod) else: raise validate.ValidationException( "Must provide scatterMethod to scatter over multiple inputs") return outputs
def addLocation(d): if "location" not in d: if d["class"] == "File" and ("contents" not in d): raise validate.ValidationException( "Anonymous file object must have 'contents' and 'basename' fields." ) if d["class"] == "Directory" and ("listing" not in d or "basename" not in d): raise validate.ValidationException( "Anonymous directory object must have 'listing' and 'basename' fields." ) d["location"] = "_:" + Text(uuid.uuid4()) if "basename" not in d: d["basename"] = d["location"][2:] parse = urllib.parse.urlparse(d["location"]) path = parse.path # strip trailing slash if path.endswith("/"): if d["class"] != "Directory": raise validate.ValidationException( "location '%s' ends with '/' but is not a Directory" % d["location"]) path = path.rstrip("/") d["location"] = urllib.parse.urlunparse( (parse.scheme, parse.netloc, path, parse.params, parse.query, parse.fragment)) if not d.get("basename"): if path.startswith("_:"): d["basename"] = Text(path[2:]) else: d["basename"] = Text( os.path.basename(urllib.request.url2pathname(path))) if d["class"] == "File": nr, ne = os.path.splitext(d["basename"]) if d.get("nameroot") != nr: d["nameroot"] = Text(nr) if d.get("nameext") != ne: d["nameext"] = Text(ne)
def fillInDefaults(inputs, job): # type: (List[Dict[str, str]], Dict[str, str]) -> None for inp in inputs: if shortname(inp["id"]) in job: pass elif shortname(inp["id"]) not in job and "default" in inp: job[shortname(inp["id"])] = copy.copy(inp["default"]) elif shortname(inp["id"]) not in job and inp["type"][0] == "null": pass else: raise validate.ValidationException("Missing input parameter `%s`" % shortname(inp["id"]))
def fillInDefaults(inputs, job): # type: (List[Dict[unicode, unicode]], Dict[unicode, Union[Dict[unicode, Any], List, unicode]]) -> None for inp in inputs: if shortname(inp[u"id"]) in job: pass elif shortname(inp[u"id"]) not in job and u"default" in inp: job[shortname(inp[u"id"])] = copy.copy(inp[u"default"]) elif shortname(inp[u"id"]) not in job and inp[u"type"][0] == u"null": pass else: raise validate.ValidationException("Missing input parameter `%s`" % shortname(inp["id"]))
def __init__( self, toolpath_object, # type: Dict[Text, Any] loadingContext # type: LoadingContext ): # type: (...) -> None super(Workflow, self).__init__(toolpath_object, loadingContext) self.provenance_object = None # type: Optional[CreateProvProfile] if loadingContext.research_obj: orcid = loadingContext.orcid full_name = loadingContext.cwl_full_name self.provenance_object = CreateProvProfile( loadingContext.research_obj, full_name, orcid, loadingContext.host_provenance, loadingContext.user_provenance) self.parent_wf = self.provenance_object loadingContext.prov_obj = self.provenance_object loadingContext = loadingContext.copy() loadingContext.requirements = self.requirements loadingContext.hints = self.hints self.steps = [] # type: List[WorkflowStep] validation_errors = [] for index, step in enumerate(self.tool.get("steps", [])): try: self.steps.append( WorkflowStep(step, index, loadingContext, loadingContext.prov_obj)) except validate.ValidationException as vexc: if _logger.isEnabledFor(logging.DEBUG): _logger.exception("Validation failed at") validation_errors.append(vexc) if validation_errors: raise validate.ValidationException("\n".join( str(v) for v in validation_errors)) random.shuffle(self.steps) # statically validate data links instead of doing it at runtime. workflow_inputs = self.tool["inputs"] workflow_outputs = self.tool["outputs"] step_inputs = [] # type: List[Any] step_outputs = [] # type: List[Any] param_to_step = {} # type: Dict[Text, Dict[Text, Any]] for step in self.steps: step_inputs.extend(step.tool["inputs"]) step_outputs.extend(step.tool["outputs"]) for s in step.tool["inputs"]: param_to_step[s["id"]] = step.tool if getdefault(loadingContext.do_validate, True): static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs, param_to_step)
def checkversion(doc, metadata, enable_dev): # type: (Union[CommentedSeq, CommentedMap], CommentedMap, bool) -> Tuple[Dict[Text, Any], Text] # pylint: disable=line-too-long """Checks the validity of the version of the give CWL document. Returns the document and the validated version string. """ cdoc = None # type: Optional[CommentedMap] if isinstance(doc, CommentedSeq): lc = metadata.lc metadata = copy.deepcopy(metadata) metadata.lc.data = copy.copy(lc.data) metadata.lc.filename = lc.filename metadata[u"$graph"] = doc cdoc = metadata elif isinstance(doc, CommentedMap): cdoc = doc else: raise Exception("Expected CommentedMap or CommentedSeq") assert cdoc is not None version = cdoc[u"cwlVersion"] if version not in UPDATES: if version in DEVUPDATES: if enable_dev: pass else: raise validate.ValidationException( u"Version '%s' is a development or deprecated version.\n " "Update your document to a stable version (%s) or use " "--enable-dev to enable support for development and " "deprecated versions." % (version, ", ".join(list(UPDATES.keys())))) else: raise validate.ValidationException(u"Unrecognized version %s" % version) return (cdoc, version)
def fetch_document(argsworkflow): # type: (Union[str, unicode, dict[unicode, Any]]) -> Tuple[Loader, Dict[unicode, Any], unicode] """Retrieve a CWL document.""" document_loader = Loader({"cwl": "https://w3id.org/cwl/cwl#", "id": "@id"}) uri = None # type: unicode workflowobj = None # type: Dict[unicode, Any] if isinstance(argsworkflow, (str, unicode)): split = urlparse.urlsplit(argsworkflow) if split.scheme: uri = argsworkflow else: uri = "file://" + os.path.abspath(argsworkflow) fileuri = urlparse.urldefrag(uri)[0] workflowobj = document_loader.fetch(fileuri) elif isinstance(argsworkflow, dict): workflowobj = argsworkflow uri = "#" + str(id(argsworkflow)) else: raise validate.ValidationException( "Must be URI or object: '%s'" % argsworkflow) return document_loader, workflowobj, uri
def __init__(self, toolpath_object, **kwargs): # type: (Dict[Text, Any], **Any) -> None super(Workflow, self).__init__(toolpath_object, **kwargs) kwargs["requirements"] = self.requirements kwargs["hints"] = self.hints makeTool = kwargs.get("makeTool") self.steps = [] # type: List[WorkflowStep] validation_errors = [] for n, step in enumerate(self.tool.get("steps", [])): try: self.steps.append(WorkflowStep(step, n, **kwargs)) except validate.ValidationException as v: if _logger.isEnabledFor(logging.DEBUG): _logger.exception("Validation failed at") validation_errors.append(v) if validation_errors: raise validate.ValidationException("\n".join( str(v) for v in validation_errors)) random.shuffle(self.steps) # statically validate data links instead of doing it at runtime. workflow_inputs = self.tool["inputs"] workflow_outputs = self.tool["outputs"] step_inputs = [] # type: List[Any] step_outputs = [] # type: List[Any] for step in self.steps: step_inputs.extend(step.tool["inputs"]) step_outputs.extend(step.tool["outputs"]) static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs)
def __init__(self, toolpath_object, # type: Dict[Text, Any] pos, # type: int loadingContext, # type: LoadingContext parentworkflowProv=None # type: Optional[CreateProvProfile] ): # type: (...) -> None if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + Text(pos) loadingContext = loadingContext.copy() loadingContext.requirements = (getdefault(loadingContext.requirements, []) + toolpath_object.get("requirements", []) + get_overrides(getdefault(loadingContext.overrides_list, []), self.id).get("requirements", [])) loadingContext.hints = getdefault(loadingContext.hints, []) + toolpath_object.get("hints", []) try: if isinstance(toolpath_object["run"], dict): self.embedded_tool = loadingContext.construct_tool_object(toolpath_object["run"], loadingContext) else: self.embedded_tool = load_tool( toolpath_object["run"], loadingContext) except validate.ValidationException as vexc: if loadingContext.debug: _logger.exception("Validation exception") raise WorkflowException( u"Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(vexc)))) validation_errors = [] self.tool = toolpath_object = copy.deepcopy(toolpath_object) bound = set() for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")): toolpath_object[toolfield] = [] for index, step_entry in enumerate(toolpath_object[stepfield]): if isinstance(step_entry, string_types): param = CommentedMap() # type: CommentedMap inputid = step_entry else: param = CommentedMap(six.iteritems(step_entry)) inputid = step_entry["id"] shortinputid = shortname(inputid) found = False for tool_entry in self.embedded_tool.tool[toolfield]: frag = shortname(tool_entry["id"]) if frag == shortinputid: #if the case that the step has a default for a parameter, #we do not want the default of the tool to override it step_default = None if "default" in param and "default" in tool_entry: step_default = param["default"] param.update(tool_entry) param["_tool_entry"] = tool_entry if step_default is not None: param["default"] = step_default found = True bound.add(frag) break if not found: if stepfield == "in": param["type"] = "Any" param["not_connected"] = True else: validation_errors.append( SourceLine(self.tool["out"], index).makeError( "Workflow step output '%s' does not correspond to" % shortname(step_entry)) + "\n" + SourceLine(self.embedded_tool.tool, "outputs").makeError( " tool output (expected '%s')" % ( "', '".join( [shortname(tool_entry["id"]) for tool_entry in self.embedded_tool.tool[toolfield]])))) param["id"] = inputid param.lc.line = toolpath_object[stepfield].lc.data[index][0] param.lc.col = toolpath_object[stepfield].lc.data[index][1] param.lc.filename = toolpath_object[stepfield].lc.filename toolpath_object[toolfield].append(param) missing = [] for i, tool_entry in enumerate(self.embedded_tool.tool["inputs"]): if shortname(tool_entry["id"]) not in bound: if "null" not in tool_entry["type"] and "default" not in tool_entry: missing.append(shortname(tool_entry["id"])) if missing: validation_errors.append(SourceLine(self.tool, "in").makeError( "Step is missing required parameter%s '%s'" % ("s" if len(missing) > 1 else "", "', '".join(missing)))) if validation_errors: raise validate.ValidationException("\n".join(validation_errors)) super(WorkflowStep, self).__init__(toolpath_object, loadingContext) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but " "SubworkflowFeatureRequirement not in requirements") if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement " "not in requirements") inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise validate.ValidationException( "Must specify scatterMethod when scattering over multiple inputs") inp_map = {i["id"]: i for i in inputparms} for inp in scatter: if inp not in inp_map: raise validate.ValidationException( SourceLine(self.tool, "scatter").makeError( "Scatter parameter '%s' does not correspond to " "an input parameter of this step, expecting '%s'" % (shortname(inp), "', '".join( shortname(k) for k in inp_map.keys())))) inp_map[inp]["type"] = {"type": "array", "items": inp_map[inp]["type"]} if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for index in range(0, nesting): for oparam in outputparms: oparam["type"] = {"type": "array", "items": oparam["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms self.prov_obj = None # type: Optional[CreateProvProfile] if loadingContext.research_obj: self.prov_obj = parentworkflowProv if self.embedded_tool.tool["class"] == "Workflow": self.parent_wf = self.embedded_tool.parent_wf else: self.parent_wf = self.prov_obj
def bind_input(self, schema, # type: MutableMapping[Text, Any] datum, # type: Any discover_secondaryFiles, # type: bool lead_pos=None, # type: Optional[Union[int, List[int]]] tail_pos=None, # type: Optional[List[int]] ): # type: (...) -> List[MutableMapping[Text, Any]] if tail_pos is None: tail_pos = [] if lead_pos is None: lead_pos = [] bindings = [] # type: List[MutableMapping[Text, Text]] binding = None # type: Optional[MutableMapping[Text,Any]] value_from_expression = False if "inputBinding" in schema and isinstance(schema["inputBinding"], MutableMapping): binding = CommentedMap(schema["inputBinding"].items()) assert binding is not None bp = list(aslist(lead_pos)) if "position" in binding: bp.extend(aslist(binding["position"])) else: bp.append(0) bp.extend(aslist(tail_pos)) binding["position"] = bp binding["datum"] = datum if "valueFrom" in binding: value_from_expression = True # Handle union types if isinstance(schema["type"], MutableSequence): bound_input = False for t in schema["type"]: avsc = None # type: Optional[Schema] if isinstance(t, string_types) and self.names.has_name(t, ""): avsc = self.names.get_name(t, "") elif isinstance(t, MutableMapping) and "name" in t and self.names.has_name(t["name"], ""): avsc = self.names.get_name(t["name"], "") if not avsc: avsc = make_avsc_object(convert_to_dict(t), self.names) assert avsc is not None if validate.validate(avsc, datum): schema = copy.deepcopy(schema) schema["type"] = t if not value_from_expression: return self.bind_input(schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) else: self.bind_input(schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) bound_input = True if not bound_input: raise validate.ValidationException(u"'%s' is not a valid union %s" % (datum, schema["type"])) elif isinstance(schema["type"], MutableMapping): st = copy.deepcopy(schema["type"]) if binding is not None\ and "inputBinding" not in st\ and "type" in st\ and st["type"] == "array"\ and "itemSeparator" not in binding: st["inputBinding"] = {} for k in ("secondaryFiles", "format", "streamable"): if k in schema: st[k] = schema[k] if value_from_expression: self.bind_input(st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) else: bindings.extend(self.bind_input(st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles)) else: if schema["type"] in self.schemaDefs: schema = self.schemaDefs[schema["type"]] if schema["type"] == "record": for f in schema["fields"]: if f["name"] in datum and datum[f["name"]] is not None: bindings.extend(self.bind_input(f, datum[f["name"]], lead_pos=lead_pos, tail_pos=f["name"], discover_secondaryFiles=discover_secondaryFiles)) else: datum[f["name"]] = f.get("default") if schema["type"] == "array": for n, item in enumerate(datum): b2 = None if binding is not None: b2 = copy.deepcopy(binding) b2["datum"] = item itemschema = { u"type": schema["items"], u"inputBinding": b2 } for k in ("secondaryFiles", "format", "streamable"): if k in schema: itemschema[k] = schema[k] bindings.extend( self.bind_input(itemschema, item, lead_pos=n, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles)) binding = None def _capture_files(f): self.files.append(f) return f if schema["type"] == "File": self.files.append(datum) if (binding and binding.get("loadContents")) or schema.get("loadContents"): with self.fs_access.open(datum["location"], "rb") as f: datum["contents"] = f.read(CONTENT_LIMIT).decode("utf-8") if "secondaryFiles" in schema: if "secondaryFiles" not in datum: datum["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if 'required' in sf: sf_required = self.do_eval(sf['required'], context=datum) else: sf_required = True if "$(" in sf["pattern"] or "${" in sf["pattern"]: sfpath = self.do_eval(sf["pattern"], context=datum) else: sfpath = substitute(datum["basename"], sf["pattern"]) for sfname in aslist(sfpath): if not sfname: continue found = False for d in datum["secondaryFiles"]: if not d.get("basename"): d["basename"] = d["location"][d["location"].rindex("/")+1:] if d["basename"] == sfname: found = True if not found: sf_location = datum["location"][0:datum["location"].rindex("/")+1]+sfname if isinstance(sfname, MutableMapping): datum["secondaryFiles"].append(sfname) elif discover_secondaryFiles and self.fs_access.exists(sf_location): datum["secondaryFiles"].append({ "location": sf_location, "basename": sfname, "class": "File"}) elif sf_required: raise WorkflowException("Missing required secondary file '%s' from file object: %s" % ( sfname, json_dumps(datum, indent=4))) normalizeFilesDirs(datum["secondaryFiles"]) if "format" in schema: try: check_format(datum, self.do_eval(schema["format"]), self.formatgraph) except validate.ValidationException as ve: raise WorkflowException( "Expected value of '%s' to have format %s but\n " " %s" % (schema["name"], schema["format"], ve)) visit_class(datum.get("secondaryFiles", []), ("File", "Directory"), _capture_files) if schema["type"] == "Directory": ll = schema.get("loadListing") or self.loadListing if ll and ll != "no_listing": get_listing(self.fs_access, datum, (ll == "deep_listing")) self.files.append(datum) if schema["type"] == "Any": visit_class(datum, ("File", "Directory"), _capture_files) # Position to front of the sort key if binding is not None: for bi in bindings: bi["position"] = binding["position"] + bi["position"] bindings.append(binding) return bindings
def job( self, job_order, # type: MutableMapping[Text, Text] output_callbacks, # type: Callable[[Any, Any], Any] runtimeContext # RuntimeContext ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] require_prefix = "" if self.metadata["cwlVersion"] == "v1.0": require_prefix = "http://commonwl.org/cwltool#" workReuse, _ = self.get_requirement(require_prefix + "WorkReuse") enableReuse = workReuse.get("enableReuse", True) if workReuse else True jobname = uniquename(runtimeContext.name or shortname(self.tool.get("id", "job"))) if runtimeContext.cachedir and enableReuse: cachecontext = runtimeContext.copy() cachecontext.outdir = "/out" cachecontext.tmpdir = "/tmp" cachecontext.stagedir = "/stage" cachebuilder = self._init_job(job_order, cachecontext) cachebuilder.pathmapper = PathMapper(cachebuilder.files, runtimeContext.basedir, cachebuilder.stagedir, separateDirs=False) _check_adjust = partial(check_adjust, cachebuilder) visit_class([cachebuilder.files, cachebuilder.bindings], ("File", "Directory"), _check_adjust) cmdline = flatten( list(map(cachebuilder.generate_arg, cachebuilder.bindings))) docker_req, _ = self.get_requirement("DockerRequirement") if docker_req is not None and runtimeContext.use_container: dockerimg = docker_req.get("dockerImageId") or docker_req.get( "dockerPull") elif runtimeContext.default_container is not None and runtimeContext.use_container: dockerimg = runtimeContext.default_container else: dockerimg = None if dockerimg is not None: cmdline = ["docker", "run", dockerimg] + cmdline # not really run using docker, just for hashing purposes keydict = {u"cmdline": cmdline} for shortcut in ["stdout", "stderr"]: # later, add "stdin" if shortcut in self.tool: keydict[shortcut] = self.tool[shortcut] for location, fobj in cachebuilder.pathmapper.items(): if fobj.type == "File": checksum = next( (e['checksum'] for e in cachebuilder.files if 'location' in e and e['location'] == location and 'checksum' in e and e['checksum'] != 'sha1$hash'), None) fobj_stat = os.stat(fobj.resolved) if checksum is not None: keydict[fobj.resolved] = [fobj_stat.st_size, checksum] else: keydict[fobj.resolved] = [ fobj_stat.st_size, int(fobj_stat.st_mtime * 1000) ] interesting = { "DockerRequirement", "EnvVarRequirement", "CreateFileRequirement", "ShellCommandRequirement" } for rh in (self.original_requirements, self.original_hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json_dumps(keydict, separators=(',', ':'), sort_keys=True) cachekey = hashlib.md5(keydictstr.encode('utf-8')).hexdigest() _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(runtimeContext.cachedir, cachekey) jobcachepending = "{}.{}.pending".format( jobcache, threading.current_thread().ident) if os.path.isdir(jobcache) and not os.path.isfile(jobcachepending): if docker_req and runtimeContext.use_container: cachebuilder.outdir = runtimeContext.docker_outdir or random_outdir( ) else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callbacks, cachebuilder, jobcache) return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) shutil.rmtree(jobcache, True) os.makedirs(jobcache) runtimeContext = runtimeContext.copy() runtimeContext.outdir = jobcache open(jobcachepending, "w").close() def rm_pending_output_callback(output_callbacks, jobcachepending, outputs, processStatus): if processStatus == "success": os.remove(jobcachepending) output_callbacks(outputs, processStatus) output_callbacks = partial(rm_pending_output_callback, output_callbacks, jobcachepending) builder = self._init_job(job_order, runtimeContext) reffiles = copy.deepcopy(builder.files) j = self.make_job_runner(runtimeContext)(builder, builder.job, self.make_path_mapper, self.requirements, self.hints, jobname) j.prov_obj = self.prov_obj j.successCodes = self.tool.get("successCodes", []) j.temporaryFailCodes = self.tool.get("temporaryFailCodes", []) j.permanentFailCodes = self.tool.get("permanentFailCodes", []) debug = _logger.isEnabledFor(logging.DEBUG) if debug: _logger.debug( u"[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), u" as part of %s" % runtimeContext.part_of if runtimeContext.part_of else "") _logger.debug(u"[job %s] %s", j.name, json_dumps(job_order, indent=4)) builder.pathmapper = self.make_path_mapper(reffiles, builder.stagedir, runtimeContext, True) builder.requirements = j.requirements _check_adjust = partial(check_adjust, builder) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) initialWorkdir, _ = self.get_requirement("InitialWorkDirRequirement") if initialWorkdir is not None: ls = [] # type: List[Dict[Text, Any]] if isinstance(initialWorkdir["listing"], string_types): ls = builder.do_eval(initialWorkdir["listing"]) else: for t in initialWorkdir["listing"]: if "entry" in t: et = { u"entry": builder.do_eval(t["entry"], strip_whitespace=False) } if "entryname" in t: et["entryname"] = builder.do_eval(t["entryname"]) else: et["entryname"] = None et["writable"] = t.get("writable", False) ls.append(et) else: ls.append(builder.do_eval(t)) for i, t in enumerate(ls): if "entry" in t: if isinstance(t["entry"], string_types): ls[i] = { "class": "File", "basename": t["entryname"], "contents": t["entry"], "writable": t.get("writable") } else: if t.get("entryname") or t.get("writable"): t = copy.deepcopy(t) if t.get("entryname"): t["entry"]["basename"] = t["entryname"] t["entry"]["writable"] = t.get("writable") ls[i] = t["entry"] j.generatefiles["listing"] = ls for l in ls: self.updatePathmap(builder.outdir, builder.pathmapper, l) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) if debug: _logger.debug( u"[job %s] path mappings is %s", j.name, json_dumps( { p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files() }, indent=4)) if self.tool.get("stdin"): with SourceLine(self.tool, "stdin", validate.ValidationException, debug): j.stdin = builder.do_eval(self.tool["stdin"]) assert j.stdin is not None reffiles.append({"class": "File", "path": j.stdin}) if self.tool.get("stderr"): with SourceLine(self.tool, "stderr", validate.ValidationException, debug): j.stderr = builder.do_eval(self.tool["stderr"]) assert j.stderr is not None if os.path.isabs(j.stderr) or ".." in j.stderr: raise validate.ValidationException( "stderr must be a relative path, got '%s'" % j.stderr) if self.tool.get("stdout"): with SourceLine(self.tool, "stdout", validate.ValidationException, debug): j.stdout = builder.do_eval(self.tool["stdout"]) assert j.stdout is not None if os.path.isabs(j.stdout) or ".." in j.stdout or not j.stdout: raise validate.ValidationException( "stdout must be a relative path, got '%s'" % j.stdout) if debug: _logger.debug(u"[job %s] command line bindings is %s", j.name, json_dumps(builder.bindings, indent=4)) dockerReq, _ = self.get_requirement("DockerRequirement") if dockerReq is not None and runtimeContext.use_container: out_prefix = getdefault(runtimeContext.tmp_outdir_prefix, 'tmp') j.outdir = runtimeContext.outdir or \ tempfile.mkdtemp(prefix=out_prefix) # type: ignore tmpdir_prefix = getdefault(runtimeContext.tmpdir_prefix, 'tmp') j.tmpdir = runtimeContext.tmpdir or \ tempfile.mkdtemp(prefix=tmpdir_prefix) # type: ignore j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir j.stagedir = builder.stagedir inplaceUpdateReq, _ = self.get_requirement( "http://commonwl.org/cwltool#InplaceUpdateRequirement") if inplaceUpdateReq is not None: j.inplace_update = inplaceUpdateReq["inplaceUpdate"] normalizeFilesDirs(j.generatefiles) readers = {} # type: Dict[Text, Any] muts = set() # type: Set[Text] if builder.mutation_manager is not None: def register_mut(f): muts.add(f["location"]) builder.mutation_manager.register_mutation(j.name, f) def register_reader(f): if f["location"] not in muts: builder.mutation_manager.register_reader(j.name, f) readers[f["location"]] = copy.deepcopy(f) for li in j.generatefiles["listing"]: li = cast(Dict[Text, Any], li) if li.get("writable") and j.inplace_update: adjustFileObjs(li, register_mut) adjustDirObjs(li, register_mut) else: adjustFileObjs(li, register_reader) adjustDirObjs(li, register_reader) adjustFileObjs(builder.files, register_reader) adjustFileObjs(builder.bindings, register_reader) adjustDirObjs(builder.files, register_reader) adjustDirObjs(builder.bindings, register_reader) timelimit, _ = self.get_requirement(require_prefix + "TimeLimit") if timelimit is not None: with SourceLine(timelimit, "timelimit", validate.ValidationException, debug): j.timelimit = builder.do_eval(timelimit["timelimit"]) if not isinstance(j.timelimit, int) or j.timelimit < 0: raise Exception( "timelimit must be an integer >= 0, got: %s" % j.timelimit) if self.metadata["cwlVersion"] == "v1.0": j.networkaccess = True networkaccess, _ = self.get_requirement(require_prefix + "NetworkAccess") if networkaccess is not None: with SourceLine(networkaccess, "networkAccess", validate.ValidationException, debug): j.networkaccess = builder.do_eval( networkaccess["networkAccess"]) if not isinstance(j.networkaccess, bool): raise Exception( "networkAccess must be a boolean, got: %s" % j.networkaccess) j.environment = {} evr, _ = self.get_requirement("EnvVarRequirement") if evr is not None: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd, _ = self.get_requirement("ShellCommandRequirement") if shellcmd is not None: cmd = [] # type: List[Text] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten( list(map(builder.generate_arg, builder.bindings))) j.pathmapper = builder.pathmapper j.collect_outputs = partial(self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=getdefault( runtimeContext.compute_checksum, True), jobname=jobname, readers=readers) j.output_callback = output_callbacks yield j
def __init__(self, toolpath_object, **kwargs): (_, self.names, _) = get_schema() self.tool = toolpath_object self.requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) if "loader" in kwargs: self.formatgraph = kwargs["loader"].graph self.validate_hints(self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} sd, _ = self.get_requirement("SchemaDefRequirement") if sd: sdtypes = sd["types"] av = schema_salad.schema.make_valid_avro( sdtypes, {t["name"]: t for t in sdtypes}, set()) for i in av: self.schemaDefs[i["name"]] = i avro.schema.make_avsc_object(av, self.names) # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.copy(i) doc_url, _ = urlparse.urldefrag(c['id']) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException( "Missing `type` in parameter `%s`" % c["name"]) if "default" in c and "null" not in aslist(c["type"]): c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) try: self.inputs_record_schema = schema_salad.schema.make_valid_avro( self.inputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.inputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( "Got error `%s` while prcoessing inputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4))) try: self.outputs_record_schema = schema_salad.schema.make_valid_avro( self.outputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.outputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( "Got error `%s` while prcoessing outputs of %s:\n%s" % (str(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4)))
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) if isinstance(self.step.tool["scatter"], string_types): scatter = [self.step.tool["scatter"]] else: scatter = self.step.tool["scatter"] scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" outputs = [] valueFrom = { shortname(i["id"]): i["valueFrom"] for i in self.step.tool["inputs"] if "valueFrom" in i } def postScatterEval(io): shortio = {shortname(k): v for k, v in iteritems(io)} for k in valueFrom: io.setdefault(k, None) def valueFromFunc(k, v): if k in valueFrom: return cwltool.expression.do_eval(valueFrom[k], shortio, self.step.requirements, None, None, {}, context=v) else: return v return {k: valueFromFunc(k, v) for k, v in list(io.items())} if scatterMethod == "dotproduct": for i in range(0, len(cwljob[shortname(scatter[0])])): copyjob = copy.copy(cwljob) for sc in [shortname(x) for x in scatter]: copyjob[sc] = cwljob[sc][i] copyjob = postScatterEval(copyjob) (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) elif scatterMethod == "nested_crossproduct": outputs = self.nested_crossproduct_scatter(cwljob, scatter, postScatterEval) elif scatterMethod == "flat_crossproduct": self.flat_crossproduct_scatter(cwljob, scatter, outputs, postScatterEval) else: if scatterMethod: raise validate.ValidationException( "Unsupported complex scatter type '%s'" % scatterMethod) else: raise validate.ValidationException( "Must provide scatterMethod to scatter over multiple inputs" ) return outputs
def bind_input( self, schema: MutableMapping[str, Any], datum: Any, discover_secondaryFiles: bool, lead_pos: Optional[Union[int, List[int]]] = None, tail_pos: Optional[List[int]] = None, ) -> List[MutableMapping[str, Any]]: if tail_pos is None: tail_pos = [] if lead_pos is None: lead_pos = [] bindings = [] # type: List[MutableMapping[str, str]] binding = {} # type: Union[MutableMapping[str, str], CommentedMap] value_from_expression = False if "inputBinding" in schema and isinstance(schema["inputBinding"], MutableMapping): binding = CommentedMap(schema["inputBinding"].items()) bp = list(aslist(lead_pos)) if "position" in binding: position = binding["position"] if isinstance(position, str): # no need to test the CWL Version # the schema for v1.0 only allow ints binding["position"] = self.do_eval(position, context=datum) bp.append(binding["position"]) else: bp.extend(aslist(binding["position"])) else: bp.append(0) bp.extend(aslist(tail_pos)) binding["position"] = bp binding["datum"] = datum if "valueFrom" in binding: value_from_expression = True # Handle union types if isinstance(schema["type"], MutableSequence): bound_input = False for t in schema["type"]: avsc = None # type: Optional[Schema] if isinstance(t, str) and self.names.has_name(t, ""): avsc = self.names.get_name(t, "") elif (isinstance(t, MutableMapping) and "name" in t and self.names.has_name(t["name"], "")): avsc = self.names.get_name(t["name"], "") if not avsc: avsc = make_avsc_object(convert_to_dict(t), self.names) if validate.validate(avsc, datum): schema = copy.deepcopy(schema) schema["type"] = t if not value_from_expression: return self.bind_input( schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles, ) else: self.bind_input( schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles, ) bound_input = True if not bound_input: raise validate.ValidationException( "'%s' is not a valid union %s" % (datum, schema["type"])) elif isinstance(schema["type"], MutableMapping): st = copy.deepcopy(schema["type"]) if (binding and "inputBinding" not in st and "type" in st and st["type"] == "array" and "itemSeparator" not in binding): st["inputBinding"] = {} for k in ("secondaryFiles", "format", "streamable"): if k in schema: st[k] = schema[k] if value_from_expression: self.bind_input( st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles, ) else: bindings.extend( self.bind_input( st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles, )) else: if schema["type"] in self.schemaDefs: schema = self.schemaDefs[schema["type"]] if schema["type"] == "record": for f in schema["fields"]: if f["name"] in datum and datum[f["name"]] is not None: bindings.extend( self.bind_input( f, datum[f["name"]], lead_pos=lead_pos, tail_pos=f["name"], discover_secondaryFiles=discover_secondaryFiles, )) else: datum[f["name"]] = f.get("default") if schema["type"] == "array": for n, item in enumerate(datum): b2 = None if binding: b2 = copy.deepcopy(binding) b2["datum"] = item itemschema = {"type": schema["items"], "inputBinding": b2} for k in ("secondaryFiles", "format", "streamable"): if k in schema: itemschema[k] = schema[k] bindings.extend( self.bind_input( itemschema, item, lead_pos=n, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles, )) binding = {} def _capture_files(f): # type: (Dict[str, str]) -> Dict[str, str] self.files.append(f) return f if schema["type"] == "File": self.files.append(datum) if (binding and binding.get("loadContents") ) or schema.get("loadContents"): with self.fs_access.open(datum["location"], "rb") as f: datum["contents"] = content_limit_respected_read(f) if "secondaryFiles" in schema: if "secondaryFiles" not in datum: datum["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if "required" in sf: sf_required = self.do_eval(sf["required"], context=datum) else: sf_required = True if "$(" in sf["pattern"] or "${" in sf["pattern"]: sfpath = self.do_eval(sf["pattern"], context=datum) else: sfpath = substitute(datum["basename"], sf["pattern"]) for sfname in aslist(sfpath): if not sfname: continue found = False if isinstance(sfname, str): sf_location = ( datum["location"] [0:datum["location"].rindex("/") + 1] + sfname) sfbasename = sfname elif isinstance(sfname, MutableMapping): sf_location = sfname["location"] sfbasename = sfname["basename"] else: raise WorkflowException( "Expected secondaryFile expression to return type 'str' or 'MutableMapping', received '%s'" % (type(sfname))) for d in datum["secondaryFiles"]: if not d.get("basename"): d["basename"] = d["location"][ d["location"].rindex("/") + 1:] if d["basename"] == sfbasename: found = True if not found: def addsf( files: MutableSequence[MutableMapping[ str, Any]], newsf: MutableMapping[str, Any], ) -> None: for f in files: if f["location"] == newsf["location"]: f["basename"] = newsf["basename"] return files.append(newsf) if isinstance(sfname, MutableMapping): addsf(datum["secondaryFiles"], sfname) elif discover_secondaryFiles and self.fs_access.exists( sf_location): addsf( datum["secondaryFiles"], { "location": sf_location, "basename": sfname, "class": "File", }, ) elif sf_required: raise WorkflowException( "Missing required secondary file '%s' from file object: %s" % (sfname, json_dumps(datum, indent=4))) normalizeFilesDirs(datum["secondaryFiles"]) if "format" in schema: try: check_format(datum, self.do_eval(schema["format"]), self.formatgraph) except validate.ValidationException as ve: raise WorkflowException( "Expected value of '%s' to have format %s but\n " " %s" % (schema["name"], schema["format"], ve)) from ve visit_class( datum.get("secondaryFiles", []), ("File", "Directory"), _capture_files, ) if schema["type"] == "Directory": ll = schema.get("loadListing") or self.loadListing if ll and ll != "no_listing": get_listing(self.fs_access, datum, (ll == "deep_listing")) self.files.append(datum) if schema["type"] == "Any": visit_class(datum, ("File", "Directory"), _capture_files) # Position to front of the sort key if binding: for bi in bindings: bi["position"] = binding["position"] + bi["position"] bindings.append(binding) return bindings
def __init__(self, toolpath_object: MutableMapping[str, Any], loadingContext: LoadingContext) -> None: """Build a Process object from the provided dictionary.""" super(Process, self).__init__() self.metadata = getdefault(loadingContext.metadata, {}) # type: Dict[str,Any] self.provenance_object = None # type: Optional[ProvenanceProfile] self.parent_wf = None # type: Optional[ProvenanceProfile] global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY # pylint: disable=global-statement if SCHEMA_FILE is None or SCHEMA_ANY is None or SCHEMA_DIR is None: get_schema("v1.0") SCHEMA_ANY = cast( Dict[str, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"], ) SCHEMA_FILE = cast( Dict[str, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"], ) SCHEMA_DIR = cast( Dict[str, Any], SCHEMA_CACHE["v1.0"] [3].idx["https://w3id.org/cwl/cwl#Directory"], ) self.names = schema.make_avro_schema( [SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY], Loader({})) self.tool = toolpath_object self.requirements = copy.deepcopy( getdefault(loadingContext.requirements, [])) self.requirements.extend(self.tool.get("requirements", [])) if "id" not in self.tool: self.tool["id"] = "_:" + str(uuid.uuid4()) self.requirements.extend( get_overrides(getdefault(loadingContext.overrides_list, []), self.tool["id"]).get("requirements", [])) self.hints = copy.deepcopy(getdefault(loadingContext.hints, [])) self.hints.extend(self.tool.get("hints", [])) # Versions of requirements and hints which aren't mutated. self.original_requirements = copy.deepcopy(self.requirements) self.original_hints = copy.deepcopy(self.hints) self.doc_loader = loadingContext.loader self.doc_schema = loadingContext.avsc_names self.formatgraph = None # type: Optional[Graph] if self.doc_loader is not None: self.formatgraph = self.doc_loader.graph checkRequirements(self.tool, supportedProcessRequirements) self.validate_hints( loadingContext.avsc_names, self.tool.get("hints", []), strict=getdefault(loadingContext.strict, False), ) self.schemaDefs = {} # type: Dict[str,Dict[str, Any]] sd, _ = self.get_requirement("SchemaDefRequirement") if sd is not None: sdtypes = avroize_type(sd["types"]) av = schema.make_valid_avro(sdtypes, {t["name"]: t for t in sdtypes}, set()) for i in av: self.schemaDefs[i["name"]] = i # type: ignore schema.make_avsc_object(schema.convert_to_dict(av), self.names) # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [], } # type: Dict[str, Any] self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [], } # type: Dict[str, Any] for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.deepcopy(i) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException( "Missing 'type' in parameter '{}'".format(c["name"])) if "default" in c and "null" not in aslist(c["type"]): nullable = ["null"] nullable.extend(aslist(c["type"])) c["type"] = nullable else: c["type"] = c["type"] c["type"] = avroize_type(c["type"], c["name"]) if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) with SourceLine(toolpath_object, "inputs", validate.ValidationException): self.inputs_record_schema = cast( Dict[str, Any], schema.make_valid_avro(self.inputs_record_schema, {}, set()), ) schema.make_avsc_object( schema.convert_to_dict(self.inputs_record_schema), self.names) with SourceLine(toolpath_object, "outputs", validate.ValidationException): self.outputs_record_schema = cast( Dict[str, Any], schema.make_valid_avro(self.outputs_record_schema, {}, set()), ) schema.make_avsc_object( schema.convert_to_dict(self.outputs_record_schema), self.names) if toolpath_object.get("class") is not None and not getdefault( loadingContext.disable_js_validation, False): if loadingContext.js_hint_options_file is not None: try: with open(loadingContext.js_hint_options_file ) as options_file: validate_js_options = json.load(options_file) except (OSError, ValueError) as err: _logger.error( "Failed to read options file %s", loadingContext.js_hint_options_file, ) raise else: validate_js_options = None if self.doc_schema is not None: validate_js_expressions( cast(CommentedMap, toolpath_object), self.doc_schema.names[toolpath_object["class"]], validate_js_options, ) dockerReq, is_req = self.get_requirement("DockerRequirement") if (dockerReq is not None and "dockerOutputDirectory" in dockerReq and is_req is not None and not is_req): _logger.warning( SourceLine(item=dockerReq, raise_type=str).makeError( "When 'dockerOutputDirectory' is declared, DockerRequirement " "should go in the 'requirements' section, not 'hints'." "")) if (dockerReq is not None and is_req is not None and dockerReq.get("dockerOutputDirectory") == "/var/spool/cwl"): if is_req: # In this specific case, it is legal to have /var/spool/cwl, so skip the check. pass else: # Must be a requirement var_spool_cwl_detector(self.tool) else: var_spool_cwl_detector(self.tool)
def __init__( self, toolpath_object, # type: MutableMapping[Text, Any] loadingContext # type: LoadingContext ): # type: (...) -> None """Initializet this Workflow.""" super(Workflow, self).__init__(toolpath_object, loadingContext) self.provenance_object = None # type: Optional[ProvenanceProfile] if loadingContext.research_obj is not None: run_uuid = None # type: Optional[UUID] is_master = not loadingContext.prov_obj # Not yet set if is_master: run_uuid = loadingContext.research_obj.ro_uuid self.provenance_object = ProvenanceProfile( loadingContext.research_obj, full_name=loadingContext.cwl_full_name, host_provenance=loadingContext.host_provenance, user_provenance=loadingContext.user_provenance, orcid=loadingContext.orcid, run_uuid=run_uuid, fsaccess=loadingContext.research_obj.fsaccess ) # inherit RO UUID for master wf run # TODO: Is Workflow(..) only called when we are the master workflow? self.parent_wf = self.provenance_object # FIXME: Won't this overwrite prov_obj for nested workflows? loadingContext.prov_obj = self.provenance_object loadingContext = loadingContext.copy() loadingContext.requirements = self.requirements loadingContext.hints = self.hints self.steps = [] # type: List[WorkflowStep] validation_errors = [] for index, step in enumerate(self.tool.get("steps", [])): try: self.steps.append( self.make_workflow_step(step, index, loadingContext, loadingContext.prov_obj)) except validate.ValidationException as vexc: if _logger.isEnabledFor(logging.DEBUG): _logger.exception("Validation failed at") validation_errors.append(vexc) if validation_errors: raise validate.ValidationException("\n".join( str(v) for v in validation_errors)) random.shuffle(self.steps) # statically validate data links instead of doing it at runtime. workflow_inputs = self.tool["inputs"] workflow_outputs = self.tool["outputs"] step_inputs = [] # type: List[Any] step_outputs = [] # type: List[Any] param_to_step = {} # type: Dict[Text, Dict[Text, Any]] for step in self.steps: step_inputs.extend(step.tool["inputs"]) step_outputs.extend(step.tool["outputs"]) for s in step.tool["inputs"]: param_to_step[s["id"]] = step.tool if getdefault(loadingContext.do_validate, True): static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs, param_to_step)
def run(self, file_store): cwljob = resolve_indirect(self.cwljob) # `promises` dict # from: each parameter (workflow input or step output) # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. promises = {} # `jobs` dict from step id to job that implements that step. jobs = {} for inp in self.cwlwf.tool["inputs"]: promises[inp["id"]] = SelfJob(self, cwljob) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, scheduling jobs as their # dependencies can be fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True for step in self.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp: for s in aslist(inp["source"]): if s not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} for inp in step.tool["inputs"]: key = shortname(inp["id"]) if "source" in inp: if inp.get("linkMerge") \ or len(aslist(inp["source"])) > 1: linkMerge = inp.get( "linkMerge", "merge_nested") if linkMerge == "merge_nested": jobobj[key] = ( MergeInputsNested( [(shortname(s), promises[s].rv()) for s in aslist( inp["source"])])) elif linkMerge == "merge_flattened": jobobj[key] = ( MergeInputsFlattened( [(shortname(s), promises[s].rv()) for s in aslist( inp["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '%s'" % linkMerge) else: inpSource = inp["source"] if isinstance(inpSource, MutableSequence): # It seems that an input source with a # '#' in the name will be returned as a # CommentedSeq list by the yaml parser. inpSource = str(inpSource[0]) jobobj[key] = (shortname(inpSource), promises[inpSource].rv()) if "default" in inp: if key in jobobj: if isinstance(jobobj[key][1], Promise): d = copy.copy(inp["default"]) jobobj[key] = DefaultWithSource( d, jobobj[key]) else: if jobobj[key][1][ jobobj[key][0]] is None: d = copy.copy(inp["default"]) jobobj[key] = ( "default", {"default": d}) else: d = copy.copy(inp["default"]) jobobj[key] = ("default", {"default": d}) if "valueFrom" in inp \ and "scatter" not in step.tool: if key in jobobj: jobobj[key] = StepValueFrom( inp["valueFrom"], jobobj[key], self.cwlwf.requirements) else: jobobj[key] = StepValueFrom( inp["valueFrom"], ( "None", {"None": None}), self.cwlwf.requirements) if "scatter" in step.tool: wfjob = CWLScatter(step, IndirectDict(jobobj), self.runtime_context) followOn = CWLGather(step, wfjob.rv()) wfjob.addFollowOn(followOn) else: (wfjob, followOn) = makeJob( step.embedded_tool, IndirectDict(jobobj), step.tool["inputs"], self.runtime_context) jobs[step.tool["id"]] = followOn connected = False for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if (isinstance( promises[s], (CWLJobWrapper, CWLGather) ) and not promises[s].hasFollowOn(wfjob)): promises[s].addFollowOn(wfjob) connected = True if (not isinstance( promises[s], (CWLJobWrapper, CWLGather) ) and not promises[s].hasChild(wfjob)): promises[s].addChild(wfjob) connected = True if not connected: # the workflow step has default inputs only & isn't # connected to other jobs, so add it as child of # this workflow. self.addChild(wfjob) for out in step.tool["outputs"]: promises[out["id"]] = followOn for inp in step.tool["inputs"]: for source in aslist(inp.get("source", [])): if source not in promises: alloutputs_fufilled = False # may need a test for out in self.cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False outobj = {} for out in self.cwlwf.tool["outputs"]: key = shortname(out["id"]) if out.get("linkMerge") or len(aslist(out["outputSource"])) > 1: link_merge = out.get("linkMerge", "merge_nested") if link_merge == "merge_nested": outobj[key] = ( MergeInputsNested( [(shortname(s), promises[s].rv()) for s in aslist(out["outputSource"])])) elif link_merge == "merge_flattened": outobj[key] = ( MergeInputsFlattened([ (shortname(s), promises[s].rv()) for s in aslist(out["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '{}'".format(link_merge)) else: # A CommentedSeq of length one still appears here rarely - # not clear why from the CWL code. When it does, it breaks # the execution by causing a non-hashable type exception. # We simplify the list into its first (and only) element. src = simplify_list(out["outputSource"]) outobj[key] = (shortname(src), promises[src].rv()) return IndirectDict(outobj)
def bind_input(self, schema, datum, lead_pos=None, tail_pos=None): # type: (Dict[Text, Any], Any, Union[int, List[int]], List[int]) -> List[Dict[Text, Any]] if tail_pos is None: tail_pos = [] if lead_pos is None: lead_pos = [] bindings = [] # type: List[Dict[Text,Text]] binding = None # type: Dict[Text,Any] if "inputBinding" in schema and isinstance(schema["inputBinding"], dict): binding = copy.copy(schema["inputBinding"]) if "position" in binding: binding["position"] = aslist(lead_pos) + aslist( binding["position"]) + aslist(tail_pos) else: binding["position"] = aslist(lead_pos) + [0] + aslist(tail_pos) binding["datum"] = datum # Handle union types if isinstance(schema["type"], list): for t in schema["type"]: if isinstance(t, (str, Text)) and self.names.has_name(t, ""): avsc = self.names.get_name(t, "") elif isinstance(t, dict) and "name" in t and self.names.has_name( t["name"], ""): avsc = self.names.get_name(t["name"], "") else: avsc = avro.schema.make_avsc_object(t, self.names) if validate.validate(avsc, datum): schema = copy.deepcopy(schema) schema["type"] = t return self.bind_input(schema, datum, lead_pos=lead_pos, tail_pos=tail_pos) raise validate.ValidationException( u"'%s' is not a valid union %s" % (datum, schema["type"])) elif isinstance(schema["type"], dict): st = copy.deepcopy(schema["type"]) if binding and "inputBinding" not in st and st[ "type"] == "array" and "itemSeparator" not in binding: st["inputBinding"] = {} for k in ("secondaryFiles", "format", "streamable"): if k in schema: st[k] = schema[k] bindings.extend( self.bind_input(st, datum, lead_pos=lead_pos, tail_pos=tail_pos)) else: if schema["type"] in self.schemaDefs: schema = self.schemaDefs[schema["type"]] if schema["type"] == "record": for f in schema["fields"]: if f["name"] in datum: bindings.extend( self.bind_input(f, datum[f["name"]], lead_pos=lead_pos, tail_pos=f["name"])) else: datum[f["name"]] = f.get("default") if schema["type"] == "array": for n, item in enumerate(datum): b2 = None if binding: b2 = copy.deepcopy(binding) b2["datum"] = item itemschema = { u"type": schema["items"], u"inputBinding": b2 } for k in ("secondaryFiles", "format", "streamable"): if k in schema: itemschema[k] = schema[k] bindings.extend( self.bind_input(itemschema, item, lead_pos=n, tail_pos=tail_pos)) binding = None if schema["type"] == "File": self.files.append(datum) if binding: if binding.get("loadContents"): with self.fs_access.open(datum["location"], "rb") as f: datum["contents"] = f.read(CONTENT_LIMIT) if "secondaryFiles" in schema: if "secondaryFiles" not in datum: datum["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if isinstance(sf, dict) or "$(" in sf or "${" in sf: secondary_eval = self.do_eval(sf, context=datum) if isinstance(secondary_eval, string_types): sfpath = { "location": secondary_eval, "class": "File" } else: sfpath = secondary_eval else: sfpath = { "location": substitute(datum["location"], sf), "class": "File" } if isinstance(sfpath, list): datum["secondaryFiles"].extend(sfpath) else: datum["secondaryFiles"].append(sfpath) normalizeFilesDirs(datum["secondaryFiles"]) def _capture_files(f): self.files.append(f) return f visit_class(datum.get("secondaryFiles", []), ("File", "Directory"), _capture_files) if schema["type"] == "Directory": ll = self.loadListing or (binding and binding.get("loadListing")) if ll and ll != "no_listing": get_listing(self.fs_access, datum, (ll == "deep_listing")) self.files.append(datum) # Position to front of the sort key if binding: for bi in bindings: bi["position"] = binding["position"] + bi["position"] bindings.append(binding) return bindings
def bind_input(self, schema, datum, lead_pos=None, tail_pos=None, discover_secondaryFiles=False): # type: (Dict[Text, Any], Any, Union[int, List[int]], List[int], bool) -> List[Dict[Text, Any]] if tail_pos is None: tail_pos = [] if lead_pos is None: lead_pos = [] bindings = [] # type: List[Dict[Text,Text]] binding = None # type: Dict[Text,Any] value_from_expression = False if "inputBinding" in schema and isinstance(schema["inputBinding"], dict): binding = copy.copy(schema["inputBinding"]) if "position" in binding: binding["position"] = aslist(lead_pos) + aslist( binding["position"]) + aslist(tail_pos) else: binding["position"] = aslist(lead_pos) + [0] + aslist(tail_pos) binding["datum"] = datum if "valueFrom" in binding: value_from_expression = True # Handle union types if isinstance(schema["type"], list): bound_input = False for t in schema["type"]: if isinstance(t, (str, Text)) and self.names.has_name(t, ""): avsc = self.names.get_name(t, "") elif isinstance(t, dict) and "name" in t and self.names.has_name( t["name"], ""): avsc = self.names.get_name(t["name"], "") else: avsc = AvroSchemaFromJSONData(t, self.names) if validate.validate(avsc, datum): schema = copy.deepcopy(schema) schema["type"] = t if not value_from_expression: return self.bind_input( schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) else: self.bind_input( schema, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) bound_input = True if not bound_input: raise validate.ValidationException( u"'%s' is not a valid union %s" % (datum, schema["type"])) elif isinstance(schema["type"], dict): st = copy.deepcopy(schema["type"]) if binding and "inputBinding" not in st and st[ "type"] == "array" and "itemSeparator" not in binding: st["inputBinding"] = {} for k in ("secondaryFiles", "format", "streamable"): if k in schema: st[k] = schema[k] if value_from_expression: self.bind_input( st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles) else: bindings.extend( self.bind_input( st, datum, lead_pos=lead_pos, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles)) else: if schema["type"] in self.schemaDefs: schema = self.schemaDefs[schema["type"]] if schema["type"] == "record": for f in schema["fields"]: if f["name"] in datum: bindings.extend( self.bind_input( f, datum[f["name"]], lead_pos=lead_pos, tail_pos=f["name"], discover_secondaryFiles=discover_secondaryFiles )) else: datum[f["name"]] = f.get("default") if schema["type"] == "array": for n, item in enumerate(datum): b2 = None if binding: b2 = copy.deepcopy(binding) b2["datum"] = item itemschema = { u"type": schema["items"], u"inputBinding": b2 } for k in ("secondaryFiles", "format", "streamable"): if k in schema: itemschema[k] = schema[k] bindings.extend( self.bind_input( itemschema, item, lead_pos=n, tail_pos=tail_pos, discover_secondaryFiles=discover_secondaryFiles)) binding = None if schema["type"] == "File": self.files.append(datum) if (binding and binding.get("loadContents") ) or schema.get("loadContents"): with self.fs_access.open(datum["location"], "rb") as f: datum["contents"] = f.read(CONTENT_LIMIT) if "secondaryFiles" in schema: if "secondaryFiles" not in datum: datum["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if isinstance(sf, dict) or "$(" in sf or "${" in sf: sfpath = self.do_eval(sf, context=datum) else: sfpath = substitute(datum["basename"], sf) for sfname in aslist(sfpath): found = False for d in datum["secondaryFiles"]: if not d.get("basename"): d["basename"] = d["location"][ d["location"].rindex("/") + 1:] if d["basename"] == sfname: found = True if not found: if isinstance(sfname, dict): datum["secondaryFiles"].append(sfname) elif discover_secondaryFiles: datum["secondaryFiles"].append({ "location": datum["location"] [0:datum["location"].rindex("/") + 1] + sfname, "basename": sfname, "class": "File" }) else: raise WorkflowException( "Missing required secondary file '%s' from file object: %s" % (sfname, json.dumps(datum, indent=4))) normalizeFilesDirs(datum["secondaryFiles"]) if "format" in schema: try: checkFormat(datum, self.do_eval(schema["format"]), self.formatgraph) except validate.ValidationException as ve: raise WorkflowException( "Expected value of '%s' to have format %s but\n %s" % (schema["name"], schema["format"], ve)) def _capture_files(f): self.files.append(f) return f visit_class(datum.get("secondaryFiles", []), ("File", "Directory"), _capture_files) if schema["type"] == "Directory": ll = self.loadListing or (binding and binding.get("loadListing")) if ll and ll != "no_listing": get_listing(self.fs_access, datum, (ll == "deep_listing")) self.files.append(datum) # Position to front of the sort key if binding: for bi in bindings: bi["position"] = binding["position"] + bi["position"] bindings.append(binding) return bindings
def static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs, param_to_step): # type: (List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]], Dict[Text, Dict[Text, Any]]) -> None """Check if all source and sink types of a workflow are compatible before run time.""" # source parameters: workflow_inputs and step_outputs # sink parameters: step_inputs and workflow_outputs # make a dictionary of source parameters, indexed by the "id" field src_parms = workflow_inputs + step_outputs src_dict = {} for parm in src_parms: src_dict[parm["id"]] = parm step_inputs_val = check_all_types(src_dict, step_inputs, "source") workflow_outputs_val = check_all_types(src_dict, workflow_outputs, "outputSource") warnings = step_inputs_val["warning"] + workflow_outputs_val["warning"] exceptions = step_inputs_val["exception"] + workflow_outputs_val[ "exception"] warning_msgs = [] exception_msgs = [] for warning in warnings: src = warning.src sink = warning.sink linkMerge = warning.linkMerge sinksf = sorted([ p["pattern"] for p in sink.get("secondaryFiles", []) if p.get("required", True) ]) srcsf = sorted([p["pattern"] for p in src.get("secondaryFiles", [])]) # Every secondaryFile required by the sink, should be declared # by the source missing = missing_subset(srcsf, sinksf) if missing: msg1 = "Parameter '%s' requires secondaryFiles %s but" % ( shortname(sink["id"]), missing) msg3 = SourceLine(src, "id").makeError( "source '%s' does not provide those secondaryFiles." % (shortname(src["id"]))) msg4 = SourceLine( src.get("_tool_entry", src), "secondaryFiles" ).makeError( "To resolve, add missing secondaryFiles patterns to definition of '%s' or" % (shortname(src["id"]))) msg5 = SourceLine( sink.get("_tool_entry", sink), "secondaryFiles" ).makeError( "mark missing secondaryFiles in definition of '%s' as optional." % shortname(sink["id"])) msg = SourceLine(sink).makeError( "%s\n%s" % (msg1, bullets([msg3, msg4, msg5], " "))) elif sink.get("not_connected"): msg = SourceLine(sink, "type").makeError( "'%s' is not an input parameter of %s, expected %s" % (shortname( sink["id"]), param_to_step[sink["id"]]["run"], ", ".join( shortname(s["id"]) for s in param_to_step[sink["id"]]["inputs"] if not s.get("not_connected")))) else: msg = SourceLine(src, "type").makeError( "Source '%s' of type %s may be incompatible" % (shortname(src["id"]), json_dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json_dumps(sink["type"]))) if linkMerge is not None: msg += "\n" + SourceLine(sink).makeError( " source has linkMerge method %s" % linkMerge) warning_msgs.append(msg) for exception in exceptions: src = exception.src sink = exception.sink linkMerge = exception.linkMerge msg = SourceLine(src, "type").makeError( "Source '%s' of type %s is incompatible" % (shortname(src["id"]), json_dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json_dumps(sink["type"]))) if linkMerge is not None: msg += "\n" + SourceLine(sink).makeError( " source has linkMerge method %s" % linkMerge) exception_msgs.append(msg) for sink in step_inputs: if ('null' != sink["type"] and 'null' not in sink["type"] and "source" not in sink and "default" not in sink and "valueFrom" not in sink): msg = SourceLine(sink).makeError( "Required parameter '%s' does not have source, default, or valueFrom expression" % shortname(sink["id"])) exception_msgs.append(msg) all_warning_msg = strip_dup_lineno("\n".join(warning_msgs)) all_exception_msg = strip_dup_lineno("\n".join(exception_msgs)) if warnings: _logger.warning("Workflow checker warning:\n%s", all_warning_msg) if exceptions: raise validate.ValidationException(all_exception_msg)
def job( self, job_order, # type: Mapping[str, str] output_callbacks, # type: Callable[[Any, Any], Any] runtimeContext, # type: RuntimeContext ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] workReuse, _ = self.get_requirement("WorkReuse") enableReuse = workReuse.get("enableReuse", True) if workReuse else True jobname = uniquename(runtimeContext.name or shortname(self.tool.get("id", "job"))) if runtimeContext.cachedir and enableReuse: cachecontext = runtimeContext.copy() cachecontext.outdir = "/out" cachecontext.tmpdir = "/tmp" # nosec cachecontext.stagedir = "/stage" cachebuilder = self._init_job(job_order, cachecontext) cachebuilder.pathmapper = PathMapper( cachebuilder.files, runtimeContext.basedir, cachebuilder.stagedir, separateDirs=False, ) _check_adjust = partial(check_adjust, cachebuilder) visit_class( [cachebuilder.files, cachebuilder.bindings], ("File", "Directory"), _check_adjust, ) cmdline = flatten( list(map(cachebuilder.generate_arg, cachebuilder.bindings))) docker_req, _ = self.get_requirement("DockerRequirement") if docker_req is not None and runtimeContext.use_container: dockerimg = docker_req.get("dockerImageId") or docker_req.get( "dockerPull") elif (runtimeContext.default_container is not None and runtimeContext.use_container): dockerimg = runtimeContext.default_container else: dockerimg = None if dockerimg is not None: cmdline = ["docker", "run", dockerimg] + cmdline # not really run using docker, just for hashing purposes keydict = { "cmdline": cmdline } # type: Dict[str, Union[Dict[str, Any], List[Any]]] for shortcut in ["stdin", "stdout", "stderr"]: if shortcut in self.tool: keydict[shortcut] = self.tool[shortcut] for location, fobj in cachebuilder.pathmapper.items(): if fobj.type == "File": checksum = next( (e["checksum"] for e in cachebuilder.files if "location" in e and e["location"] == location and "checksum" in e and e["checksum"] != "sha1$hash"), None, ) fobj_stat = os.stat(fobj.resolved) if checksum is not None: keydict[fobj.resolved] = [fobj_stat.st_size, checksum] else: keydict[fobj.resolved] = [ fobj_stat.st_size, int(fobj_stat.st_mtime * 1000), ] interesting = { "DockerRequirement", "EnvVarRequirement", "InitialWorkDirRequirement", "ShellCommandRequirement", "NetworkAccess", } for rh in (self.original_requirements, self.original_hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json_dumps(keydict, separators=(",", ":"), sort_keys=True) cachekey = hashlib.md5( keydictstr.encode("utf-8")).hexdigest() # nosec _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(runtimeContext.cachedir, cachekey) # Create a lockfile to manage cache status. jobcachepending = "{}.status".format(jobcache) jobcachelock = None jobstatus = None # Opens the file for read/write, or creates an empty file. jobcachelock = open(jobcachepending, "a+") # get the shared lock to ensure no other process is trying # to write to this cache shared_file_lock(jobcachelock) jobcachelock.seek(0) jobstatus = jobcachelock.read() if os.path.isdir(jobcache) and jobstatus == "success": if docker_req and runtimeContext.use_container: cachebuilder.outdir = (runtimeContext.docker_outdir or random_outdir()) else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callbacks, cachebuilder, jobcache) # we're done with the cache so release lock jobcachelock.close() return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) # turn shared lock into an exclusive lock since we'll # be writing the cache directory upgrade_lock(jobcachelock) shutil.rmtree(jobcache, True) os.makedirs(jobcache) runtimeContext = runtimeContext.copy() runtimeContext.outdir = jobcache def update_status_output_callback( output_callbacks: Callable[[List[Dict[str, Any]], str], None], jobcachelock: IO[Any], outputs: List[Dict[str, Any]], processStatus: str, ) -> None: # save status to the lockfile then release the lock jobcachelock.seek(0) jobcachelock.truncate() jobcachelock.write(processStatus) jobcachelock.close() output_callbacks(outputs, processStatus) output_callbacks = partial(update_status_output_callback, output_callbacks, jobcachelock) builder = self._init_job(job_order, runtimeContext) reffiles = copy.deepcopy(builder.files) j = self.make_job_runner(runtimeContext)( builder, builder.job, self.make_path_mapper, self.requirements, self.hints, jobname, ) j.prov_obj = self.prov_obj j.successCodes = self.tool.get("successCodes", []) j.temporaryFailCodes = self.tool.get("temporaryFailCodes", []) j.permanentFailCodes = self.tool.get("permanentFailCodes", []) debug = _logger.isEnabledFor(logging.DEBUG) if debug: _logger.debug( "[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), " as part of %s" % runtimeContext.part_of if runtimeContext.part_of else "", ) _logger.debug("[job %s] %s", j.name, json_dumps(builder.job, indent=4)) builder.pathmapper = self.make_path_mapper(reffiles, builder.stagedir, runtimeContext, True) builder.requirements = j.requirements _check_adjust = partial(check_adjust, builder) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) initialWorkdir, _ = self.get_requirement("InitialWorkDirRequirement") if initialWorkdir is not None: ls = [] # type: List[Dict[str, Any]] if isinstance(initialWorkdir["listing"], str): ls = builder.do_eval(initialWorkdir["listing"]) else: for t in initialWorkdir["listing"]: if isinstance(t, Mapping) and "entry" in t: entry_exp = builder.do_eval(t["entry"], strip_whitespace=False) for entry in aslist(entry_exp): et = {"entry": entry} if "entryname" in t: et["entryname"] = builder.do_eval( t["entryname"]) else: et["entryname"] = None et["writable"] = t.get("writable", False) if et["entry"] is not None: ls.append(et) else: initwd_item = builder.do_eval(t) if not initwd_item: continue if isinstance(initwd_item, MutableSequence): ls.extend(initwd_item) else: ls.append(initwd_item) for i, t in enumerate(ls): if "entry" in t: if isinstance(t["entry"], str): ls[i] = { "class": "File", "basename": t["entryname"], "contents": t["entry"], "writable": t.get("writable"), } else: if t.get("entryname") or t.get("writable"): t = copy.deepcopy(t) if t.get("entryname"): t["entry"]["basename"] = t["entryname"] t["entry"]["writable"] = t.get("writable") ls[i] = t["entry"] j.generatefiles["listing"] = ls for l in ls: self.updatePathmap(builder.outdir, builder.pathmapper, l) visit_class([builder.files, builder.bindings], ("File", "Directory"), _check_adjust) if debug: _logger.debug( "[job %s] path mappings is %s", j.name, json_dumps( { p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files() }, indent=4, ), ) if self.tool.get("stdin"): with SourceLine(self.tool, "stdin", validate.ValidationException, debug): j.stdin = builder.do_eval(self.tool["stdin"]) if j.stdin: reffiles.append({"class": "File", "path": j.stdin}) if self.tool.get("stderr"): with SourceLine(self.tool, "stderr", validate.ValidationException, debug): j.stderr = builder.do_eval(self.tool["stderr"]) if j.stderr: if os.path.isabs(j.stderr) or ".." in j.stderr: raise validate.ValidationException( "stderr must be a relative path, got '%s'" % j.stderr) if self.tool.get("stdout"): with SourceLine(self.tool, "stdout", validate.ValidationException, debug): j.stdout = builder.do_eval(self.tool["stdout"]) if j.stdout: if os.path.isabs( j.stdout) or ".." in j.stdout or not j.stdout: raise validate.ValidationException( "stdout must be a relative path, got '%s'" % j.stdout) if debug: _logger.debug( "[job %s] command line bindings is %s", j.name, json_dumps(builder.bindings, indent=4), ) dockerReq, _ = self.get_requirement("DockerRequirement") if dockerReq is not None and runtimeContext.use_container: out_dir, out_prefix = os.path.split( runtimeContext.tmp_outdir_prefix) j.outdir = runtimeContext.outdir or tempfile.mkdtemp( prefix=out_prefix, dir=out_dir) tmpdir_dir, tmpdir_prefix = os.path.split( runtimeContext.tmpdir_prefix) j.tmpdir = runtimeContext.tmpdir or tempfile.mkdtemp( prefix=tmpdir_prefix, dir=tmpdir_dir) j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix, dir=tmpdir_dir) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir j.stagedir = builder.stagedir inplaceUpdateReq, _ = self.get_requirement("InplaceUpdateRequirement") if inplaceUpdateReq is not None: j.inplace_update = inplaceUpdateReq["inplaceUpdate"] normalizeFilesDirs(j.generatefiles) readers = {} # type: Dict[str, Any] muts = set() # type: Set[str] if builder.mutation_manager is not None: def register_mut(f): # type: (Dict[str, Any]) -> None mm = cast(MutationManager, builder.mutation_manager) muts.add(f["location"]) mm.register_mutation(j.name, f) def register_reader(f): # type: (Dict[str, Any]) -> None mm = cast(MutationManager, builder.mutation_manager) if f["location"] not in muts: mm.register_reader(j.name, f) readers[f["location"]] = copy.deepcopy(f) for li in j.generatefiles["listing"]: li = cast(Dict[str, Any], li) if li.get("writable") and j.inplace_update: adjustFileObjs(li, register_mut) adjustDirObjs(li, register_mut) else: adjustFileObjs(li, register_reader) adjustDirObjs(li, register_reader) adjustFileObjs(builder.files, register_reader) adjustFileObjs(builder.bindings, register_reader) adjustDirObjs(builder.files, register_reader) adjustDirObjs(builder.bindings, register_reader) timelimit, _ = self.get_requirement("ToolTimeLimit") if timelimit is not None: with SourceLine(timelimit, "timelimit", validate.ValidationException, debug): j.timelimit = builder.do_eval(timelimit["timelimit"]) if not isinstance(j.timelimit, int) or j.timelimit < 0: raise Exception( "timelimit must be an integer >= 0, got: %s" % j.timelimit) networkaccess, _ = self.get_requirement("NetworkAccess") if networkaccess is not None: with SourceLine(networkaccess, "networkAccess", validate.ValidationException, debug): j.networkaccess = builder.do_eval( networkaccess["networkAccess"]) if not isinstance(j.networkaccess, bool): raise Exception( "networkAccess must be a boolean, got: %s" % j.networkaccess) j.environment = {} evr, _ = self.get_requirement("EnvVarRequirement") if evr is not None: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd, _ = self.get_requirement("ShellCommandRequirement") if shellcmd is not None: cmd = [] # type: List[str] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten( list(map(builder.generate_arg, builder.bindings))) j.pathmapper = builder.pathmapper j.collect_outputs = partial( self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=getdefault(runtimeContext.compute_checksum, True), jobname=jobname, readers=readers, ) j.output_callback = output_callbacks yield j
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) # `promises` dict # from: each parameter (workflow input or step output) # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. promises = {} # `jobs` dict from step id to job that implements that step. jobs = {} for inp in self.cwlwf.tool["inputs"]: promises[inp["id"]] = SelfJob(self, cwljob) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, scheduling jobs as their # dependencies can be fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True for step in self.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp: for s in aslist(inp["source"]): if s not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} for inp in step.tool["inputs"]: key = shortname(inp["id"]) if "source" in inp: if inp.get("linkMerge") or len(aslist(inp["source"])) > 1: linkMerge = inp.get("linkMerge", "merge_nested") if linkMerge == "merge_nested": jobobj[key] = ( MergeInputsNested([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) elif linkMerge == "merge_flattened": jobobj[key] = ( MergeInputsFlattened([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '%s'", linkMerge) else: jobobj[key] = (shortname(inp["source"]), promises[inp["source"]].rv()) elif "default" in inp: d = copy.copy(inp["default"]) jobobj[key] = ("default", {"default": d}) if "valueFrom" in inp and "scatter" not in step.tool: if key in jobobj: jobobj[key] = StepValueFrom(inp["valueFrom"], jobobj[key], self.cwlwf.requirements) else: jobobj[key] = StepValueFrom(inp["valueFrom"], ("None", {"None": None}), self.cwlwf.requirements) if "scatter" in step.tool: wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options) followOn = CWLGather(step, wfjob.rv()) wfjob.addFollowOn(followOn) else: (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj), step_inputs=step.tool["inputs"], **self.executor_options) jobs[step.tool["id"]] = followOn connected = False for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if not promises[s].hasChild(wfjob): promises[s].addChild(wfjob) connected = True if not connected: # workflow step has default inputs only, isn't connected to other jobs, # so add it as child of workflow. self.addChild(wfjob) for out in step.tool["outputs"]: promises[out["id"]] = followOn for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if s not in promises: alloutputs_fufilled = False # may need a test for out in self.cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False outobj = {} for out in self.cwlwf.tool["outputs"]: outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv()) return IndirectDict(outobj)
def static_checker(workflow_inputs, workflow_outputs, step_inputs, step_outputs): # type: (List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]], List[Dict[Text, Any]]) -> None """Check if all source and sink types of a workflow are compatible before run time. """ # source parameters: workflow_inputs and step_outputs # sink parameters: step_inputs and workflow_outputs # make a dictionary of source parameters, indexed by the "id" field src_parms = workflow_inputs + step_outputs src_dict = {} for parm in src_parms: src_dict[parm["id"]] = parm step_inputs_val = check_all_types(src_dict, step_inputs, "source") workflow_outputs_val = check_all_types(src_dict, workflow_outputs, "outputSource") warnings = step_inputs_val["warning"] + workflow_outputs_val["warning"] exceptions = step_inputs_val["exception"] + workflow_outputs_val[ "exception"] warning_msgs = [] exception_msgs = [] for warning in warnings: src = warning.src sink = warning.sink linkMerge = warning.linkMerge msg = SourceLine(src, "type").makeError( "Source '%s' of type %s is partially incompatible" % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json.dumps(sink["type"]))) if linkMerge: msg += "\n" + SourceLine(sink).makeError( " sink has linkMerge method %s" % linkMerge) warning_msgs.append(msg) for exception in exceptions: src = exception.src sink = exception.sink linkMerge = exception.linkMerge msg = SourceLine(src, "type").makeError( "Source '%s' of type %s is incompatible" % (shortname(src["id"]), json.dumps(src["type"]))) + "\n" + \ SourceLine(sink, "type").makeError( " with sink '%s' of type %s" % (shortname(sink["id"]), json.dumps(sink["type"]))) if linkMerge: msg += "\n" + SourceLine(sink).makeError( " sink has linkMerge method %s" % linkMerge) exception_msgs.append(msg) for sink in step_inputs: if ('null' != sink["type"] and 'null' not in sink["type"] and "source" not in sink and "default" not in sink and "valueFrom" not in sink): msg = SourceLine(sink).makeError( "Required parameter '%s' does not have source, default, or valueFrom expression" % shortname(sink["id"])) exception_msgs.append(msg) all_warning_msg = "\n".join(warning_msgs) all_exception_msg = "\n".join(exception_msgs) if warnings: _logger.warn("Workflow checker warning:") _logger.warn(all_warning_msg) if exceptions: raise validate.ValidationException(all_exception_msg)
def __init__(self, toolpath_object, **kwargs): # type: (Dict[Text, Any], **Any) -> None """ kwargs: metadata: tool document metadata requirements: inherited requirements hints: inherited hints loader: schema_salad.ref_resolver.Loader used to load tool document avsc_names: CWL Avro schema object used to validate document strict: flag to determine strict validation (fail on unrecognized fields) """ self.metadata = kwargs.get("metadata", {}) # type: Dict[Text,Any] self.names = None # type: avro.schema.Names global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY # pylint: disable=global-statement if SCHEMA_FILE is None: get_schema("v1.0") SCHEMA_ANY = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"]) SCHEMA_FILE = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"]) SCHEMA_DIR = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"] [3].idx["https://w3id.org/cwl/cwl#Directory"]) names = schema_salad.schema.make_avro_schema( [SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY], schema_salad.ref_resolver.Loader({}))[0] if isinstance(names, avro.schema.SchemaParseException): raise names else: self.names = names self.tool = toolpath_object self.requirements = kwargs.get("requirements", []) + self.tool.get( "requirements", []) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) self.formatgraph = None # type: Graph if "loader" in kwargs: self.formatgraph = kwargs["loader"].graph self.doc_loader = kwargs["loader"] self.doc_schema = kwargs["avsc_names"] checkRequirements(self.tool, supportedProcessRequirements) self.validate_hints(kwargs["avsc_names"], self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} # type: Dict[Text,Dict[Text, Any]] sd, _ = self.get_requirement("SchemaDefRequirement") if sd: sdtypes = sd["types"] av = schema_salad.schema.make_valid_avro( sdtypes, {t["name"]: t for t in avroize_type(sdtypes)}, set()) for i in av: self.schemaDefs[i["name"]] = i avro.schema.make_avsc_object(av, self.names) # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.copy(i) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException( u"Missing `type` in parameter `%s`" % c["name"]) if "default" in c and "null" not in aslist(c["type"]): c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] c["type"] = avroize_type(c["type"], c["name"]) if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) try: self.inputs_record_schema = schema_salad.schema.make_valid_avro( self.inputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.inputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( u"Got error `%s` while processing inputs of %s:\n%s" % (Text(e), self.tool["id"], json.dumps(self.inputs_record_schema, indent=4))) try: self.outputs_record_schema = schema_salad.schema.make_valid_avro( self.outputs_record_schema, {}, set()) avro.schema.make_avsc_object(self.outputs_record_schema, self.names) except avro.schema.SchemaParseException as e: raise validate.ValidationException( u"Got error `%s` while processing outputs of %s:\n%s" % (Text(e), self.tool["id"], json.dumps(self.outputs_record_schema, indent=4)))
def job(self, joborder, output_callback, **kwargs): # type: (Dict[str,str], str, Callable[..., Any], **Any) -> Generator[Union[CommandLineJob, CallbackJob], None, None] jobname = uniquename(kwargs.get("name", shortname(self.tool.get("id", "job")))) if kwargs.get("cachedir"): cacheargs = kwargs.copy() cacheargs["outdir"] = "/out" cacheargs["tmpdir"] = "/tmp" cachebuilder = self._init_job(joborder, **cacheargs) cachebuilder.pathmapper = PathMapper(set((f["path"] for f in cachebuilder.files)), kwargs["basedir"]) cmdline = flatten(map(cachebuilder.generate_arg, cachebuilder.bindings)) (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") if docker_req and kwargs.get("use_container") is not False: dockerimg = docker_req.get("dockerImageId") or docker_req.get("dockerPull") cmdline = ["docker", "run", dockerimg] + cmdline keydict = {"cmdline": cmdline} for _,f in cachebuilder.pathmapper.items(): st = os.stat(f[0]) keydict[f[0]] = [st.st_size, int(st.st_mtime * 1000)] interesting = {"DockerRequirement", "EnvVarRequirement", "CreateFileRequirement", "ShellCommandRequirement"} for rh in (self.requirements, self.hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json.dumps(keydict, separators=(',',':'), sort_keys=True) cachekey = hashlib.md5(keydictstr).hexdigest() _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(kwargs["cachedir"], cachekey) jobcachepending = jobcache + ".pending" if os.path.isdir(jobcache) and not os.path.isfile(jobcachepending): if docker_req and kwargs.get("use_container") is not False: cachebuilder.outdir = kwargs.get("docker_outdir") or "/var/spool/cwl" else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callback, cachebuilder, jobcache) return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) shutil.rmtree(jobcache, True) os.makedirs(jobcache) kwargs["outdir"] = jobcache open(jobcachepending, "w").close() def rm_pending_output_callback(output_callback, jobcachepending, outputs, processStatus): if processStatus == "success": os.remove(jobcachepending) output_callback(outputs, processStatus) output_callback = cast( Callable[..., Any], # known bug in mypy # https://github.com/python/mypy/issues/797 partial(rm_pending_output_callback, output_callback, jobcachepending)) builder = self._init_job(joborder, **kwargs) reffiles = set((f["path"] for f in builder.files)) j = self.makeJobRunner() j.builder = builder j.joborder = builder.job j.stdin = None j.stdout = None j.successCodes = self.tool.get("successCodes") j.temporaryFailCodes = self.tool.get("temporaryFailCodes") j.permanentFailCodes = self.tool.get("permanentFailCodes") j.requirements = self.requirements j.hints = self.hints j.name = jobname _logger.debug(u"[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), u" as part of %s" % kwargs["part_of"] if "part_of" in kwargs else "") _logger.debug(u"[job %s] %s", j.name, json.dumps(joborder, indent=4)) builder.pathmapper = None if self.tool.get("stdin"): j.stdin = builder.do_eval(self.tool["stdin"]) reffiles.add(j.stdin) if self.tool.get("stdout"): j.stdout = builder.do_eval(self.tool["stdout"]) if os.path.isabs(j.stdout) or ".." in j.stdout: raise validate.ValidationException("stdout must be a relative path") builder.pathmapper = self.makePathMapper(reffiles, **kwargs) builder.requirements = j.requirements # map files to assigned path inside a container. We need to also explicitly # walk over input as implicit reassignment doesn't reach everything in builder.bindings def _check_adjust(f): # type: (Dict[str,Any]) -> Dict[str,Any] if not f.get("containerfs"): f["path"] = builder.pathmapper.mapper(f["path"])[1] f["containerfs"] = True return f _logger.debug(u"[job %s] path mappings is %s", j.name, json.dumps({p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files()}, indent=4)) adjustFileObjs(builder.files, _check_adjust) adjustFileObjs(builder.bindings, _check_adjust) _logger.debug(u"[job %s] command line bindings is %s", j.name, json.dumps(builder.bindings, indent=4)) dockerReq = self.get_requirement("DockerRequirement")[0] if dockerReq and kwargs.get("use_container"): out_prefix = kwargs.get("tmp_outdir_prefix") j.outdir = kwargs.get("outdir") or tempfile.mkdtemp(prefix=out_prefix) tmpdir_prefix = kwargs.get('tmpdir_prefix') j.tmpdir = kwargs.get("tmpdir") or tempfile.mkdtemp(prefix=tmpdir_prefix) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir createFiles = self.get_requirement("CreateFileRequirement")[0] j.generatefiles = {} if createFiles: for t in createFiles["fileDef"]: j.generatefiles[builder.do_eval(t["filename"])] = copy.deepcopy(builder.do_eval(t["fileContent"])) j.environment = {} evr = self.get_requirement("EnvVarRequirement")[0] if evr: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd = self.get_requirement("ShellCommandRequirement")[0] if shellcmd: cmd = [] # type: List[str] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten(map(builder.generate_arg, builder.bindings)) j.pathmapper = builder.pathmapper j.collect_outputs = partial( self.collect_output_ports, self.tool["outputs"], builder) j.output_callback = output_callback yield j
def job( self, job_order, # type: Dict[Text, Text] output_callbacks, # type: Callable[[Any, Any], Any] **kwargs # type: Any ): # type: (...) -> Generator[Union[CommandLineJob, CallbackJob], None, None] jobname = uniquename( kwargs.get("name", shortname(self.tool.get("id", "job")))) if kwargs.get("cachedir"): cacheargs = kwargs.copy() cacheargs["outdir"] = "/out" cacheargs["tmpdir"] = "/tmp" cacheargs["stagedir"] = "/stage" cachebuilder = self._init_job(job_order, **cacheargs) cachebuilder.pathmapper = PathMapper(cachebuilder.files, kwargs["basedir"], cachebuilder.stagedir, separateDirs=False) _check_adjust = partial(check_adjust, cachebuilder) adjustFileObjs(cachebuilder.files, _check_adjust) adjustFileObjs(cachebuilder.bindings, _check_adjust) adjustDirObjs(cachebuilder.files, _check_adjust) adjustDirObjs(cachebuilder.bindings, _check_adjust) cmdline = flatten( map(cachebuilder.generate_arg, cachebuilder.bindings)) (docker_req, docker_is_req) = self.get_requirement("DockerRequirement") if docker_req and kwargs.get("use_container") is not False: dockerimg = docker_req.get("dockerImageId") or docker_req.get( "dockerPull") cmdline = ["docker", "run", dockerimg] + cmdline keydict = {u"cmdline": cmdline} for _, f in cachebuilder.pathmapper.items(): if f.type == "File": st = os.stat(f.resolved) keydict[f.resolved] = [st.st_size, int(st.st_mtime * 1000)] interesting = { "DockerRequirement", "EnvVarRequirement", "CreateFileRequirement", "ShellCommandRequirement" } for rh in (self.requirements, self.hints): for r in reversed(rh): if r["class"] in interesting and r["class"] not in keydict: keydict[r["class"]] = r keydictstr = json.dumps(keydict, separators=(',', ':'), sort_keys=True) cachekey = hashlib.md5(keydictstr).hexdigest() _logger.debug("[job %s] keydictstr is %s -> %s", jobname, keydictstr, cachekey) jobcache = os.path.join(kwargs["cachedir"], cachekey) jobcachepending = jobcache + ".pending" if os.path.isdir(jobcache) and not os.path.isfile(jobcachepending): if docker_req and kwargs.get("use_container") is not False: cachebuilder.outdir = kwargs.get( "docker_outdir") or "/var/spool/cwl" else: cachebuilder.outdir = jobcache _logger.info("[job %s] Using cached output in %s", jobname, jobcache) yield CallbackJob(self, output_callbacks, cachebuilder, jobcache) return else: _logger.info("[job %s] Output of job will be cached in %s", jobname, jobcache) shutil.rmtree(jobcache, True) os.makedirs(jobcache) kwargs["outdir"] = jobcache open(jobcachepending, "w").close() def rm_pending_output_callback(output_callbacks, jobcachepending, outputs, processStatus): if processStatus == "success": os.remove(jobcachepending) output_callbacks(outputs, processStatus) output_callbacks = cast( Callable[..., Any], # known bug in mypy # https://github.com/python/mypy/issues/797 partial(rm_pending_output_callback, output_callbacks, jobcachepending)) builder = self._init_job(job_order, **kwargs) reffiles = copy.deepcopy(builder.files) j = self.makeJobRunner() j.builder = builder j.joborder = builder.job j.stdin = None j.stderr = None j.stdout = None j.successCodes = self.tool.get("successCodes") j.temporaryFailCodes = self.tool.get("temporaryFailCodes") j.permanentFailCodes = self.tool.get("permanentFailCodes") j.requirements = self.requirements j.hints = self.hints j.name = jobname if _logger.isEnabledFor(logging.DEBUG): _logger.debug( u"[job %s] initializing from %s%s", j.name, self.tool.get("id", ""), u" as part of %s" % kwargs["part_of"] if "part_of" in kwargs else "") _logger.debug(u"[job %s] %s", j.name, json.dumps(job_order, indent=4)) builder.pathmapper = None make_path_mapper_kwargs = kwargs if "stagedir" in make_path_mapper_kwargs: make_path_mapper_kwargs = make_path_mapper_kwargs.copy() del make_path_mapper_kwargs["stagedir"] builder.pathmapper = self.makePathMapper(reffiles, builder.stagedir, **make_path_mapper_kwargs) builder.requirements = j.requirements if _logger.isEnabledFor(logging.DEBUG): _logger.debug( u"[job %s] path mappings is %s", j.name, json.dumps( { p: builder.pathmapper.mapper(p) for p in builder.pathmapper.files() }, indent=4)) _check_adjust = partial(check_adjust, builder) adjustFileObjs(builder.files, _check_adjust) adjustFileObjs(builder.bindings, _check_adjust) adjustDirObjs(builder.files, _check_adjust) adjustDirObjs(builder.bindings, _check_adjust) if self.tool.get("stdin"): with SourceLine(self.tool, "stdin", validate.ValidationException): j.stdin = builder.do_eval(self.tool["stdin"]) reffiles.append({"class": "File", "path": j.stdin}) if self.tool.get("stderr"): with SourceLine(self.tool, "stderr", validate.ValidationException): j.stderr = builder.do_eval(self.tool["stderr"]) if os.path.isabs(j.stderr) or ".." in j.stderr: raise validate.ValidationException( "stderr must be a relative path, got '%s'" % j.stderr) if self.tool.get("stdout"): with SourceLine(self.tool, "stdout", validate.ValidationException): j.stdout = builder.do_eval(self.tool["stdout"]) if os.path.isabs(j.stdout) or ".." in j.stdout or not j.stdout: raise validate.ValidationException( "stdout must be a relative path, got '%s'" % j.stdout) if _logger.isEnabledFor(logging.DEBUG): _logger.debug(u"[job %s] command line bindings is %s", j.name, json.dumps(builder.bindings, indent=4)) dockerReq = self.get_requirement("DockerRequirement")[0] if dockerReq and kwargs.get("use_container"): out_prefix = kwargs.get("tmp_outdir_prefix") j.outdir = kwargs.get("outdir") or tempfile.mkdtemp( prefix=out_prefix) tmpdir_prefix = kwargs.get('tmpdir_prefix') j.tmpdir = kwargs.get("tmpdir") or tempfile.mkdtemp( prefix=tmpdir_prefix) j.stagedir = tempfile.mkdtemp(prefix=tmpdir_prefix) else: j.outdir = builder.outdir j.tmpdir = builder.tmpdir j.stagedir = builder.stagedir initialWorkdir = self.get_requirement("InitialWorkDirRequirement")[0] j.generatefiles = {"class": "Directory", "listing": [], "basename": ""} if initialWorkdir: ls = [] # type: List[Dict[Text, Any]] if isinstance(initialWorkdir["listing"], (str, Text)): ls = builder.do_eval(initialWorkdir["listing"]) else: for t in initialWorkdir["listing"]: if "entry" in t: et = {u"entry": builder.do_eval(t["entry"])} if "entryname" in t: et["entryname"] = builder.do_eval(t["entryname"]) else: et["entryname"] = None et["writable"] = t.get("writable", False) ls.append(et) else: ls.append(builder.do_eval(t)) for i, t in enumerate(ls): if "entry" in t: if isinstance(t["entry"], basestring): ls[i] = { "class": "File", "basename": t["entryname"], "contents": t["entry"], "writable": t.get("writable") } else: if t["entryname"] or t["writable"]: t = copy.deepcopy(t) if t["entryname"]: t["entry"]["basename"] = t["entryname"] t["entry"]["writable"] = t.get("writable") ls[i] = t["entry"] j.generatefiles[u"listing"] = ls normalizeFilesDirs(j.generatefiles) j.environment = {} evr = self.get_requirement("EnvVarRequirement")[0] if evr: for t in evr["envDef"]: j.environment[t["envName"]] = builder.do_eval(t["envValue"]) shellcmd = self.get_requirement("ShellCommandRequirement")[0] if shellcmd: cmd = [] # type: List[Text] for b in builder.bindings: arg = builder.generate_arg(b) if b.get("shellQuote", True): arg = [shellescape.quote(a) for a in aslist(arg)] cmd.extend(aslist(arg)) j.command_line = ["/bin/sh", "-c", " ".join(cmd)] else: j.command_line = flatten( map(builder.generate_arg, builder.bindings)) j.pathmapper = builder.pathmapper j.collect_outputs = partial(self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=kwargs.get( "compute_checksum", True)) j.output_callback = output_callbacks yield j
def bind_input(self, schema, datum, lead_pos=[], tail_pos=[]): bindings = [] binding = None if "inputBinding" in schema and isinstance(schema["inputBinding"], dict): binding = copy.copy(schema["inputBinding"]) if "position" in binding: binding["position"] = aslist(lead_pos) + aslist( binding["position"]) + aslist(tail_pos) else: binding["position"] = aslist(lead_pos) + [0] + aslist(tail_pos) if "valueFrom" in binding: binding["do_eval"] = binding["valueFrom"] binding["valueFrom"] = datum # Handle union types if isinstance(schema["type"], list): for t in schema["type"]: if isinstance(t, basestring) and self.names.has_name(t, ""): avsc = self.names.get_name(t, "") elif isinstance(t, dict) and "name" in t and self.names.has_name( t["name"], ""): avsc = self.names.get_name(t["name"], "") else: avsc = avro.schema.make_avsc_object(t, self.names) if validate.validate(avsc, datum): schema = copy.deepcopy(schema) schema["type"] = t return self.bind_input(schema, datum, lead_pos=lead_pos, tail_pos=tail_pos) raise validate.ValidationException("'%s' is not a valid union %s" % (datum, schema["type"])) elif isinstance(schema["type"], dict): st = copy.deepcopy(schema["type"]) if binding and "inputBinding" not in st and "itemSeparator" not in binding and st[ "type"] in ("array", "map"): st["inputBinding"] = {} bindings.extend( self.bind_input(st, datum, lead_pos=lead_pos, tail_pos=tail_pos)) else: if schema["type"] in self.schemaDefs: schema = self.schemaDefs[schema["type"]] if schema["type"] == "record": for f in schema["fields"]: if f["name"] in datum: bindings.extend( self.bind_input(f, datum[f["name"]], lead_pos=lead_pos, tail_pos=f["name"])) else: datum[f["name"]] = f.get("default") if schema["type"] == "map": for n, item in datum.items(): b2 = None if binding: b2 = copy.deepcopy(binding) b2["valueFrom"] = [n, item] bindings.extend( self.bind_input( { "type": schema["values"], "inputBinding": b2 }, item, lead_pos=n, tail_pos=tail_pos)) binding = None if schema["type"] == "array": for n, item in enumerate(datum): b2 = None if binding: b2 = copy.deepcopy(binding) b2["valueFrom"] = item bindings.extend( self.bind_input( { "type": schema["items"], "inputBinding": b2 }, item, lead_pos=n, tail_pos=tail_pos)) binding = None if schema["type"] == "File": self.files.append(datum) if binding and binding.get("loadContents"): with self.fs_access.open(datum["path"], "rb") as f: datum["contents"] = f.read(CONTENT_LIMIT) if "secondaryFiles" in schema: if "secondaryFiles" not in datum: datum["secondaryFiles"] = [] for sf in aslist(schema["secondaryFiles"]): if isinstance(sf, dict) or "$(" in sf or "${" in sf: sfpath = self.do_eval(sf, context=datum) if isinstance(sfpath, basestring): sfpath = {"path": sfpath, "class": "File"} else: sfpath = { "path": substitute(datum["path"], sf), "class": "File" } if isinstance(sfpath, list): datum["secondaryFiles"].extend(sfpath) else: datum["secondaryFiles"].append(sfpath) for sf in datum.get("secondaryFiles", []): self.files.append(sf) # Position to front of the sort key if binding: for bi in bindings: bi["position"] = binding["position"] + bi["position"] bindings.append(binding) return bindings
def __init__(self, toolpath_object, pos, **kwargs): # type: (Dict[Text, Any], int, **Any) -> None if "id" in toolpath_object: self.id = toolpath_object["id"] else: self.id = "#step" + Text(pos) kwargs["requirements"] = kwargs.get( "requirements", []) + toolpath_object.get("requirements", []) kwargs["hints"] = kwargs.get("hints", []) + toolpath_object.get( "hints", []) try: if isinstance(toolpath_object["run"], dict): self.embedded_tool = kwargs.get("makeTool")( toolpath_object["run"], **kwargs) else: self.embedded_tool = load_tool( toolpath_object["run"], kwargs.get("makeTool"), kwargs, enable_dev=kwargs.get("enable_dev"), strict=kwargs.get("strict"), fetcher_constructor=kwargs.get("fetcher_constructor")) except validate.ValidationException as v: raise WorkflowException( u"Tool definition %s failed validation:\n%s" % (toolpath_object["run"], validate.indent(str(v)))) validation_errors = [] self.tool = toolpath_object = copy.deepcopy(toolpath_object) bound = set() for stepfield, toolfield in (("in", "inputs"), ("out", "outputs")): toolpath_object[toolfield] = [] for n, step_entry in enumerate(toolpath_object[stepfield]): if isinstance(step_entry, (str, unicode)): param = CommentedMap() # type: CommentedMap inputid = step_entry else: param = CommentedMap(step_entry.iteritems()) inputid = step_entry["id"] shortinputid = shortname(inputid) found = False for tool_entry in self.embedded_tool.tool[toolfield]: frag = shortname(tool_entry["id"]) if frag == shortinputid: param.update(tool_entry) # type: ignore found = True bound.add(frag) break if not found: if stepfield == "in": param["type"] = "Any" else: validation_errors.append( SourceLine(self.tool["out"], n).makeError( "Workflow step output '%s' does not correspond to" % shortname(step_entry)) + "\n" + SourceLine(self.embedded_tool.tool, "outputs"). makeError(" tool output (expected '%s')" % ("', '".join([ shortname(tool_entry["id"]) for tool_entry in self.embedded_tool.tool[toolfield] ])))) param["id"] = inputid param.lc.line = toolpath_object[stepfield].lc.data[n][0] param.lc.col = toolpath_object[stepfield].lc.data[n][1] param.lc.filename = toolpath_object[stepfield].lc.filename toolpath_object[toolfield].append(param) missing = [] for i, tool_entry in enumerate(self.embedded_tool.tool["inputs"]): if shortname(tool_entry["id"]) not in bound: if "null" not in tool_entry[ "type"] and "default" not in tool_entry: missing.append(shortname(tool_entry["id"])) if missing: validation_errors.append( SourceLine(self.tool, "in").makeError( "Step is missing required parameter%s '%s'" % ("s" if len(missing) > 1 else "", "', '".join(missing)))) if validation_errors: raise validate.ValidationException("\n".join(validation_errors)) super(WorkflowStep, self).__init__(toolpath_object, **kwargs) if self.embedded_tool.tool["class"] == "Workflow": (feature, _) = self.get_requirement("SubworkflowFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains embedded workflow but SubworkflowFeatureRequirement not in requirements" ) if "scatter" in self.tool: (feature, _) = self.get_requirement("ScatterFeatureRequirement") if not feature: raise WorkflowException( "Workflow contains scatter but ScatterFeatureRequirement not in requirements" ) inputparms = copy.deepcopy(self.tool["inputs"]) outputparms = copy.deepcopy(self.tool["outputs"]) scatter = aslist(self.tool["scatter"]) method = self.tool.get("scatterMethod") if method is None and len(scatter) != 1: raise validate.ValidationException( "Must specify scatterMethod when scattering over multiple inputs" ) inp_map = {i["id"]: i for i in inputparms} for s in scatter: if s not in inp_map: raise validate.ValidationException( SourceLine(self.tool, "scatter").makeError( u"Scatter parameter '%s' does not correspond to an input parameter of this " u"step, expecting '%s'" % (shortname(s), "', '".join( shortname(k) for k in inp_map.keys())))) inp_map[s]["type"] = { "type": "array", "items": inp_map[s]["type"] } if self.tool.get("scatterMethod") == "nested_crossproduct": nesting = len(scatter) else: nesting = 1 for r in xrange(0, nesting): for op in outputparms: op["type"] = {"type": "array", "items": op["type"]} self.tool["inputs"] = inputparms self.tool["outputs"] = outputparms
def __init__(self, toolpath_object, **kwargs): # type: (Dict[Text, Any], **Any) -> None """ kwargs: metadata: tool document metadata requirements: inherited requirements hints: inherited hints loader: schema_salad.ref_resolver.Loader used to load tool document avsc_names: CWL Avro schema object used to validate document strict: flag to determine strict validation (fail on unrecognized fields) """ self.metadata = kwargs.get("metadata", {}) # type: Dict[Text,Any] self.names = None # type: schema.Names global SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY # pylint: disable=global-statement if SCHEMA_FILE is None: get_schema("v1.0") SCHEMA_ANY = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/salad#Any"]) SCHEMA_FILE = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"][3].idx["https://w3id.org/cwl/cwl#File"]) SCHEMA_DIR = cast( Dict[Text, Any], SCHEMA_CACHE["v1.0"] [3].idx["https://w3id.org/cwl/cwl#Directory"]) names = schema.make_avro_schema([SCHEMA_FILE, SCHEMA_DIR, SCHEMA_ANY], Loader({}))[0] if isinstance(names, schema.SchemaParseException): raise names else: self.names = names self.tool = toolpath_object self.requirements = (kwargs.get("requirements", []) + self.tool.get( "requirements", []) + get_overrides(kwargs.get( "overrides", []), self.tool["id"]).get("requirements", [])) self.hints = kwargs.get("hints", []) + self.tool.get("hints", []) self.formatgraph = None # type: Graph if "loader" in kwargs: self.formatgraph = kwargs["loader"].graph self.doc_loader = kwargs["loader"] self.doc_schema = kwargs["avsc_names"] checkRequirements(self.tool, supportedProcessRequirements) self.validate_hints(kwargs["avsc_names"], self.tool.get("hints", []), strict=kwargs.get("strict")) self.schemaDefs = {} # type: Dict[Text,Dict[Text, Any]] sd, _ = self.get_requirement("SchemaDefRequirement") if sd: sdtypes = sd["types"] av = schema.make_valid_avro( sdtypes, {t["name"]: t for t in avroize_type(sdtypes)}, set()) for i in av: self.schemaDefs[i["name"]] = i # type: ignore schema.AvroSchemaFromJSONData(av, self.names) # type: ignore # Build record schema from inputs self.inputs_record_schema = { "name": "input_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] self.outputs_record_schema = { "name": "outputs_record_schema", "type": "record", "fields": [] } # type: Dict[Text, Any] for key in ("inputs", "outputs"): for i in self.tool[key]: c = copy.copy(i) c["name"] = shortname(c["id"]) del c["id"] if "type" not in c: raise validate.ValidationException(u"Missing 'type' in " "parameter '%s'" % c["name"]) if "default" in c and "null" not in aslist(c["type"]): c["type"] = ["null"] + aslist(c["type"]) else: c["type"] = c["type"] c["type"] = avroize_type(c["type"], c["name"]) if key == "inputs": self.inputs_record_schema["fields"].append(c) elif key == "outputs": self.outputs_record_schema["fields"].append(c) with SourceLine(toolpath_object, "inputs", validate.ValidationException): self.inputs_record_schema = cast( Dict[six.text_type, Any], schema.make_valid_avro(self.inputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.inputs_record_schema, self.names) with SourceLine(toolpath_object, "outputs", validate.ValidationException): self.outputs_record_schema = cast( Dict[six.text_type, Any], schema.make_valid_avro(self.outputs_record_schema, {}, set())) schema.AvroSchemaFromJSONData(self.outputs_record_schema, self.names) if toolpath_object.get("class") is not None and not kwargs.get( "disable_js_validation", False): if kwargs.get("js_hint_options_file") is not None: try: with open(kwargs["js_hint_options_file"]) as options_file: validate_js_options = json.load(options_file) except (OSError, ValueError) as e: _logger.error("Failed to read options file %s" % kwargs["js_hint_options_file"]) raise e else: validate_js_options = None validate_js_expressions( cast(CommentedMap, toolpath_object), self.doc_schema.names[toolpath_object["class"]], validate_js_options) dockerReq, is_req = self.get_requirement("DockerRequirement") if dockerReq and dockerReq.get("dockerOutputDirectory") and not is_req: _logger.warn( SourceLine(item=dockerReq, raise_type=Text).makeError( """When 'dockerOutputDirectory' is declared, DockerRequirement should go in the 'requirements' section, not 'hints'.""")) if dockerReq and dockerReq.get( "dockerOutputDirectory") == "/var/spool/cwl": if is_req: # In this specific case, it is legal to have /var/spool/cwl, so skip the check. pass else: # Must be a requirement var_spool_cwl_detector(self.tool) else: var_spool_cwl_detector(self.tool)