def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) if isinstance(self.step.tool["scatter"], string_types): scatter = [self.step.tool["scatter"]] else: scatter = self.step.tool["scatter"] scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" outputs = [] valueFrom = {shortname(i["id"]): i["valueFrom"] for i in self.step.tool["inputs"] if "valueFrom" in i} def postScatterEval(io): shortio = {shortname(k): v for k, v in iteritems(io)} for k in valueFrom: io.setdefault(k, None) def valueFromFunc(k, v): if k in valueFrom: return cwltool.expression.do_eval( valueFrom[k], shortio, self.step.requirements, None, None, {}, context=v) else: return v return {k: valueFromFunc(k, v) for k,v in list(io.items())} if scatterMethod == "dotproduct": for i in range(0, len(cwljob[shortname(scatter[0])])): copyjob = copy.copy(cwljob) for sc in [shortname(x) for x in scatter]: copyjob[sc] = cwljob[sc][i] copyjob = postScatterEval(copyjob) (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) elif scatterMethod == "nested_crossproduct": outputs = self.nested_crossproduct_scatter(cwljob, scatter, postScatterEval) elif scatterMethod == "flat_crossproduct": self.flat_crossproduct_scatter(cwljob, scatter, outputs, postScatterEval) else: if scatterMethod: raise validate.ValidationException( "Unsupported complex scatter type '%s'" % scatterMethod) else: raise validate.ValidationException( "Must provide scatterMethod to scatter over multiple inputs") return outputs
def job(self, joborder, output_callback, runtimeContext): runtimeContext = runtimeContext.copy() runtimeContext.toplevel = True # Preserve behavior for #13365 builder = make_builder({shortname(k): v for k,v in viewitems(joborder)}, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) return super(ArvadosWorkflowStep, self).job(joborder, output_callback, runtimeContext)
def to_script(ctx, path, job_path, **kwds): if schema_salad is None: raise Exception("This functionality requires schema_salad and Python 2.7.") if cwltool is None: raise Exception("This functionality requires cwltool and Python 2.7.") uri = "file://" + os.path.abspath(job_path) loader = schema_salad.ref_resolver.Loader({ "@base": uri, "path": { "@type": "@id" } }) job, _ = loader.resolve_ref(uri) t = load_tool(path, False, False, cwltool.workflow.defaultMakeTool, True) if type(t) == int: return t process.checkRequirements(t.tool, cwl2script.supportedProcessRequirements) for inp in t.tool["inputs"]: if process.shortname(inp["id"]) in job: pass elif process.shortname(inp["id"]) not in job and "default" in inp: job[process.shortname(inp["id"])] = copy.copy(inp["default"]) elif process.shortname(inp["id"]) not in job and inp["type"][0] == "null": pass else: raise Exception("Missing inputs `%s`" % process.shortname(inp["id"])) if not kwds.get("basedir", None): kwds["basedir"] = os.path.dirname(os.path.abspath(job_path)) outdir = kwds.get("outdir") if t.tool["class"] == "Workflow": print(cwl2script.generateScriptForWorkflow(t, job, outdir)) elif t.tool["class"] == "CommandLineTool": print(cwl2script.generateScriptForTool(t, job, outdir)) return 0
def pipeline_component_spec(self): """Return a component that Workbench and a-r-p-i will understand. Specifically, translate CWL input specs to Arvados pipeline format, like {"dataclass":"File","value":"xyz"}. """ spec = self.job.arvados_job_spec() # Most of the component spec is exactly the same as the job # spec (script, script_version, etc.). # spec['script_parameters'] isn't right, though. A component # spec's script_parameters hash is a translation of # self.tool.tool['inputs'] with defaults/overrides taken from # the job order. So we move the job parameters out of the way # and build a new spec['script_parameters']. job_params = spec['script_parameters'] spec['script_parameters'] = {} for param in self.tool.tool['inputs']: param = copy.deepcopy(param) # Data type and "required" flag... types = param['type'] if not isinstance(types, list): types = [types] param['required'] = 'null' not in types non_null_types = set(types) - set(['null']) if len(non_null_types) == 1: the_type = [c for c in non_null_types][0] dataclass = self.type_to_dataclass.get(the_type) if dataclass: param['dataclass'] = dataclass # Note: If we didn't figure out a single appropriate # dataclass, we just left that attribute out. We leave # the "type" attribute there in any case, which might help # downstream. # Title and description... title = param.pop('label', '') descr = param.pop('doc', '').rstrip('\n') if title: param['title'] = title if descr: param['description'] = descr # Fill in the value from the current job order, if any. param_id = shortname(param.pop('id')) value = job_params.get(param_id) if value is None: pass elif not isinstance(value, dict): param['value'] = value elif param.get('dataclass') == 'File' and value.get('location'): param['value'] = value['location'] spec['script_parameters'][param_id] = param spec['script_parameters']['cwl:tool'] = job_params['cwl:tool'] return spec
def postScatterEval(io): shortio = {shortname(k): v for k, v in iteritems(io)} def valueFromFunc(k, v): if k in valueFrom: return cwltool.expression.do_eval( valueFrom[k], shortio, self.step.requirements, None, None, {}, context=v) else: return v return {k: valueFromFunc(k, v) for k,v in io.items()}
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) if isinstance(self.step.tool["scatter"], basestring): scatter = [self.step.tool["scatter"]] else: scatter = self.step.tool["scatter"] scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" outputs = [] self.vfinputs = cwljob shortscatter = [shortname(s) for s in scatter] cwljob = {k: self.valueFromFunc(k, v) if k not in shortscatter else v for k,v in cwljob.items()} if scatterMethod == "dotproduct": for i in xrange(0, len(cwljob[shortname(scatter[0])])): copyjob = copy.copy(cwljob) for sc in scatter: scatter_key = shortname(sc) copyjob[scatter_key] = self.valueFromFunc(scatter_key, cwljob[scatter_key][i]) (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) elif scatterMethod == "nested_crossproduct": outputs = self.nested_crossproduct_scatter(cwljob, scatter) elif scatterMethod == "flat_crossproduct": self.flat_crossproduct_scatter(cwljob, scatter, outputs) else: if scatterMethod: raise validate.ValidationException( "Unsupported complex scatter type '%s'" % scatterMethod) else: raise validate.ValidationException( "Must provide scatterMethod to scatter over multiple inputs") return outputs
def flat_crossproduct_scatter(self, joborder, scatter_keys, outputs): scatter_key = shortname(scatter_keys[0]) l = len(joborder[scatter_key]) for n in xrange(0, l): jo = copy.copy(joborder) jo[scatter_key] = self.valueFromFunc(scatter_key, joborder[scatter_key][n]) if len(scatter_keys) == 1: (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) else: self.flat_crossproduct_scatter(jo, scatter_keys[1:], outputs)
def nested_crossproduct_scatter(self, joborder, scatter_keys): scatter_key = shortname(scatter_keys[0]) l = len(joborder[scatter_key]) outputs = [] for n in xrange(0, l): jo = copy.copy(joborder) jo[scatter_key] = self.valueFromFunc(scatter_key, joborder[scatter_key][n]) if len(scatter_keys) == 1: (subjob, followOn) = makeJob(self.step.embedded_tool, jo) self.addChild(subjob) outputs.append(followOn.rv()) else: outputs.append(self.nested_crossproduct_scatter(jo, scatter_keys[1:])) return outputs
def nested_crossproduct_scatter(self, joborder, scatter_keys, postScatterEval): scatter_key = shortname(scatter_keys[0]) l = len(joborder[scatter_key]) outputs = [] for n in xrange(0, l): jo = copy.copy(joborder) jo[scatter_key] = joborder[scatter_key][n] if len(scatter_keys) == 1: jo = postScatterEval(jo) (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) else: outputs.append(self.nested_crossproduct_scatter(jo, scatter_keys[1:], postScatterEval)) return outputs
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, submit_runner_ram=0, name=None, merged_map=None): packed = packed_workflow(arvRunner, tool, merged_map) adjustDirObjs(job_order, trim_listing) adjustFileObjs(job_order, trim_anonymous_location) adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: sn = shortname(inp["id"]) if sn in job_order: inp["default"] = job_order[sn] if not name: name = tool.tool.get("label", os.path.basename(tool.tool["id"])) upload_dependencies(arvRunner, name, tool.doc_loader, packed, tool.tool["id"], False) if submit_runner_ram: hints = main.get("hints", []) found = False for h in hints: if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources": h["ramMin"] = submit_runner_ram found = True break if not found: hints.append({"class": "http://arvados.org/cwl#WorkflowRunnerResources", "ramMin": submit_runner_ram}) main["hints"] = hints body = { "workflow": { "name": name, "description": tool.tool.get("doc", ""), "definition":json.dumps(packed, sort_keys=True, indent=4, separators=(',',': ')) }} if project_uuid: body["workflow"]["owner_uuid"] = project_uuid if uuid: call = arvRunner.api.workflows().update(uuid=uuid, body=body) else: call = arvRunner.api.workflows().create(body=body) return call.execute(num_retries=arvRunner.num_retries)["uuid"]
def upload_tool_deps(deptool): if "id" in deptool: discovered_secondaryfiles = {} pm = upload_dependencies(arvrunner, "%s dependencies" % (shortname(deptool["id"])), document_loader, deptool, deptool["id"], False, include_primary=False, discovered_secondaryfiles=discovered_secondaryfiles) document_loader.idx[deptool["id"]] = deptool toolmap = {} for k,v in pm.items(): toolmap[k] = v.resolved merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
def postScatterEval(io): shortio = {shortname(k): v for k, v in iteritems(io)} for k in valueFrom: io.setdefault(k, None) def valueFromFunc(k, v): if k in valueFrom: return cwltool.expression.do_eval(valueFrom[k], shortio, self.step.requirements, None, None, {}, context=v) else: return v return {k: valueFromFunc(k, v) for k, v in list(io.items())}
def nested_crossproduct_scatter(self, joborder, scatter_keys): scatter_key = shortname(scatter_keys[0]) l = len(joborder[scatter_key]) outputs = [] for n in xrange(0, l): jo = copy.copy(joborder) jo[scatter_key] = self.valueFromFunc(scatter_key, joborder[scatter_key][n]) if len(scatter_keys) == 1: (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) else: outputs.append( self.nested_crossproduct_scatter(jo, scatter_keys[1:])) return outputs
def nested_crossproduct_scatter(self, joborder, scatter_keys, postScatterEval): scatter_key = shortname(scatter_keys[0]) outputs = [] for n in range(0, len(joborder[scatter_key])): jo = copy.copy(joborder) jo[scatter_key] = joborder[scatter_key][n] if len(scatter_keys) == 1: jo = postScatterEval(jo) (subjob, followOn) = makeJob(self.step.embedded_tool, jo, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) else: outputs.append( self.nested_crossproduct_scatter(jo, scatter_keys[1:], postScatterEval)) return outputs
def upload_tool_deps(deptool): if "id" in deptool: discovered_secondaryfiles = {} pm = upload_dependencies( arvrunner, "%s dependencies" % (shortname(deptool["id"])), document_loader, deptool, deptool["id"], False, include_primary=False, discovered_secondaryfiles=discovered_secondaryfiles) document_loader.idx[deptool["id"]] = deptool toolmap = {} for k, v in pm.items(): toolmap[k] = v.resolved merged_map[deptool["id"]] = FileUpdates(toolmap, discovered_secondaryfiles)
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, submit_runner_ram=0, name=None): packed = packed_workflow(arvRunner, tool) adjustDirObjs(job_order, trim_listing) adjustFileObjs(job_order, trim_anonymous_location) adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: sn = shortname(inp["id"]) if sn in job_order: inp["default"] = job_order[sn] if not name: name = tool.tool.get("label", os.path.basename(tool.tool["id"])) upload_dependencies(arvRunner, name, tool.doc_loader, packed, tool.tool["id"], False) # TODO nowhere for submit_runner_ram to go. body = { "workflow": { "name": name, "description": tool.tool.get("doc", ""), "definition": yaml.round_trip_dump(packed) } } if project_uuid: body["workflow"]["owner_uuid"] = project_uuid if uuid: call = arvRunner.api.workflows().update(uuid=uuid, body=body) else: call = arvRunner.api.workflows().create(body=body) return call.execute(num_retries=arvRunner.num_retries)["uuid"]
def run(self, *args, **kwargs): job_spec = self.arvados_job_spec(*args, **kwargs) for k, v in job_spec["script_parameters"].items(): if v is False or v is None or isinstance(v, dict): job_spec["script_parameters"][k] = {"value": v} self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances( ).create( body={ "owner_uuid": self.arvrunner.project_uuid, "name": shortname(self.tool.tool["id"]), "components": { "cwl-runner": job_spec }, "state": "RunningOnServer" }).execute(num_retries=self.arvrunner.num_retries) logger.info("Created pipeline %s", self.arvrunner.pipeline["uuid"]) if kwargs.get("wait") is False: self.uuid = self.arvrunner.pipeline["uuid"] return job = None while not job: time.sleep(2) self.arvrunner.pipeline = self.arvrunner.api.pipeline_instances( ).get(uuid=self.arvrunner.pipeline["uuid"]).execute( num_retries=self.arvrunner.num_retries) job = self.arvrunner.pipeline["components"]["cwl-runner"].get( "job") if not job and self.arvrunner.pipeline[ "state"] != "RunningOnServer": raise WorkflowException("Submitted pipeline is %s" % (self.arvrunner.pipeline["state"])) self.uuid = job["uuid"] self.arvrunner.processes[self.uuid] = self if job["state"] in ("Complete", "Failed", "Cancelled"): self.done(job)
def __init__(self, builder, # type: Builder joborder, # type: Dict[Text, Union[Dict[Text, Any], List, Text, None]] requirements, # type: List[Dict[Text, Text]] hints, # type: List[Dict[Text, Text]] name, # type: Text wps_process, # type: WpsProcessInterface expected_outputs, # type: List[CWL_ExpectedOutputs] ): # type: (...) -> None super(WpsWorkflowJob, self).__init__(builder, joborder, None, requirements, hints, name) self.wps_process = wps_process self.expected_outputs = {} # type: Dict[str, str] # {id: file-pattern} for output in expected_outputs: # TODO Should we support something else? if is_cwl_file_type(output): # Expecting output to look like this # output = {"id": "file:///tmp/random_path/process_name#output_id, # "type": "File", # "outputBinding": {"glob": output_name } # } output_id = shortname(output["id"]) self.expected_outputs[output_id] = output["outputBinding"]["glob"]
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, submit_runner_ram=0, name=None): packed = packed_workflow(arvRunner, tool) adjustDirObjs(job_order, trim_listing) adjustFileObjs(job_order, trim_anonymous_location) adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: sn = shortname(inp["id"]) if sn in job_order: inp["default"] = job_order[sn] if not name: name = tool.tool.get("label", os.path.basename(tool.tool["id"])) upload_dependencies(arvRunner, name, tool.doc_loader, packed, tool.tool["id"], False) # TODO nowhere for submit_runner_ram to go. body = { "workflow": { "name": name, "description": tool.tool.get("doc", ""), "definition":yaml.round_trip_dump(packed) }} if project_uuid: body["workflow"]["owner_uuid"] = project_uuid if uuid: call = arvRunner.api.workflows().update(uuid=uuid, body=body) else: call = arvRunner.api.workflows().create(body=body) return call.execute(num_retries=arvRunner.num_retries)["uuid"]
def run(self, *args, **kwargs): job_spec = self.arvados_job_spec(*args, **kwargs) job_spec.setdefault("owner_uuid", self.arvrunner.project_uuid) response = self.arvrunner.api.jobs().create( body=job_spec, find_or_create=self.enable_reuse ).execute(num_retries=self.arvrunner.num_retries) self.uuid = response["uuid"] self.arvrunner.processes[self.uuid] = self logger.info("Submitted job %s", response["uuid"]) if kwargs.get("submit"): self.pipeline = self.arvrunner.api.pipeline_instances().create( body={ "owner_uuid": self.arvrunner.project_uuid, "name": shortname(self.tool.tool["id"]), "components": {"cwl-runner": {"job": {"uuid": self.uuid, "state": response["state"]} } }, "state": "RunningOnClient"}).execute(num_retries=self.arvrunner.num_retries) if response["state"] in ("Complete", "Failed", "Cancelled"): self.done(response)
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug updated_tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception("--trash-intermediate is only supported with --api=containers.") self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception("--intermediate-output-ttl is only supported with --api=containers.") if self.intermediate_output_ttl < 0: raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"]) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, updated_tool, job_order) # the last clause means: if it is a command line tool, and we # are going to wait for the result, and always_submit_runner # is false, then we don't submit a runner process. submitting = (runtimeContext.update_workflow or runtimeContext.create_workflow or (runtimeContext.submit and not (updated_tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and not runtimeContext.always_submit_runner))) loadingContext = self.loadingContext.copy() loadingContext.do_validate = False loadingContext.do_update = False if submitting: # Document may have been auto-updated. Reload the original # document with updating disabled because we want to # submit the document with its original CWL version, not # the auto-updated one. tool = load_tool(updated_tool.tool["id"], loadingContext) else: tool = updated_tool # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Recreate process object (ArvadosWorkflow or # ArvadosCommandTool) because tool document may have been # updated by upload_workflow_deps in ways that modify # inheritance of hints or requirements. loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata tool = load_tool(tool.tool, loadingContext) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "containers": return (upload_workflow(self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.apply_reqs(job_order, tool) self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception("--ignore-docker-for-reuse not supported with containers API.") runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256) self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if submitting: tool = RunnerContainer(self, updated_tool, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext.intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext.collection_cache_size, collection_cache_is_default=self.should_estimate_cache_size) else: runtimeContext.runnerjob = tool.tool["id"] if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.") break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") elif isinstance(sys.exc_info()[1], WorkflowException): logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) else: logger.exception("Workflow execution failed") if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if self.work_api == "containers" and not current_container: # Not running in a crunch container, so cancel any outstanding processes. for p in self.processes: try: self.api.container_requests().update(uuid=p, body={"priority": "0"} ).execute(num_retries=self.num_retries) except Exception: pass finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname(tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", type=str) parser.add_argument("--conformance-test", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) options = parser.parse_args(args) uri = "file://" + os.path.abspath(options.cwljob) if options.conformance_test: loader = schema_salad.ref_resolver.Loader({}) else: loader = schema_salad.ref_resolver.Loader({"@base": uri, "path": {"@type": "@id"}}) job, _ = loader.resolve_ref(uri) print "> job looks like ", job print "> working with ", options.cwltool # returns a workflow/tool object t = cwltool.main.load_tool(options.cwltool, False, False, cwltool.workflow.defaultMakeTool, True) # print t # print "> Need ", supportedProcessRequirements, " for tool ", t # check the requirements of the tool, skip for now if type(t) == int: return t try: # checkRequirements(t.tool, supportedProcessRequirements) print "skipping requirement checking ..." except Exception as e: logging.error(e) return 33 # ----------------- assign inputs to jobs ------------------------------ # for inp in t.tool["inputs"]: if shortname(inp["id"]) in job: pass elif shortname(inp["id"]) not in job and "default" in inp: job[shortname(inp["id"])] = copy.copy(inp["default"]) elif shortname(inp["id"]) not in job and inp["type"][0] == "null": pass else: raise Exception("Missing inputs `%s`" % shortname(inp["id"])) print "> job looks like ", job if options.conformance_test: sys.stdout.write( json.dumps( cwltool.main.single_job_executor(t, job, options.basedir, options, conformance_test=True), indent=4 ) ) return 0 if not options.basedir: options.basedir = os.path.dirname(os.path.abspath(options.cwljob)) outdir = options.outdir print "> the cwl file is for a ", t.tool["class"] if t.tool["class"] == "Workflow": print generateScriptForWorkflow(t, job, outdir) elif t.tool["class"] == "CommandLineTool": print generateScriptForTool(t, job, outdir) return 0
def pipeline_component_spec(self): """Return a component that Workbench and a-r-p-i will understand. Specifically, translate CWL input specs to Arvados pipeline format, like {"dataclass":"File","value":"xyz"}. """ spec = self.job.arvados_job_spec() # Most of the component spec is exactly the same as the job # spec (script, script_version, etc.). # spec['script_parameters'] isn't right, though. A component # spec's script_parameters hash is a translation of # self.tool.tool['inputs'] with defaults/overrides taken from # the job order. So we move the job parameters out of the way # and build a new spec['script_parameters']. job_params = spec['script_parameters'] spec['script_parameters'] = {} for param in self.embedded_tool.tool['inputs']: param = copy.deepcopy(param) # Data type and "required" flag... types = param['type'] if not isinstance(types, list): types = [types] param['required'] = 'null' not in types non_null_types = [t for t in types if t != "null"] if len(non_null_types) == 1: the_type = [c for c in non_null_types][0] dataclass = None if isinstance(the_type, basestring): dataclass = self.type_to_dataclass.get(the_type) if dataclass: param['dataclass'] = dataclass # Note: If we didn't figure out a single appropriate # dataclass, we just left that attribute out. We leave # the "type" attribute there in any case, which might help # downstream. # Title and description... title = param.pop('label', '') descr = param.pop('doc', '').rstrip('\n') if title: param['title'] = title if descr: param['description'] = descr # Fill in the value from the current job order, if any. param_id = shortname(param.pop('id')) value = job_params.get(param_id) if value is None: pass elif not isinstance(value, dict): param['value'] = value elif param.get('dataclass') in ( 'File', 'Collection') and value.get('location'): param['value'] = value['location'][5:] spec['script_parameters'][param_id] = param spec['script_parameters']['cwl:tool'] = job_params['cwl:tool'] return spec
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) if isinstance(self.step.tool["scatter"], string_types): scatter = [self.step.tool["scatter"]] else: scatter = self.step.tool["scatter"] scatterMethod = self.step.tool.get("scatterMethod", None) if len(scatter) == 1: scatterMethod = "dotproduct" outputs = [] valueFrom = { shortname(i["id"]): i["valueFrom"] for i in self.step.tool["inputs"] if "valueFrom" in i } def postScatterEval(io): shortio = {shortname(k): v for k, v in iteritems(io)} for k in valueFrom: io.setdefault(k, None) def valueFromFunc(k, v): if k in valueFrom: return cwltool.expression.do_eval(valueFrom[k], shortio, self.step.requirements, None, None, {}, context=v) else: return v return {k: valueFromFunc(k, v) for k, v in list(io.items())} if scatterMethod == "dotproduct": for i in range(0, len(cwljob[shortname(scatter[0])])): copyjob = copy.copy(cwljob) for sc in [shortname(x) for x in scatter]: copyjob[sc] = cwljob[sc][i] copyjob = postScatterEval(copyjob) (subjob, followOn) = makeJob(self.step.embedded_tool, copyjob, **self.executor_options) self.addChild(subjob) outputs.append(followOn.rv()) elif scatterMethod == "nested_crossproduct": outputs = self.nested_crossproduct_scatter(cwljob, scatter, postScatterEval) elif scatterMethod == "flat_crossproduct": self.flat_crossproduct_scatter(cwljob, scatter, outputs, postScatterEval) else: if scatterMethod: raise validate.ValidationException( "Unsupported complex scatter type '%s'" % scatterMethod) else: raise validate.ValidationException( "Must provide scatterMethod to scatter over multiple inputs" ) return outputs
def job(self, joborder, output_callback, **kwargs): kwargs["work_api"] = self.work_api req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if req: with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"]) discover_secondary_files(self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) upload_dependencies(self.arvrunner, kwargs.get("name", ""), document_loader, packed, uri, False) with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x)) mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"], "/keep/%s", "/keep/%s/%s", **kwargs) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+[ { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": { "class": "File", "location": "keep:%s/workflow.cwl" % self.wf_pdh } }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"], "id": "#" }) kwargs["loader"] = self.doc_loader kwargs["avsc_names"] = self.doc_schema return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs) else: return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
def discover_secondary_files(inputs, job_order, discovered=None): for t in inputs: if shortname(t["id"]) in job_order and t.get("secondaryFiles"): setSecondary(t, job_order[shortname(t["id"])], discovered)
def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs): events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message) try: self.api.collections().get(uuid=crunchrunner_pdh).execute() except arvados.errors.ApiError as e: import httplib2 h = httplib2.Http(ca_certs=arvados.util.ca_certs_path()) resp, content = h.request(crunchrunner_download, "GET") resp2, content2 = h.request(certs_download, "GET") with arvados.collection.Collection() as col: with col.open("crunchrunner", "w") as f: f.write(content) with col.open("ca-certificates.crt", "w") as f: f.write(content2) col.save_new("crunchrunner binary", ensure_unique_name=True) self.fs_access = CollectionFsAccess(input_basedir) kwargs["fs_access"] = self.fs_access kwargs["enable_reuse"] = args.enable_reuse kwargs["outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" if kwargs.get("conformance_test"): return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs) else: self.pipeline = self.api.pipeline_instances().create(body={"name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) jobiter = tool.job(job_order, input_basedir, self.output_callback, docker_outdir="$(task.outdir)", **kwargs) try: for runnable in jobiter: if runnable: with self.lock: runnable.run(**kwargs) else: if self.jobs: try: self.cond.acquire() self.cond.wait(1) except RuntimeError: pass finally: self.cond.release() else: logger.error("Workflow cannot make any more progress.") break while self.jobs: try: self.cond.acquire() self.cond.wait(1) except RuntimeError: pass finally: self.cond.release() events.close() if self.final_output is None: raise cwltool.workflow.WorkflowException("Workflow did not return a result.") except: if sys.exc_info()[0] is not KeyboardInterrupt: logger.exception("Caught unhandled exception, marking pipeline as failed") self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) return self.final_output
def arv_executor(self, tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception("--trash-intermediate is only supported with --api=containers.") self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception("--intermediate-output-ttl is only supported with --api=containers.") if self.intermediate_output_ttl < 0: raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps # Don't validate this time because it will just print redundant errors. loadingContext = self.loadingContext.copy() loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata loadingContext.do_validate = False tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], loadingContext) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, tool, job_order) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate(self, tool, job_order, runtimeContext.enable_reuse, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map, loadingContext=loadingContext) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow(self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception("--ignore-docker-for-reuse not supported with containers API.") runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" elif self.work_api == "jobs": if runtimeContext.priority != DEFAULT_PRIORITY: raise Exception("--priority not implemented for jobs API.") runtimeContext.outdir = "$(task.outdir)" runtimeContext.docker_outdir = "$(task.outdir)" runtimeContext.tmpdir = "$(task.tmpdir)" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256) self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner): runtimeContext.runnerjob = tool.tool["id"] else: tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext.intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext.collection_cache_size, collection_cache_is_default=self.should_estimate_cache_size) elif self.work_api == "jobs": tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, merged_map=merged_map) elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.") break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") else: logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if runtimeContext.submit and isinstance(tool, Runner): runnerjob = tool if runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update(uuid=runnerjob.uuid, body={"priority": "0"}).execute(num_retries=self.num_retries) finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname(tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def main(args=None, stdout=sys.stdout): parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument( "--preserve-environment", type=str, nargs='+', help= "Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH", ), dest="preserve_environment") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container setLoggingFromOptions(options) if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict try: t = cwltool.load_tool.load_tool( options.cwltool, toilMakeTool, kwargs={ "hints": [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] }, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job = cwltool.main.load_job_order(options, t, sys.stdin) if type(job) == int: return job job, options.basedir = job fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs( tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs( tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute( fileobj["location"], sf), "class": "File" } for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) make_fs_access = functools.partial(ToilFsAccess, fileStore=toil) try: (wf1, wf2) = makeJob( t, {}, use_container=use_container, preserve_environment=options.preserve_environment, tmpdir=os.path.realpath(outdir), workdir=options.workDir) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class( outobj, ("File", ), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid self.pipeline = None self.fs_access = CollectionFsAccess(kwargs["basedir"], api_client=self.api) if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["fs_access"] = self.fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["on_error"] = "continue" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse")) else: runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get("submit") and "cwl_runner_job" not in kwargs and not self.work_api == "containers": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid arvados.config.settings()["ARVADOS_DISABLE_WEBSOCKETS"] = "1" if self.work_api == "containers": events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#container"]], self.on_message) if self.work_api == "jobs": events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message) if runnerjob: jobiter = iter((runnerjob,)) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.") break while self.processes: self.cond.wait(1) events.close() except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error("Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update(uuid=runnerjob.uuid, body={"priority": "0"}).execute(num_retries=self.num_retries) finally: self.cond.release() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_status != "success": raise WorkflowException("Workflow failed.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("compute_checksum"): def compute_checksums(fileobj): if "checksum" not in fileobj: checksum = hashlib.sha1() with self.fs_access.open(fileobj["location"], "rb") as f: contents = f.read(1024*1024) while contents != "": checksum.update(contents) contents = f.read(1024*1024) fileobj["checksum"] = "sha1$%s" % checksum.hexdigest() adjustFileObjs(self.final_output, compute_checksums) return self.final_output
def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get("project_uuid") else useruuid self.pipeline = None if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid if kwargs.get("submit"): runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message) self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") self.fs_access = CollectionFsAccess(kwargs["basedir"]) kwargs["fs_access"] = self.fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" if kwargs.get("conformance_test"): return cwltool.main.single_job_executor(tool, job_order, **kwargs) else: if kwargs.get("submit"): jobiter = iter((runnerjob,)) else: components = {} if "cwl_runner_job" in kwargs: components[os.path.basename(tool.tool["id"])] = {"job": kwargs["cwl_runner_job"]} self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": components, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) jobiter = tool.job(job_order, self.output_callback, docker_outdir="$(task.outdir)", **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.jobs: self.cond.wait(1) else: logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.") break while self.jobs: self.cond.wait(1) events.close() except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error("Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) finally: self.cond.release() if self.final_output is None: raise cwltool.workflow.WorkflowException("Workflow did not return a result.") return self.final_output
def arv_executor(self, tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception( "--trash-intermediate is only supported with --api=containers." ) self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception( "--intermediate-output-ttl is only supported with --api=containers." ) if self.intermediate_output_ttl < 0: raise Exception( "Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception( "--submit-request-uuid requires containers API, but using '{}' api" .format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps # Don't validate this time because it will just print redundant errors. loadingContext = self.loadingContext.copy() loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata loadingContext.do_validate = False tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], loadingContext) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, tool, job_order) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, runtimeContext.enable_reuse, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception( "--ignore-docker-for-reuse not supported with containers API." ) runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" elif self.work_api == "jobs": if runtimeContext.priority != DEFAULT_PRIORITY: raise Exception("--priority not implemented for jobs API.") runtimeContext.outdir = "$(task.outdir)" runtimeContext.docker_outdir = "$(task.outdir)" runtimeContext.tmpdir = "$(task.tmpdir)" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool[ "class"] == "CommandLineTool" and runtimeContext.wait: runtimeContext.runnerjob = tool.tool["id"] runnerjob = tool.job(job_order, self.output_callback, runtimeContext).next() else: runnerjob = RunnerContainer( self, tool, job_order, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext. intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store) elif self.work_api == "jobs": runnerjob = RunnerJob( self, tool, job_order, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, merged_map=merged_map) elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": runtimeContext.name if runtimeContext. name else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not runtimeContext.wait: submitargs = runtimeContext.copy() submitargs.submit = False runnerjob.run(submitargs) return (runnerjob.uuid, "success") self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) if runnerjob: jobiter = iter((runnerjob, )) else: if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error( "Workflow is deadlocked, no runnable processes and not waiting on any pending processes." ) break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info( )[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_writable) if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get( "project_uuid") else useruuid self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api) self.fs_access = make_fs_access(kwargs["basedir"]) if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["on_error"] = "continue" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer(self, tool, job_order, kwargs.get("enable_reuse")) else: runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and not self.work_api == "containers": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_status != "success": raise WorkflowException("Workflow failed.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("compute_checksum"): adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return self.final_output
def sn(n): if isinstance(n, dict): return shortname(n["id"]) if isinstance(n, string_types): return shortname(n)
def main(args=None, stdout=sys.stdout): config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument("--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker " "command (like udocker or dx-docker) that will be " "used to call 'pull' and 'run'") parser.add_argument("--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system." parser.add_argument("--beta-dependency-resolvers-configuration", default=None) # help="Defaut root directory used by dependency resolvers configuration." parser.add_argument("--beta-dependencies-directory", default=None) # help="Use biocontainers for tools without an explicitly annotated Docker container." parser.add_argument("--beta-use-biocontainers", default=None, action="store_true") # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages." parser.add_argument("--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} make_tool_kwargs = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) # Text use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) # Text job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) # type: DependenciesConfiguration job_script_provider = dependencies_configuration options.default_container = None make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict make_tool_kwargs["hints"] = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, kwargs=make_tool_kwargs, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job, options.basedir, loader = cwltool.main.load_job_order( options, sys.stdin, None, [], options.job_order) job = cwltool.main.init_job_order(job, options, t, loader=loader) fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) try: make_opts = copy.deepcopy(vars(options)) make_opts.update({'tool': t, 'jobobj': {}, 'use_container': use_container, 'tmpdir': os.path.realpath(outdir), 'job_script_provider': job_script_provider}) (wf1, wf2) = makeJob(**make_opts) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"]) discover_secondary_files(self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) def visit(item): for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append(req) dyn = True break else: with SourceLine(req, k, WorkflowException): raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions") if not dyn: self.static_resource_req.append(req) if req["class"] == "DockerRequirement": if "http://arvados.org/cwl#dockerCollectionPDH" in req: del req["http://arvados.org/cwl#dockerCollectionPDH"] visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [get_overall_res_req(self.static_resource_req)] upload_dependencies(self.arvrunner, runtimeContext.name, document_loader, packed, uri, False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+job_res_reqs+[ {"class": "InlineJavascriptRequirement"}, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"], "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
def discover_default_secondary_files(obj): discover_secondary_files( obj["inputs"], { shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t }, discovered)
def arvExecutor(self, tool, job_order, input_basedir, args, **kwargs): events = arvados.events.subscribe(arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message) try: self.api.collections().get(uuid=crunchrunner_pdh).execute() except arvados.errors.ApiError as e: import httplib2 h = httplib2.Http(ca_certs=arvados.util.ca_certs_path()) resp, content = h.request(crunchrunner_download, "GET") resp2, content2 = h.request(certs_download, "GET") with arvados.collection.Collection() as col: with col.open("crunchrunner", "w") as f: f.write(content) with col.open("ca-certificates.crt", "w") as f: f.write(content2) col.save_new("crunchrunner binary", ensure_unique_name=True) self.fs_access = CollectionFsAccess(input_basedir) kwargs["fs_access"] = self.fs_access kwargs["enable_reuse"] = args.enable_reuse kwargs["outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = args.project_uuid if args.project_uuid else useruuid if kwargs.get("conformance_test"): return cwltool.main.single_job_executor(tool, job_order, input_basedir, args, **kwargs) else: self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) jobiter = tool.job(job_order, input_basedir, self.output_callback, docker_outdir="$(task.outdir)", **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.jobs: self.cond.wait(1) else: logger.error("Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs.") break while self.jobs: self.cond.wait(1) events.close() if self.final_output is None: raise cwltool.workflow.WorkflowException("Workflow did not return a result.") # create final output collection except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.exception("Caught unhandled exception, marking pipeline as failed") self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) finally: self.cond.release() return self.final_output
def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg))
def __init__(self, step, cwljob, **kwargs): super(CWLScatter, self).__init__() self.step = step self.cwljob = cwljob self.valueFrom = {shortname(i["id"]): i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i} self.executor_options = kwargs
def arv_executor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") tool.visit(self.check_features) self.project_uuid = kwargs.get("project_uuid") self.pipeline = None make_fs_access = kwargs.get("make_fs_access") or partial( CollectionFsAccess, api_client=self.api, keep_client=self.keep_client) self.fs_access = make_fs_access(kwargs["basedir"]) if not kwargs.get("name"): kwargs["name"] = self.name = tool.tool.get( "label") or tool.metadata.get("label") or os.path.basename( tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], makeTool=self.arv_make_tool, loader=tool.doc_loader, avsc_names=tool.doc_schema, metadata=tool.metadata) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % kwargs["name"], tool, job_order) existing_uuid = kwargs.get("update_workflow") if existing_uuid or kwargs.get("create_workflow"): # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate( self, tool, job_order, kwargs.get("enable_reuse"), uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow( self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs["name"]), "success") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") kwargs["make_fs_access"] = make_fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["use_container"] = True kwargs["tmpdir_prefix"] = "tmp" kwargs["compute_checksum"] = kwargs.get("compute_checksum") if self.work_api == "containers": kwargs["outdir"] = "/var/spool/cwl" kwargs["docker_outdir"] = "/var/spool/cwl" kwargs["tmpdir"] = "/tmp" kwargs["docker_tmpdir"] = "/tmp" elif self.work_api == "jobs": kwargs["outdir"] = "$(task.outdir)" kwargs["docker_outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" runnerjob = None if kwargs.get("submit"): # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool": kwargs["runnerjob"] = tool.tool["id"] upload_dependencies(self, kwargs["name"], tool.doc_loader, tool.tool, tool.tool["id"], False) runnerjob = tool.job(job_order, self.output_callback, **kwargs).next() else: runnerjob = RunnerContainer( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image")) elif self.work_api == "jobs": runnerjob = RunnerJob( self, tool, job_order, kwargs.get("enable_reuse"), self.output_name, self.output_tags, submit_runner_ram=kwargs.get("submit_runner_ram"), name=kwargs.get("name"), on_error=kwargs.get("on_error"), submit_runner_image=kwargs.get("submit_runner_image")) if not kwargs.get( "submit" ) and "cwl_runner_job" not in kwargs and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": kwargs["name"] if kwargs. get("name") else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runnerjob and not kwargs.get("wait"): runnerjob.run(wait=kwargs.get("wait")) return (runnerjob.uuid, "success") self.poll_api = arvados.api('v1') self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() if runnerjob: jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if runnable: with Perf(metrics, "run"): runnable.run(**kwargs) else: if self.processes: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break loopperf.__enter__() loopperf.__exit__() while self.processes: self.cond.wait(1) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Execution failed: %s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) if runnerjob and runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update( uuid=runnerjob.uuid, body={ "priority": "0" }).execute(num_retries=self.num_retries) finally: self.cond.release() self.stop_polling.set() self.polling_thread.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if kwargs.get("submit") and isinstance(runnerjob, Runner): logger.info("Final output collection %s", runnerjob.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname( tool.tool["id"])) if self.output_tags is None: self.output_tags = "" self.final_output, self.final_output_collection = self.make_output_collection( self.output_name, self.output_tags, self.final_output) self.set_crunch_output() if kwargs.get("compute_checksum"): adjustDirObjs(self.final_output, partial(getListing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) return (self.final_output, self.final_status)
def main(args=None, stdout=sys.stdout): """Main method for toil-cwl-runner.""" cwllogger.removeHandler(defaultStreamHandler) config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) dockergroup = parser.add_mutually_exclusive_group() dockergroup.add_argument( "--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker command (like " "udocker or dx-docker) that will be used to call 'pull' and 'run'") dockergroup.add_argument( "--singularity", action="store_true", default=False, help="[experimental] Use Singularity runtime for running containers. " "Requires Singularity v2.3.2+ and Linux with kernel version v3.18+ or " "with overlayfs support backported.") dockergroup.add_argument( "--no-container", action="store_true", help="Do not execute jobs in a " "Docker container, even when `DockerRequirement` " "is specified under `hints`.") parser.add_argument( "--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running" " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") parser.add_argument( "--destBucket", type=str, help="Specify a cloud bucket endpoint for output files.") parser.add_argument( "--beta-dependency-resolvers-configuration", default=None) parser.add_argument("--beta-dependencies-directory", default=None) parser.add_argument( "--beta-use-biocontainers", default=None, action="store_true") parser.add_argument( "--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") parser.add_argument( "--force-docker-pull", action="store_true", default=False, dest="force_docker_pull", help="Pull latest docker image even if it is locally present") parser.add_argument( "--no-match-user", action="store_true", default=False, help="Disable passing the current uid to `docker run --user`") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] # we use workdir as jobStore: options = parser.parse_args([workdir] + args) # if tmpdir_prefix is not the default value, set workDir too if options.tmpdir_prefix != 'tmp': options.workDir = options.tmpdir_prefix if options.provisioner and not options.jobStore: raise NoSuchJobStoreException( 'Please specify a jobstore with the --jobStore option when specifying a provisioner.') use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) tmp_outdir_prefix = os.path.abspath(options.tmp_outdir_prefix) tmpdir_prefix = os.path.abspath(options.tmpdir_prefix) fileindex = {} existing = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) job_script_provider = dependencies_configuration options.default_container = None runtime_context = cwltool.context.RuntimeContext(vars(options)) runtime_context.find_default_container = functools.partial( find_default_container, options) runtime_context.workdir = workdir runtime_context.move_outputs = "leave" runtime_context.rm_tmpdir = False loading_context = cwltool.context.LoadingContext(vars(options)) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: loading_context.hints = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] loading_context.construct_tool_object = toil_make_tool loading_context.resolver = cwltool.resolver.tool_resolver loading_context.strict = not options.not_strict options.workflow = options.cwltool options.job_order = options.cwljob uri, tool_file_uri = cwltool.load_tool.resolve_tool_uri( options.cwltool, loading_context.resolver, loading_context.fetcher_constructor) options.tool_help = None options.debug = options.logLevel == "DEBUG" job_order_object, options.basedir, jobloader = \ cwltool.main.load_job_order( options, sys.stdin, loading_context.fetcher_constructor, loading_context.overrides_list, tool_file_uri) document_loader, workflowobj, uri = \ cwltool.load_tool.fetch_document( uri, loading_context.resolver, loading_context.fetcher_constructor) document_loader, avsc_names, processobj, metadata, uri = \ cwltool.load_tool.validate_document( document_loader, workflowobj, uri, loading_context.enable_dev, loading_context.strict, False, loading_context.fetcher_constructor, False, loading_context.overrides_list, do_validate=loading_context.do_validate) loading_context.overrides_list.extend( metadata.get("cwltool:overrides", [])) try: tool = cwltool.load_tool.make_tool( document_loader, avsc_names, metadata, uri, loading_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 runtime_context.secret_store = SecretStore() initialized_job_order = cwltool.main.init_job_order( job_order_object, options, tool, jobloader, sys.stdout, secret_store=runtime_context.secret_store) fs_access = cwltool.stdfsaccess.StdFsAccess(options.basedir) fill_in_defaults( tool.tool["inputs"], initialized_job_order, fs_access) def path_to_loc(obj): if "location" not in obj and "path" in obj: obj["location"] = obj["path"] del obj["path"] def import_files(tool): visit_class(tool, ("File", "Directory"), path_to_loc) visit_class(tool, ("File", ), functools.partial( add_sizes, fs_access)) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial( get_listing, fs_access, recursive=True)) adjustFileObjs(tool, functools.partial( uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) tool.visit(import_files) for inp in tool.tool["inputs"]: def set_secondary(fileobj): if isinstance(fileobj, Mapping) \ and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [ {"location": cwltool.builder.substitute( fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, MutableSequence): for entry in fileobj: set_secondary(entry) if shortname(inp["id"]) in initialized_job_order \ and inp.get("secondaryFiles"): set_secondary(initialized_job_order[shortname(inp["id"])]) import_files(initialized_job_order) visitSteps(tool, import_files) try: runtime_context.use_container = use_container runtime_context.tmpdir = os.path.realpath(tmpdir_prefix) runtime_context.tmp_outdir_prefix = os.path.realpath( tmp_outdir_prefix) runtime_context.job_script_provider = job_script_provider runtime_context.force_docker_pull = options.force_docker_pull runtime_context.no_match_user = options.no_match_user (wf1, _) = makeJob(tool, {}, None, runtime_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 wf1.cwljob = initialized_job_order if wf1 is CWLJob: # Clean up temporary directories only created with CWLJobs. wf1.addFollowOnFn(cleanTempDirs, wf1) outobj = toil.start(wf1) outobj = resolve_indirect(outobj) # Stage files. Specify destination bucket if specified in CLI # options. If destination bucket not passed in, # options.destBucket's value will be None. toilStageFiles( toil, outobj, outdir, fileindex, existing, export=True, destBucket=options.destBucket) if not options.destBucket: visit_class(outobj, ("File",), functools.partial( compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) visit_class(outobj, ("File", ), MutationManager().unset_generation) stdout.write(json.dumps(outobj, indent=4)) return 0
def __init__(self, step, cwljob): Job.__init__(self) self.step = step self.cwljob = cwljob self.valueFrom = {shortname(i["id"]): i["valueFrom"] for i in step.tool["inputs"] if "valueFrom" in i}
def run(self, file_store): cwljob = resolve_indirect(self.cwljob) # `promises` dict # from: each parameter (workflow input or step output) # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. promises = {} # `jobs` dict from step id to job that implements that step. jobs = {} for inp in self.cwlwf.tool["inputs"]: promises[inp["id"]] = SelfJob(self, cwljob) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, scheduling jobs as their # dependencies can be fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True for step in self.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp: for s in aslist(inp["source"]): if s not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} for inp in step.tool["inputs"]: key = shortname(inp["id"]) if "source" in inp: if inp.get("linkMerge") \ or len(aslist(inp["source"])) > 1: linkMerge = inp.get( "linkMerge", "merge_nested") if linkMerge == "merge_nested": jobobj[key] = ( MergeInputsNested( [(shortname(s), promises[s].rv()) for s in aslist( inp["source"])])) elif linkMerge == "merge_flattened": jobobj[key] = ( MergeInputsFlattened( [(shortname(s), promises[s].rv()) for s in aslist( inp["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '%s'" % linkMerge) else: inpSource = inp["source"] if isinstance(inpSource, MutableSequence): # It seems that an input source with a # '#' in the name will be returned as a # CommentedSeq list by the yaml parser. inpSource = str(inpSource[0]) jobobj[key] = (shortname(inpSource), promises[inpSource].rv()) if "default" in inp: if key in jobobj: if isinstance(jobobj[key][1], Promise): d = copy.copy(inp["default"]) jobobj[key] = DefaultWithSource( d, jobobj[key]) else: if jobobj[key][1][ jobobj[key][0]] is None: d = copy.copy(inp["default"]) jobobj[key] = ( "default", {"default": d}) else: d = copy.copy(inp["default"]) jobobj[key] = ("default", {"default": d}) if "valueFrom" in inp \ and "scatter" not in step.tool: if key in jobobj: jobobj[key] = StepValueFrom( inp["valueFrom"], jobobj[key], self.cwlwf.requirements) else: jobobj[key] = StepValueFrom( inp["valueFrom"], ( "None", {"None": None}), self.cwlwf.requirements) if "scatter" in step.tool: wfjob = CWLScatter(step, IndirectDict(jobobj), self.runtime_context) followOn = CWLGather(step, wfjob.rv()) wfjob.addFollowOn(followOn) else: (wfjob, followOn) = makeJob( step.embedded_tool, IndirectDict(jobobj), step.tool["inputs"], self.runtime_context) jobs[step.tool["id"]] = followOn connected = False for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if (isinstance( promises[s], (CWLJobWrapper, CWLGather) ) and not promises[s].hasFollowOn(wfjob)): promises[s].addFollowOn(wfjob) connected = True if (not isinstance( promises[s], (CWLJobWrapper, CWLGather) ) and not promises[s].hasChild(wfjob)): promises[s].addChild(wfjob) connected = True if not connected: # the workflow step has default inputs only & isn't # connected to other jobs, so add it as child of # this workflow. self.addChild(wfjob) for out in step.tool["outputs"]: promises[out["id"]] = followOn for inp in step.tool["inputs"]: for source in aslist(inp.get("source", [])): if source not in promises: alloutputs_fufilled = False # may need a test for out in self.cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False outobj = {} for out in self.cwlwf.tool["outputs"]: key = shortname(out["id"]) if out.get("linkMerge") or len(aslist(out["outputSource"])) > 1: link_merge = out.get("linkMerge", "merge_nested") if link_merge == "merge_nested": outobj[key] = ( MergeInputsNested( [(shortname(s), promises[s].rv()) for s in aslist(out["outputSource"])])) elif link_merge == "merge_flattened": outobj[key] = ( MergeInputsFlattened([ (shortname(s), promises[s].rv()) for s in aslist(out["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '{}'".format(link_merge)) else: # A CommentedSeq of length one still appears here rarely - # not clear why from the CWL code. When it does, it breaks # the execution by causing a non-hashable type exception. # We simplify the list into its first (and only) element. src = simplify_list(out["outputSource"]) outobj[key] = (shortname(src), promises[src].rv()) return IndirectDict(outobj)
def generateScriptForWorkflow(cwlwf, cwljob, outdir): promises = {} jobs = {} script = [] outdirs = [] print "> inputs for this workflow are: ", cwlwf.tool["inputs"] print "> steps in this workflow are: ", cwlwf.steps # need to confirm why this is neccesary for inp in cwlwf.tool["inputs"]: promises[inp["id"]] = (cwlwf, cwljob[shortname(inp["id"])]) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, adding jobs to the script as their # dependencies are fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True progress = False # step = t.steps[1] # testing for step in t.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp and inp["source"] not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} # TODO: Handle multiple inbound links # TODO: Handle scatter/gather # (both are discussed in section 5.1.2 in CWL spec draft-2) # script.append("# Run step %s" % step.tool["id"]) for inp in step.tool["inputs"]: if "source" in inp: jobobj[shortname(inp["id"])] = promises[inp["source"]][1] # script.append("# depends on step %s" % promises[inp["source"]][0].tool["id"]) elif "default" in inp: d = copy.copy(inp["default"]) jobobj[shortname(inp["id"])] = d (wfjob, joboutdir, jobtmpdir) = generateScriptForTool(step.embedded_tool, jobobj, None) outdirs.append(joboutdir) jobs[step.tool["id"]] = True # This line is where it generates the command together with arguments # (from the function generateScriptForTool) script.append(wfjob) for out in step.tool["outputs"]: for toolout in step.embedded_tool.tool["outputs"]: if shortname(toolout["id"]) == shortname(out["id"]): if toolout["type"] != "File": raise Exception("Only supports file outputs") if glob_metacharacters(toolout["outputBinding"]["glob"]): raise Exception("Only support glob with concrete filename.") promises[out["id"]] = ( step, { "class": "File", "path": os.path.join(joboutdir, toolout["outputBinding"]["glob"]), }, ) progress = True for out in cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False if not alloutputs_fufilled and not progress: raise Exception("Not making progress") # outobj = {} # script.append("# Move output files to the current directory") # # for out in cwlwf.tool["outputs"]: # f = promises[out["source"]][1] # script.append("mv %s ." % (maybe_quote(f["path"]))) # f["path"] = os.path.basename(f["path"]) # # if f.get("secondaryFiles"): # script.append("mv %s ." % (' '.join([maybe_quote(sf["path"]) for sf in f["secondaryFiles"]]))) # for sf in f["secondaryFiles"]: # sf["path"] = os.path.basename(sf["path"]) # # outobj[shortname(out["id"])] = f # script.append("") # script.append("# Clean up staging output directories") # script.append("rm -r %s" % (' '.join([maybe_quote(od) for od in outdirs]))) # script.append("") # script.append("# Generate final output object") # script.append("echo '%s'" % json.dumps(outobj, indent=4)) return "\n".join(script)
def collect_output( self, schema, # type: Dict[Text, Any] builder, # type: Builder outdir, # type: Text fs_access, # type: StdFsAccess compute_checksum=True # type: bool ): # type: (...) -> Optional[Union[Dict[Text, Any], List[Union[Dict[Text, Any], Text]]]] result = [] # type: List[Any] empty_and_optional = False debug = LOGGER.isEnabledFor(logging.DEBUG) if "outputBinding" in schema: binding = schema["outputBinding"] globpatterns = [] # type: List[Text] revmap = partial(command_line_tool.revmap_file, builder, outdir) if "glob" in binding: with SourceLine(binding, "glob", WorkflowException, debug): for glob in aslist(binding["glob"]): glob = builder.do_eval(glob) if glob: globpatterns.extend(aslist(glob)) for glob in globpatterns: if glob.startswith(outdir): glob = glob[len(outdir) + 1:] elif glob == ".": glob = outdir elif glob.startswith("/"): raise WorkflowException( "glob patterns must not start with '/'") try: prefix = fs_access.glob(outdir) key = cmp_to_key( cast(Callable[[Text, Text], int], locale.strcoll)) # In case of stdout.log or stderr.log file not created if "stdout" in self.tool and "stderr" in self.tool \ and glob in (self.tool["stdout"], self.tool["stderr"]): filepath = Path(fs_access.join(outdir, glob)) if not filepath.is_file(): Path(filepath).touch() result.extend([{ "location": g, "path": fs_access.join(builder.outdir, g[len(prefix[0]) + 1:]), "basename": os.path.basename(g), "nameroot": os.path.splitext(os.path.basename(g))[0], "nameext": os.path.splitext(os.path.basename(g))[1], "class": "File" if fs_access.isfile(g) else "Directory" } for g in sorted(fs_access.glob( fs_access.join(outdir, glob)), key=key)]) except (OSError, IOError) as exc: LOGGER.warning(Text(exc)) except Exception: LOGGER.exception("Unexpected error from fs_access") raise for files in result: rfile = files.copy() # TODO This function raise an exception and seems to be related to docker (which is not used here) # revmap(rfile) if files["class"] == "Directory": load_listing = builder.loadListing or ( binding and binding.get("loadListing")) if load_listing and load_listing != "no_listing": get_listing(fs_access, files, (load_listing == "deep_listing")) else: with fs_access.open(rfile["location"], "rb") as f: contents = b"" if binding.get("loadContents") or compute_checksum: contents = f.read(CONTENT_LIMIT) if binding.get("loadContents"): files["contents"] = contents.decode("utf-8") if compute_checksum: checksum = hashlib.sha1() # nosec: B303 while contents != b"": checksum.update(contents) contents = f.read(1024 * 1024) files[ "checksum"] = "sha1$%s" % checksum.hexdigest( ) f.seek(0, 2) file_size = f.tell() files["size"] = file_size optional = False single = False if isinstance(schema["type"], list): if "null" in schema["type"]: optional = True if "File" in schema["type"] or "Directory" in schema["type"]: single = True elif schema["type"] == "File" or schema["type"] == "Directory": single = True if "outputEval" in binding: with SourceLine(binding, "outputEval", WorkflowException, debug): result = builder.do_eval(binding["outputEval"], context=result) if single: if not result and not optional: with SourceLine(binding, "glob", WorkflowException, debug): raise WorkflowException( "Did not find output file with glob pattern: '{}'". format(globpatterns)) elif not result and optional: pass elif isinstance(result, list): if len(result) > 1: raise WorkflowException( "Multiple matches for output item that is a single file." ) result = result[0] if "secondaryFiles" in schema: with SourceLine(schema, "secondaryFiles", WorkflowException, debug): for primary in aslist(result): if isinstance(primary, dict): primary.setdefault("secondaryFiles", []) pathprefix = primary["path"][0:primary["path"]. rindex("/") + 1] for file in aslist(schema["secondaryFiles"]): if isinstance( file, dict) or "$(" in file or "${" in file: sfpath = builder.do_eval(file, context=primary) subst = False else: sfpath = file subst = True for sfitem in aslist(sfpath): if isinstance(sfitem, str): if subst: sfitem = { "path": substitute( primary["path"], sfitem) } else: sfitem = { "path": pathprefix + sfitem } if "path" in sfitem and "location" not in sfitem: revmap(sfitem) if fs_access.isfile(sfitem["location"]): sfitem["class"] = "File" primary["secondaryFiles"].append( sfitem) elif fs_access.isdir(sfitem["location"]): sfitem["class"] = "Directory" primary["secondaryFiles"].append( sfitem) if "format" in schema: for primary in aslist(result): primary["format"] = builder.do_eval(schema["format"], context=primary) # Ensure files point to local references outside of the run environment # TODO: Again removing revmap.... # adjustFileObjs(result, revmap) if not result and optional: return None if not empty_and_optional and isinstance( schema["type"], dict) and schema["type"]["type"] == "record": out = {} for f in schema["type"]["fields"]: out[shortname( f["name"])] = self.collect_output( # type: ignore f, builder, outdir, fs_access, compute_checksum=compute_checksum) return out return result
def arvExecutor(self, tool, job_order, **kwargs): self.debug = kwargs.get("debug") if kwargs.get("quiet"): logger.setLevel(logging.WARN) logging.getLogger('arvados.arv-run').setLevel(logging.WARN) useruuid = self.api.users().current().execute()["uuid"] self.project_uuid = kwargs.get("project_uuid") if kwargs.get( "project_uuid") else useruuid self.pipeline = None if kwargs.get("create_template"): tmpl = RunnerTemplate(self, tool, job_order, kwargs.get("enable_reuse")) tmpl.save() # cwltool.main will write our return value to stdout. return tmpl.uuid if kwargs.get("submit"): runnerjob = RunnerJob(self, tool, job_order, kwargs.get("enable_reuse")) if not kwargs.get("submit") and "cwl_runner_job" not in kwargs: # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient" }).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if kwargs.get("submit") and not kwargs.get("wait"): runnerjob.run() return runnerjob.uuid events = arvados.events.subscribe( arvados.api('v1'), [["object_uuid", "is_a", "arvados#job"]], self.on_message) self.debug = kwargs.get("debug") self.ignore_docker_for_reuse = kwargs.get("ignore_docker_for_reuse") self.fs_access = CollectionFsAccess(kwargs["basedir"]) kwargs["fs_access"] = self.fs_access kwargs["enable_reuse"] = kwargs.get("enable_reuse") kwargs["outdir"] = "$(task.outdir)" kwargs["tmpdir"] = "$(task.tmpdir)" if kwargs.get("conformance_test"): return cwltool.main.single_job_executor(tool, job_order, **kwargs) else: if kwargs.get("submit"): jobiter = iter((runnerjob, )) else: if "cwl_runner_job" in kwargs: self.uuid = kwargs.get("cwl_runner_job").get('uuid') jobiter = tool.job(job_order, self.output_callback, docker_outdir="$(task.outdir)", **kwargs) try: self.cond.acquire() # Will continue to hold the lock for the duration of this code # except when in cond.wait(), at which point on_message can update # job state and process output callbacks. for runnable in jobiter: if runnable: runnable.run(**kwargs) else: if self.jobs: self.cond.wait(1) else: logger.error( "Workflow is deadlocked, no runnable jobs and not waiting on any pending jobs." ) break while self.jobs: self.cond.wait(1) events.close() except: if sys.exc_info()[0] is KeyboardInterrupt: logger.error("Interrupted, marking pipeline as failed") else: logger.error( "Caught unhandled exception, marking pipeline as failed. Error was: %s", sys.exc_info()[0], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update( uuid=self.pipeline["uuid"], body={ "state": "Failed" }).execute(num_retries=self.num_retries) finally: self.cond.release() if self.final_output is None: raise cwltool.workflow.WorkflowException( "Workflow did not return a result.") return self.final_output
def set_secondary(fsaccess, builder, inputschema, secondaryspec, primary, discovered): if isinstance(inputschema, Sequence) and not isinstance(inputschema, basestring): # union type, collect all possible secondaryFiles for i in inputschema: set_secondary(fsaccess, builder, i, secondaryspec, primary, discovered) return if isinstance(inputschema, basestring): sd = search_schemadef(inputschema, reversed(builder.hints + builder.requirements)) if sd: inputschema = sd else: return if "secondaryFiles" in inputschema: # set secondaryFiles, may be inherited by compound types. secondaryspec = inputschema["secondaryFiles"] if (isinstance(inputschema["type"], (Mapping, Sequence)) and not isinstance(inputschema["type"], basestring)): # compound type (union, array, record) set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered) elif (inputschema["type"] == "record" and isinstance(primary, Mapping)): # # record type, find secondary files associated with fields. # for f in inputschema["fields"]: p = primary.get(shortname(f["name"])) if p: set_secondary(fsaccess, builder, f, secondaryspec, p, discovered) elif (inputschema["type"] == "array" and isinstance(primary, Sequence)): # # array type, find secondary files of elements # for p in primary: set_secondary(fsaccess, builder, {"type": inputschema["items"]}, secondaryspec, p, discovered) elif (inputschema["type"] == "File" and secondaryspec and isinstance(primary, Mapping) and primary.get("class") == "File" and "secondaryFiles" not in primary): # # Found a file, check for secondaryFiles # specs = [] primary["secondaryFiles"] = secondaryspec for i, sf in enumerate(aslist(secondaryspec)): if builder.cwlVersion == "v1.0": pattern = builder.do_eval(sf, context=primary) else: pattern = builder.do_eval(sf["pattern"], context=primary) if pattern is None: continue if isinstance(pattern, list): specs.extend(pattern) elif isinstance(pattern, dict): specs.append(pattern) elif isinstance(pattern, str): if builder.cwlVersion == "v1.0": specs.append({"pattern": pattern, "required": True}) else: specs.append({ "pattern": pattern, "required": sf.get("required") }) else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") found = [] for i, sf in enumerate(specs): if isinstance(sf, dict): if sf.get("class") == "File": pattern = None if sf.get("location") is None: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "File object is missing 'location': %s" % sf) sfpath = sf["location"] required = True else: pattern = sf["pattern"] required = sf.get("required") elif isinstance(sf, str): pattern = sf required = True else: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Expression must return list, object, string or null") if pattern is not None: sfpath = substitute(primary["location"], pattern) required = builder.do_eval(required, context=primary) if fsaccess.exists(sfpath): if pattern is not None: found.append({"location": sfpath, "class": "File"}) else: found.append(sf) elif required: raise SourceLine( primary["secondaryFiles"], i, validate.ValidationException).makeError( "Required secondary file '%s' does not exist" % sfpath) primary["secondaryFiles"] = cmap(found) if discovered is not None: discovered[primary["location"]] = primary["secondaryFiles"] elif inputschema["type"] not in primitive_types_set: set_secondary(fsaccess, builder, inputschema["type"], secondaryspec, primary, discovered)
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) # `promises` dict # from: each parameter (workflow input or step output) # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. promises = {} # `jobs` dict from step id to job that implements that step. jobs = {} for inp in self.cwlwf.tool["inputs"]: promises[inp["id"]] = SelfJob(self, cwljob) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, scheduling jobs as their # dependencies can be fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True for step in self.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp: for s in aslist(inp["source"]): if s not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} for inp in step.tool["inputs"]: key = shortname(inp["id"]) if "source" in inp: if inp.get("linkMerge") or len(aslist(inp["source"])) > 1: linkMerge = inp.get("linkMerge", "merge_nested") if linkMerge == "merge_nested": jobobj[key] = ( MergeInputsNested([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) elif linkMerge == "merge_flattened": jobobj[key] = ( MergeInputsFlattened([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '%s'", linkMerge) else: jobobj[key] = (shortname(inp["source"]), promises[inp["source"]].rv()) elif "default" in inp: d = copy.copy(inp["default"]) jobobj[key] = ("default", {"default": d}) if "valueFrom" in inp and "scatter" not in step.tool: if key in jobobj: jobobj[key] = StepValueFrom(inp["valueFrom"], jobobj[key], self.cwlwf.requirements) else: jobobj[key] = StepValueFrom(inp["valueFrom"], ("None", {"None": None}), self.cwlwf.requirements) if "scatter" in step.tool: wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options) followOn = CWLGather(step, wfjob.rv()) wfjob.addFollowOn(followOn) else: (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj), step_inputs=step.tool["inputs"], **self.executor_options) jobs[step.tool["id"]] = followOn connected = False for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if not promises[s].hasChild(wfjob): promises[s].addChild(wfjob) connected = True if not connected: # workflow step has default inputs only, isn't connected to other jobs, # so add it as child of workflow. self.addChild(wfjob) for out in step.tool["outputs"]: promises[out["id"]] = followOn for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if s not in promises: alloutputs_fufilled = False # may need a test for out in self.cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False outobj = {} for out in self.cwlwf.tool["outputs"]: outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv()) return IndirectDict(outobj)
def collect_output_ports( self, ports, # type: Set[Dict[Text, Any]] builder, # type: Builder outdir, # type: Text compute_checksum=True, # type: bool jobname="", # type: Text readers=None # type: Dict[Text, Any] ): # type: (...) -> OutputPorts ret = {} # type: OutputPorts debug = LOGGER.isEnabledFor(logging.DEBUG) try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) else: for i, port in enumerate(ports): def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg)) with SourceLine(ports, i, make_workflow_exception, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output( port, builder, outdir, fs_access, compute_checksum=compute_checksum) if ret: # revmap = partial(command_line_tool.revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) # TODO: Attempt to avoid a crash because the revmap fct is not functional # (intend for a docker usage only?) # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), command_line_tool.remove_path) normalizeFilesDirs(ret) visit_class( ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access)) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) validate.validate_ex(self.names.get_name("outputs_record_schema", ""), ret, strict=False, logger=LOGGER) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as exc: raise WorkflowException( "Error validating output record: {!s}\nIn:\n{}".format( exc, json.dumps(ret, indent=4))) finally: if builder.mutation_manager and readers: for reader in readers.values(): builder.mutation_manager.release_reader(jobname, reader)
def main(args=None): parser = ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", type=str) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--conformance-test", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", action="store_true") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=version) # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) if options.quiet: options.logLevel = "WARNING" uri = "file://" + os.path.abspath(options.cwljob) t = cwltool.main.load_tool(options.cwltool, False, True, cwltool.workflow.defaultMakeTool, True) if options.conformance_test: loader = schema_salad.ref_resolver.Loader({}) else: jobloaderctx = {"path": {"@type": "@id"}, "format": {"@type": "@id"}} jobloaderctx.update(t.metadata.get("$namespaces", {})) loader = schema_salad.ref_resolver.Loader(jobloaderctx) job, _ = loader.resolve_ref(uri) if type(t) == int: return t try: checkRequirements(t.tool) except Exception as e: logging.error(e) return 33 jobobj = {} for inp in t.tool["inputs"]: if shortname(inp["id"]) in job: pass elif shortname(inp["id"]) not in job and "default" in inp: job[shortname(inp["id"])] = copy.copy(inp["default"]) elif shortname(inp["id"]) not in job and inp["type"][0] == "null": pass else: raise validate.ValidationException("Missing inputs `%s`" % shortname(inp["id"])) adjustFiles(job, lambda x: x.replace("file://", "")) if options.conformance_test: sys.stdout.write(json.dumps( cwltool.main.single_job_executor(t, job, options.basedir, options, conformance_test=True), indent=4)) return 0 if not options.basedir: options.basedir = os.path.dirname(os.path.abspath(options.cwljob)) outdir = options.outdir staging = StageJob(t, job, os.path.dirname(os.path.abspath(options.cwljob))) (wf1, wf2) = makeJob(t, staging.rv()) staging.addFollowOn(wf1) wf2.addFollowOn(FinalJob(wf2.rv(), outdir)) Job.Runner.startToil(staging, options) with open(os.path.join(outdir, "cwl.output.json"), "r") as f: sys.stdout.write(f.read()) return 0
def main(args=None, stdout=sys.stdout): parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument("--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container setLoggingFromOptions(options) if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, kwargs={ "hints": [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }]}, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job = cwltool.main.load_job_order(options, t, sys.stdin) if type(job) == int: return job job, options.basedir = job fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) make_fs_access = functools.partial(ToilFsAccess, fileStore=toil) try: (wf1, wf2) = makeJob(t, {}, use_container=use_container, preserve_environment=options.preserve_environment, tmpdir=os.path.realpath(outdir), workdir=options.workDir) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def upload_workflow(arvRunner, tool, job_order, project_uuid, uuid=None, submit_runner_ram=0, name=None, merged_map=None): packed = packed_workflow(arvRunner, tool, merged_map) adjustDirObjs(job_order, trim_listing) adjustFileObjs(job_order, trim_anonymous_location) adjustDirObjs(job_order, trim_anonymous_location) main = [p for p in packed["$graph"] if p["id"] == "#main"][0] for inp in main["inputs"]: sn = shortname(inp["id"]) if sn in job_order: inp["default"] = job_order[sn] if not name: name = tool.tool.get("label", os.path.basename(tool.tool["id"])) upload_dependencies(arvRunner, name, tool.doc_loader, packed, tool.tool["id"], False) if submit_runner_ram: hints = main.get("hints", []) found = False for h in hints: if h["class"] == "http://arvados.org/cwl#WorkflowRunnerResources": h["ramMin"] = submit_runner_ram found = True break if not found: hints.append({ "class": "http://arvados.org/cwl#WorkflowRunnerResources", "ramMin": submit_runner_ram }) main["hints"] = hints body = { "workflow": { "name": name, "description": tool.tool.get("doc", ""), "definition": json.dumps(packed, sort_keys=True, indent=4, separators=(',', ': ')) } } if project_uuid: body["workflow"]["owner_uuid"] = project_uuid if uuid: call = arvRunner.api.workflows().update(uuid=uuid, body=body) else: call = arvRunner.api.workflows().create(body=body) return call.execute(num_retries=arvRunner.num_retries)["uuid"]
def job(self, joborder, output_callback, **kwargs): kwargs["work_api"] = self.work_api req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if req: with SourceLine(self.tool, None, WorkflowException): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"]) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) upload_dependencies(self.arvrunner, kwargs.get("name", ""), document_loader, packed, uri, False) with Perf(metrics, "subworkflow adjust"): joborder_keepmount = copy.deepcopy(joborder) def keepmount(obj): with SourceLine(obj, None, WorkflowException): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException): if obj["location"].startswith("keep:"): obj["location"] = "/keep/" + obj["location"][5:] if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) adjustFileObjs(joborder_keepmount, keepmount) adjustDirObjs(joborder_keepmount, keepmount) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": workflowobj["requirements"]+[ { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": { "class": "File", "location": "keep:%s/workflow.cwl" % self.wf_pdh } }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": workflowobj["hints"], "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"] }) kwargs["loader"] = self.doc_loader kwargs["avsc_names"] = self.doc_schema return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder, output_callback, **kwargs) else: return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered)
def job( self, joborder, # type: Dict[Text, AnyValue] output_callbacks, # type: Callable[[Any, Any], Any] runtime_context, # type: RuntimeContext ): # type: (...) -> Generator[Union[JobBase, CallbackJob], None, None] """ Workflow job generator. :param joborder: inputs of the job submission :param output_callbacks: method to fetch step outputs and corresponding step details :param runtime_context: configs about execution environment :return: """ require_prefix = "" if self.metadata["cwlVersion"] == "v1.0": require_prefix = "http://commonwl.org/cwltool#" jobname = uniquename(runtime_context.name or shortname(self.tool.get("id", "job"))) # outdir must be served by the EMS because downstream step will need access to upstream steps output weaver_out_dir = get_wps_output_dir(get_settings(app)) runtime_context.outdir = tempfile.mkdtemp(prefix=getdefault( runtime_context.tmp_outdir_prefix, DEFAULT_TMP_PREFIX), dir=weaver_out_dir) builder = self._init_job(joborder, runtime_context) # `jobname` is the step name and `joborder` is the actual step inputs wps_workflow_job = WpsWorkflowJob( builder, builder.job, self.requirements, self.hints, jobname, self.get_job_process_definition(jobname, joborder, self.tool), self.tool["outputs"]) wps_workflow_job.prov_obj = self.prov_obj wps_workflow_job.successCodes = self.tool.get("successCodes") wps_workflow_job.temporaryFailCodes = self.tool.get( "temporaryFailCodes") wps_workflow_job.permanentFailCodes = self.tool.get( "permanentFailCodes") # TODO Taken from command_line_tool.py maybe this could let us use the revmap if required at all # reffiles = copy.deepcopy(builder.files) # builder.pathmapper = self.make_path_mapper( # reffiles, builder.stagedir, runtimeContext, True) # builder.requirements = wps_workflow_job.requirements wps_workflow_job.outdir = builder.outdir wps_workflow_job.tmpdir = builder.tmpdir wps_workflow_job.stagedir = builder.stagedir readers = {} # type: Dict[Text, Any] timelimit = self.get_requirement(require_prefix + "TimeLimit")[0] if timelimit: with SourceLine(timelimit, "timelimit", validate.ValidationException): wps_workflow_job.timelimit = builder.do_eval( timelimit["timelimit"]) if not isinstance(wps_workflow_job.timelimit, int) or wps_workflow_job.timelimit < 0: raise Exception( "timelimit must be an integer >= 0, got: %s" % wps_workflow_job.timelimit) wps_workflow_job.collect_outputs = partial( self.collect_output_ports, self.tool["outputs"], builder, compute_checksum=getdefault(runtime_context.compute_checksum, True), jobname=jobname, readers=readers) wps_workflow_job.output_callback = output_callbacks yield wps_workflow_job
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) discover_secondary_files(self.arvrunner.fs_access, builder, self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), self.doc_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader) for p in packed["$graph"]: if p["id"] == "#main": p["requirements"] = dedup_reqs(self.requirements) p["hints"] = dedup_reqs(self.hints) def visit(item): if "requirements" in item: item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"] for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append(req) dyn = True break else: with SourceLine(req, k, WorkflowException): raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions") if not dyn: self.static_resource_req.append(req) visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [get_overall_res_req(self.static_resource_req)] upload_dependencies(self.arvrunner, runtimeContext.name, self.doc_loader, packed, self.tool["id"], False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) self.loadingContext = self.loadingContext.copy() self.loadingContext.metadata = self.loadingContext.metadata.copy() self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0" if len(job_res_reqs) == 1: # RAM request needs to be at least 128 MiB or the workflow # runner itself won't run reliably. if job_res_reqs[0].get("ramMin", 1024) < 128: job_res_reqs[0]["ramMin"] = 128 arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"] if runtimeContext.debug: arguments.insert(0, '--debug') wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+job_res_reqs+[ {"class": "InlineJavascriptRequirement"}, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": arguments, "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
def run(self, fileStore): cwljob = resolve_indirect(self.cwljob) # `promises` dict # from: each parameter (workflow input or step output) # that may be used as a "source" for a step input workflow output # parameter # to: the job that will produce that value. promises = {} # `jobs` dict from step id to job that implements that step. jobs = {} for inp in self.cwlwf.tool["inputs"]: promises[inp["id"]] = SelfJob(self, cwljob) alloutputs_fufilled = False while not alloutputs_fufilled: # Iteratively go over the workflow steps, scheduling jobs as their # dependencies can be fufilled by upstream workflow inputs or # step outputs. Loop exits when the workflow outputs # are satisfied. alloutputs_fufilled = True for step in self.cwlwf.steps: if step.tool["id"] not in jobs: stepinputs_fufilled = True for inp in step.tool["inputs"]: if "source" in inp: for s in aslist(inp["source"]): if s not in promises: stepinputs_fufilled = False if stepinputs_fufilled: jobobj = {} for inp in step.tool["inputs"]: key = shortname(inp["id"]) if "source" in inp: if inp.get("linkMerge") or len(aslist(inp["source"])) > 1: linkMerge = inp.get("linkMerge", "merge_nested") if linkMerge == "merge_nested": jobobj[key] = ( MergeInputsNested([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) elif linkMerge == "merge_flattened": jobobj[key] = ( MergeInputsFlattened([(shortname(s), promises[s].rv()) for s in aslist(inp["source"])])) else: raise validate.ValidationException( "Unsupported linkMerge '%s'", linkMerge) else: jobobj[key] = ( shortname(inp["source"]), promises[inp["source"]].rv()) elif "default" in inp: d = copy.copy(inp["default"]) jobobj[key] = ("default", {"default": d}) if "valueFrom" in inp and "scatter" not in step.tool: if key in jobobj: jobobj[key] = StepValueFrom(inp["valueFrom"], jobobj[key], self.cwlwf.requirements) else: jobobj[key] = StepValueFrom(inp["valueFrom"], ("None", {"None": None}), self.cwlwf.requirements) if "scatter" in step.tool: wfjob = CWLScatter(step, IndirectDict(jobobj), **self.executor_options) followOn = CWLGather(step, wfjob.rv()) wfjob.addFollowOn(followOn) else: (wfjob, followOn) = makeJob(step.embedded_tool, IndirectDict(jobobj), **self.executor_options) jobs[step.tool["id"]] = followOn connected = False for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if not promises[s].hasChild(wfjob): promises[s].addChild(wfjob) connected = True if not connected: # workflow step has default inputs only, isn't connected to other jobs, # so add it as child of workflow. self.addChild(wfjob) for out in step.tool["outputs"]: promises[out["id"]] = followOn for inp in step.tool["inputs"]: for s in aslist(inp.get("source", [])): if s not in promises: alloutputs_fufilled = False # may need a test for out in self.cwlwf.tool["outputs"]: if "source" in out: if out["source"] not in promises: alloutputs_fufilled = False outobj = {} for out in self.cwlwf.tool["outputs"]: outobj[shortname(out["id"])] = (shortname(out["outputSource"]), promises[out["outputSource"]].rv()) return IndirectDict(outobj)