def toilStageFiles(fileStore, cwljob, outdir, index, existing, export): """Copy input files out of the global file store and update location and path.""" jobfiles = [] # type: List[Dict[Text, Any]] collectFilesAndDirs(cwljob, jobfiles) pm = ToilPathMapper(jobfiles, "", outdir, separateDirs=False, stage_listing=True) for f, p in pm.items(): if not p.staged: continue if not os.path.exists(os.path.dirname(p.target)): os.makedirs(os.path.dirname(p.target), 0o0755) if p.type == "File": fileStore.exportFile(p.resolved[7:], "file://" + p.target) elif p.type == "Directory" and not os.path.exists(p.target): os.makedirs(p.target, 0o0755) elif p.type == "CreateFile": with open(p.target, "wb") as n: n.write(p.resolved.encode("utf-8")) def _check_adjust(f): f["location"] = schema_salad.ref_resolver.file_uri(pm.mapper(f["location"])[1]) if "contents" in f: del f["contents"] return f visit_class(cwljob, ("File", "Directory"), _check_adjust)
def toilStageFiles(fileStore, cwljob, outdir, index, existing, export): """Copy input files out of the global file store and update location and path.""" jobfiles = [] # type: List[Dict[Text, Any]] collectFilesAndDirs(cwljob, jobfiles) pm = ToilPathMapper(jobfiles, "", outdir, separateDirs=False, stage_listing=True) for f, p in pm.items(): if not p.staged: continue if not os.path.exists(os.path.dirname(p.target)): os.makedirs(os.path.dirname(p.target), 0o0755) if p.type == "File": fileStore.exportFile(p.resolved[7:], "file://" + p.target) elif p.type == "Directory" and not os.path.exists(p.target): os.makedirs(p.target, 0o0755) elif p.type == "CreateFile": with open(p.target, "wb") as n: n.write(p.resolved.encode("utf-8")) def _check_adjust(f): f["location"] = schema_salad.ref_resolver.file_uri(pm.mapper(f["location"])[1]) if "contents" in f: del f["contents"] return f visit_class(cwljob, ("File", "Directory"), _check_adjust)
def returndeps( obj, # type: Optional[Mapping[Text, Any]] document_loader, # type: Loader stdout, # type: Union[TextIO, StreamWriter] relative_deps, # type: bool uri, # type: Text basedir=None # type: Text ): # type: (...) -> None """Print a JSON representation of the dependencies of the CWL document.""" deps = {"class": "File", "location": uri} # type: Dict[Text, Any] def loadref(base, uri): return document_loader.fetch(document_loader.fetcher.urljoin( base, uri)) sfs = scandeps(basedir if basedir else uri, obj, {"$import", "run"}, {"$include", "$schemas", "location"}, loadref) if sfs: deps["secondaryFiles"] = sfs if relative_deps: if relative_deps == "primary": base = basedir if basedir else os.path.dirname( uri_file_path(str(uri))) elif relative_deps == "cwd": base = os.getcwd() else: raise Exception(u"Unknown relative_deps %s" % relative_deps) visit_class(deps, ("File", "Directory"), functools.partial(make_relative, base)) return json_dumps(deps, indent=4)
def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True))
def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True))
def import_files(tool): visit_class(tool, ("File", "Directory"), path_to_loc) visit_class(tool, ("File", ), functools.partial( add_sizes, fs_access)) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial( get_listing, fs_access, recursive=True)) adjustFileObjs(tool, functools.partial( uploadFile, toil.importFile, fileindex, existing, skip_broken=True))
def collect_output_ports(self, ports, # type: Set[Dict[Text, Any]] builder, # type: Builder outdir, # type: Text compute_checksum=True, # type: bool jobname="", # type: Text readers=None # type: Dict[Text, Any] ): # type: (...) -> OutputPorts ret = {} # type: OutputPorts debug = LOGGER.isEnabledFor(logging.DEBUG) try: fs_access = builder.make_fs_access(outdir) custom_output = fs_access.join(outdir, "cwl.output.json") if fs_access.exists(custom_output): with fs_access.open(custom_output, "r") as f: ret = json.load(f) if debug: LOGGER.debug(u"Raw output from %s: %s", custom_output, json.dumps(ret, indent=4)) else: for i, port in enumerate(ports): def make_workflow_exception(msg): return WorkflowException( u"Error collecting output for parameter '%s':\n%s" % (shortname(port["id"]), msg)) with SourceLine(ports, i, make_workflow_exception, debug): fragment = shortname(port["id"]) ret[fragment] = self.collect_output(port, builder, outdir, fs_access, compute_checksum=compute_checksum) if ret: # revmap = partial(command_line_tool.revmap_file, builder, outdir) adjustDirObjs(ret, trim_listing) # TODO: Attempt to avoid a crash because the revmap fct is not functional # (intend for a docker usage only?) # visit_class(ret, ("File", "Directory"), cast(Callable[[Any], Any], revmap)) visit_class(ret, ("File", "Directory"), command_line_tool.remove_path) normalizeFilesDirs(ret) visit_class(ret, ("File", "Directory"), partial(command_line_tool.check_valid_locations, fs_access)) if compute_checksum: adjustFileObjs(ret, partial(compute_checksums, fs_access)) validate.validate_ex( self.names.get_name("outputs_record_schema", ""), ret, strict=False, logger=LOGGER) if ret is not None and builder.mutation_manager is not None: adjustFileObjs(ret, builder.mutation_manager.set_generation) return ret if ret is not None else {} except validate.ValidationException as exc: raise WorkflowException("Error validating output record: {!s}\nIn:\n{}" .format(exc, json.dumps(ret, indent=4))) finally: if builder.mutation_manager and readers: for reader in readers.values(): builder.mutation_manager.release_reader(jobname, reader)
def toilStageFiles(fileStore, cwljob, outdir, index, existing, export, destBucket=None): """Copy input files out of the global file store and update location and path.""" jobfiles = [] # type: List[Dict[Text, Any]] collectFilesAndDirs(cwljob, jobfiles) pm = ToilPathMapper(jobfiles, "", outdir, separateDirs=False, stage_listing=True) for f, p in pm.items(): if not p.staged: continue # Deal with bucket exports if destBucket: # Directories don't need to be created if we're exporting to # a bucket if p.type == "File": # Remove the staging directory from the filepath and # form the destination URL unstageTargetPath = p.target[len(outdir):] destUrl = '/'.join( s.strip('/') for s in [destBucket, unstageTargetPath]) fileStore.exportFile(p.resolved[7:], destUrl) continue if not os.path.exists(os.path.dirname(p.target)): os.makedirs(os.path.dirname(p.target), 0o0755) if p.type == "File": fileStore.exportFile(p.resolved[7:], "file://" + p.target) elif p.type == "Directory" and not os.path.exists(p.target): os.makedirs(p.target, 0o0755) elif p.type == "CreateFile": with open(p.target, "wb") as n: n.write(p.resolved.encode("utf-8")) def _check_adjust(f): f["location"] = schema_salad.ref_resolver.file_uri( pm.mapper(f["location"])[1]) if "contents" in f: del f["contents"] return f visit_class(cwljob, ("File", "Directory"), _check_adjust)
def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"]
def capture_default(obj): remove = [False] def add_default(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Remove from sc sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), add_default) if remove[0]: del obj["default"]
def arvados_job_spec(self, debug=False): """Create an Arvados job specification for this workflow. The returned dict can be used to create a job (i.e., passed as the +body+ argument to jobs().create()), or as a component in a pipeline template or pipeline instance. """ if self.embedded_tool.tool["id"].startswith("keep:"): self.job_order["cwl:tool"] = self.embedded_tool.tool["id"][5:] else: packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map) wf_pdh = upload_workflow_collection(self.arvrunner, self.name, packed) self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh adjustDirObjs(self.job_order, trim_listing) visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location) visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields) if self.output_name: self.job_order["arv:output_name"] = self.output_name if self.output_tags: self.job_order["arv:output_tags"] = self.output_tags self.job_order["arv:enable_reuse"] = self.enable_reuse if self.on_error: self.job_order["arv:on_error"] = self.on_error if debug: self.job_order["arv:debug"] = True return { "script": "cwl-runner", "script_version": "master", "minimum_script_version": "570509ab4d2ef93d870fd2b1f2eab178afb1bad9", "repository": "arvados", "script_parameters": self.job_order, "runtime_constraints": { "docker_image": arvados_jobs_image(self.arvrunner, self.jobs_image), "min_ram_mb_per_node": self.submit_runner_ram } }
def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"]
def visit_default(obj): remove = [False] def ensure_default_location(fileobj): if "location" not in fileobj and "path" in fileobj: fileobj["location"] = fileobj["path"] del fileobj["path"] if "location" in fileobj \ and not ftp_access.exists(fileobj["location"]): # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"]
def arvados_job_spec(self, debug=False): """Create an Arvados job specification for this workflow. The returned dict can be used to create a job (i.e., passed as the +body+ argument to jobs().create()), or as a component in a pipeline template or pipeline instance. """ if self.embedded_tool.tool["id"].startswith("keep:"): self.job_order["cwl:tool"] = self.embedded_tool.tool["id"][5:] else: packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map) wf_pdh = upload_workflow_collection(self.arvrunner, self.name, packed) self.job_order["cwl:tool"] = "%s/workflow.cwl#main" % wf_pdh adjustDirObjs(self.job_order, trim_listing) visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location) visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields) if self.output_name: self.job_order["arv:output_name"] = self.output_name if self.output_tags: self.job_order["arv:output_tags"] = self.output_tags self.job_order["arv:enable_reuse"] = self.enable_reuse if self.on_error: self.job_order["arv:on_error"] = self.on_error if debug: self.job_order["arv:debug"] = True return { "script": "cwl-runner", "script_version": "master", "minimum_script_version": "570509ab4d2ef93d870fd2b1f2eab178afb1bad9", "repository": "arvados", "script_parameters": self.job_order, "runtime_constraints": { "docker_image": arvados_jobs_image(self.arvrunner, self.jobs_image), "min_ram_mb_per_node": self.submit_runner_ram } }
def job(self, joborder, output_callback, **kwargs): kwargs["work_api"] = self.work_api req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if req: with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"]) discover_secondary_files(self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) upload_dependencies(self.arvrunner, kwargs.get("name", ""), document_loader, packed, uri, False) with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), lambda x: reffiles.append(x)) mapper = ArvPathMapper(self.arvrunner, reffiles, kwargs["basedir"], "/keep/%s", "/keep/%s/%s", **kwargs) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+[ { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": { "class": "File", "location": "keep:%s/workflow.cwl" % self.wf_pdh } }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"], "id": "#" }) kwargs["loader"] = self.doc_loader kwargs["avsc_names"] = self.doc_schema return ArvadosCommandTool(self.arvrunner, wf_runner, **kwargs).job(joborder_resolved, output_callback, **kwargs) else: return super(ArvadosWorkflow, self).job(joborder, output_callback, **kwargs)
def upload_dependencies_ftp(document_loader, workflowobj, uri, loadref_run, remote_storage_url, ftp_access): """ Upload the dependencies of the workflowobj document to an FTP location. Does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. Adapted from: https://github.com/curoverse/arvados/blob/2b0b06579199967eca3d44d955ad64195d2db3c3/sdk/cwl/arvados_cwl/runner.py#L83 """ loaded = set() def loadref(base, ref): joined = document_loader.fetcher.urljoin(base, ref) defrg, _ = urllib.parse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): text_io = StringIO(text.decode('utf-8')) else: text_io = StringIO(text) return yaml.safe_load(text_io) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import", )) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) def visit_default(obj): remove = [False] def ensure_default_location(fileobj): if "location" not in fileobj and "path" in fileobj: fileobj["location"] = fileobj["path"] del fileobj["path"] if "location" in fileobj \ and not ftp_access.exists(fileobj["location"]): # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files( obj["inputs"], { shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t }, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for entry in list(discovered.keys()): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if not entry.startswith("file:"): del discovered[entry] visit_class(workflowobj, ("Directory"), functools.partial(ftp_upload, remote_storage_url, ftp_access)) visit_class(workflowobj, ("File"), functools.partial(ftp_upload, remote_storage_url, ftp_access)) visit_class(discovered, ("Directory"), functools.partial(ftp_upload, remote_storage_url, ftp_access)) visit_class(discovered, ("File"), functools.partial(ftp_upload, remote_storage_url, ftp_access))
def arvados_job_spec(self, runtimeContext): """Create an Arvados container request for this workflow. The returned dict can be used to create a container passed as the +body+ argument to container_requests().create(). """ adjustDirObjs(self.job_order, trim_listing) visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location) visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields) secret_mounts = {} for param in sorted(self.job_order.keys()): if self.secret_store.has_secret(self.job_order[param]): mnt = "/secrets/s%d" % len(secret_mounts) secret_mounts[mnt] = { "kind": "text", "content": self.secret_store.retrieve(self.job_order[param]) } self.job_order[param] = {"$include": mnt} container_req = { "name": self.name, "output_path": "/var/spool/cwl", "cwd": "/var/spool/cwl", "priority": self.priority, "state": "Committed", "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image), "mounts": { "/var/lib/cwl/cwl.input.json": { "kind": "json", "content": self.job_order }, "stdout": { "kind": "file", "path": "/var/spool/cwl/cwl.output.json" }, "/var/spool/cwl": { "kind": "collection", "writable": True } }, "secret_mounts": secret_mounts, "runtime_constraints": { "vcpus": math.ceil(self.submit_runner_cores), "ram": 1024 * 1024 * (math.ceil(self.submit_runner_ram) + math.ceil(self.collection_cache_size)), "API": True }, "use_existing": False, # Never reuse the runner container - see #15497. "properties": {} } if self.embedded_tool.tool.get("id", "").startswith("keep:"): sp = self.embedded_tool.tool["id"].split('/') workflowcollection = sp[0][5:] workflowname = "/".join(sp[1:]) workflowpath = "/var/lib/cwl/workflow/%s" % workflowname container_req["mounts"]["/var/lib/cwl/workflow"] = { "kind": "collection", "portable_data_hash": "%s" % workflowcollection } else: packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map) workflowpath = "/var/lib/cwl/workflow.json#main" container_req["mounts"]["/var/lib/cwl/workflow.json"] = { "kind": "json", "content": packed } if self.embedded_tool.tool.get("id", "").startswith("arvwf:"): container_req["properties"][ "template_uuid"] = self.embedded_tool.tool["id"][6:33] # --local means execute the workflow instead of submitting a container request # --api=containers means use the containers API # --no-log-timestamps means don't add timestamps (the logging infrastructure does this) # --disable-validate because we already validated so don't need to do it again # --eval-timeout is the timeout for javascript invocation # --parallel-task-count is the number of threads to use for job submission # --enable/disable-reuse sets desired job reuse # --collection-cache-size sets aside memory to store collections command = [ "arvados-cwl-runner", "--local", "--api=containers", "--no-log-timestamps", "--disable-validate", "--disable-color", "--eval-timeout=%s" % self.arvrunner.eval_timeout, "--thread-count=%s" % self.arvrunner.thread_count, "--enable-reuse" if self.enable_reuse else "--disable-reuse", "--collection-cache-size=%s" % self.collection_cache_size ] if self.output_name: command.append("--output-name=" + self.output_name) container_req["output_name"] = self.output_name if self.output_tags: command.append("--output-tags=" + self.output_tags) if runtimeContext.debug: command.append("--debug") if runtimeContext.storage_classes != "default": command.append("--storage-classes=" + runtimeContext.storage_classes) if self.on_error: command.append("--on-error=" + self.on_error) if self.intermediate_output_ttl: command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl) if self.arvrunner.trash_intermediate: command.append("--trash-intermediate") if self.arvrunner.project_uuid: command.append("--project-uuid=" + self.arvrunner.project_uuid) if self.enable_dev: command.append("--enable-dev") command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"]) container_req["command"] = command return container_req
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urllib.parse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import", )) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] uuids = {} def collect_uuids(obj): loc = obj.get("location", "") sp = loc.split(":") if sp[0] == "keep": # Collect collection uuids that need to be resolved to # portable data hashes gp = collection_uuid_pattern.match(loc) if gp: uuids[gp.groups()[0]] = obj if collectionUUID in obj: uuids[obj[collectionUUID]] = obj def collect_uploads(obj): loc = obj.get("location", "") sp = loc.split(":") if len(sp) < 1: return if sp[0] in ("file", "http", "https"): # Record local files than need to be uploaded, # don't include file literals, keep references, etc. sc.append(obj) collect_uuids(obj) visit_class(workflowobj, ("File", "Directory"), collect_uuids) visit_class(sc_result, ("File", "Directory"), collect_uploads) # Resolve any collection uuids we found to portable data hashes # and assign them to uuid_map uuid_map = {} fetch_uuids = list(uuids.keys()) while fetch_uuids: # For a large number of fetch_uuids, API server may limit # response size, so keep fetching from API server has nothing # more to give us. lookups = arvrunner.api.collections().list( filters=[["uuid", "in", fetch_uuids]], count="none", select=["uuid", "portable_data_hash" ]).execute(num_retries=arvrunner.num_retries) if not lookups["items"]: break for l in lookups["items"]: uuid_map[l["uuid"]] = l["portable_data_hash"] fetch_uuids = [u for u in fetch_uuids if u not in uuid_map] normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists( f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files( obj["inputs"], { shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t }, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): loc = p.get("location") if loc and (not loc.startswith("_:")) and ( not loc.startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved return if not loc: return if collectionUUID in p: uuid = p[collectionUUID] if uuid not in uuid_map: raise SourceLine(p, collectionUUID, validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) gp = collection_pdh_pattern.match(loc) if gp and uuid_map[uuid] != gp.groups()[0]: # This file entry has both collectionUUID and a PDH # location. If the PDH doesn't match the one returned # the API server, raise an error. raise SourceLine( p, "location", validate.ValidationException ).makeError( "Expected collection uuid %s to be %s but API server reported %s" % (uuid, gp.groups()[0], uuid_map[p[collectionUUID]])) gp = collection_uuid_pattern.match(loc) if not gp: return uuid = gp.groups()[0] if uuid not in uuid_map: raise SourceLine(p, "location", validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "") p[collectionUUID] = uuid visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper( d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def cwlmain( self, argsl=None, # type: List[str] args=None, # type: argparse.Namespace job_order_object=None, # type: MutableMapping[Text, Any] stdin=sys.stdin, # type: IO[Any] stdout=None, # type: Union[TextIO, codecs.StreamWriter] stderr=sys.stderr, # type: IO[Any] versionfunc=versionstring, # type: Callable[[], Text] logger_handler=None, # custom_schema_callback=None, # type: Callable[[], None] executor=None, # type: Callable[..., Tuple[Dict[Text, Any], Text]] loadingContext=None, # type: LoadingContext runtimeContext=None # type: RuntimeContext ): # type: (...) -> int if not stdout: stdout = codecs.getwriter('utf-8')(sys.stdout) _logger.removeHandler(defaultStreamHandler) if logger_handler: stderr_handler = logger_handler else: stderr_handler = logging.StreamHandler(stderr) _logger.addHandler(stderr_handler) try: if args is None: args = arg_parser().parse_args(argsl) if args.workflow and "--outdir" not in argsl: outputPath = args.workflow.split('/')[-1].split('.')[0] setattr( args, "outdir", os.getcwd() + "/" + outputPath + "/" + datetime.datetime.now().strftime('%Y-%m-%d-%H%M')) if runtimeContext is None: runtimeContext = RuntimeContext(vars(args)) else: runtimeContext = runtimeContext.copy() rdflib_logger = logging.getLogger("rdflib.term") rdflib_logger.addHandler(stderr_handler) rdflib_logger.setLevel(logging.ERROR) if args.quiet: _logger.setLevel(logging.WARN) if runtimeContext.debug: _logger.setLevel(logging.DEBUG) rdflib_logger.setLevel(logging.DEBUG) if args.timestamps: formatter = logging.Formatter("[%(asctime)s] %(message)s", "%Y-%m-%d %H:%M:%S") stderr_handler.setFormatter(formatter) # version if args.version: return versionfunc(), 0 else: _logger.info(versionfunc()) if args.print_supported_versions: return "\n".join(supportedCWLversions(args.enable_dev)), 0 if not args.workflow: if os.path.isfile("CWLFile"): setattr(args, "workflow", "CWLFile") else: _logger.error("") _logger.error( "CWL document required, no input file was provided") arg_parser().print_help() return "CWL document required, no input file was provided", 1 if args.relax_path_checks: command_line_tool.ACCEPTLIST_RE = command_line_tool.ACCEPTLIST_EN_RELAXED_RE if args.ga4gh_tool_registries: ga4gh_tool_registries[:] = args.ga4gh_tool_registries if not args.enable_ga4gh_tool_registry: del ga4gh_tool_registries[:] if custom_schema_callback: custom_schema_callback() elif args.enable_ext: res = pkg_resources.resource_stream(__name__, 'extensions.yml') use_custom_schema("v1.0", "http://commonwl.org/cwltool", res.read()) res.close() else: use_standard_schema("v1.0") if loadingContext is None: loadingContext = LoadingContext(vars(args)) else: loadingContext = loadingContext.copy() loadingContext.disable_js_validation = \ args.disable_js_validation or (not args.do_validate) loadingContext.construct_tool_object = getdefault( loadingContext.construct_tool_object, workflow.default_make_tool) loadingContext.resolver = getdefault(loadingContext.resolver, tool_resolver) try: uri, tool_file_uri = resolve_tool_uri( args.workflow, resolver=loadingContext.resolver, fetcher_constructor=loadingContext.fetcher_constructor) except: return "Can't find file " + args.workflow, 0 try_again_msg = "" if args.debug else ", try again with --debug for more information" try: job_order_object, input_basedir, jobloader = load_job_order( args, stdin, loadingContext.fetcher_constructor, loadingContext.overrides_list, tool_file_uri) if args.overrides: loadingContext.overrides_list.extend( load_overrides( file_uri(os.path.abspath(args.overrides)), tool_file_uri)) document_loader, workflowobj, uri = fetch_document( uri, resolver=loadingContext.resolver, fetcher_constructor=loadingContext.fetcher_constructor) if args.print_deps: # printdeps(workflowobj, document_loader, stdout, args.relative_deps, uri) result = returndeps(workflowobj, document_loader, stdout, args.relative_deps, uri) return result, 0 document_loader, avsc_names, processobj, metadata, uri \ = validate_document(document_loader, workflowobj, uri, enable_dev=loadingContext.enable_dev, strict=loadingContext.strict, preprocess_only=(args.print_pre or args.pack), fetcher_constructor=loadingContext.fetcher_constructor, skip_schemas=args.skip_schemas, overrides=loadingContext.overrides_list, do_validate=loadingContext.do_validate) if args.print_pre: # stdout.write(json_dumps(processobj, indent=4)) return json_dumps(processobj, indent=4), 0 loadingContext.overrides_list.extend( metadata.get("cwltool:overrides", [])) tool = make_tool(document_loader, avsc_names, metadata, uri, loadingContext) if args.make_template: yaml.safe_dump(generate_input_template(tool), sys.stdout, default_flow_style=False, indent=4, block_seq_indent=2) return yaml.safe_dump(generate_input_template(tool), indent=4), 0 if args.validate: _logger.info("Tool definition is valid") return "Tool definition is valid", 0 if args.pack: stdout.write( print_pack(document_loader, processobj, uri, metadata)) return print_pack(document_loader, processobj, uri, metadata), 0 if args.print_rdf: stdout.write( printrdf(tool, document_loader.ctx, args.rdf_serializer)) return printrdf(tool, document_loader.ctx, args.rdf_serializer), 0 if args.print_dot: printdot(tool, document_loader.ctx, stdout) return "args.print_dot still not solved", 0 except (validate.ValidationException) as exc: _logger.error(u"Tool definition failed validation:\n%s", exc, exc_info=args.debug) infor = "Tool definition failed validation:\n%s" + exc + args.debug return infor, 1 except (RuntimeError, WorkflowException) as exc: _logger.error(u"Tool definition failed initialization:\n%s", exc, exc_info=args.debug) infor = "Tool definition failed initialization:\n%s" + exc + args.debug return infor, 1 except Exception as exc: _logger.error( u"I'm sorry, I couldn't load this CWL file%s.\nThe error was: %s", try_again_msg, exc if not args.debug else "", exc_info=args.debug) return "I'm sorry, I couldn't load this CWL file", 1 if isinstance(tool, int): return tool, 0 # If on MacOS platform, TMPDIR must be set to be under one of the # shared volumes in Docker for Mac # More info: https://dockstore.org/docs/faq if sys.platform == "darwin": default_mac_path = "/private/tmp/docker_tmp" if runtimeContext.tmp_outdir_prefix == DEFAULT_TMP_PREFIX: runtimeContext.tmp_outdir_prefix = default_mac_path for dirprefix in ("tmpdir_prefix", "tmp_outdir_prefix", "cachedir"): if getattr(runtimeContext, dirprefix) and getattr( runtimeContext, dirprefix) != DEFAULT_TMP_PREFIX: sl = "/" if getattr(runtimeContext, dirprefix).endswith("/") or dirprefix == "cachedir" \ else "" setattr( runtimeContext, dirprefix, os.path.abspath(getattr(runtimeContext, dirprefix)) + sl) if not os.path.exists( os.path.dirname(getattr(runtimeContext, dirprefix))): try: os.makedirs( os.path.dirname( getattr(runtimeContext, dirprefix))) except Exception as e: _logger.error("Failed to create directory: %s", e) infor = "Failed to create directory: %s" + e + "" return infor, 1 if args.cachedir: if args.move_outputs == "move": runtimeContext.move_outputs = "copy" runtimeContext.tmp_outdir_prefix = args.cachedir runtimeContext.secret_store = getdefault( runtimeContext.secret_store, SecretStore()) try: initialized_job_order_object = init_job_order( job_order_object, args, tool, jobloader, stdout, print_input_deps=args.print_input_deps, relative_deps=args.relative_deps, input_basedir=input_basedir, secret_store=runtimeContext.secret_store) except SystemExit as err: return err.code if not executor: if args.parallel: executor = MultithreadedJobExecutor() else: executor = SingleJobExecutor() assert executor is not None if isinstance(initialized_job_order_object, int): return initialized_job_order_object try: runtimeContext.basedir = input_basedir del args.workflow del args.job_order conf_file = getattr(args, "beta_dependency_resolvers_configuration", None) # Text use_conda_dependencies = getattr(args, "beta_conda_dependencies", None) # Text job_script_provider = None # type: Optional[DependenciesConfiguration] if conf_file or use_conda_dependencies: runtimeContext.job_script_provider = DependenciesConfiguration( args) runtimeContext.find_default_container = \ functools.partial(find_default_container, args) runtimeContext.make_fs_access = getdefault( runtimeContext.make_fs_access, StdFsAccess) (out, status) = executor(tool, initialized_job_order_object, runtimeContext, logger=_logger) # This is the workflow output, it needs to be written if out is not None: def loc_to_path(obj): for field in ("path", "nameext", "nameroot", "dirname"): if field in obj: del obj[field] if obj["location"].startswith("file://"): obj["path"] = uri_file_path(obj["location"]) visit_class(out, ("File", "Directory"), loc_to_path) # Unsetting the Generation fron final output object visit_class(out, ("File", ), MutationManager().unset_generation) if isinstance(out, string_types): stdout.write(out) else: stdout.write( json_dumps( out, indent=4, # type: ignore ensure_ascii=False)) stdout.write("\n") if hasattr(stdout, "flush"): stdout.flush() # type: ignore if status != "success": _logger.warning(u"Final process status is %s", status) infor = "Final process status is %s" + status + "" return infor, 1 _logger.info(u"Final process status is %s", status) return out, status except (validate.ValidationException) as exc: _logger.error(u"Input object failed validation:\n%s", exc, exc_info=args.debug) infor = "Input object failed validation:\n%s" + exc + args.debug return infor, 1 except UnsupportedRequirement as exc: _logger.error( u"Workflow or tool uses unsupported feature:\n%s", exc, exc_info=args.debug) infor = "Workflow or tool uses unsupported feature:\n%s" + exc + args.debug return infor, 3 except WorkflowException as exc: _logger.error(u"Workflow error%s:\n%s", try_again_msg, strip_dup_lineno(six.text_type(exc)), exc_info=args.debug) infor = "Workflow error%s:\n%s" + try_again_msg + strip_dup_lineno( six.text_type(exc)) + args.debug return infor, 1 except Exception as exc: _logger.error(u"Unhandled error%s:\n %s", try_again_msg, exc, exc_info=args.debug) infor = "Unhandled error%s:\n %s" + try_again_msg + exc + args.debug return infor, 1 finally: _logger.removeHandler(stderr_handler) _logger.addHandler(defaultStreamHandler)
def main(args=None, stdout=sys.stdout): parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument("--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container setLoggingFromOptions(options) if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, kwargs={ "hints": [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }]}, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job = cwltool.main.load_job_order(options, t, sys.stdin) if type(job) == int: return job job, options.basedir = job fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) make_fs_access = functools.partial(ToilFsAccess, fileStore=toil) try: (wf1, wf2) = makeJob(t, {}, use_container=use_container, preserve_environment=options.preserve_environment, tmpdir=os.path.realpath(outdir), workdir=options.workDir) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def main(args=None, stdout=sys.stdout): """Main method for toil-cwl-runner.""" cwllogger.removeHandler(defaultStreamHandler) config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) dockergroup = parser.add_mutually_exclusive_group() dockergroup.add_argument( "--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker command (like " "udocker or dx-docker) that will be used to call 'pull' and 'run'") dockergroup.add_argument( "--singularity", action="store_true", default=False, help="[experimental] Use Singularity runtime for running containers. " "Requires Singularity v2.3.2+ and Linux with kernel version v3.18+ or " "with overlayfs support backported.") dockergroup.add_argument( "--no-container", action="store_true", help="Do not execute jobs in a " "Docker container, even when `DockerRequirement` " "is specified under `hints`.") parser.add_argument( "--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running" " CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") parser.add_argument( "--destBucket", type=str, help="Specify a cloud bucket endpoint for output files.") parser.add_argument( "--beta-dependency-resolvers-configuration", default=None) parser.add_argument("--beta-dependencies-directory", default=None) parser.add_argument( "--beta-use-biocontainers", default=None, action="store_true") parser.add_argument( "--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") parser.add_argument( "--force-docker-pull", action="store_true", default=False, dest="force_docker_pull", help="Pull latest docker image even if it is locally present") parser.add_argument( "--no-match-user", action="store_true", default=False, help="Disable passing the current uid to `docker run --user`") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] # we use workdir as jobStore: options = parser.parse_args([workdir] + args) # if tmpdir_prefix is not the default value, set workDir too if options.tmpdir_prefix != 'tmp': options.workDir = options.tmpdir_prefix if options.provisioner and not options.jobStore: raise NoSuchJobStoreException( 'Please specify a jobstore with the --jobStore option when specifying a provisioner.') use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) tmp_outdir_prefix = os.path.abspath(options.tmp_outdir_prefix) tmpdir_prefix = os.path.abspath(options.tmpdir_prefix) fileindex = {} existing = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) job_script_provider = dependencies_configuration options.default_container = None runtime_context = cwltool.context.RuntimeContext(vars(options)) runtime_context.find_default_container = functools.partial( find_default_container, options) runtime_context.workdir = workdir runtime_context.move_outputs = "leave" runtime_context.rm_tmpdir = False loading_context = cwltool.context.LoadingContext(vars(options)) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: loading_context.hints = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] loading_context.construct_tool_object = toil_make_tool loading_context.resolver = cwltool.resolver.tool_resolver loading_context.strict = not options.not_strict options.workflow = options.cwltool options.job_order = options.cwljob uri, tool_file_uri = cwltool.load_tool.resolve_tool_uri( options.cwltool, loading_context.resolver, loading_context.fetcher_constructor) options.tool_help = None options.debug = options.logLevel == "DEBUG" job_order_object, options.basedir, jobloader = \ cwltool.main.load_job_order( options, sys.stdin, loading_context.fetcher_constructor, loading_context.overrides_list, tool_file_uri) document_loader, workflowobj, uri = \ cwltool.load_tool.fetch_document( uri, loading_context.resolver, loading_context.fetcher_constructor) document_loader, avsc_names, processobj, metadata, uri = \ cwltool.load_tool.validate_document( document_loader, workflowobj, uri, loading_context.enable_dev, loading_context.strict, False, loading_context.fetcher_constructor, False, loading_context.overrides_list, do_validate=loading_context.do_validate) loading_context.overrides_list.extend( metadata.get("cwltool:overrides", [])) try: tool = cwltool.load_tool.make_tool( document_loader, avsc_names, metadata, uri, loading_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 runtime_context.secret_store = SecretStore() initialized_job_order = cwltool.main.init_job_order( job_order_object, options, tool, jobloader, sys.stdout, secret_store=runtime_context.secret_store) fs_access = cwltool.stdfsaccess.StdFsAccess(options.basedir) fill_in_defaults( tool.tool["inputs"], initialized_job_order, fs_access) def path_to_loc(obj): if "location" not in obj and "path" in obj: obj["location"] = obj["path"] del obj["path"] def import_files(tool): visit_class(tool, ("File", "Directory"), path_to_loc) visit_class(tool, ("File", ), functools.partial( add_sizes, fs_access)) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial( get_listing, fs_access, recursive=True)) adjustFileObjs(tool, functools.partial( uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) tool.visit(import_files) for inp in tool.tool["inputs"]: def set_secondary(fileobj): if isinstance(fileobj, Mapping) \ and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [ {"location": cwltool.builder.substitute( fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, MutableSequence): for entry in fileobj: set_secondary(entry) if shortname(inp["id"]) in initialized_job_order \ and inp.get("secondaryFiles"): set_secondary(initialized_job_order[shortname(inp["id"])]) import_files(initialized_job_order) visitSteps(tool, import_files) try: runtime_context.use_container = use_container runtime_context.tmpdir = os.path.realpath(tmpdir_prefix) runtime_context.tmp_outdir_prefix = os.path.realpath( tmp_outdir_prefix) runtime_context.job_script_provider = job_script_provider runtime_context.force_docker_pull = options.force_docker_pull runtime_context.no_match_user = options.no_match_user (wf1, _) = makeJob(tool, {}, None, runtime_context) except cwltool.process.UnsupportedRequirement as err: logging.error(err) return 33 wf1.cwljob = initialized_job_order if wf1 is CWLJob: # Clean up temporary directories only created with CWLJobs. wf1.addFollowOnFn(cleanTempDirs, wf1) outobj = toil.start(wf1) outobj = resolve_indirect(outobj) # Stage files. Specify destination bucket if specified in CLI # options. If destination bucket not passed in, # options.destBucket's value will be None. toilStageFiles( toil, outobj, outdir, fileindex, existing, export=True, destBucket=options.destBucket) if not options.destBucket: visit_class(outobj, ("File",), functools.partial( compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) visit_class(outobj, ("File", ), MutationManager().unset_generation) stdout.write(json.dumps(outobj, indent=4)) return 0
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urllib.parse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] uuids = {} def collect_uuids(obj): loc = obj.get("location", "") sp = loc.split(":") if sp[0] == "keep": # Collect collection uuids that need to be resolved to # portable data hashes gp = collection_uuid_pattern.match(loc) if gp: uuids[gp.groups()[0]] = obj if collectionUUID in obj: uuids[obj[collectionUUID]] = obj def collect_uploads(obj): loc = obj.get("location", "") sp = loc.split(":") if len(sp) < 1: return if sp[0] in ("file", "http", "https"): # Record local files than need to be uploaded, # don't include file literals, keep references, etc. sc.append(obj) collect_uuids(obj) visit_class(workflowobj, ("File", "Directory"), collect_uuids) visit_class(sc_result, ("File", "Directory"), collect_uploads) # Resolve any collection uuids we found to portable data hashes # and assign them to uuid_map uuid_map = {} fetch_uuids = list(uuids.keys()) while fetch_uuids: # For a large number of fetch_uuids, API server may limit # response size, so keep fetching from API server has nothing # more to give us. lookups = arvrunner.api.collections().list( filters=[["uuid", "in", fetch_uuids]], count="none", select=["uuid", "portable_data_hash"]).execute( num_retries=arvrunner.num_retries) if not lookups["items"]: break for l in lookups["items"]: uuid_map[l["uuid"]] = l["portable_data_hash"] fetch_uuids = [u for u in fetch_uuids if u not in uuid_map] normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): loc = p.get("location") if loc and (not loc.startswith("_:")) and (not loc.startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved return if not loc: return if collectionUUID in p: uuid = p[collectionUUID] if uuid not in uuid_map: raise SourceLine(p, collectionUUID, validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) gp = collection_pdh_pattern.match(loc) if gp and uuid_map[uuid] != gp.groups()[0]: # This file entry has both collectionUUID and a PDH # location. If the PDH doesn't match the one returned # the API server, raise an error. raise SourceLine(p, "location", validate.ValidationException).makeError( "Expected collection uuid %s to be %s but API server reported %s" % ( uuid, gp.groups()[0], uuid_map[p[collectionUUID]])) gp = collection_uuid_pattern.match(loc) if not gp: return uuid = gp.groups()[0] if uuid not in uuid_map: raise SourceLine(p, "location", validate.ValidationException).makeError( "Collection uuid %s not found" % uuid) p["location"] = "keep:%s%s" % (uuid_map[uuid], gp.groups()[1] if gp.groups()[1] else "") p[collectionUUID] = uuid visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def arv_executor(self, tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception("--trash-intermediate is only supported with --api=containers.") self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception("--intermediate-output-ttl is only supported with --api=containers.") if self.intermediate_output_ttl < 0: raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = tool.tool.get("label") or tool.metadata.get("label") or os.path.basename(tool.tool["id"]) # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Reload tool object which may have been updated by # upload_workflow_deps # Don't validate this time because it will just print redundant errors. loadingContext = self.loadingContext.copy() loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata loadingContext.do_validate = False tool = self.arv_make_tool(tool.doc_loader.idx[tool.tool["id"]], loadingContext) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, tool, job_order) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "jobs": tmpl = RunnerTemplate(self, tool, job_order, runtimeContext.enable_reuse, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map, loadingContext=loadingContext) tmpl.save() # cwltool.main will write our return value to stdout. return (tmpl.uuid, "success") elif self.work_api == "containers": return (upload_workflow(self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception("--ignore-docker-for-reuse not supported with containers API.") runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" elif self.work_api == "jobs": if runtimeContext.priority != DEFAULT_PRIORITY: raise Exception("--priority not implemented for jobs API.") runtimeContext.outdir = "$(task.outdir)" runtimeContext.docker_outdir = "$(task.outdir)" runtimeContext.tmpdir = "$(task.tmpdir)" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256) self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and (not runtimeContext.always_submit_runner): runtimeContext.runnerjob = tool.tool["id"] else: tool = RunnerContainer(self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext.intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext.collection_cache_size, collection_cache_is_default=self.should_estimate_cache_size) elif self.work_api == "jobs": tool = RunnerJob(self, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, merged_map=merged_map) elif runtimeContext.cwl_runner_job is None and self.work_api == "jobs": # Create pipeline for local run self.pipeline = self.api.pipeline_instances().create( body={ "owner_uuid": self.project_uuid, "name": runtimeContext.name if runtimeContext.name else shortname(tool.tool["id"]), "components": {}, "state": "RunningOnClient"}).execute(num_retries=self.num_retries) logger.info("Pipeline instance %s", self.pipeline["uuid"]) if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.") break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") else: logger.error("Execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if runtimeContext.submit and isinstance(tool, Runner): runnerjob = tool if runnerjob.uuid and self.work_api == "containers": self.api.container_requests().update(uuid=runnerjob.uuid, body={"priority": "0"}).execute(num_retries=self.num_retries) finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname(tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def main(args=None, stdout=sys.stdout): parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument( "--preserve-environment", type=str, nargs='+', help= "Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH", ), dest="preserve_environment") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container setLoggingFromOptions(options) if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict try: t = cwltool.load_tool.load_tool( options.cwltool, toilMakeTool, kwargs={ "hints": [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] }, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job = cwltool.main.load_job_order(options, t, sys.stdin) if type(job) == int: return job job, options.basedir = job fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs( tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs( tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute( fileobj["location"], sf), "class": "File" } for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) make_fs_access = functools.partial(ToilFsAccess, fileStore=toil) try: (wf1, wf2) = makeJob( t, {}, use_container=use_container, preserve_environment=options.preserve_environment, tmpdir=os.path.realpath(outdir), workdir=options.workDir) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class( outobj, ("File", ), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) document_loader, workflowobj, uri = (self.doc_loader, self.doc_loader.fetch(self.tool["id"]), self.tool["id"]) discover_secondary_files(self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), document_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: workflowobj["requirements"] = dedup_reqs(self.requirements) workflowobj["hints"] = dedup_reqs(self.hints) packed = pack(document_loader, workflowobj, uri, self.metadata) def visit(item): for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append(req) dyn = True break else: with SourceLine(req, k, WorkflowException): raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions") if not dyn: self.static_resource_req.append(req) if req["class"] == "DockerRequirement": if "http://arvados.org/cwl#dockerCollectionPDH" in req: del req["http://arvados.org/cwl#dockerCollectionPDH"] visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [get_overall_res_req(self.static_resource_req)] upload_dependencies(self.arvrunner, runtimeContext.name, document_loader, packed, uri, False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+job_res_reqs+[ {"class": "InlineJavascriptRequirement"}, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl#main", "cwl.input.yml"], "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
def arv_executor(self, updated_tool, job_order, runtimeContext, logger=None): self.debug = runtimeContext.debug updated_tool.visit(self.check_features) self.project_uuid = runtimeContext.project_uuid self.pipeline = None self.fs_access = runtimeContext.make_fs_access(runtimeContext.basedir) self.secret_store = runtimeContext.secret_store self.trash_intermediate = runtimeContext.trash_intermediate if self.trash_intermediate and self.work_api != "containers": raise Exception("--trash-intermediate is only supported with --api=containers.") self.intermediate_output_ttl = runtimeContext.intermediate_output_ttl if self.intermediate_output_ttl and self.work_api != "containers": raise Exception("--intermediate-output-ttl is only supported with --api=containers.") if self.intermediate_output_ttl < 0: raise Exception("Invalid value %d for --intermediate-output-ttl, cannot be less than zero" % self.intermediate_output_ttl) if runtimeContext.submit_request_uuid and self.work_api != "containers": raise Exception("--submit-request-uuid requires containers API, but using '{}' api".format(self.work_api)) if not runtimeContext.name: runtimeContext.name = self.name = updated_tool.tool.get("label") or updated_tool.metadata.get("label") or os.path.basename(updated_tool.tool["id"]) # Upload local file references in the job order. job_order = upload_job_order(self, "%s input" % runtimeContext.name, updated_tool, job_order) # the last clause means: if it is a command line tool, and we # are going to wait for the result, and always_submit_runner # is false, then we don't submit a runner process. submitting = (runtimeContext.update_workflow or runtimeContext.create_workflow or (runtimeContext.submit and not (updated_tool.tool["class"] == "CommandLineTool" and runtimeContext.wait and not runtimeContext.always_submit_runner))) loadingContext = self.loadingContext.copy() loadingContext.do_validate = False loadingContext.do_update = False if submitting: # Document may have been auto-updated. Reload the original # document with updating disabled because we want to # submit the document with its original CWL version, not # the auto-updated one. tool = load_tool(updated_tool.tool["id"], loadingContext) else: tool = updated_tool # Upload direct dependencies of workflow steps, get back mapping of files to keep references. # Also uploads docker images. merged_map = upload_workflow_deps(self, tool) # Recreate process object (ArvadosWorkflow or # ArvadosCommandTool) because tool document may have been # updated by upload_workflow_deps in ways that modify # inheritance of hints or requirements. loadingContext.loader = tool.doc_loader loadingContext.avsc_names = tool.doc_schema loadingContext.metadata = tool.metadata tool = load_tool(tool.tool, loadingContext) existing_uuid = runtimeContext.update_workflow if existing_uuid or runtimeContext.create_workflow: # Create a pipeline template or workflow record and exit. if self.work_api == "containers": return (upload_workflow(self, tool, job_order, self.project_uuid, uuid=existing_uuid, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, merged_map=merged_map), "success") self.apply_reqs(job_order, tool) self.ignore_docker_for_reuse = runtimeContext.ignore_docker_for_reuse self.eval_timeout = runtimeContext.eval_timeout runtimeContext = runtimeContext.copy() runtimeContext.use_container = True runtimeContext.tmpdir_prefix = "tmp" runtimeContext.work_api = self.work_api if self.work_api == "containers": if self.ignore_docker_for_reuse: raise Exception("--ignore-docker-for-reuse not supported with containers API.") runtimeContext.outdir = "/var/spool/cwl" runtimeContext.docker_outdir = "/var/spool/cwl" runtimeContext.tmpdir = "/tmp" runtimeContext.docker_tmpdir = "/tmp" if runtimeContext.priority < 1 or runtimeContext.priority > 1000: raise Exception("--priority must be in the range 1..1000.") if self.should_estimate_cache_size: visited = set() estimated_size = [0] def estimate_collection_cache(obj): if obj.get("location", "").startswith("keep:"): m = pdh_size.match(obj["location"][5:]) if m and m.group(1) not in visited: visited.add(m.group(1)) estimated_size[0] += int(m.group(2)) visit_class(job_order, ("File", "Directory"), estimate_collection_cache) runtimeContext.collection_cache_size = max(((estimated_size[0]*192) // (1024*1024))+1, 256) self.collection_cache.set_cap(runtimeContext.collection_cache_size*1024*1024) logger.info("Using collection cache size %s MiB", runtimeContext.collection_cache_size) runnerjob = None if runtimeContext.submit: # Submit a runner job to run the workflow for us. if self.work_api == "containers": if submitting: tool = RunnerContainer(self, updated_tool, tool, loadingContext, runtimeContext.enable_reuse, self.output_name, self.output_tags, submit_runner_ram=runtimeContext.submit_runner_ram, name=runtimeContext.name, on_error=runtimeContext.on_error, submit_runner_image=runtimeContext.submit_runner_image, intermediate_output_ttl=runtimeContext.intermediate_output_ttl, merged_map=merged_map, priority=runtimeContext.priority, secret_store=self.secret_store, collection_cache_size=runtimeContext.collection_cache_size, collection_cache_is_default=self.should_estimate_cache_size) else: runtimeContext.runnerjob = tool.tool["id"] if runtimeContext.cwl_runner_job is not None: self.uuid = runtimeContext.cwl_runner_job.get('uuid') jobiter = tool.job(job_order, self.output_callback, runtimeContext) if runtimeContext.submit and not runtimeContext.wait: runnerjob = next(jobiter) runnerjob.run(runtimeContext) return (runnerjob.uuid, "success") current_container = arvados_cwl.util.get_current_container(self.api, self.num_retries, logger) if current_container: logger.info("Running inside container %s", current_container.get("uuid")) self.poll_api = arvados.api('v1', timeout=runtimeContext.http_timeout) self.polling_thread = threading.Thread(target=self.poll_states) self.polling_thread.start() self.task_queue = TaskQueue(self.workflow_eval_lock, self.thread_count) try: self.workflow_eval_lock.acquire() # Holds the lock while this code runs and releases it when # it is safe to do so in self.workflow_eval_lock.wait(), # at which point on_message can update job state and # process output callbacks. loopperf = Perf(metrics, "jobiter") loopperf.__enter__() for runnable in jobiter: loopperf.__exit__() if self.stop_polling.is_set(): break if self.task_queue.error is not None: raise self.task_queue.error if runnable: with Perf(metrics, "run"): self.start_run(runnable, runtimeContext) else: if (self.task_queue.in_flight + len(self.processes)) > 0: self.workflow_eval_lock.wait(3) else: logger.error("Workflow is deadlocked, no runnable processes and not waiting on any pending processes.") break if self.stop_polling.is_set(): break loopperf.__enter__() loopperf.__exit__() while (self.task_queue.in_flight + len(self.processes)) > 0: if self.task_queue.error is not None: raise self.task_queue.error self.workflow_eval_lock.wait(3) except UnsupportedRequirement: raise except: if sys.exc_info()[0] is KeyboardInterrupt or sys.exc_info()[0] is SystemExit: logger.error("Interrupted, workflow will be cancelled") elif isinstance(sys.exc_info()[1], WorkflowException): logger.error("Workflow execution failed:\n%s", sys.exc_info()[1], exc_info=(sys.exc_info()[1] if self.debug else False)) else: logger.exception("Workflow execution failed") if self.pipeline: self.api.pipeline_instances().update(uuid=self.pipeline["uuid"], body={"state": "Failed"}).execute(num_retries=self.num_retries) if self.work_api == "containers" and not current_container: # Not running in a crunch container, so cancel any outstanding processes. for p in self.processes: try: self.api.container_requests().update(uuid=p, body={"priority": "0"} ).execute(num_retries=self.num_retries) except Exception: pass finally: self.workflow_eval_lock.release() self.task_queue.drain() self.stop_polling.set() self.polling_thread.join() self.task_queue.join() if self.final_status == "UnsupportedRequirement": raise UnsupportedRequirement("Check log for details.") if self.final_output is None: raise WorkflowException("Workflow did not return a result.") if runtimeContext.submit and isinstance(tool, Runner): logger.info("Final output collection %s", tool.final_output) else: if self.output_name is None: self.output_name = "Output of %s" % (shortname(tool.tool["id"])) if self.output_tags is None: self.output_tags = "" storage_classes = runtimeContext.storage_classes.strip().split(",") self.final_output, self.final_output_collection = self.make_output_collection(self.output_name, storage_classes, self.output_tags, self.final_output) self.set_crunch_output() if runtimeContext.compute_checksum: adjustDirObjs(self.final_output, partial(get_listing, self.fs_access)) adjustFileObjs(self.final_output, partial(compute_checksums, self.fs_access)) if self.trash_intermediate and self.final_status == "success": self.trash_intermediate_output() return (self.final_output, self.final_status)
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] def only_real(obj): # Only interested in local files than need to be uploaded, # don't include file literals, keep references, etc. sp = obj.get("location", "").split(":") if len(sp) > 1 and sp[0] in ("file", "http", "https"): sc.append(obj) visit_class(sc_result, ("File", "Directory"), only_real) normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in list(discovered.keys()): # Only interested in discovered secondaryFiles which are local # files that need to be uploaded. if d.startswith("file:"): sc.extend(discovered[d]) else: del discovered[d] mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper
def main(args=None, stdout=sys.stdout): config = Config() config.cwl = True parser = argparse.ArgumentParser() addOptions(parser, config) parser.add_argument("cwltool", type=str) parser.add_argument("cwljob", nargs=argparse.REMAINDER) # Will override the "jobStore" positional argument, enables # user to select jobStore or get a default from logic one below. parser.add_argument("--jobStore", type=str) parser.add_argument("--not-strict", action="store_true") parser.add_argument("--no-container", action="store_true") parser.add_argument("--quiet", dest="logLevel", action="store_const", const="ERROR") parser.add_argument("--basedir", type=str) parser.add_argument("--outdir", type=str, default=os.getcwd()) parser.add_argument("--version", action='version', version=baseVersion) parser.add_argument("--user-space-docker-cmd", help="(Linux/OS X only) Specify a user space docker " "command (like udocker or dx-docker) that will be " "used to call 'pull' and 'run'") parser.add_argument("--preserve-environment", type=str, nargs='+', help="Preserve specified environment variables when running CommandLineTools", metavar=("VAR1 VAR2"), default=("PATH",), dest="preserve_environment") # help="Dependency resolver configuration file describing how to adapt 'SoftwareRequirement' packages to current system." parser.add_argument("--beta-dependency-resolvers-configuration", default=None) # help="Defaut root directory used by dependency resolvers configuration." parser.add_argument("--beta-dependencies-directory", default=None) # help="Use biocontainers for tools without an explicitly annotated Docker container." parser.add_argument("--beta-use-biocontainers", default=None, action="store_true") # help="Short cut to use Conda to resolve 'SoftwareRequirement' packages." parser.add_argument("--beta-conda-dependencies", default=None, action="store_true") parser.add_argument("--tmpdir-prefix", type=Text, help="Path prefix for temporary directories", default="tmp") parser.add_argument("--tmp-outdir-prefix", type=Text, help="Path prefix for intermediate output directories", default="tmp") # mkdtemp actually creates the directory, but # toil requires that the directory not exist, # so make it and delete it and allow # toil to create it again (!) workdir = tempfile.mkdtemp() os.rmdir(workdir) if args is None: args = sys.argv[1:] options = parser.parse_args([workdir] + args) use_container = not options.no_container if options.logLevel: cwllogger.setLevel(options.logLevel) outdir = os.path.abspath(options.outdir) fileindex = {} existing = {} make_tool_kwargs = {} conf_file = getattr(options, "beta_dependency_resolvers_configuration", None) # Text use_conda_dependencies = getattr(options, "beta_conda_dependencies", None) # Text job_script_provider = None if conf_file or use_conda_dependencies: dependencies_configuration = DependenciesConfiguration(options) # type: DependenciesConfiguration job_script_provider = dependencies_configuration options.default_container = None make_tool_kwargs["find_default_container"] = functools.partial(find_default_container, options) with Toil(options) as toil: if options.restart: outobj = toil.restart() else: useStrict = not options.not_strict make_tool_kwargs["hints"] = [{ "class": "ResourceRequirement", "coresMin": toil.config.defaultCores, "ramMin": toil.config.defaultMemory / (2**20), "outdirMin": toil.config.defaultDisk / (2**20), "tmpdirMin": 0 }] try: t = cwltool.load_tool.load_tool(options.cwltool, toilMakeTool, kwargs=make_tool_kwargs, resolver=cwltool.resolver.tool_resolver, strict=useStrict) unsupportedRequirementsCheck(t.requirements) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 if type(t) == int: return t options.workflow = options.cwltool options.job_order = options.cwljob options.tool_help = None options.debug = options.logLevel == "DEBUG" job, options.basedir, loader = cwltool.main.load_job_order( options, sys.stdin, None, [], options.job_order) job = cwltool.main.init_job_order(job, options, t, loader=loader) fillInDefaults(t.tool["inputs"], job) def pathToLoc(p): if "location" not in p and "path" in p: p["location"] = p["path"] del p["path"] def importFiles(tool): visit_class(tool, ("File", "Directory"), pathToLoc) normalizeFilesDirs(tool) adjustDirObjs(tool, functools.partial(get_listing, cwltool.stdfsaccess.StdFsAccess(""), recursive=True)) adjustFileObjs(tool, functools.partial(uploadFile, toil.importFile, fileindex, existing, skip_broken=True)) t.visit(importFiles) for inp in t.tool["inputs"]: def setSecondary(fileobj): if isinstance(fileobj, dict) and fileobj.get("class") == "File": if "secondaryFiles" not in fileobj: fileobj["secondaryFiles"] = [{ "location": cwltool.builder.substitute(fileobj["location"], sf), "class": "File"} for sf in inp["secondaryFiles"]] if isinstance(fileobj, list): for e in fileobj: setSecondary(e) if shortname(inp["id"]) in job and inp.get("secondaryFiles"): setSecondary(job[shortname(inp["id"])]) importFiles(job) visitSteps(t, importFiles) try: make_opts = copy.deepcopy(vars(options)) make_opts.update({'tool': t, 'jobobj': {}, 'use_container': use_container, 'tmpdir': os.path.realpath(outdir), 'job_script_provider': job_script_provider}) (wf1, wf2) = makeJob(**make_opts) except cwltool.process.UnsupportedRequirement as e: logging.error(e) return 33 wf1.cwljob = job outobj = toil.start(wf1) outobj = resolve_indirect(outobj) toilStageFiles(toil, outobj, outdir, fileindex, existing, True) visit_class(outobj, ("File",), functools.partial(compute_checksums, cwltool.stdfsaccess.StdFsAccess(""))) stdout.write(json.dumps(outobj, indent=4)) return 0
def job(self, joborder, output_callback, runtimeContext): builder = make_builder(joborder, self.hints, self.requirements, runtimeContext) runtimeContext = set_cluster_target(self.tool, self.arvrunner, builder, runtimeContext) req, _ = self.get_requirement("http://arvados.org/cwl#RunInSingleContainer") if not req: return super(ArvadosWorkflow, self).job(joborder, output_callback, runtimeContext) # RunInSingleContainer is true with SourceLine(self.tool, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "id" not in self.tool: raise WorkflowException("%s object must have 'id'" % (self.tool["class"])) discover_secondary_files(self.arvrunner.fs_access, builder, self.tool["inputs"], joborder) with Perf(metrics, "subworkflow upload_deps"): upload_dependencies(self.arvrunner, os.path.basename(joborder.get("id", "#")), self.doc_loader, joborder, joborder.get("id", "#"), False) if self.wf_pdh is None: packed = pack(self.loadingContext, self.tool["id"], loader=self.doc_loader) for p in packed["$graph"]: if p["id"] == "#main": p["requirements"] = dedup_reqs(self.requirements) p["hints"] = dedup_reqs(self.hints) def visit(item): if "requirements" in item: item["requirements"] = [i for i in item["requirements"] if i["class"] != "DockerRequirement"] for t in ("hints", "requirements"): if t not in item: continue for req in item[t]: if req["class"] == "ResourceRequirement": dyn = False for k in max_res_pars + sum_res_pars: if k in req: if isinstance(req[k], basestring): if item["id"] == "#main": # only the top-level requirements/hints may contain expressions self.dynamic_resource_req.append(req) dyn = True break else: with SourceLine(req, k, WorkflowException): raise WorkflowException("Non-top-level ResourceRequirement in single container cannot have expressions") if not dyn: self.static_resource_req.append(req) visit_class(packed["$graph"], ("Workflow", "CommandLineTool"), visit) if self.static_resource_req: self.static_resource_req = [get_overall_res_req(self.static_resource_req)] upload_dependencies(self.arvrunner, runtimeContext.name, self.doc_loader, packed, self.tool["id"], False) # Discover files/directories referenced by the # workflow (mainly "default" values) visit_class(packed, ("File", "Directory"), self.wf_reffiles.append) if self.dynamic_resource_req: # Evaluate dynamic resource requirements using current builder rs = copy.copy(self.static_resource_req) for dyn_rs in self.dynamic_resource_req: eval_req = {"class": "ResourceRequirement"} for a in max_res_pars + sum_res_pars: if a in dyn_rs: eval_req[a] = builder.do_eval(dyn_rs[a]) rs.append(eval_req) job_res_reqs = [get_overall_res_req(rs)] else: job_res_reqs = self.static_resource_req with Perf(metrics, "subworkflow adjust"): joborder_resolved = copy.deepcopy(joborder) joborder_keepmount = copy.deepcopy(joborder) reffiles = [] visit_class(joborder_keepmount, ("File", "Directory"), reffiles.append) mapper = ArvPathMapper(self.arvrunner, reffiles+self.wf_reffiles, runtimeContext.basedir, "/keep/%s", "/keep/%s/%s") # For containers API, we need to make sure any extra # referenced files (ie referenced by the workflow but # not in the inputs) are included in the mounts. if self.wf_reffiles: runtimeContext = runtimeContext.copy() runtimeContext.extra_reffiles = copy.deepcopy(self.wf_reffiles) def keepmount(obj): remove_redundant_fields(obj) with SourceLine(obj, None, WorkflowException, logger.isEnabledFor(logging.DEBUG)): if "location" not in obj: raise WorkflowException("%s object is missing required 'location' field: %s" % (obj["class"], obj)) with SourceLine(obj, "location", WorkflowException, logger.isEnabledFor(logging.DEBUG)): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).target if "listing" in obj: del obj["listing"] elif obj["location"].startswith("_:"): del obj["location"] else: raise WorkflowException("Location is not a keep reference or a literal: '%s'" % obj["location"]) visit_class(joborder_keepmount, ("File", "Directory"), keepmount) def resolved(obj): if obj["location"].startswith("keep:"): obj["location"] = mapper.mapper(obj["location"]).resolved visit_class(joborder_resolved, ("File", "Directory"), resolved) if self.wf_pdh is None: adjustFileObjs(packed, keepmount) adjustDirObjs(packed, keepmount) self.wf_pdh = upload_workflow_collection(self.arvrunner, shortname(self.tool["id"]), packed) self.loadingContext = self.loadingContext.copy() self.loadingContext.metadata = self.loadingContext.metadata.copy() self.loadingContext.metadata["http://commonwl.org/cwltool#original_cwlVersion"] = "v1.0" if len(job_res_reqs) == 1: # RAM request needs to be at least 128 MiB or the workflow # runner itself won't run reliably. if job_res_reqs[0].get("ramMin", 1024) < 128: job_res_reqs[0]["ramMin"] = 128 arguments = ["--no-container", "--move-outputs", "--preserve-entire-environment", "workflow.cwl", "cwl.input.yml"] if runtimeContext.debug: arguments.insert(0, '--debug') wf_runner = cmap({ "class": "CommandLineTool", "baseCommand": "cwltool", "inputs": self.tool["inputs"], "outputs": self.tool["outputs"], "stdout": "cwl.output.json", "requirements": self.requirements+job_res_reqs+[ {"class": "InlineJavascriptRequirement"}, { "class": "InitialWorkDirRequirement", "listing": [{ "entryname": "workflow.cwl", "entry": '$({"class": "File", "location": "keep:%s/workflow.cwl"})' % self.wf_pdh }, { "entryname": "cwl.input.yml", "entry": json.dumps(joborder_keepmount, indent=2, sort_keys=True, separators=(',',': ')).replace("\\", "\\\\").replace('$(', '\$(').replace('${', '\${') }] }], "hints": self.hints, "arguments": arguments, "id": "#" }) return ArvadosCommandTool(self.arvrunner, wf_runner, self.loadingContext).job(joborder_resolved, output_callback, runtimeContext)
def arvados_job_spec(self, runtimeContext): """Create an Arvados container request for this workflow. The returned dict can be used to create a container passed as the +body+ argument to container_requests().create(). """ adjustDirObjs(self.job_order, trim_listing) visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location) visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields) secret_mounts = {} for param in sorted(self.job_order.keys()): if self.secret_store.has_secret(self.job_order[param]): mnt = "/secrets/s%d" % len(secret_mounts) secret_mounts[mnt] = { "kind": "text", "content": self.secret_store.retrieve(self.job_order[param]) } self.job_order[param] = {"$include": mnt} container_req = { "name": self.name, "output_path": "/var/spool/cwl", "cwd": "/var/spool/cwl", "priority": self.priority, "state": "Committed", "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image), "mounts": { "/var/lib/cwl/cwl.input.json": { "kind": "json", "content": self.job_order }, "stdout": { "kind": "file", "path": "/var/spool/cwl/cwl.output.json" }, "/var/spool/cwl": { "kind": "collection", "writable": True } }, "secret_mounts": secret_mounts, "runtime_constraints": { "vcpus": math.ceil(self.submit_runner_cores), "ram": 1024*1024 * (math.ceil(self.submit_runner_ram) + math.ceil(self.collection_cache_size)), "API": True }, "use_existing": self.enable_reuse, "properties": {} } if self.embedded_tool.tool.get("id", "").startswith("keep:"): sp = self.embedded_tool.tool["id"].split('/') workflowcollection = sp[0][5:] workflowname = "/".join(sp[1:]) workflowpath = "/var/lib/cwl/workflow/%s" % workflowname container_req["mounts"]["/var/lib/cwl/workflow"] = { "kind": "collection", "portable_data_hash": "%s" % workflowcollection } else: packed = packed_workflow(self.arvrunner, self.embedded_tool, self.merged_map) workflowpath = "/var/lib/cwl/workflow.json#main" container_req["mounts"]["/var/lib/cwl/workflow.json"] = { "kind": "json", "content": packed } if self.embedded_tool.tool.get("id", "").startswith("arvwf:"): container_req["properties"]["template_uuid"] = self.embedded_tool.tool["id"][6:33] # --local means execute the workflow instead of submitting a container request # --api=containers means use the containers API # --no-log-timestamps means don't add timestamps (the logging infrastructure does this) # --disable-validate because we already validated so don't need to do it again # --eval-timeout is the timeout for javascript invocation # --parallel-task-count is the number of threads to use for job submission # --enable/disable-reuse sets desired job reuse # --collection-cache-size sets aside memory to store collections command = ["arvados-cwl-runner", "--local", "--api=containers", "--no-log-timestamps", "--disable-validate", "--eval-timeout=%s" % self.arvrunner.eval_timeout, "--thread-count=%s" % self.arvrunner.thread_count, "--enable-reuse" if self.enable_reuse else "--disable-reuse", "--collection-cache-size=%s" % self.collection_cache_size] if self.output_name: command.append("--output-name=" + self.output_name) container_req["output_name"] = self.output_name if self.output_tags: command.append("--output-tags=" + self.output_tags) if runtimeContext.debug: command.append("--debug") if runtimeContext.storage_classes != "default": command.append("--storage-classes=" + runtimeContext.storage_classes) if self.on_error: command.append("--on-error=" + self.on_error) if self.intermediate_output_ttl: command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl) if self.arvrunner.trash_intermediate: command.append("--trash-intermediate") if self.arvrunner.project_uuid: command.append("--project-uuid="+self.arvrunner.project_uuid) command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"]) container_req["command"] = command return container_req
def arvados_job_spec(self, dry_run=False, pull_image=True, **kwargs): """Create an Arvados container request for this workflow. The returned dict can be used to create a container passed as the +body+ argument to container_requests().create(). """ adjustDirObjs(self.job_order, trim_listing) visit_class(self.job_order, ("File", "Directory"), trim_anonymous_location) visit_class(self.job_order, ("File", "Directory"), remove_redundant_fields) container_req = { "owner_uuid": self.arvrunner.project_uuid, "name": self.name, "output_path": "/var/spool/cwl", "cwd": "/var/spool/cwl", "priority": 1, "state": "Committed", "container_image": arvados_jobs_image(self.arvrunner, self.jobs_image), "mounts": { "/var/lib/cwl/cwl.input.json": { "kind": "json", "content": self.job_order }, "stdout": { "kind": "file", "path": "/var/spool/cwl/cwl.output.json" }, "/var/spool/cwl": { "kind": "collection", "writable": True } }, "runtime_constraints": { "vcpus": 1, "ram": 1024*1024 * self.submit_runner_ram, "API": True }, "properties": {} } if self.tool.tool.get("id", "").startswith("keep:"): sp = self.tool.tool["id"].split('/') workflowcollection = sp[0][5:] workflowname = "/".join(sp[1:]) workflowpath = "/var/lib/cwl/workflow/%s" % workflowname container_req["mounts"]["/var/lib/cwl/workflow"] = { "kind": "collection", "portable_data_hash": "%s" % workflowcollection } else: packed = packed_workflow(self.arvrunner, self.tool) workflowpath = "/var/lib/cwl/workflow.json#main" container_req["mounts"]["/var/lib/cwl/workflow.json"] = { "kind": "json", "content": packed } if self.tool.tool.get("id", "").startswith("arvwf:"): container_req["properties"]["template_uuid"] = self.tool.tool["id"][6:33] command = ["arvados-cwl-runner", "--local", "--api=containers", "--no-log-timestamps"] if self.output_name: command.append("--output-name=" + self.output_name) container_req["output_name"] = self.output_name if self.output_tags: command.append("--output-tags=" + self.output_tags) if kwargs.get("debug"): command.append("--debug") if self.enable_reuse: command.append("--enable-reuse") else: command.append("--disable-reuse") if self.on_error: command.append("--on-error=" + self.on_error) if self.intermediate_output_ttl: command.append("--intermediate-output-ttl=%d" % self.intermediate_output_ttl) if self.arvrunner.trash_intermediate: command.append("--trash-intermediate") if self.arvrunner.project_uuid: command.append("--project-uuid="+self.arvrunner.project_uuid) command.extend([workflowpath, "/var/lib/cwl/cwl.input.json"]) container_req["command"] = command return container_req
def execute(self, context): post_status(context) self.cwlwf, it_is_workflow = load_cwl( self.dag.default_args["cwl_workflow"], self.dag.default_args) self.cwl_step = [ step for step in self.cwlwf.steps if self.task_id == step.id.split("#")[-1] ][0] if it_is_workflow else self.cwlwf _logger.info('{0}: Running!'.format(self.task_id)) upstream_task_ids = [t.task_id for t in self.upstream_list] + \ ([self.reader_task_id] if self.reader_task_id else []) _logger.debug('{0}: Collecting outputs from: \n{1}'.format( self.task_id, json.dumps(upstream_task_ids, indent=4))) upstream_data = self.xcom_pull(context=context, task_ids=upstream_task_ids) _logger.info('{0}: Upstream data: \n {1}'.format( self.task_id, json.dumps(upstream_data, indent=4))) promises = {} for data in upstream_data: # upstream_data is an array with { promises and outdir } promises = merge(promises, data["promises"]) if "outdir" in data: self.outdir = data["outdir"] _d_args = self.dag.default_args if not self.outdir: self.outdir = _d_args['tmp_folder'] _logger.debug('{0}: Step inputs: {1}'.format( self.task_id, json.dumps(self.cwl_step.tool["inputs"], indent=4))) _logger.debug('{0}: Step outputs: {1}'.format( self.task_id, json.dumps(self.cwl_step.tool["outputs"], indent=4))) jobobj = {} for inp in self.cwl_step.tool["inputs"]: jobobj_id = shortname(inp["id"]).split("/")[-1] source_ids = [] promises_outputs = [] try: source_field = inp["source"] if it_is_workflow else inp.get( "id") source_ids = [shortname(s) for s in source_field] if isinstance( source_field, list) else [shortname(source_field)] promises_outputs = [ promises[source_id] for source_id in source_ids if source_id in promises ] except: _logger.warning( "{0}: Couldn't find source field in step input: {1}". format(self.task_id, json.dumps(inp, indent=4))) _logger.info( '{0}: For input {1} with source_ids: {2} found upstream outputs: \n{3}' .format(self.task_id, jobobj_id, source_ids, promises_outputs)) if len(promises_outputs) > 1: if inp.get("linkMerge", "merge_nested") == "merge_flattened": jobobj[jobobj_id] = flatten(promises_outputs) else: jobobj[jobobj_id] = promises_outputs # Should also check if [None], because in this case we need to take default value elif len(promises_outputs) == 1 and (promises_outputs[0] is not None): jobobj[jobobj_id] = promises_outputs[0] elif "valueFrom" in inp: jobobj[jobobj_id] = None elif "default" in inp: d = copy.copy(inp["default"]) jobobj[jobobj_id] = d else: continue _logger.debug('{0}: Collected job object: \n {1}'.format( self.task_id, json.dumps(jobobj, indent=4))) def _post_scatter_eval(shortio, cwl_step): _value_from = { shortname(i["id"]).split("/")[-1]: i["valueFrom"] for i in cwl_step.tool["inputs"] if "valueFrom" in i } _logger.debug('{0}: Step inputs with valueFrom: \n{1}'.format( self.task_id, json.dumps(_value_from, indent=4))) def value_from_func(k, v): if k in _value_from: return expression.do_eval(_value_from[k], shortio, self.cwlwf.tool.get( "requirements", []), None, None, {}, context=v) else: return v return {k: value_from_func(k, v) for k, v in shortio.items()} job = _post_scatter_eval(jobobj, self.cwl_step) _logger.info('{0}: Final job data: \n {1}'.format( self.task_id, json.dumps(job, indent=4))) _d_args['outdir'] = tempfile.mkdtemp( prefix=os.path.join(self.outdir, "step_tmp")) _d_args['tmpdir_prefix'] = os.path.join(_d_args['outdir'], 'cwl_tmp_') _d_args['tmp_outdir_prefix'] = os.path.join(_d_args['outdir'], 'cwl_outdir_') _d_args["record_container_id"] = True _d_args["cidfile_dir"] = _d_args['outdir'] _d_args["cidfile_prefix"] = self.task_id _logger.debug('{0}: Runtime context: \n {1}'.format(self, _d_args)) executor = SingleJobExecutor() runtimeContext = RuntimeContext(_d_args) runtimeContext.make_fs_access = getdefault( runtimeContext.make_fs_access, StdFsAccess) for inp in self.cwl_step.tool["inputs"]: if inp.get("not_connected"): del job[shortname(inp["id"].split("/")[-1])] _stderr = sys.stderr sys.stderr = sys.__stderr__ (output, status) = executor( self.cwl_step.embedded_tool if it_is_workflow else self.cwl_step, job, runtimeContext, logger=_logger) sys.stderr = _stderr if not output and status == "permanentFail": raise ValueError _logger.debug('{0}: Embedded tool outputs: \n {1}'.format( self.task_id, json.dumps(output, indent=4))) promises = {} for out in self.cwl_step.tool["outputs"]: out_id = shortname(out["id"]) jobout_id = out_id.split("/")[-1] try: promises[out_id] = output[jobout_id] except: continue # Unsetting the Generation from final output object visit_class(promises, ("File", ), MutationManager().unset_generation) data = {"promises": promises, "outdir": self.outdir} _logger.info('{0}: Output: \n {1}'.format(self.task_id, json.dumps(data, indent=4))) return data
def upload_dependencies(arvrunner, name, document_loader, workflowobj, uri, loadref_run, include_primary=True, discovered_secondaryfiles=None): """Upload the dependencies of the workflowobj document to Keep. Returns a pathmapper object mapping local paths to keep references. Also does an in-place update of references in "workflowobj". Use scandeps to find $import, $include, $schemas, run, File and Directory fields that represent external references. If workflowobj has an "id" field, this will reload the document to ensure it is scanning the raw document prior to preprocessing. """ loaded = set() def loadref(b, u): joined = document_loader.fetcher.urljoin(b, u) defrg, _ = urlparse.urldefrag(joined) if defrg not in loaded: loaded.add(defrg) # Use fetch_text to get raw file (before preprocessing). text = document_loader.fetch_text(defrg) if isinstance(text, bytes): textIO = StringIO(text.decode('utf-8')) else: textIO = StringIO(text) return yaml.safe_load(textIO) else: return {} if loadref_run: loadref_fields = set(("$import", "run")) else: loadref_fields = set(("$import",)) scanobj = workflowobj if "id" in workflowobj: # Need raw file content (before preprocessing) to ensure # that external references in $include and $mixin are captured. scanobj = loadref("", workflowobj["id"]) sc_result = scandeps(uri, scanobj, loadref_fields, set(("$include", "$schemas", "location")), loadref, urljoin=document_loader.fetcher.urljoin) sc = [] def only_real(obj): if obj.get("location", "").startswith("file:"): sc.append(obj) visit_class(sc_result, ("File", "Directory"), only_real) normalizeFilesDirs(sc) if include_primary and "id" in workflowobj: sc.append({"class": "File", "location": workflowobj["id"]}) if "$schemas" in workflowobj: for s in workflowobj["$schemas"]: sc.append({"class": "File", "location": s}) def visit_default(obj): remove = [False] def ensure_default_location(f): if "location" not in f and "path" in f: f["location"] = f["path"] del f["path"] if "location" in f and not arvrunner.fs_access.exists(f["location"]): # Doesn't exist, remove from list of dependencies to upload sc[:] = [x for x in sc if x["location"] != f["location"]] # Delete "default" from workflowobj remove[0] = True visit_class(obj["default"], ("File", "Directory"), ensure_default_location) if remove[0]: del obj["default"] find_defaults(workflowobj, visit_default) discovered = {} def discover_default_secondary_files(obj): discover_secondary_files(obj["inputs"], {shortname(t["id"]): t["default"] for t in obj["inputs"] if "default" in t}, discovered) visit_class(workflowobj, ("CommandLineTool", "Workflow"), discover_default_secondary_files) for d in discovered: sc.extend(discovered[d]) mapper = ArvPathMapper(arvrunner, sc, "", "keep:%s", "keep:%s/%s", name=name, single_collection=True) def setloc(p): if "location" in p and (not p["location"].startswith("_:")) and (not p["location"].startswith("keep:")): p["location"] = mapper.mapper(p["location"]).resolved visit_class(workflowobj, ("File", "Directory"), setloc) visit_class(discovered, ("File", "Directory"), setloc) if discovered_secondaryfiles is not None: for d in discovered: discovered_secondaryfiles[mapper.mapper(d).resolved] = discovered[d] if "$schemas" in workflowobj: sch = [] for s in workflowobj["$schemas"]: sch.append(mapper.mapper(s).resolved) workflowobj["$schemas"] = sch return mapper