def fsx_to_s3(logger, run_dir, files): files = [os.path.join(run_dir, fn) for fn in files] files = [fn for fn in files if os.path.exists(fn)] try: logger.debug(_("writing", files=files)) proc = subprocess.run( ["fsx_to_s3"] + files, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, universal_newlines=True, ) except subprocess.CalledProcessError as exn: logger.error(_("failed writing output file(s) to S3", stderr=proc.stderr)) raise WDL.Error.RuntimeError("failed writing output file(s) to S3") from exn # read stdout table of local filenames & uploaded URIs ans = {} for line in proc.stdout.split("\n"): line = line.strip() if line: line = line.split("\t") if line: assert len(line) == 2 and line[0] not in ans logger.info(_("wrote", uri=line[1], size=os.path.getsize(line[0]))) ans[line[0]] = line[1] return ans
def update_status_json(logger, task, run_ids, s3_wd_uri, entries): """ Post short-read-mngs workflow status JSON files to the output S3 bucket. These status files were originally created by idseq-dag, used to display pipeline progress in the IDseq webapp. We update it at the beginning and end of each task (carefully, because some tasks run concurrently). """ global _status_json, _status_json_lock if not s3_wd_uri: return try: # Figure out workflow and step names: # e.g. run_ids = ["host_filter", "call-validate_input"] workflow_name = run_ids[0] if workflow_name in ( "czid_host_filter", "czid_non_host_alignment", "czid_postprocess", "czid_experimental", ): workflow_name = "_".join(workflow_name.split("_")[1:]) # parse --step-name from the task command template. For historical reasons, the status JSON # keys use this name and it's not the same as the WDL task name. step_name = task.name # use WDL task name as default step_name_re = re.compile(r"--step-name\s+(\S+)\s") for part in task.command.parts: m = step_name_re.search(part) if isinstance(part, str) else None if m: step_name = m.group(1) assert step_name, "reading --step-name from task command" # Update _status_json which is accumulating over the course of workflow execution. with _status_json_lock: status = _status_json.setdefault(step_name, {}) for k, v in entries.items(): status[k] = v # Upload it logger.verbose( _("update_status_json", step_name=step_name, status=status) ) status_uri = os.path.join(s3_wd_uri, f"{workflow_name}_status2.json") s3_object(status_uri).put(Body=json.dumps(_status_json).encode()) except Exception as exn: logger.error( _( "update_status_json failed", error=str(exn), s3_wd_uri=s3_wd_uri, run_ids=run_ids, ) )
def upload_file(abs_fn, s3uri): s3cp(logger, abs_fn, s3uri) # record in _uploaded_files (keyed by inode, so that it can be found from any # symlink or hardlink) with _uploaded_files_lock: _uploaded_files[inode(abs_fn)] = s3uri if inode(abs_fn) in _cached_files: cache_put(cfg, logger, *_cached_files[inode(abs_fn)]) logger.info(_("task output uploaded", file=abs_fn, uri=s3uri))
def rewriter(fd): try: return _uploaded_files[inode(fd.value)] except Exception: logger.warning( _( "output file or directory wasn't uploaded to S3; keeping local path in outputs.s3.json", path=fd.value, )) return fn
def rewriter(fn): try: return _uploaded_files[inode(fn)] except Exception: logger.warning( _( "output file wasn't uploaded to S3; keeping local path in outputs.s3.json", file=fn, )) return fn
def rewriter(fn): try: return uploaded[fn] except KeyError: logger.warning( _( "output file wasn't written to S3; keeping local path in outputs.s3.json", file=fn, ) ) return fn
def s3cp(logger, fn, s3uri): cmd = ["s3parcp", fn, s3uri] logger.debug(" ".join(cmd)) rslt = subprocess.run(cmd, stderr=subprocess.PIPE) if rslt.returncode != 0: logger.error( _( "failed uploading output file", cmd=" ".join(cmd), exit_status=rslt.returncode, stderr=rslt.stderr.decode("utf-8"), )) raise WDL.Error.RuntimeError("failed: " + " ".join(cmd))
def s3cp(logger, fn, s3uri): # shell out to `aws s3 cp` instead of calling boto3 directly, to minimize contention added to # miniwdl's GIL cmd = ["aws", "s3", "cp", fn, s3uri, "--follow-symlinks", "--only-show-errors"] logger.debug(" ".join(cmd)) rslt = subprocess.run(cmd, stderr=subprocess.PIPE) if rslt.returncode != 0: logger.error( _( "failed uploading output file", cmd=" ".join(cmd), exit_status=rslt.returncode, stderr=rslt.stderr.decode("utf-8"), ) ) raise WDL.Error.RuntimeError("failed: " + " ".join(cmd))
def task(cfg, logger, run_id, run_dir, task, **recv): """ on completion of any task, upload its output files to S3, and record the S3 URI corresponding to each local file (keyed by inode) in _uploaded_files """ logger = logger.getChild("s3_progressive_upload") # ignore inputs recv = yield recv # ignore command/runtime/container recv = yield recv if not cfg.has_option("s3_progressive_upload", "uri_prefix"): logger.debug( "skipping because MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX is unset" ) elif not run_id[-1].startswith("download-"): s3prefix = cfg["s3_progressive_upload"]["uri_prefix"] assert s3prefix.startswith( "s3://"), "MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX invalid" # for each file under output_links def _raise(ex): raise ex links_dir = os.path.join(run_dir, "output_links") for (dn, subdirs, files) in os.walk(links_dir, onerror=_raise): assert dn == links_dir or dn.startswith(links_dir + "/") for fn in files: # upload to S3 abs_fn = os.path.join(dn, fn) # s3uri = os.path.join(s3prefix, *run_id[1:], dn[(len(links_dir) + 1) :], fn) s3uri = os.path.join(s3prefix, os.path.basename(fn)) s3cp(logger, abs_fn, s3uri) # record in _uploaded_files (keyed by inode, so that it can be found from any # symlink or hardlink) with _uploaded_files_lock: _uploaded_files[inode(abs_fn)] = s3uri logger.info(_("task output uploaded", file=abs_fn, uri=s3uri)) # write outputs_s3.json using _uploaded_files write_outputs_s3_json(logger, recv["outputs"], run_dir, os.path.join(s3prefix, *run_id[1:]), task.name) yield recv
def cache_put(cfg: config.Loader, logger: logging.Logger, key: str, outputs: Env.Bindings[Value.Base]): if not (cfg["call_cache"].get_bool("put") and cfg["call_cache"]["backend"] == "s3_progressive_upload_call_cache_backend"): return missing = False def cache(v: Union[Value.File, Value.Directory]) -> str: nonlocal missing missing = missing or inode(str(v.value)) not in _uploaded_files if missing: return "" return _uploaded_files[inode(str(v.value))] remapped_outputs = Value.rewrite_env_paths(outputs, cache) if not missing and cfg.has_option("s3_progressive_upload", "uri_prefix"): uri = os.path.join(get_s3_put_prefix(cfg), "cache", f"{key}.json") s3_object(uri).put( Body=json.dumps(values_to_json(remapped_outputs)).encode()) flag_temporary(uri) logger.info(_("call cache insert", cache_file=uri))
def stderr_callback(line): nonlocal last_stderr_json line2 = line.strip() parsed = False if line2.startswith("{") and line2.endswith("}"): try: d = json.loads(line) assert isinstance(d, dict) msg = "" if "message" in d: msg = d["message"] del d["message"] elif "msg" in d: msg = d["msg"] del d["msg"] stderr_logger.verbose(_(msg.strip(), **d)) last_stderr_json = d parsed = True except Exception: pass if not parsed: stderr_logger.verbose(line.rstrip())
def task(cfg, logger, run_id, run_dir, task, **recv): t_0 = time.time() s3_wd_uri = recv["inputs"].get("s3_wd_uri", None) if s3_wd_uri and s3_wd_uri.value: s3_wd_uri = s3_wd_uri.value update_status_json( logger, task, run_id, s3_wd_uri, {"status": "running", "start_time": time.time()}, ) # First yield point -- through which we'll get the task inputs. Also, the 'task' object is a # WDL.Task through which we have access to the full AST of the task source code. # https://miniwdl.readthedocs.io/en/latest/WDL.html#WDL.Tree.Task # pending proper documentation for this interface, see the detailed comments in this example: # https://github.com/chanzuckerberg/miniwdl/blob/main/examples/plugin_task_omnibus/miniwdl_task_omnibus_example.py recv = yield recv # provide a callback for stderr log messages that attempts to parse them as JSON and pass them # on in structured form stderr_logger = logger.getChild("stderr") last_stderr_json = None def stderr_callback(line): nonlocal last_stderr_json line2 = line.strip() parsed = False if line2.startswith("{") and line2.endswith("}"): try: d = json.loads(line) assert isinstance(d, dict) msg = "" if "message" in d: msg = d["message"] del d["message"] elif "msg" in d: msg = d["msg"] del d["msg"] stderr_logger.verbose(_(msg.strip(), **d)) last_stderr_json = d parsed = True except Exception: pass if not parsed: stderr_logger.verbose(line.rstrip()) recv["container"].stderr_callback = stderr_callback # inject command to log `aws sts get-caller-identity` to confirm AWS_CONTAINER_CREDENTIALS_RELATIVE_URI # is passed through & effective if not run_id[-1].startswith("download-"): recv["command"] = ( """aws sts get-caller-identity | jq -c '. + {message: "aws sts get-caller-identity"}' 1>&2\n\n""" + recv["command"] ) try: recv = yield recv # After task completion -- logging elapsed time in structured form, to be picked up by # CloudWatch Logs. We also have access to the task outputs in recv. t_elapsed = time.time() - t_0 logger.notice( _( "SFN-WDL task done", run_id=run_id[-1], task_name=task.name, elapsed_seconds=round(t_elapsed, 3), ) ) except Exception as exn: if s3_wd_uri: # read the error message to determine status user_errored or pipeline_errored status = dict(status="pipeline_errored") msg = str(exn) if last_stderr_json and "wdl_error_message" in last_stderr_json: msg = last_stderr_json.get( "cause", last_stderr_json["wdl_error_message"] ) if last_stderr_json.get("error", None) == "InvalidInputFileError": status = dict(status="user_errored") if "step_description_md" in last_stderr_json: status.update(description=last_stderr_json["step_description_md"]) status.update(error=msg, end_time=time.time()) update_status_json(logger, task, run_id, s3_wd_uri, status) raise if s3_wd_uri: status = { "status": "uploaded", "end_time": time.time(), } if "step_description_md" in recv["outputs"]: # idseq_dag steps may dynamically generate their description to reflect different # behaviors based on the input. The WDL tasks output this as a String value. status["description"] = recv["outputs"]["step_description_md"].value update_status_json(logger, task, run_id, s3_wd_uri, status) # do nothing with outputs yield recv