Exemplo n.º 1
0
def fsx_to_s3(logger, run_dir, files):
    files = [os.path.join(run_dir, fn) for fn in files]
    files = [fn for fn in files if os.path.exists(fn)]
    try:
        logger.debug(_("writing", files=files))
        proc = subprocess.run(
            ["fsx_to_s3"] + files,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=True,
            universal_newlines=True,
        )
    except subprocess.CalledProcessError as exn:
        logger.error(_("failed writing output file(s) to S3", stderr=proc.stderr))
        raise WDL.Error.RuntimeError("failed writing output file(s) to S3") from exn
    # read stdout table of local filenames & uploaded URIs
    ans = {}
    for line in proc.stdout.split("\n"):
        line = line.strip()
        if line:
            line = line.split("\t")
            if line:
                assert len(line) == 2 and line[0] not in ans
                logger.info(_("wrote", uri=line[1], size=os.path.getsize(line[0])))
                ans[line[0]] = line[1]
    return ans
Exemplo n.º 2
0
def update_status_json(logger, task, run_ids, s3_wd_uri, entries):
    """
    Post short-read-mngs workflow status JSON files to the output S3 bucket. These status files
    were originally created by idseq-dag, used to display pipeline progress in the IDseq webapp.
    We update it at the beginning and end of each task (carefully, because some tasks run
    concurrently).
    """
    global _status_json, _status_json_lock

    if not s3_wd_uri:
        return

    try:
        # Figure out workflow and step names:
        # e.g. run_ids = ["host_filter", "call-validate_input"]
        workflow_name = run_ids[0]
        if workflow_name in (
            "czid_host_filter",
            "czid_non_host_alignment",
            "czid_postprocess",
            "czid_experimental",
        ):
            workflow_name = "_".join(workflow_name.split("_")[1:])
            # parse --step-name from the task command template. For historical reasons, the status JSON
            # keys use this name and it's not the same as the WDL task name.
            step_name = task.name  # use WDL task name as default
            step_name_re = re.compile(r"--step-name\s+(\S+)\s")
            for part in task.command.parts:
                m = step_name_re.search(part) if isinstance(part, str) else None
                if m:
                    step_name = m.group(1)
            assert step_name, "reading --step-name from task command"

            # Update _status_json which is accumulating over the course of workflow execution.
            with _status_json_lock:
                status = _status_json.setdefault(step_name, {})
                for k, v in entries.items():
                    status[k] = v

                # Upload it
                logger.verbose(
                    _("update_status_json", step_name=step_name, status=status)
                )
                status_uri = os.path.join(s3_wd_uri, f"{workflow_name}_status2.json")
                s3_object(status_uri).put(Body=json.dumps(_status_json).encode())
    except Exception as exn:
        logger.error(
            _(
                "update_status_json failed",
                error=str(exn),
                s3_wd_uri=s3_wd_uri,
                run_ids=run_ids,
            )
        )
 def upload_file(abs_fn, s3uri):
     s3cp(logger, abs_fn, s3uri)
     # record in _uploaded_files (keyed by inode, so that it can be found from any
     # symlink or hardlink)
     with _uploaded_files_lock:
         _uploaded_files[inode(abs_fn)] = s3uri
         if inode(abs_fn) in _cached_files:
             cache_put(cfg, logger, *_cached_files[inode(abs_fn)])
     logger.info(_("task output uploaded", file=abs_fn, uri=s3uri))
 def rewriter(fd):
     try:
         return _uploaded_files[inode(fd.value)]
     except Exception:
         logger.warning(
             _(
                 "output file or directory wasn't uploaded to S3; keeping local path in outputs.s3.json",
                 path=fd.value,
             ))
         return fn
Exemplo n.º 5
0
 def rewriter(fn):
     try:
         return _uploaded_files[inode(fn)]
     except Exception:
         logger.warning(
             _(
                 "output file wasn't uploaded to S3; keeping local path in outputs.s3.json",
                 file=fn,
             ))
         return fn
Exemplo n.º 6
0
 def rewriter(fn):
     try:
         return uploaded[fn]
     except KeyError:
         logger.warning(
             _(
                 "output file wasn't written to S3; keeping local path in outputs.s3.json",
                 file=fn,
             )
         )
         return fn
Exemplo n.º 7
0
def s3cp(logger, fn, s3uri):
    cmd = ["s3parcp", fn, s3uri]
    logger.debug(" ".join(cmd))
    rslt = subprocess.run(cmd, stderr=subprocess.PIPE)
    if rslt.returncode != 0:
        logger.error(
            _(
                "failed uploading output file",
                cmd=" ".join(cmd),
                exit_status=rslt.returncode,
                stderr=rslt.stderr.decode("utf-8"),
            ))
        raise WDL.Error.RuntimeError("failed: " + " ".join(cmd))
def s3cp(logger, fn, s3uri):
    # shell out to `aws s3 cp` instead of calling boto3 directly, to minimize contention added to
    # miniwdl's GIL
    cmd = ["aws", "s3", "cp", fn, s3uri, "--follow-symlinks", "--only-show-errors"]
    logger.debug(" ".join(cmd))
    rslt = subprocess.run(cmd, stderr=subprocess.PIPE)
    if rslt.returncode != 0:
        logger.error(
            _(
                "failed uploading output file",
                cmd=" ".join(cmd),
                exit_status=rslt.returncode,
                stderr=rslt.stderr.decode("utf-8"),
            )
        )
        raise WDL.Error.RuntimeError("failed: " + " ".join(cmd))
Exemplo n.º 9
0
def task(cfg, logger, run_id, run_dir, task, **recv):
    """
    on completion of any task, upload its output files to S3, and record the S3 URI corresponding
    to each local file (keyed by inode) in _uploaded_files
    """
    logger = logger.getChild("s3_progressive_upload")

    # ignore inputs
    recv = yield recv
    # ignore command/runtime/container
    recv = yield recv

    if not cfg.has_option("s3_progressive_upload", "uri_prefix"):
        logger.debug(
            "skipping because MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX is unset"
        )
    elif not run_id[-1].startswith("download-"):
        s3prefix = cfg["s3_progressive_upload"]["uri_prefix"]
        assert s3prefix.startswith(
            "s3://"), "MINIWDL__S3_PROGRESSIVE_UPLOAD__URI_PREFIX invalid"

        # for each file under output_links
        def _raise(ex):
            raise ex

        links_dir = os.path.join(run_dir, "output_links")
        for (dn, subdirs, files) in os.walk(links_dir, onerror=_raise):
            assert dn == links_dir or dn.startswith(links_dir + "/")
            for fn in files:
                # upload to S3
                abs_fn = os.path.join(dn, fn)
                # s3uri = os.path.join(s3prefix, *run_id[1:], dn[(len(links_dir) + 1) :], fn)
                s3uri = os.path.join(s3prefix, os.path.basename(fn))
                s3cp(logger, abs_fn, s3uri)
                # record in _uploaded_files (keyed by inode, so that it can be found from any
                # symlink or hardlink)
                with _uploaded_files_lock:
                    _uploaded_files[inode(abs_fn)] = s3uri
                logger.info(_("task output uploaded", file=abs_fn, uri=s3uri))

        # write outputs_s3.json using _uploaded_files
        write_outputs_s3_json(logger, recv["outputs"], run_dir,
                              os.path.join(s3prefix, *run_id[1:]), task.name)

    yield recv
def cache_put(cfg: config.Loader, logger: logging.Logger, key: str,
              outputs: Env.Bindings[Value.Base]):
    if not (cfg["call_cache"].get_bool("put") and cfg["call_cache"]["backend"]
            == "s3_progressive_upload_call_cache_backend"):
        return

    missing = False

    def cache(v: Union[Value.File, Value.Directory]) -> str:
        nonlocal missing
        missing = missing or inode(str(v.value)) not in _uploaded_files
        if missing:
            return ""
        return _uploaded_files[inode(str(v.value))]

    remapped_outputs = Value.rewrite_env_paths(outputs, cache)
    if not missing and cfg.has_option("s3_progressive_upload", "uri_prefix"):
        uri = os.path.join(get_s3_put_prefix(cfg), "cache", f"{key}.json")
        s3_object(uri).put(
            Body=json.dumps(values_to_json(remapped_outputs)).encode())
        flag_temporary(uri)
        logger.info(_("call cache insert", cache_file=uri))
Exemplo n.º 11
0
 def stderr_callback(line):
     nonlocal last_stderr_json
     line2 = line.strip()
     parsed = False
     if line2.startswith("{") and line2.endswith("}"):
         try:
             d = json.loads(line)
             assert isinstance(d, dict)
             msg = ""
             if "message" in d:
                 msg = d["message"]
                 del d["message"]
             elif "msg" in d:
                 msg = d["msg"]
                 del d["msg"]
             stderr_logger.verbose(_(msg.strip(), **d))
             last_stderr_json = d
             parsed = True
         except Exception:
             pass
     if not parsed:
         stderr_logger.verbose(line.rstrip())
Exemplo n.º 12
0
def task(cfg, logger, run_id, run_dir, task, **recv):
    t_0 = time.time()

    s3_wd_uri = recv["inputs"].get("s3_wd_uri", None)
    if s3_wd_uri and s3_wd_uri.value:
        s3_wd_uri = s3_wd_uri.value
        update_status_json(
            logger,
            task,
            run_id,
            s3_wd_uri,
            {"status": "running", "start_time": time.time()},
        )

    # First yield point -- through which we'll get the task inputs. Also, the 'task' object is a
    # WDL.Task through which we have access to the full AST of the task source code.
    #   https://miniwdl.readthedocs.io/en/latest/WDL.html#WDL.Tree.Task
    # pending proper documentation for this interface, see the detailed comments in this example:
    #   https://github.com/chanzuckerberg/miniwdl/blob/main/examples/plugin_task_omnibus/miniwdl_task_omnibus_example.py
    recv = yield recv

    # provide a callback for stderr log messages that attempts to parse them as JSON and pass them
    # on in structured form
    stderr_logger = logger.getChild("stderr")
    last_stderr_json = None

    def stderr_callback(line):
        nonlocal last_stderr_json
        line2 = line.strip()
        parsed = False
        if line2.startswith("{") and line2.endswith("}"):
            try:
                d = json.loads(line)
                assert isinstance(d, dict)
                msg = ""
                if "message" in d:
                    msg = d["message"]
                    del d["message"]
                elif "msg" in d:
                    msg = d["msg"]
                    del d["msg"]
                stderr_logger.verbose(_(msg.strip(), **d))
                last_stderr_json = d
                parsed = True
            except Exception:
                pass
        if not parsed:
            stderr_logger.verbose(line.rstrip())

    recv["container"].stderr_callback = stderr_callback

    # inject command to log `aws sts get-caller-identity` to confirm AWS_CONTAINER_CREDENTIALS_RELATIVE_URI
    # is passed through & effective
    if not run_id[-1].startswith("download-"):
        recv["command"] = (
            """aws sts get-caller-identity | jq -c '. + {message: "aws sts get-caller-identity"}' 1>&2\n\n"""
            + recv["command"]
        )

    try:
        recv = yield recv

        # After task completion -- logging elapsed time in structured form, to be picked up by
        # CloudWatch Logs. We also have access to the task outputs in recv.
        t_elapsed = time.time() - t_0
        logger.notice(
            _(
                "SFN-WDL task done",
                run_id=run_id[-1],
                task_name=task.name,
                elapsed_seconds=round(t_elapsed, 3),
            )
        )
    except Exception as exn:
        if s3_wd_uri:
            # read the error message to determine status user_errored or pipeline_errored
            status = dict(status="pipeline_errored")
            msg = str(exn)
            if last_stderr_json and "wdl_error_message" in last_stderr_json:
                msg = last_stderr_json.get(
                    "cause", last_stderr_json["wdl_error_message"]
                )
                if last_stderr_json.get("error", None) == "InvalidInputFileError":
                    status = dict(status="user_errored")
                if "step_description_md" in last_stderr_json:
                    status.update(description=last_stderr_json["step_description_md"])
            status.update(error=msg, end_time=time.time())
            update_status_json(logger, task, run_id, s3_wd_uri, status)
        raise

    if s3_wd_uri:
        status = {
            "status": "uploaded",
            "end_time": time.time(),
        }
        if "step_description_md" in recv["outputs"]:
            # idseq_dag steps may dynamically generate their description to reflect different
            # behaviors based on the input. The WDL tasks output this as a String value.
            status["description"] = recv["outputs"]["step_description_md"].value
        update_status_json(logger, task, run_id, s3_wd_uri, status)

    # do nothing with outputs
    yield recv