Пример #1
0
def embed_all_runs(workflow_tool, cwl_args=None, location=None):
    """
    Tries to find and load all "run" fields from the "workflow_tool"
    if it is Workflow. If not, doesn't replace anything. "cwl_args"
    can be used to update default arguments used by loading and runtime
    contexts. If "location" is provided, save resulted workflow to json
    file. Returns workflow tool with all "run" fields replaced.
    """
    def __embed(workflow_tool, cwl_args=None):
        if isinstance(workflow_tool, MutableSequence):
            for item in workflow_tool:
                __embed(item, cwl_args)
        elif isinstance(workflow_tool, MutableMapping):
            if "run" in workflow_tool and isinstance(workflow_tool["run"],
                                                     str):
                workflow_tool["run"] = slow_cwl_load(
                    workflow=workflow_tool["run"],
                    cwl_args=cwl_args,
                    only_tool=True)
            for item in workflow_tool.values():
                __embed(item, cwl_args)

    if workflow_tool["class"] == "Workflow":
        workflow_tool_copy = deepcopy(workflow_tool)
        __embed(workflow_tool_copy, cwl_args)
    else:
        workflow_tool_copy = workflow_tool

    if location is not None:
        dump_json(workflow_tool_copy, location)

    return workflow_tool_copy
Пример #2
0
def execute_workflow_step(workflow,
                          task_id,
                          job_data,
                          cwl_args=None,
                          executor=None):
    """
    Constructs and executes single step workflow based on the "workflow"
    and "task_id". "cwl_args" can be used to update default parameters
    used for loading and runtime contexts. Exports json file with the
    execution results.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    executor = SingleJobExecutor() if executor is None else executor

    step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders(
        task_id=task_id, job_data=job_data)

    default_cwl_args = get_default_cwl_args(cwl_args)

    default_cwl_args.update({  # add execution specific parameters
        "tmp_outdir_prefix": step_cache_folder + "/",
        "tmpdir_prefix": step_cache_folder + "/",
        "cidfile_dir": step_tmp_folder,
        "cidfile_prefix": task_id,
        "basedir": os.getcwd(
        ),  # job should already have abs path for inputs, so this is useless
        "outdir": step_outputs_folder
    })

    workflow_step_path = os.path.join(step_tmp_folder,
                                      task_id + "_step_workflow.cwl")

    fast_cwl_step_load(  # will save new worlflow to "workflow_step_path"
        workflow=workflow,
        target_id=task_id,
        cwl_args=default_cwl_args,
        location=workflow_step_path)

    _stderr = sys.stderr  # to trick the logger
    sys.stderr = sys.__stderr__
    step_outputs, step_status = executor(
        slow_cwl_load(workflow=workflow_step_path, cwl_args=default_cwl_args),
        job_data, RuntimeContext(default_cwl_args))
    sys.stderr = _stderr

    if step_status != "success":
        raise ValueError

    # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
    visit_class(step_outputs, ("File", ), MutationManager().unset_generation)

    dump_json(step_outputs, step_report)

    return step_outputs, step_report
Пример #3
0
def relocate_outputs(workflow, job_data, cwl_args=None):
    """
    Moves or copies filtered outputs to "outputs_folder" depending on
    "runtime_context.move_outputs" value, however "tmp_folder" is not
    going to be deleted as it will be done when DAG finishes running.
    Saves report with relocated outputs as "workflow_report.json"
    to "outputs_folder". Maps outputs from "workflow" back to normal
    (from step_id_step_out to workflow output) and filters "job_data"
    based on them (combining items from "job_data" into a list based on
    "outputSource" if it was a list). "cwl_args" can be used to update
    default parameters used for loading and runtime contexts.
    """

    cwl_args = {} if cwl_args is None else cwl_args

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    # Filter "job_data" to include only items required by workflow outputs.
    # Remap keys to the proper workflow outputs IDs (without step id).
    # If "outputSource" was a list even of len=1, find all correspondent items
    # from the "job_data" and assign them as list of the same size.
    job_data_copy = deepcopy(job_data)
    filtered_job_data = {}
    for output_id, output_data in get_items(workflow_tool["outputs"]):
        collected_job_items = []
        for source_id, _ in get_items(output_data["outputSource"]):
            collected_job_items.append(job_data_copy[source_id.replace(
                "/", "_")])
        if isinstance(output_data["outputSource"], list):
            filtered_job_data[output_id] = collected_job_items
        else:
            filtered_job_data[output_id] = collected_job_items[0]

    runtime_context = RuntimeContext(default_cwl_args)
    relocated_job_data = relocateOutputs(
        outputObj=filtered_job_data,
        destination_path=job_data_copy["outputs_folder"],
        source_directories=[
            job_data_copy["tmp_folder"]
        ],  # need to set it to tmp_folder otherwise it won't be able to delete tmp data if action is "move"
        action=runtime_context.move_outputs,
        fs_access=runtime_context.make_fs_access(""),
        compute_checksum=runtime_context.compute_checksum,
        path_mapper=runtime_context.path_mapper)

    # Dump report with relocated outputs
    workflow_report = os.path.join(job_data_copy["outputs_folder"],
                                   "workflow_report.json")

    dump_json(relocated_job_data, workflow_report)

    return relocated_job_data, workflow_report
Пример #4
0
    def execute(self, context):
        """
        Loads job Object from the context. Sets "tmp_folder" and "output_folder"
        if they have not been set before in the job. In case "tmp_folder" and/or
        "output_folder" were read from the job and are relative, resolves paths
        relative to the "tmp_folder" and/or "outputs_folder" from "cwl_args".
        Dumps step outputs as a json file into "tmp_folder". Writes to X-Com report
        file location.
        """

        setup_cwl_logger(context["ti"])
        post_status(context)

        # for easy access
        dag_id = context["dag"].dag_id
        workflow = context["dag"].workflow
        run_id = context["run_id"].replace(":", "_").replace(
            "+", "_")  # to make it dumpable by json
        cwl_args = context["dag"].default_args["cwl"]

        # Loads job from dag_run configuration. Sets defaults from "workflow". Fails on missing input files
        job_data = load_job(workflow=workflow,
                            job=context["dag_run"].conf["job"],
                            cwl_args=cwl_args)

        job_data["tmp_folder"] = get_dir(
            get_absolute_path(
                job_data.get(
                    "tmp_folder",
                    mkdtemp(dir=cwl_args["tmp_folder"],
                            prefix=dag_id + "_" + run_id + "_")),
                cwl_args["tmp_folder"]))

        job_data["outputs_folder"] = get_dir(
            get_absolute_path(
                job_data.get(
                    "outputs_folder",
                    os.path.join(cwl_args["outputs_folder"], dag_id, run_id)),
                cwl_args["outputs_folder"]))

        _, _, _, step_report = get_temp_folders(task_id=self.task_id,
                                                job_data=job_data)

        dump_json(job_data, step_report)

        return step_report
Пример #5
0
def fast_cwl_step_load(workflow, target_id, cwl_args=None, location=None):
    """
    Returns workflow (CommentedMap) that includes only single step
    selected by "target_id" from the parsed "workflow". Other steps
    are removed. Workflow inputs and outputs are updated based on
    source fields of "in" and "out" from the selected workflow step.
    If selected step includes "scatter" field all output types will
    be transformed to the nested/flat array of items of the same type. 
    IDs of updated workflow inputs and outputs as well as IDs of
    correspondent "source" fields also include step id separated by
    underscore. All other fields remain unchanged.

    "cwl_args" can be used to update default location of "pickle_folder"
    used by "fast_cwl_load" as well as other parameters used by
    "slow_cwl_load" for loading and runtime contexts.

    If "location" is not None, export modified workflow.
    """

    cwl_args = {} if cwl_args is None else cwl_args

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_inputs = []
    workflow_outputs = []
    workflow_steps = []

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    selected_step = list(get_items(workflow_tool["steps"], target_id))[0][1]

    workflow_steps.append(selected_step)

    for _, step_in in get_items(selected_step.get(
            "in", [])):  # step might not have "in"

        updated_sources = []  # to keep track of updated sources

        for step_in_source, _ in get_items(step_in.get(
                "source", [])):  # "in" might not have "source"

            try:

                # try to find workflow input that corresponds to "source"

                workflow_input = list(
                    get_items(workflow_tool["inputs"], step_in_source))[0][1]

                updated_workflow_input = {
                    "id": step_in_source,
                    "type": workflow_input["type"]
                }

                # need to copy:
                #  original inputBinding because it can include loadContents section
                #  loadContents and loadListing sections if present outside of inputBinding
                #  both "default" and "secondaryFiles" if present
                # TODO: Do I need to copy format?
                for key in [
                        "default", "secondaryFiles", "inputBinding",
                        "loadContents", "loadListing"
                ]:
                    if key in workflow_input:
                        updated_workflow_input[key] = workflow_input[key]

                # Check if we have already added input based on the same "source"
                # from another item from "in". Skip adding the same input twice.

                if len(list(get_items(workflow_inputs, step_in_source))) == 0:
                    workflow_inputs.append(updated_workflow_input)

                updated_sources.append(step_in_source)

            except (IndexError, KeyError):

                # Need to find upstream step that corresponds to "source"

                upstream_step = list(
                    get_items(
                        workflow_tool["steps"],
                        get_short_id(step_in_source,
                                     only_step_name=True)))[0][1]

                # Need to load tool from "run" of the found upstream step
                # and look for the output that corresponds to "source".
                # We look for correspondence only based on "id"

                upstream_step_tool = fast_cwl_load(
                    workflow=upstream_step["run"], cwl_args=default_cwl_args)

                upstream_step_output = list(
                    get_items(
                        {
                            get_short_id(k, only_id=True): v
                            for k, v in get_items(
                                upstream_step_tool["outputs"])
                        },  # trick
                        get_short_id(step_in_source, only_id=True)))[0][1]

                step_in_source_with_step_id = step_in_source.replace(
                    "/", "_")  # to include both step name and id

                # Check if it should be assumed optional (default field is present)
                # NOTE: consider also checking if upstream step had scatter, so the
                # output type should become array based on the scatter parameters
                if "default" in step_in:
                    upstream_step_output_type = [
                        "null", upstream_step_output["type"]
                    ]
                else:
                    upstream_step_output_type = upstream_step_output["type"]

                updated_workflow_input = {
                    "id": step_in_source_with_step_id,
                    "type": upstream_step_output_type
                }

                # No need to copy "secondaryFiles" for outputs from other steps
                # because they should be already included into the generated json
                # report file
                # # TODO: Do I need to copy format to "workflow_inputs"?

                # Check if we have already added input based on the same "source"
                # from another item from "in". Skip adding the same input twice.

                if len(
                        list(
                            get_items(workflow_inputs,
                                      step_in_source_with_step_id))) == 0:
                    workflow_inputs.append(updated_workflow_input)

                updated_sources.append(step_in_source_with_step_id)

        # replace "source" in step's "in" if anything was updated
        if len(updated_sources) > 0:
            if isinstance(step_in["source"], list):
                step_in["source"] = updated_sources
            else:
                step_in["source"] = updated_sources[0]

    # Need to load tool from the "run" field of the selected step
    # and look for the outputs that correspond to the items from "out".
    # We look for correspondence only based on "id"

    selected_step_tool = fast_cwl_load(workflow=selected_step["run"],
                                       cwl_args=default_cwl_args)

    for step_out, _ in get_items(selected_step["out"]):
        selected_step_output = list(
            get_items(
                {
                    get_short_id(k, only_id=True): v
                    for k, v in get_items(selected_step_tool["outputs"])
                },  # trick
                get_short_id(step_out, only_id=True)))[0][1]
        step_out_with_step_id = step_out.replace(
            "/", "_")  # to include both step name and id

        # update output type in case of scatter
        if "scatter" in selected_step:
            selected_step_output = deepcopy(
                selected_step_output
            )  # need to deepcopy, otherwise we change embedded tool's output
            if isinstance(selected_step["scatter"], MutableSequence) \
                and selected_step.get("scatterMethod") == "nested_crossproduct":
                nesting = len(selected_step["scatter"])
            else:
                nesting = 1
            for _ in range(0, nesting):
                selected_step_output["type"] = {
                    "type": "array",
                    "items": selected_step_output["type"]
                }

        workflow_outputs.append({
            "id": step_out_with_step_id,
            "type": selected_step_output["type"],
            "outputSource": step_out
        })

    workflow_tool.update({
        "inputs": workflow_inputs,
        "outputs": workflow_outputs,
        "steps": workflow_steps
    })

    if location is not None:
        dump_json(workflow_tool, location)

    return workflow_tool
Пример #6
0
def relocate_outputs(workflow,
                     job_data,
                     cwl_args=None,
                     remove_tmp_folder=None):
    """
    Relocates filtered outputs to "outputs_folder" and, by default,
    removes tmp_folder, unless "remove_tmp_folder" is set to something
    else. Saves report with relocated outputs as "workflow_report.json"
    to "outputs_folder".
    Maps outputs from "workflow" back to normal (from step_id_step_out
    to workflow output) and filters "job_data" based on them (combining
    items from "job_data" into a list based on "outputSource" if it
    was a list). "cwl_args" can be used to update default parameters
    used for loading and runtime contexts.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    remove_tmp_folder = True if remove_tmp_folder is None else remove_tmp_folder

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    # Filter "job_data" to include only items required by workflow outputs.
    # Remap keys to the proper workflow outputs IDs (without step id).
    # If "outputSource" was a list even of len=1, find all correspondent items
    # from the "job_data" and assign them as list of the same size.
    job_data_copy = deepcopy(job_data)
    filtered_job_data = {}
    for output_id, output_data in get_items(workflow_tool["outputs"]):
        collected_job_items = []
        for source_id, _ in get_items(output_data["outputSource"]):
            collected_job_items.append(job_data_copy[source_id.replace(
                "/", "_")])
        if isinstance(output_data["outputSource"], list):
            filtered_job_data[output_id] = collected_job_items
        else:
            filtered_job_data[output_id] = collected_job_items[0]

    # Outputs will be always copied, because source_directories=[]
    runtime_context = RuntimeContext(default_cwl_args)
    relocated_job_data = relocateOutputs(
        outputObj=filtered_job_data,
        destination_path=job_data_copy["outputs_folder"],
        source_directories=
        [],  # use it as a placeholder (shouldn't influence anything)
        action=runtime_context.move_outputs,
        fs_access=runtime_context.make_fs_access(""),
        compute_checksum=runtime_context.compute_checksum,
        path_mapper=runtime_context.path_mapper)

    # Dump report with relocated outputs
    workflow_report = os.path.join(job_data_copy["outputs_folder"],
                                   "workflow_report.json")

    dump_json(relocated_job_data, workflow_report)

    # Clean "tmp_folder"
    if remove_tmp_folder:
        shutil.rmtree(job_data_copy["tmp_folder"], ignore_errors=False)

    return relocated_job_data, workflow_report
Пример #7
0
def convert_to_workflow(command_line_tool, location=None):
    """
    Converts "command_line_tool" to Workflow trying to keep all
    important elements. If "command_line_tool" is already Workflow,
    doesn't apply any changes. If "location" is not None, dumps
    results to json file.
    """

    if command_line_tool["class"] == "Workflow":
        workflow_tool = command_line_tool
    else:
        workflow_tool = {
            "class": "Workflow",
            "cwlVersion": command_line_tool["cwlVersion"],
            "inputs": [],
            "outputs": []
        }

        for key in ["requirements"]:
            if key in command_line_tool:
                workflow_tool[key] = command_line_tool[key]

        for input_id, input_data in get_items(command_line_tool["inputs"]):
            workflow_input = {
                "id": input_id,
                "type": remove_field_from_dict(
                    input_data["type"], "inputBinding"
                )  # "type" in WorkflowInputParameter cannot have "inputBinding"
            }
            for key in ["secondaryFiles",
                        "default"]:  # TODO: Do I need to copy format?
                if key in input_data:
                    workflow_input[key] = input_data[key]
            workflow_tool["inputs"].append(workflow_input)

        for output_id, output_data in get_items(command_line_tool["outputs"]):
            workflow_output = {
                "id":
                output_id,
                "type":
                output_data["type"],
                "outputSource":
                get_rootname(command_line_tool["id"]) + "/" + output_id
            }
            # TODO: not sure if I need format here
            # for key in ["format"]:
            #     if key in output_data:
            #         workflow_output[key] = output_data[key]
            workflow_tool["outputs"].append(workflow_output)

        workflow_tool["steps"] = [{
            "id":
            get_rootname(command_line_tool["id"]),
            "run":
            command_line_tool,
            "in": [{
                "id": input_id,
                "source": input_id
            } for input_id, _ in get_items(workflow_tool["inputs"])],
            "out": [
                output_id
                for output_id, _ in get_items(workflow_tool["outputs"])
            ]
        }]

    if location is not None:
        dump_json(workflow_tool, location)

    return workflow_tool
Пример #8
0
def execute_workflow_step(workflow,
                          task_id,
                          job_data,
                          cwl_args=None,
                          executor=None):
    """
    Constructs and executes single step workflow based on the "workflow"
    and "task_id". "cwl_args" can be used to update default parameters
    used for loading and runtime contexts. Exports json file with the
    execution results. If the step was evaluated as the one that need to
    be skipped, the output "skipped" will set to True and the step_report
    file will include "nulls". This function doesn't remove any temporary
    data in both success and failure scenarios.
    """

    cwl_args = {} if cwl_args is None else cwl_args
    executor = SingleJobExecutor() if executor is None else executor

    step_tmp_folder, step_cache_folder, step_outputs_folder, step_report = get_temp_folders(
        task_id=task_id, job_data=job_data)

    default_cwl_args = get_default_cwl_args(cwl_args)

    default_cwl_args.update({  # add execution specific parameters
        "tmp_outdir_prefix": step_cache_folder + "/",
        "tmpdir_prefix": step_cache_folder + "/",
        "cidfile_dir": step_tmp_folder,
        "cidfile_prefix": task_id,
        "basedir": os.getcwd(
        ),  # job should already have abs path for inputs, so this is useless
        "outdir": step_outputs_folder
    })

    workflow_step_path = os.path.join(step_tmp_folder,
                                      task_id + "_step_workflow.cwl")

    fast_cwl_step_load(  # will save new worlflow to "workflow_step_path"
        workflow=workflow,
        target_id=task_id,
        cwl_args=default_cwl_args,
        location=workflow_step_path)

    workflow_data = slow_cwl_load(workflow=workflow_step_path,
                                  cwl_args=default_cwl_args)

    skipped = True
    step_outputs = {
        output_id: None
        for output_id, _ in get_items(workflow_data.tool["outputs"])
    }
    if need_to_run(workflow_data, job_data, task_id):
        skipped = False
        _stderr = sys.stderr  # to trick the logger
        sys.stderr = sys.__stderr__
        step_outputs, step_status = executor(workflow_data, job_data,
                                             RuntimeContext(default_cwl_args))
        sys.stderr = _stderr

        if step_status != "success":
            raise ValueError("Failed to run workflow step")

        # To remove "http://commonwl.org/cwltool#generation": 0 (copied from cwltool)
        visit_class(step_outputs, ("File", ),
                    MutationManager().unset_generation)

    dump_json(step_outputs, step_report)

    return step_outputs, step_report, skipped
Пример #9
0
def fast_cwl_step_load(workflow, target_id, cwl_args=None, location=None):
    """
    Returns workflow (CommentedMap) that includes only single step
    selected by "target_id" from the parsed "workflow". Other steps
    are removed. Workflow inputs and outputs are updated based on
    source fields of "in" and "out" from the selected workflow step.
    IDs of updated workflow inputs and outputs as well as IDs of
    correspondent "source" fields also include step id separated by
    underscore. All other fields remain unchanged.

    "cwl_args" can be used to update default location of "pickle_folder"
    used by "fast_cwl_load" as well as other parameters used by
    "slow_cwl_load" for loading and runtime contexts.

    If "location" is not None, export modified workflow.
    """

    cwl_args = {} if cwl_args is None else cwl_args

    default_cwl_args = get_default_cwl_args(cwl_args)

    workflow_inputs = []
    workflow_outputs = []
    workflow_steps = []

    workflow_tool = fast_cwl_load(workflow=workflow, cwl_args=default_cwl_args)

    selected_step = list(get_items(workflow_tool["steps"], target_id))[0][1]

    workflow_steps.append(selected_step)

    for _, step_in in get_items(selected_step.get(
            "in", [])):  # step might not have "in"

        updated_sources = []  # to keep track of updated sources

        for step_in_source, _ in get_items(step_in.get(
                "source", [])):  # "in" might not have "source"

            try:

                # try to find workflow input that corresponds to "source"

                workflow_input = list(
                    get_items(workflow_tool["inputs"], step_in_source))[0][1]

                updated_workflow_input = {
                    "id": step_in_source,
                    "type": workflow_input["type"]
                }

                # need to copy both "default" and "secondaryFiles" if present
                for key in ["default", "secondaryFiles"
                            ]:  # TODO: Do I need to copy format?
                    if key in workflow_input:
                        updated_workflow_input[key] = workflow_input[key]

                # Check if we have already added input based on the same "source"
                # from another item from "in". Skip adding the same input twice.

                if len(list(get_items(workflow_inputs, step_in_source))) == 0:
                    workflow_inputs.append(updated_workflow_input)

                updated_sources.append(step_in_source)

            except (IndexError, KeyError):

                # Need to find upstream step that corresponds to "source"

                upstream_step = list(
                    get_items(
                        workflow_tool["steps"],
                        get_short_id(step_in_source,
                                     only_step_name=True)))[0][1]

                # Need to load tool from "run" of the found upstream step
                # and look for the output that corresponds to "source".
                # We look for correspondence only based on "id"

                upstream_step_tool = fast_cwl_load(
                    workflow=upstream_step["run"], cwl_args=default_cwl_args)

                upstream_step_output = list(
                    get_items(
                        {
                            get_short_id(k, only_id=True): v
                            for k, v in get_items(
                                upstream_step_tool["outputs"])
                        },  # trick
                        get_short_id(step_in_source, only_id=True)))[0][1]

                step_in_source_with_step_id = step_in_source.replace(
                    "/", "_")  # to include both step name and id

                updated_workflow_input = {
                    "id": step_in_source_with_step_id,
                    "type": upstream_step_output["type"]
                }

                # No need to copy "secondaryFiles" for outputs from other steps
                # because they should be already included into the generated json
                # report file
                # # TODO: Do I need to copy format to "workflow_inputs"?

                # Check if we have already added input based on the same "source"
                # from another item from "in". Skip adding the same input twice.

                if len(
                        list(
                            get_items(workflow_inputs,
                                      step_in_source_with_step_id))) == 0:
                    workflow_inputs.append(updated_workflow_input)

                updated_sources.append(step_in_source_with_step_id)

        # replace "source" in step's "in" if anything was updated
        if len(updated_sources) > 0:
            if isinstance(step_in["source"], list):
                step_in["source"] = updated_sources
            else:
                step_in["source"] = updated_sources[0]

    # Need to load tool from the "run" field of the selected step
    # and look for the outputs that correspond to the items from "out".
    # We look for correspondence only based on "id"

    selected_step_tool = fast_cwl_load(workflow=selected_step["run"],
                                       cwl_args=default_cwl_args)

    for step_out, _ in get_items(selected_step["out"]):
        selected_step_output = list(
            get_items(
                {
                    get_short_id(k, only_id=True): v
                    for k, v in get_items(selected_step_tool["outputs"])
                },  # trick
                get_short_id(step_out, only_id=True)))[0][1]
        step_out_with_step_id = step_out.replace(
            "/", "_")  # to include both step name and id
        workflow_outputs.append({
            "id": step_out_with_step_id,
            "type": selected_step_output["type"],
            "outputSource": step_out
        })

    workflow_tool.update({
        "inputs": workflow_inputs,
        "outputs": workflow_outputs,
        "steps": workflow_steps
    })

    if location is not None:
        dump_json(workflow_tool, location)

    return workflow_tool