Exemplo n.º 1
0
def commit_step(
    store: workflow_storage.WorkflowStorage,
    step_id: "StepID",
    ret: Union["Workflow", Any],
    *,
    exception: Optional[Exception],
):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
        exception: The exception caught by the step.
    """
    from ray.workflow.common import Workflow

    if isinstance(ret, Workflow):
        assert not ret.executed
        tasks = []
        for w in ret._iter_workflows_in_dag():
            # If this is a reference to a workflow, do not checkpoint
            # its input (again).
            if w.ref is None:
                tasks.append(_write_step_inputs(store, w.step_id, w.data))
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))

    context = workflow_context.get_workflow_step_context()
    store.save_step_output(step_id,
                           ret,
                           exception=exception,
                           outer_most_step_id=context.outer_most_step_id)
Exemplo n.º 2
0
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult":
    """Execute workflow.

    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result
    workflow_data = workflow.data
    baked_inputs = _BakedWorkflowInputs.from_workflow_inputs(
        workflow_data.inputs)
    persisted_output, volatile_output = _workflow_step_executor.options(
        **workflow_data.ray_options).remote(
            workflow_data.step_type, workflow_data.func_body,
            workflow_context.get_workflow_step_context(), workflow.step_id,
            baked_inputs, workflow_data.catch_exceptions,
            workflow_data.max_retries)

    if not isinstance(persisted_output, WorkflowOutputType):
        raise TypeError("Unexpected return type of the workflow.")

    if workflow_data.step_type != StepType.READONLY_ACTOR_METHOD:
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 3
0
def execute_workflow(
        workflow: "Workflow",
        outer_most_step_id: Optional[str] = None,
        last_step_of_workflow: bool = False) -> "WorkflowExecutionResult":
    """Execute workflow.

    To fully explain what we are doing, we need to introduce some syntax first.
    The syntax for dependencies between workflow steps
    "B.step(A.step())" is "A - B"; the syntax for nested workflow steps
    "def A(): return B.step()" is "A / B".

    In a chain/DAG of step dependencies, the "output step" is the step of last
    (topological) order. For example, in "A - B - C", C is the output step.

    In a chain of nested workflow steps, the initial "output step" is
    called the "outer most step" for other "output steps". For example, in
    "A / B / C / D", "A" is the outer most step for "B", "C", "D";
    in the hybrid workflow "((A - B) / C / D) - (E / (F - G) / H)",
    "B" is the outer most step for "C", "D"; "E" is the outer most step
    for "G", "H".

    Args:
        workflow: The workflow to be executed.
        outer_most_step_id: The ID of the outer most workflow. None if it
            does not exists. See "step_executor.execute_workflow" for detailed
            explanation.
        last_step_of_workflow: The step that generates the output of the
            workflow (including nested steps).
    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result
    workflow_data = workflow.data
    baked_inputs = _BakedWorkflowInputs.from_workflow_inputs(
        workflow_data.inputs)
    persisted_output, volatile_output = _workflow_step_executor.options(
        **workflow_data.ray_options).remote(
            workflow_data.step_type, workflow_data.func_body,
            workflow_context.get_workflow_step_context(), workflow.step_id,
            baked_inputs, outer_most_step_id, workflow_data.catch_exceptions,
            workflow_data.max_retries, last_step_of_workflow)

    if not isinstance(persisted_output, WorkflowOutputType):
        raise TypeError("Unexpected return type of the workflow.")

    if workflow_data.step_type != StepType.READONLY_ACTOR_METHOD:
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 4
0
def _inherit_checkpoint_option(checkpoint: "Optional[CheckpointModeType]"):
    # If checkpoint option is not specified, inherit checkpoint
    # options from context (i.e. checkpoint options of the outer
    # step). If it is still not specified, it's True by default.
    context = workflow_context.get_workflow_step_context()
    if checkpoint is None:
        if context is not None:
            return context.checkpoint_context.checkpoint
    if checkpoint is None:
        return True
    return checkpoint
Exemplo n.º 5
0
def get_workflow_storage(workflow_id: Optional[str] = None) -> WorkflowStorage:
    """Get the storage for the workflow.

    Args:
        workflow_id: The ID of the storage.

    Returns:
        A workflow storage.
    """
    if workflow_id is None:
        workflow_id = workflow_context.get_workflow_step_context().workflow_id
    return WorkflowStorage(workflow_id)
Exemplo n.º 6
0
def _resolve_dynamic_workflow_refs(workflow_refs: "List[WorkflowRef]"):
    """Get the output of a workflow step with the step ID at runtime.

    We lookup the output by the following order:
    1. Query cached step output in the workflow manager. Fetch the physical
       output object.
    2. If failed to fetch the physical output object, look into the storage
       to see whether the output is checkpointed. Load the checkpoint.
    3. If failed to load the checkpoint, resume the step and get the output.
    """
    workflow_manager = get_or_create_management_actor()
    context = workflow_context.get_workflow_step_context()
    workflow_id = context.workflow_id
    storage_url = context.storage_url
    workflow_ref_mapping = []
    for workflow_ref in workflow_refs:
        step_ref = ray.get(
            workflow_manager.get_cached_step_output.remote(
                workflow_id, workflow_ref.step_id
            )
        )
        get_cached_step = False
        if step_ref is not None:
            try:
                output, _ = _resolve_object_ref(step_ref)
                get_cached_step = True
            except Exception:
                get_cached_step = False
        if not get_cached_step:
            wf_store = workflow_storage.get_workflow_storage()
            try:
                output = wf_store.load_step_output(workflow_ref.step_id)
            except Exception:
                current_step_id = workflow_context.get_current_step_id()
                logger.warning(
                    "Failed to get the output of step "
                    f"{workflow_ref.step_id}. Trying to resume it. "
                    f"Current step: '{current_step_id}'"
                )
                step_ref = recovery.resume_workflow_step(
                    workflow_id, workflow_ref.step_id, storage_url, None
                ).persisted_output
                output, _ = _resolve_object_ref(step_ref)
        workflow_ref_mapping.append(output)
    return workflow_ref_mapping
Exemplo n.º 7
0
def _workflow_wait_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[WaitResult, None]:
    """Executor of 'workflow.wait' steps.

    It returns a tuple that contains wait result. The wait result is a list
    of result of workflows that are ready and a list of workflows that are
    pending.
    """
    # Part 1: Update the context for the step.
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    assert step_type == StepType.WAIT
    wait_options = runtime_options.ray_options.get("wait_options", {})

    # Part 2: Resolve any ready workflows.
    ready_workflows, remaining_workflows = baked_inputs.wait(**wait_options)
    ready_objects = []
    for w in ready_workflows:
        (
            obj,
            _,
        ) = _resolve_object_ref(w.ref.ref)
        ready_objects.append(obj)
    persisted_output = (ready_objects, remaining_workflows)

    # Part 3: Save the outputs.
    store = workflow_storage.get_workflow_storage()
    commit_step(store, step_id, persisted_output, exception=None)
    if context.last_step_of_workflow:
        # advance the progress of the workflow
        store.advance_progress(step_id)

    _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    return persisted_output, None
Exemplo n.º 8
0
def _workflow_wait_executor(
    func: Callable,
    context: "WorkflowStepContext",
    job_id: str,
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[WaitResult, None]:
    """Executor of 'workflow.wait' steps.

    It returns a tuple that contains wait result. The wait result is a list
    of result of workflows that are ready and a list of workflows that are
    pending.
    """
    # Part 1: Update the context for the step.
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    assert step_type == StepType.WAIT
    wait_options = runtime_options.ray_options.get("wait_options", {})

    # Part 2: Resolve any ready workflows.
    ready_workflows, remaining_workflows = baked_inputs.wait(**wait_options)
    ready_objects = [
        _resolve_static_workflow_ref(w.ref) for w in ready_workflows
    ]
    output = (ready_objects, remaining_workflows)

    # Part 3: Save the outputs.
    store = workflow_storage.get_workflow_storage()
    # TODO(suquark): Because the outputs are not generated by "workflow.wait",
    # we do not checkpoint the outputs here. Those steps that generate
    # outputs should checkpoint them.
    commit_step(store, step_id, output, exception=None)
    if context.last_step_of_workflow:
        # advance the progress of the workflow
        store.advance_progress(step_id)

    _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    return output
Exemplo n.º 9
0
def commit_step(store: workflow_storage.WorkflowStorage, step_id: "StepID",
                ret: Union["Workflow", Any], exception: Optional[Exception]):
    """Checkpoint the step output.
    Args:
        store: The storage the current workflow is using.
        step_id: The ID of the step.
        ret: The returned object of the workflow step.
    """
    from ray.workflow.common import Workflow
    if isinstance(ret, Workflow):
        assert not ret.executed
        tasks = [
            _write_step_inputs(store, w.step_id, w.data)
            for w in ret._iter_workflows_in_dag()
        ]
        asyncio.get_event_loop().run_until_complete(asyncio.gather(*tasks))

    context = workflow_context.get_workflow_step_context()
    store.save_step_output(step_id,
                           ret,
                           exception=exception,
                           outer_most_step_id=context.outer_most_step_id)
Exemplo n.º 10
0
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult":
    """Execute workflow.

    Args:
        workflow: The workflow to be executed.

    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result

    # Stage 1: prepare inputs
    workflow_data = workflow.data
    inputs = workflow_data.inputs
    workflow_outputs = []
    with workflow_context.fork_workflow_step_context(
            outer_most_step_id=None, last_step_of_workflow=False):
        for w in inputs.workflows:
            static_ref = w.ref
            if static_ref is None:
                # The input workflow is not a reference to an executed
                # workflow .
                output = execute_workflow(w).persisted_output
                static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output)
            workflow_outputs.append(static_ref)

    baked_inputs = _BakedWorkflowInputs(
        args=workflow_data.inputs.args,
        workflow_outputs=workflow_outputs,
        workflow_refs=inputs.workflow_refs,
    )

    # Stage 2: match executors
    step_options = workflow_data.step_options
    if step_options.allow_inplace:
        # TODO(suquark): For inplace execution, it is impossible
        # to get the ObjectRef of the output before execution.
        # Here we use a dummy ObjectRef, because _record_step_status does not
        # even use it (?!).
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [ray.put(None)])
        # Note: we need to be careful about workflow context when
        # calling the executor directly.
        # TODO(suquark): We still have recursive Python calls.
        # This would cause stack overflow if we have a really
        # deep recursive call. We should fix it later.
        if step_options.step_type == StepType.WAIT:
            executor = _workflow_wait_executor
        else:
            executor = _workflow_step_executor
    else:
        if step_options.step_type == StepType.WAIT:
            # This is very important to set "num_cpus=0" to
            # ensure "workflow.wait" is not blocked by other
            # tasks.
            executor = _workflow_wait_executor_remote.options(
                num_cpus=0).remote
        else:
            executor = _workflow_step_executor_remote.options(
                **step_options.ray_options).remote

    # Stage 3: execution
    persisted_output, volatile_output = executor(
        workflow_data.func_body,
        workflow_context.get_workflow_step_context(),
        workflow.step_id,
        baked_inputs,
        workflow_data.step_options,
    )

    # Stage 4: post processing outputs
    if not isinstance(persisted_output, WorkflowOutputType):
        persisted_output = ray.put(persisted_output)
    if not isinstance(persisted_output, WorkflowOutputType):
        volatile_output = ray.put(volatile_output)

    if step_options.step_type != StepType.READONLY_ACTOR_METHOD:
        if not step_options.allow_inplace:
            # TODO: [Possible flaky bug] Here the RUNNING state may
            # be recorded earlier than SUCCESSFUL. This caused some
            # confusion during development.
            _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                                [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 11
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        persisted_output, volatile_output = _wrap_run(func, runtime_options,
                                                      *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor "
                "is not allowed.")
        assert not isinstance(persisted_output, Workflow)
    else:
        store = workflow_storage.get_workflow_storage()
        commit_step(store, step_id, persisted_output, exception=None)
        if isinstance(persisted_output, Workflow):
            outer_most_step_id = context.outer_most_step_id
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            assert volatile_output is None
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(persisted_output)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Exemplo n.º 12
0
def workflow_state_from_dag(dag_node: DAGNode,
                            input_context: Optional[DAGInputData],
                            workflow_id: str):
    """
    Transform a Ray DAG to a workflow. Map FunctionNode to workflow step with
    the workflow decorator.

    Args:
        dag_node: The DAG to be converted to a workflow.
        input_context: The input data that wraps varibles for the input node of the DAG.
        workflow_id: The ID of the workflow.
    """
    if not isinstance(dag_node, FunctionNode):
        raise TypeError(
            "Currently workflow does not support classes as DAG inputs.")

    state = WorkflowExecutionState()

    # TODO(suquark): remove this cyclic importing later by changing the way of
    # task ID assignment.
    from ray.workflow.workflow_access import get_management_actor

    mgr = get_management_actor()
    context = workflow_context.get_workflow_step_context()

    def _node_visitor(node: Any) -> Any:
        if isinstance(node, FunctionNode):
            bound_options = node._bound_options.copy()
            num_returns = bound_options.get("num_returns", 1)
            if num_returns is None:  # ray could use `None` as default value
                num_returns = 1
            if num_returns > 1:
                raise ValueError("Workflow steps can only have one return.")

            workflow_options = bound_options.pop("_metadata",
                                                 {}).get(WORKFLOW_OPTIONS, {})

            # If checkpoint option is not specified, inherit checkpoint
            # options from context (i.e. checkpoint options of the outer
            # step). If it is still not specified, it's True by default.
            checkpoint = workflow_options.get("checkpoint", None)
            if checkpoint is None:
                checkpoint = context.checkpoint if context is not None else True
            # When it returns a nested workflow, catch_exception
            # should be passed recursively.
            catch_exceptions = workflow_options.get("catch_exceptions", None)
            if catch_exceptions is None:
                # TODO(suquark): should we also handle exceptions from a "leaf node"
                #   in the continuation? For example, we have a workflow
                #   > @ray.remote
                #   > def A(): pass
                #   > @ray.remote
                #   > def B(x): return x
                #   > @ray.remote
                #   > def C(x): return workflow.continuation(B.bind(A.bind()))
                #   > dag = C.options(**workflow.options(catch_exceptions=True)).bind()
                #   Should C catches exceptions of A?
                if node.get_stable_uuid() == dag_node.get_stable_uuid():
                    # 'catch_exception' context should be passed down to
                    # its direct continuation task.
                    # In this case, the direct continuation is the output node.
                    catch_exceptions = (context.catch_exceptions
                                        if context is not None else False)
                else:
                    catch_exceptions = False

            max_retries = bound_options.get("max_retries", 3)
            if not isinstance(max_retries, int) or max_retries < -1:
                raise ValueError(
                    "'max_retries' only accepts 0, -1 or a positive integer.")

            step_options = WorkflowStepRuntimeOptions(
                step_type=StepType.FUNCTION,
                catch_exceptions=catch_exceptions,
                max_retries=max_retries,
                allow_inplace=False,
                checkpoint=checkpoint,
                ray_options=bound_options,
            )

            workflow_refs: List[WorkflowRef] = []
            with serialization_context.workflow_args_serialization_context(
                    workflow_refs):
                _func_signature = signature.extract_signature(node._body)
                flattened_args = signature.flatten_args(
                    _func_signature, node._bound_args, node._bound_kwargs)
                # NOTE: When calling 'ray.put', we trigger python object
                # serialization. Under our serialization context,
                # Workflows are separated from the arguments,
                # leaving a placeholder object with all other python objects.
                # Then we put the placeholder object to object store,
                # so it won't be mutated later. This guarantees correct
                # semantics. See "tests/test_variable_mutable.py" as
                # an example.
                input_placeholder: ray.ObjectRef = ray.put(flattened_args)

            name = workflow_options.get("name")
            if name is None:
                name = f"{get_module(node._body)}.{slugify(get_qualname(node._body))}"
            task_id = ray.get(mgr.gen_step_id.remote(workflow_id, name))
            state.add_dependencies(task_id, [s.task_id for s in workflow_refs])
            state.task_input_args[task_id] = input_placeholder

            user_metadata = workflow_options.pop("metadata", {})
            validate_user_metadata(user_metadata)
            state.tasks[task_id] = Task(
                name=name,
                options=step_options,
                user_metadata=user_metadata,
                func_body=node._body,
            )
            return WorkflowRef(task_id)

        if isinstance(node, InputAttributeNode):
            return node._execute_impl()  # get data from input node
        if isinstance(node, InputNode):
            return input_context  # replace input node with input data
        if not isinstance(node, DAGNode):
            return node  # return normal objects
        raise TypeError(f"Unsupported DAG node: {node}")

    output_workflow_ref = dag_node.apply_recursive(_node_visitor)
    state.output_task_id = output_workflow_ref.task_id
    return state
Exemplo n.º 13
0
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult":
    """Execute workflow.

    Args:
        workflow: The workflow to be executed.

    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result

    # Stage 1: prepare inputs
    workflow_data = workflow.data
    inputs = workflow_data.inputs
    # Here A is the outer workflow step, B & C are the inner steps.
    # C is the output step for A, because C produces the output for A.
    #
    # @workflow.step
    # def A():
    #     b = B.step()
    #     return C.step(b)
    #
    # If the outer workflow step skips checkpointing, it would
    # update the checkpoint context of all inner steps except
    # the output step, marking them "detached" from the DAG.
    # Output step is not detached from the DAG because once
    # completed, it replaces the output of the outer step.
    step_context = workflow_context.get_workflow_step_context()
    checkpoint_context = step_context.checkpoint_context.copy()
    # "detached" could be defined recursively:
    # detached := already detached or the outer step skips checkpointing
    checkpoint_context.detached_from_dag = (
        checkpoint_context.detached_from_dag
        or not step_context.checkpoint_context.checkpoint)
    # Apply checkpoint context to input steps. Since input steps
    # further apply them to their inputs, this would eventually
    # apply to all steps except the output step. This avoids
    # detaching the output step.
    workflow_outputs = []
    with workflow_context.fork_workflow_step_context(
            outer_most_step_id=None,
            last_step_of_workflow=False,
            checkpoint_context=checkpoint_context,
    ):
        for w in inputs.workflows:
            static_ref = w.ref
            if static_ref is None:
                # The input workflow is not a reference to an executed
                # workflow .
                output = execute_workflow(w).persisted_output
                static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output)
            workflow_outputs.append(static_ref)

    baked_inputs = _BakedWorkflowInputs(
        args=inputs.args,
        workflow_outputs=workflow_outputs,
        workflow_refs=inputs.workflow_refs,
    )

    # Stage 2: match executors
    step_options = workflow_data.step_options
    if step_options.allow_inplace:
        # TODO(suquark): For inplace execution, it is impossible
        # to get the ObjectRef of the output before execution.
        # Here we use a dummy ObjectRef, because _record_step_status does not
        # even use it (?!).
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [ray.put(None)])
        # Note: we need to be careful about workflow context when
        # calling the executor directly.
        # TODO(suquark): We still have recursive Python calls.
        # This would cause stack overflow if we have a really
        # deep recursive call. We should fix it later.
        if step_options.step_type == StepType.WAIT:
            executor = _workflow_wait_executor
        else:
            executor = _workflow_step_executor
    else:
        if step_options.step_type == StepType.WAIT:
            # This is very important to set "num_cpus=0" to
            # ensure "workflow.wait" is not blocked by other
            # tasks.
            executor = _workflow_wait_executor_remote.options(
                num_cpus=0).remote
        else:
            executor = _workflow_step_executor_remote.options(
                **step_options.ray_options).remote

    # Stage 3: execution
    persisted_output, volatile_output = executor(
        workflow_data.func_body,
        step_context,
        workflow.step_id,
        baked_inputs,
        workflow_data.step_options,
    )

    # Stage 4: post processing outputs
    if not isinstance(persisted_output, WorkflowOutputType):
        persisted_output = ray.put(persisted_output)
    if not isinstance(persisted_output, WorkflowOutputType):
        volatile_output = ray.put(volatile_output)

    if step_options.step_type != StepType.READONLY_ACTOR_METHOD:
        if not step_options.allow_inplace:
            # TODO: [Possible flaky bug] Here the RUNNING state may
            # be recorded earlier than SUCCESSFUL. This caused some
            # confusion during development.
            _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                                [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 14
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
    inplace: bool = False,
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.
        inplace: Execute the workflow inplace.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    context.checkpoint_context.checkpoint = runtime_options.checkpoint

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        with workflow_context.workflow_execution():
            persisted_output, volatile_output = _wrap_run(
                func, runtime_options, *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        # Always checkpoint the exception.
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor is not allowed."
            )
        assert not isinstance(persisted_output, Workflow)
    else:
        # TODO(suquark): Validate checkpoint options before
        # commit the step.
        store = workflow_storage.get_workflow_storage()
        if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC:
            commit_step(
                store,
                step_id,
                persisted_output,
                exception=None,
            )
        if isinstance(persisted_output, Workflow):
            sub_workflow = persisted_output
            outer_most_step_id = context.outer_most_step_id
            assert volatile_output is None
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            if inplace:
                _step_options = sub_workflow.data.step_options
                if (_step_options.step_type != StepType.WAIT
                        and runtime_options.ray_options !=
                        _step_options.ray_options):
                    logger.warning(
                        f"Workflow step '{sub_workflow.step_id}' uses "
                        f"a Ray option different to its caller step '{step_id}' "
                        f"and will be executed inplace. Ray assumes it still "
                        f"consumes the same resource as the caller. This may result "
                        f"in oversubscribing resources.")
                return (
                    InplaceReturnedWorkflow(
                        sub_workflow,
                        {"outer_most_step_id": outer_most_step_id}),
                    None,
                )
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(sub_workflow)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
    return persisted_output, volatile_output
Exemplo n.º 15
0
def _execute_workflow(job_id,
                      workflow: "Workflow") -> "WorkflowExecutionResult":
    """Internal function of workflow execution."""
    if workflow.executed:
        return workflow.result

    # Stage 1: prepare inputs
    workflow_data = workflow.data
    inputs = workflow_data.inputs
    # Here A is the outer workflow step, B & C are the inner steps.
    # C is the output step for A, because C produces the output for A.
    #
    # @workflow.step
    # def A():
    #     b = B.step()
    #     return C.step(b)
    #
    # If the outer workflow step skips checkpointing, it would
    # update the checkpoint context of all inner steps except
    # the output step, marking them "detached" from the DAG.
    # Output step is not detached from the DAG because once
    # completed, it replaces the output of the outer step.
    step_context = workflow_context.get_workflow_step_context()
    checkpoint_context = step_context.checkpoint_context.copy()
    # "detached" could be defined recursively:
    # detached := already detached or the outer step skips checkpointing
    checkpoint_context.detached_from_dag = (
        checkpoint_context.detached_from_dag
        or not step_context.checkpoint_context.checkpoint)
    # Apply checkpoint context to input steps. Since input steps
    # further apply them to their inputs, this would eventually
    # apply to all steps except the output step. This avoids
    # detaching the output step.
    workflow_outputs = []
    with workflow_context.fork_workflow_step_context(
            outer_most_step_id=None,
            last_step_of_workflow=False,
            checkpoint_context=checkpoint_context,
    ):
        for w in inputs.workflows:
            static_ref = w.ref
            if static_ref is None:
                extra_options = w.data.step_options.ray_options
                # The input workflow is not a reference to an executed
                # workflow.
                static_ref = execute_workflow(job_id, w).output
                static_ref._resolve_like_object_ref_in_args = extra_options.get(
                    "_resolve_like_object_ref_in_args", False)
            workflow_outputs.append(static_ref)

    baked_inputs = _BakedWorkflowInputs(
        args=inputs.args,
        workflow_outputs=workflow_outputs,
        workflow_refs=inputs.workflow_refs,
        job_id=job_id,
    )

    # Stage 2: match executors
    step_options = workflow_data.step_options
    if step_options.allow_inplace:
        # TODO(suquark): For inplace execution, it is impossible
        # to get the ObjectRef of the output before execution.
        # Here we use a dummy ObjectRef, because _record_step_status does not
        # even use it (?!).
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [ray.put(None)])
        # Note: we need to be careful about workflow context when
        # calling the executor directly.
        # TODO(suquark): We still have recursive Python calls.
        # This would cause stack overflow if we have a really
        # deep recursive call. We should fix it later.
        if step_options.step_type == StepType.WAIT:
            executor = _workflow_wait_executor
        else:
            # Tell the executor that we are running inplace. This enables
            # tail-recursion optimization.
            executor = functools.partial(_workflow_step_executor, inplace=True)
    else:
        if step_options.step_type == StepType.WAIT:
            # This is very important to set "num_cpus=0" to
            # ensure "workflow.wait" is not blocked by other
            # tasks.
            executor = _workflow_wait_executor_remote.options(
                num_cpus=0).remote
        else:
            ray_options = step_options.ray_options.copy()
            # cleanup the "_resolve_like_object_ref_in_args" option, it is not for Ray.
            ray_options.pop("_resolve_like_object_ref_in_args", None)
            executor = _workflow_step_executor_remote.options(
                **ray_options).remote

    # Stage 3: execution
    output = executor(
        workflow_data.func_body,
        step_context,
        job_id,
        workflow.step_id,
        baked_inputs,
        workflow_data.step_options,
    )

    # Stage 4: post processing outputs
    if not step_options.allow_inplace:
        # TODO: [Possible flaky bug] Here the RUNNING state may
        # be recorded earlier than SUCCESSFUL. This caused some
        # confusion during development.

        # convert into workflow static ref for step status record.
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [None])

    result = WorkflowExecutionResult(output)
    workflow._result = result
    workflow._executed = True
    return result