Exemplo n.º 1
0
def execute_workflow(workflow: Workflow) -> "WorkflowExecutionResult":
    """Execute workflow.

    This function also performs tail-recursion optimization for inplace
    workflow steps.

    Args:
        workflow: The workflow to be executed.
    Returns:
        An object ref that represent the result.
    """
    # Tail recursion optimization.
    context = {}
    while True:
        with workflow_context.fork_workflow_step_context(**context):
            result = _execute_workflow(workflow)
        if not isinstance(result.persisted_output, InplaceReturnedWorkflow):
            break
        workflow = result.persisted_output.workflow
        context = result.persisted_output.context

    # Convert the outputs into WorkflowStaticRef.
    result.persisted_output = WorkflowStaticRef.from_output(
        workflow.step_id, result.persisted_output)
    result.volatile_output = WorkflowStaticRef.from_output(
        workflow.step_id, result.volatile_output)
    return result
Exemplo n.º 2
0
def resume_workflow_step(
    job_id: str,
    workflow_id: str,
    step_id: "StepID",
    current_output: Optional[ray.ObjectRef],
) -> WorkflowExecutionResult:
    """Resume a step of a workflow.

    Args:
        job_id: The ID of the job that submits the workflow execution. The ID
        is used to identify the submitter of the workflow.
        workflow_id: The ID of the workflow job. The ID is used to identify
            the workflow.
        step_id: The step to resume in the workflow.

    Raises:
        WorkflowNotResumableException: fail to resume the workflow.

    Returns:
        The execution result of the workflow, represented by Ray ObjectRef.
    """
    if current_output is None:
        current_output = []
    else:
        current_output = [current_output]

    persisted_output, volatile_output = _resume_workflow_step_executor.remote(
        job_id, workflow_id, step_id, current_output)
    persisted_output = WorkflowStaticRef.from_output(step_id, persisted_output)
    volatile_output = WorkflowStaticRef.from_output(step_id, volatile_output)
    return WorkflowExecutionResult(persisted_output, volatile_output)
Exemplo n.º 3
0
    def get_output(self, workflow_id: str,
                   name: Optional[str]) -> WorkflowStaticRef:
        """Get the output of a running workflow.

        Args:
            workflow_id: The ID of a workflow job.

        Returns:
            An object reference that can be used to retrieve the
            workflow result.
        """
        if workflow_id in self._workflow_outputs and name is None:
            return self._workflow_outputs[workflow_id].output
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        status = wf_store.load_workflow_status()
        if status == common.WorkflowStatus.NONE:
            raise ValueError(f"No such workflow {workflow_id}")
        if status == common.WorkflowStatus.CANCELED:
            raise ValueError(f"Workflow {workflow_id} is canceled")
        if name is None:
            # For resumable workflow, the workflow result is not ready.
            # It has to be resumed first.
            if status == common.WorkflowStatus.RESUMABLE:
                raise ValueError(
                    f"Workflow {workflow_id} is in resumable status, "
                    "please resume it")

        if name is None:
            step_id = wf_store.get_entrypoint_step_id()
        else:
            step_id = name
            output = self.get_cached_step_output(workflow_id, step_id)
            if output is not None:
                return WorkflowStaticRef.from_output(step_id, output)

        @ray.remote
        def load(wf_store, workflow_id, step_id):
            result = wf_store.inspect_step(step_id)
            if result.output_object_valid:
                # we already have the output
                return wf_store.load_step_output(step_id)
            if isinstance(result.output_step_id, str):
                actor = get_management_actor()
                return WorkflowStaticRef.from_output(
                    result.output_step_id,
                    actor.get_output.remote(workflow_id,
                                            result.output_step_id),
                )
            raise ValueError(f"Cannot load output from step id {step_id} "
                             f"in workflow {workflow_id}")

        return WorkflowStaticRef.from_output(
            step_id,
            load.remote(wf_store, workflow_id, step_id),
        )
Exemplo n.º 4
0
def _reconstruct_wait_step(
    reader: workflow_storage.WorkflowStorage,
    result: workflow_storage.StepInspectResult,
    input_map: Dict[StepID, Any],
):
    input_workflows = []
    step_options = result.step_options
    wait_options = step_options.ray_options.get("wait_options", {})
    for i, _step_id in enumerate(result.workflows):
        # Check whether the step has been loaded or not to avoid
        # duplication
        if _step_id in input_map:
            r = input_map[_step_id]
        else:
            r = _construct_resume_workflow_from_step(reader, _step_id, input_map)
            input_map[_step_id] = r
        if isinstance(r, Workflow):
            input_workflows.append(r)
        else:
            assert isinstance(r, StepID)
            # TODO (Alex): We should consider caching these outputs too.
            output = reader.load_step_output(r)
            # Simulate a workflow with a workflow reference so it could be
            # used directly by 'workflow.wait'.
            static_ref = WorkflowStaticRef(step_id=r, ref=ray.put(output))
            wf = Workflow.from_ref(static_ref)
            input_workflows.append(wf)

    from ray import workflow

    return workflow.wait(input_workflows, **wait_options)
Exemplo n.º 5
0
 def load(wf_store, workflow_id, step_id):
     result = wf_store.inspect_step(step_id)
     if result.output_object_valid:
         # we already have the output
         return wf_store.load_step_output(step_id)
     if isinstance(result.output_step_id, str):
         actor = get_management_actor()
         return WorkflowStaticRef.from_output(
             result.output_step_id,
             actor.get_output.remote(workflow_id,
                                     result.output_step_id),
         )
     raise ValueError(f"Cannot load output from step id {step_id} "
                      f"in workflow {workflow_id}")
Exemplo n.º 6
0
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult":
    """Execute workflow.

    Args:
        workflow: The workflow to be executed.

    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result

    # Stage 1: prepare inputs
    workflow_data = workflow.data
    inputs = workflow_data.inputs
    workflow_outputs = []
    with workflow_context.fork_workflow_step_context(
            outer_most_step_id=None, last_step_of_workflow=False):
        for w in inputs.workflows:
            static_ref = w.ref
            if static_ref is None:
                # The input workflow is not a reference to an executed
                # workflow .
                output = execute_workflow(w).persisted_output
                static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output)
            workflow_outputs.append(static_ref)

    baked_inputs = _BakedWorkflowInputs(
        args=workflow_data.inputs.args,
        workflow_outputs=workflow_outputs,
        workflow_refs=inputs.workflow_refs,
    )

    # Stage 2: match executors
    step_options = workflow_data.step_options
    if step_options.allow_inplace:
        # TODO(suquark): For inplace execution, it is impossible
        # to get the ObjectRef of the output before execution.
        # Here we use a dummy ObjectRef, because _record_step_status does not
        # even use it (?!).
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [ray.put(None)])
        # Note: we need to be careful about workflow context when
        # calling the executor directly.
        # TODO(suquark): We still have recursive Python calls.
        # This would cause stack overflow if we have a really
        # deep recursive call. We should fix it later.
        if step_options.step_type == StepType.WAIT:
            executor = _workflow_wait_executor
        else:
            executor = _workflow_step_executor
    else:
        if step_options.step_type == StepType.WAIT:
            # This is very important to set "num_cpus=0" to
            # ensure "workflow.wait" is not blocked by other
            # tasks.
            executor = _workflow_wait_executor_remote.options(
                num_cpus=0).remote
        else:
            executor = _workflow_step_executor_remote.options(
                **step_options.ray_options).remote

    # Stage 3: execution
    persisted_output, volatile_output = executor(
        workflow_data.func_body,
        workflow_context.get_workflow_step_context(),
        workflow.step_id,
        baked_inputs,
        workflow_data.step_options,
    )

    # Stage 4: post processing outputs
    if not isinstance(persisted_output, WorkflowOutputType):
        persisted_output = ray.put(persisted_output)
    if not isinstance(persisted_output, WorkflowOutputType):
        volatile_output = ray.put(volatile_output)

    if step_options.step_type != StepType.READONLY_ACTOR_METHOD:
        if not step_options.allow_inplace:
            # TODO: [Possible flaky bug] Here the RUNNING state may
            # be recorded earlier than SUCCESSFUL. This caused some
            # confusion during development.
            _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                                [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 7
0
def _construct_resume_workflow_from_step(
        workflow_id: str, step_id: StepID) -> Union[Workflow, Any]:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        workflow_id: The ID of the workflow.
        step_id: The ID of the step we want to recover.

    Returns:
        A workflow that recovers the step, or the output of the step
            if it has been checkpointed.
    """
    reader = workflow_storage.WorkflowStorage(workflow_id)

    # Step 1: construct dependency of the DAG (BFS)
    inpsect_results = {}
    dependency_map = defaultdict(list)
    num_in_edges = {}

    dag_visit_queue = deque([step_id])
    while dag_visit_queue:
        s: StepID = dag_visit_queue.popleft()
        if s in inpsect_results:
            continue
        r = reader.inspect_step(s)
        inpsect_results[s] = r
        if not r.is_recoverable():
            raise WorkflowStepNotRecoverableError(s)
        if r.output_object_valid:
            deps = []
        elif isinstance(r.output_step_id, str):
            deps = [r.output_step_id]
        else:
            deps = r.workflows
        for w in deps:
            dependency_map[w].append(s)
        num_in_edges[s] = len(deps)
        dag_visit_queue.extend(deps)

    # Step 2: topological sort to determine the execution order (Kahn's algorithm)
    execution_queue: List[StepID] = []

    start_nodes = deque(k for k, v in num_in_edges.items() if v == 0)
    while start_nodes:
        n = start_nodes.popleft()
        execution_queue.append(n)
        for m in dependency_map[n]:
            num_in_edges[m] -= 1
            assert num_in_edges[m] >= 0, (m, n)
            if num_in_edges[m] == 0:
                start_nodes.append(m)

    # Step 3: recover the workflow by the order of the execution queue
    with serialization.objectref_cache():
        # "input_map" is a context storing the input which has been loaded.
        # This context is important for deduplicate step inputs.
        input_map: Dict[StepID, Any] = {}

        for _step_id in execution_queue:
            result = inpsect_results[_step_id]
            if result.output_object_valid:
                input_map[_step_id] = reader.load_step_output(_step_id)
                continue
            if isinstance(result.output_step_id, str):
                input_map[_step_id] = input_map[result.output_step_id]
                continue

            # Process the wait step as a special case.
            if result.step_options.step_type == StepType.WAIT:
                wait_input_workflows = []
                for w in result.workflows:
                    output = input_map[w]
                    if isinstance(output, Workflow):
                        wait_input_workflows.append(output)
                    else:
                        # Simulate a workflow with a workflow reference so it could be
                        # used directly by 'workflow.wait'.
                        static_ref = WorkflowStaticRef(step_id=w,
                                                       ref=ray.put(output))
                        wait_input_workflows.append(
                            Workflow.from_ref(static_ref))
                recovery_workflow = ray.workflow.wait(
                    wait_input_workflows,
                    **result.step_options.ray_options.get("wait_options", {}),
                )
            else:
                args, kwargs = reader.load_step_args(
                    _step_id,
                    workflows=[input_map[w] for w in result.workflows],
                    workflow_refs=list(map(WorkflowRef, result.workflow_refs)),
                )
                func: Callable = reader.load_step_func_body(_step_id)
                # TODO(suquark): Use an alternative function when "workflow.step"
                # is fully deprecated.
                recovery_workflow = ray.workflow.step(func).step(
                    *args, **kwargs)

            # override step_options
            recovery_workflow._step_id = _step_id
            recovery_workflow.data.step_options = result.step_options

            input_map[_step_id] = recovery_workflow

    # Step 4: return the output of the requested step
    return input_map[step_id]
Exemplo n.º 8
0
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult":
    """Execute workflow.

    Args:
        workflow: The workflow to be executed.

    Returns:
        An object ref that represent the result.
    """
    if workflow.executed:
        return workflow.result

    # Stage 1: prepare inputs
    workflow_data = workflow.data
    inputs = workflow_data.inputs
    # Here A is the outer workflow step, B & C are the inner steps.
    # C is the output step for A, because C produces the output for A.
    #
    # @workflow.step
    # def A():
    #     b = B.step()
    #     return C.step(b)
    #
    # If the outer workflow step skips checkpointing, it would
    # update the checkpoint context of all inner steps except
    # the output step, marking them "detached" from the DAG.
    # Output step is not detached from the DAG because once
    # completed, it replaces the output of the outer step.
    step_context = workflow_context.get_workflow_step_context()
    checkpoint_context = step_context.checkpoint_context.copy()
    # "detached" could be defined recursively:
    # detached := already detached or the outer step skips checkpointing
    checkpoint_context.detached_from_dag = (
        checkpoint_context.detached_from_dag
        or not step_context.checkpoint_context.checkpoint)
    # Apply checkpoint context to input steps. Since input steps
    # further apply them to their inputs, this would eventually
    # apply to all steps except the output step. This avoids
    # detaching the output step.
    workflow_outputs = []
    with workflow_context.fork_workflow_step_context(
            outer_most_step_id=None,
            last_step_of_workflow=False,
            checkpoint_context=checkpoint_context,
    ):
        for w in inputs.workflows:
            static_ref = w.ref
            if static_ref is None:
                # The input workflow is not a reference to an executed
                # workflow .
                output = execute_workflow(w).persisted_output
                static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output)
            workflow_outputs.append(static_ref)

    baked_inputs = _BakedWorkflowInputs(
        args=inputs.args,
        workflow_outputs=workflow_outputs,
        workflow_refs=inputs.workflow_refs,
    )

    # Stage 2: match executors
    step_options = workflow_data.step_options
    if step_options.allow_inplace:
        # TODO(suquark): For inplace execution, it is impossible
        # to get the ObjectRef of the output before execution.
        # Here we use a dummy ObjectRef, because _record_step_status does not
        # even use it (?!).
        _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                            [ray.put(None)])
        # Note: we need to be careful about workflow context when
        # calling the executor directly.
        # TODO(suquark): We still have recursive Python calls.
        # This would cause stack overflow if we have a really
        # deep recursive call. We should fix it later.
        if step_options.step_type == StepType.WAIT:
            executor = _workflow_wait_executor
        else:
            executor = _workflow_step_executor
    else:
        if step_options.step_type == StepType.WAIT:
            # This is very important to set "num_cpus=0" to
            # ensure "workflow.wait" is not blocked by other
            # tasks.
            executor = _workflow_wait_executor_remote.options(
                num_cpus=0).remote
        else:
            executor = _workflow_step_executor_remote.options(
                **step_options.ray_options).remote

    # Stage 3: execution
    persisted_output, volatile_output = executor(
        workflow_data.func_body,
        step_context,
        workflow.step_id,
        baked_inputs,
        workflow_data.step_options,
    )

    # Stage 4: post processing outputs
    if not isinstance(persisted_output, WorkflowOutputType):
        persisted_output = ray.put(persisted_output)
    if not isinstance(persisted_output, WorkflowOutputType):
        volatile_output = ray.put(volatile_output)

    if step_options.step_type != StepType.READONLY_ACTOR_METHOD:
        if not step_options.allow_inplace:
            # TODO: [Possible flaky bug] Here the RUNNING state may
            # be recorded earlier than SUCCESSFUL. This caused some
            # confusion during development.
            _record_step_status(workflow.step_id, WorkflowStatus.RUNNING,
                                [volatile_output])

    result = WorkflowExecutionResult(persisted_output, volatile_output)
    workflow._result = result
    workflow._executed = True
    return result
Exemplo n.º 9
0
def _workflow_step_executor(
    func: Callable,
    context: "WorkflowStepContext",
    step_id: "StepID",
    baked_inputs: "_BakedWorkflowInputs",
    runtime_options: "WorkflowStepRuntimeOptions",
    inplace: bool = False,
) -> Tuple[Any, Any]:
    """Executor function for workflow step.

    Args:
        step_id: ID of the step.
        func: The workflow step function.
        baked_inputs: The processed inputs for the step.
        context: Workflow step context. Used to access correct storage etc.
        runtime_options: Parameters for workflow step execution.
        inplace: Execute the workflow inplace.

    Returns:
        Workflow step output.
    """
    # Part 1: update the context for the step
    workflow_context.update_workflow_step_context(context, step_id)
    context = workflow_context.get_workflow_step_context()
    step_type = runtime_options.step_type
    context.checkpoint_context.checkpoint = runtime_options.checkpoint

    # Part 2: resolve inputs
    args, kwargs = baked_inputs.resolve()

    # Part 3: execute the step
    store = workflow_storage.get_workflow_storage()
    try:
        step_prerun_metadata = {"start_time": time.time()}
        store.save_step_prerun_metadata(step_id, step_prerun_metadata)
        with workflow_context.workflow_execution():
            persisted_output, volatile_output = _wrap_run(
                func, runtime_options, *args, **kwargs)
        step_postrun_metadata = {"end_time": time.time()}
        store.save_step_postrun_metadata(step_id, step_postrun_metadata)
    except Exception as e:
        # Always checkpoint the exception.
        commit_step(store, step_id, None, exception=e)
        raise e

    # Part 4: save outputs
    if step_type == StepType.READONLY_ACTOR_METHOD:
        if isinstance(volatile_output, Workflow):
            raise TypeError(
                "Returning a Workflow from a readonly virtual actor is not allowed."
            )
        assert not isinstance(persisted_output, Workflow)
    else:
        # TODO(suquark): Validate checkpoint options before
        # commit the step.
        store = workflow_storage.get_workflow_storage()
        if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC:
            commit_step(
                store,
                step_id,
                persisted_output,
                exception=None,
            )
        if isinstance(persisted_output, Workflow):
            sub_workflow = persisted_output
            outer_most_step_id = context.outer_most_step_id
            assert volatile_output is None
            if step_type == StepType.FUNCTION:
                # Passing down outer most step so inner nested steps would
                # access the same outer most step.
                if not context.outer_most_step_id:
                    # The current workflow step returns a nested workflow, and
                    # there is no outer step for the current step. So the
                    # current step is the outer most step for the inner nested
                    # workflow steps.
                    outer_most_step_id = workflow_context.get_current_step_id()
            if inplace:
                _step_options = sub_workflow.data.step_options
                if (_step_options.step_type != StepType.WAIT
                        and runtime_options.ray_options !=
                        _step_options.ray_options):
                    logger.warning(
                        f"Workflow step '{sub_workflow.step_id}' uses "
                        f"a Ray option different to its caller step '{step_id}' "
                        f"and will be executed inplace. Ray assumes it still "
                        f"consumes the same resource as the caller. This may result "
                        f"in oversubscribing resources.")
                return (
                    InplaceReturnedWorkflow(
                        sub_workflow,
                        {"outer_most_step_id": outer_most_step_id}),
                    None,
                )
            # Execute sub-workflow. Pass down "outer_most_step_id".
            with workflow_context.fork_workflow_step_context(
                    outer_most_step_id=outer_most_step_id):
                result = execute_workflow(sub_workflow)
            # When virtual actor returns a workflow in the method,
            # the volatile_output and persisted_output will be put together
            persisted_output = result.persisted_output
            volatile_output = result.volatile_output
        elif context.last_step_of_workflow:
            # advance the progress of the workflow
            store.advance_progress(step_id)
        _record_step_status(step_id, WorkflowStatus.SUCCESSFUL)
    logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL))
    if isinstance(volatile_output, Workflow):
        # This is the case where a step method is called in the virtual actor.
        # We need to run the method to get the final result.
        assert step_type == StepType.ACTOR_METHOD
        volatile_output = volatile_output.run_async(
            workflow_context.get_current_workflow_id())
        volatile_output = WorkflowStaticRef.from_output(
            step_id, volatile_output)
    return persisted_output, volatile_output