예제 #1
0
def get_output(workflow_id: str, name: Optional[str]) -> ray.ObjectRef:
    """Get the output of a running workflow.
    See "api.get_output()" for details.
    """
    from ray.workflow.api import _ensure_workflow_initialized

    _ensure_workflow_initialized()

    try:
        workflow_manager = get_management_actor()
    except ValueError as e:
        raise ValueError(
            "Failed to connect to the workflow management "
            "actor. The workflow could have already failed. You can use "
            "workflow.resume() to resume the workflow.") from e

    try:
        # check storage first
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        tid = wf_store.inspect_output(name)
        if tid is not None:
            return load_step_output_from_storage.remote(workflow_id, name)
    except ValueError:
        pass

    return workflow_manager.get_output.remote(workflow_id, name)
예제 #2
0
    async def save_objectref(
            self, ref_tuple: Tuple[ray.ObjectRef],
            workflow_id: "str") -> Tuple[List[str], ray.ObjectRef]:
        """Serialize and upload an object reference exactly once.

        Args:
            ref_tuple: A 1-element tuple which wraps the reference.

        Returns:
            A pair. The first element is the paths the ref will be uploaded to.
            The second is an object reference to the upload task.
        """
        wf_storage = workflow_storage.WorkflowStorage(workflow_id,
                                                      self._storage)
        ref, = ref_tuple
        # Use the hex as the key to avoid holding a reference to the object.
        key = ref.hex()

        if key not in self._uploads:
            # TODO(Alex): We should probably eventually free these refs.
            identifier_ref = common.calculate_identifier.remote(ref)
            upload_task = _put_helper.remote(identifier_ref, ref, wf_storage)
            self._uploads[key] = Upload(identifier_ref, upload_task)
            self._num_uploads += 1

        info = self._uploads[key]
        identifer = await info.identifier_ref
        paths = wf_storage._key_obj_id(identifer)
        return paths, info.upload_task
예제 #3
0
    def update_step_status(self, workflow_id: str, step_id: str,
                           status: common.WorkflowStatus,
                           outputs: List[ray.ObjectRef]):
        # Note: For virtual actor, we could add more steps even if
        # the workflow finishes.

        self._step_status.setdefault(workflow_id, {})
        if status == common.WorkflowStatus.SUCCESSFUL:
            self._step_status[workflow_id].pop(step_id, None)
        else:
            self._step_status.setdefault(workflow_id, {})[step_id] = status
        remaining = len(self._step_status[workflow_id])
        if status != common.WorkflowStatus.RUNNING:
            self._step_output_cache.pop((workflow_id, step_id), None)

        if status != common.WorkflowStatus.FAILED and remaining != 0:
            return

        wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store)

        if status == common.WorkflowStatus.FAILED:
            if workflow_id in self._workflow_outputs:
                cancel_job(self._workflow_outputs.pop(workflow_id).output)
            wf_store.save_workflow_meta(
                common.WorkflowMetaData(common.WorkflowStatus.FAILED))
            self._step_status.pop(workflow_id)
        else:
            wf_store.save_workflow_meta(
                common.WorkflowMetaData(common.WorkflowStatus.SUCCESSFUL))
            self._step_status.pop(workflow_id)
예제 #4
0
    def update_step_status(
        self,
        workflow_id: str,
        step_id: str,
        status: common.WorkflowStatus,
        outputs: List[WorkflowStaticRef],
    ):
        # Note: For virtual actor, we could add more steps even if
        # the workflow finishes.

        self._step_status.setdefault(workflow_id, {})
        if status == common.WorkflowStatus.SUCCESSFUL:
            self._step_status[workflow_id].pop(step_id, None)
        else:
            self._step_status.setdefault(workflow_id, {})[step_id] = status
        remaining = len(self._step_status[workflow_id])
        if status != common.WorkflowStatus.RUNNING:
            self._step_output_cache.pop((workflow_id, step_id), None)

        if status != common.WorkflowStatus.FAILED and remaining != 0:
            return

        if status == common.WorkflowStatus.FAILED:
            if workflow_id in self._workflow_outputs:
                cancel_job(self._workflow_outputs.pop(workflow_id).output)
            self._update_workflow_status(workflow_id,
                                         common.WorkflowStatus.FAILED)
            self._step_status.pop(workflow_id)
        else:
            self._update_workflow_status(workflow_id,
                                         common.WorkflowStatus.SUCCESSFUL)
            self._step_status.pop(workflow_id)
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        wf_store.save_workflow_postrun_metadata({"end_time": time.time()})
예제 #5
0
def workflow_state_from_storage(
        workflow_id: str, task_id: Optional[TaskID]) -> WorkflowExecutionState:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        workflow_id: The ID of the workflow.
        task_id: The ID of the output task. If None, it will be the entrypoint of
            the workflow.

    Returns:
        A workflow that recovers the step, or the output of the step
            if it has been checkpointed.
    """
    reader = workflow_storage.WorkflowStorage(workflow_id)
    if task_id is None:
        task_id = reader.get_entrypoint_step_id()

    # Construct the workflow execution state.
    state = WorkflowExecutionState(output_task_id=task_id)
    state.output_task_id = task_id

    visited_tasks = set()
    dag_visit_queue = deque([task_id])
    with serialization.objectref_cache():
        while dag_visit_queue:
            task_id: TaskID = dag_visit_queue.popleft()
            if task_id in visited_tasks:
                continue
            visited_tasks.add(task_id)
            r = reader.inspect_step(task_id)
            if not r.is_recoverable():
                raise WorkflowStepNotRecoverableError(task_id)
            if r.output_object_valid:
                target = state.continuation_root.get(task_id, task_id)
                state.checkpoint_map[target] = WorkflowRef(task_id)
                continue
            if isinstance(r.output_step_id, str):
                # no input dependencies here because the task has already
                # returned a continuation
                state.upstream_dependencies[task_id] = []
                state.append_continuation(task_id, r.output_step_id)
                dag_visit_queue.append(r.output_step_id)
                continue
            # transfer task info to state
            state.add_dependencies(task_id, r.workflow_refs)
            state.task_input_args[task_id] = reader.load_step_args(task_id)
            # TODO(suquark): although not necessary, but for completeness,
            #  we may also load name and metadata.
            state.tasks[task_id] = Task(
                name="",
                options=r.step_options,
                user_metadata={},
                func_body=reader.load_step_func_body(task_id),
            )

            dag_visit_queue.extend(r.workflow_refs)

    return state
예제 #6
0
def _resume_workflow_step_executor(workflow_id: str, step_id: "StepID",
                                   store_url: str, current_output: [
                                       ray.ObjectRef
                                   ]) -> Tuple[ray.ObjectRef, ray.ObjectRef]:
    # TODO (yic): We need better dependency management for virtual actor
    # The current output will always be empty for normal workflow
    # For virtual actor, if it's not empty, it means the previous job is
    # running. This is a really bad one.
    for ref in current_output:
        try:
            while isinstance(ref, ray.ObjectRef):
                ref = ray.get(ref)
        except Exception:
            pass
    try:
        store = storage.create_storage(store_url)
        wf_store = workflow_storage.WorkflowStorage(workflow_id, store)
        r = _construct_resume_workflow_from_step(wf_store, step_id)
    except Exception as e:
        raise WorkflowNotResumableError(workflow_id) from e

    if isinstance(r, Workflow):
        with workflow_context.workflow_step_context(workflow_id,
                                                    store.storage_url):
            from ray.workflow.step_executor import (execute_workflow)
            result = execute_workflow(r, last_step_of_workflow=True)
            return result.persisted_output, result.volatile_output
    assert isinstance(r, StepID)
    return wf_store.load_step_output(r), None
예제 #7
0
 def gen_step_id(self, workflow_id: str, step_name: str) -> str:
     wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store)
     idx = wf_store.gen_step_id(step_name)
     if idx == 0:
         return step_name
     else:
         return f"{step_name}_{idx}"
예제 #8
0
파일: recovery.py 프로젝트: tchordia/ray
def get_latest_output(workflow_id: str) -> Any:
    """Get the latest output of a workflow. This function is intended to be
    used by readonly virtual actors. To resume a workflow,
    `resume_workflow_job` should be used instead.

    Args:
        workflow_id: The ID of the workflow.

    Returns:
        The output of the workflow.
    """
    reader = workflow_storage.WorkflowStorage(workflow_id)
    try:
        step_id: StepID = reader.get_latest_progress()
        while True:
            result: workflow_storage.StepInspectResult = reader.inspect_step(
                step_id)
            if result.output_object_valid:
                # we already have the output
                return reader.load_step_output(step_id)
            if isinstance(result.output_step_id, str):
                step_id = result.output_step_id
            else:
                raise ValueError(
                    "Workflow output does not exists or not valid.")
    except Exception as e:
        raise WorkflowNotResumableError(workflow_id) from e
예제 #9
0
def test_wait_recovery_step_id(workflow_start_regular_shared):
    # This test ensures workflow reuse the original directory and
    # step id for "workflow.wait" during recovery.

    @workflow.step
    def identity(x: int):
        # block the step by a global mark
        assert utils.check_global_mark()
        return x

    w = workflow.wait([identity.step(42)], num_returns=1, timeout=None)
    utils.unset_global_mark()
    with pytest.raises(RaySystemError):
        _ = w.run(workflow_id="test_wait_recovery_step_id")
    utils.set_global_mark()
    ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id"))
    assert ready == [42]

    from ray.workflow import storage, workflow_storage

    global_storage = storage.get_global_storage()
    wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id",
                                                  global_storage)
    index = wf_storage.gen_step_id("workflow.wait")
    # no new step id
    assert index <= 1
예제 #10
0
    def get_output(self, workflow_id: str,
                   name: Optional[str]) -> WorkflowStaticRef:
        """Get the output of a running workflow.

        Args:
            workflow_id: The ID of a workflow job.

        Returns:
            An object reference that can be used to retrieve the
            workflow result.
        """
        if workflow_id in self._workflow_outputs and name is None:
            return self._workflow_outputs[workflow_id].output
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        status = wf_store.load_workflow_status()
        if status == common.WorkflowStatus.NONE:
            raise ValueError(f"No such workflow {workflow_id}")
        if status == common.WorkflowStatus.CANCELED:
            raise ValueError(f"Workflow {workflow_id} is canceled")
        if name is None:
            # For resumable workflow, the workflow result is not ready.
            # It has to be resumed first.
            if status == common.WorkflowStatus.RESUMABLE:
                raise ValueError(
                    f"Workflow {workflow_id} is in resumable status, "
                    "please resume it")

        if name is None:
            step_id = wf_store.get_entrypoint_step_id()
        else:
            step_id = name
            output = self.get_cached_step_output(workflow_id, step_id)
            if output is not None:
                return WorkflowStaticRef.from_output(step_id, output)

        @ray.remote
        def load(wf_store, workflow_id, step_id):
            result = wf_store.inspect_step(step_id)
            if result.output_object_valid:
                # we already have the output
                return wf_store.load_step_output(step_id)
            if isinstance(result.output_step_id, str):
                actor = get_management_actor()
                return WorkflowStaticRef.from_output(
                    result.output_step_id,
                    actor.get_output.remote(workflow_id,
                                            result.output_step_id),
                )
            raise ValueError(f"Cannot load output from step id {step_id} "
                             f"in workflow {workflow_id}")

        return WorkflowStaticRef.from_output(
            step_id,
            load.remote(wf_store, workflow_id, step_id),
        )
예제 #11
0
def test_checkpoint_dag_full(workflow_start_regular_shared):
    outputs = workflow.create(
        checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True)
    ).run(workflow_id="checkpoint_whole")
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_whole"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #12
0
def load_step_output_from_storage(workflow_id: str, task_id: Optional[TaskID]):
    wf_store = workflow_storage.WorkflowStorage(workflow_id)
    tid = wf_store.inspect_output(task_id)
    if tid is not None:
        return wf_store.load_step_output(tid)
    # TODO(suquark): Unify the error from "workflow.get_output" & "workflow.run_async".
    # Currently they could be different, because "workflow.get_output" could
    # get the output from a stopped workflow, it does not may sense to raise
    # "WorkflowExecutionError" as the workflow is not running.
    if task_id is not None:
        raise ValueError(
            f"Cannot load output from task id '{task_id}' in workflow '{workflow_id}'"
        )
    else:
        raise ValueError(f"Cannot load output from workflow '{workflow_id}'")
예제 #13
0
 async def cancel_workflow(self, workflow_id: str) -> None:
     """Cancel workflow execution."""
     if workflow_id in self._workflow_executors:
         executor = self._workflow_executors[workflow_id]
         fut = executor.get_task_output_async(executor.output_task_id)
         executor.cancel()
         try:
             # Wait until cancelled, otherwise workflow status may not
             # get updated after "workflow.cancel()" is called.
             await fut
         except WorkflowCancellationError:
             pass
     else:
         wf_store = workflow_storage.WorkflowStorage(workflow_id)
         wf_store.update_workflow_status(WorkflowStatus.CANCELED)
예제 #14
0
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared):
    outputs = workflow.run(
        checkpoint_dag.options(**workflow.options(
            name="checkpoint_dag")).bind(False),
        workflow_id="checkpoint_partial",
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = workflow.resume("checkpoint_partial")
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial")
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #15
0
def test_checkpoint_dag_full(workflow_start_regular):
    global_storage = storage.get_global_storage()

    outputs = utils.run_workflow_dag_with_options(
        checkpoint_dag, (True,), workflow_id="checkpoint_whole", name="checkpoint_dag"
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_whole"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole", global_storage)
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #16
0
def _put_helper(identifier: str, obj: Any, workflow_id: str) -> None:
    # TODO (Alex): This check isn't sufficient, it only works for directly
    # nested object refs.
    if isinstance(obj, ray.ObjectRef):
        raise NotImplementedError(
            "Workflow does not support checkpointing nested object references yet."
        )
    key = _obj_id_to_key(identifier)

    dump_to_storage(
        key,
        obj,
        workflow_id,
        workflow_storage.WorkflowStorage(workflow_id),
        update_existing=False,
    )
예제 #17
0
def test_checkpoint_dag_skip_partial(workflow_start_regular):
    global_storage = storage.get_global_storage()

    outputs = (
        checkpoint_dag.options(name="checkpoint_dag")
        .step(False)
        .run(workflow_id="checkpoint_partial")
    )
    assert np.isclose(outputs, 8388607.5)
    recovered = ray.get(workflow.resume("checkpoint_partial"))
    assert np.isclose(recovered, 8388607.5)

    wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial", global_storage)
    _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed")
    _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped")
    _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
예제 #18
0
    def run_or_resume(
            self,
            job_id: str,
            workflow_id: str,
            ignore_existing: bool = False) -> "WorkflowExecutionResult":
        """Run or resume a workflow.

        Args:
            job_id: The ID of the job that submits the workflow execution.
            workflow_id: The ID of the workflow.
            ignore_existing: Ignore we already have an existing output. When
            set false, raise an exception if there has already been a workflow
            running with this id

        Returns:
            Workflow execution result that contains the state and output.
        """
        if workflow_id in self._workflow_outputs and not ignore_existing:
            raise RuntimeError(
                f"The output of workflow[id={workflow_id}] already exists.")
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        workflow_prerun_metadata = {"start_time": time.time()}
        wf_store.save_workflow_prerun_metadata(workflow_prerun_metadata)
        step_id = wf_store.get_entrypoint_step_id()
        try:
            current_output = self._workflow_outputs[workflow_id].output
        except KeyError:
            current_output = None
        result = recovery.resume_workflow_step(job_id, workflow_id, step_id,
                                               current_output)
        latest_output = LatestWorkflowOutput(result.persisted_output,
                                             workflow_id, step_id)
        self._workflow_outputs[workflow_id] = latest_output
        logger.info(f"run_or_resume: {workflow_id}, {step_id},"
                    f"{result.persisted_output.ref}")
        self._step_output_cache[(workflow_id, step_id)] = latest_output

        self._update_workflow_status(workflow_id,
                                     common.WorkflowStatus.RUNNING)

        if workflow_id not in self._step_status:
            self._step_status[workflow_id] = {}
            logger.info(f"Workflow job [id={workflow_id}] started.")
        return result
예제 #19
0
    def run_or_resume(
            self,
            workflow_id: str,
            ignore_existing: bool = False) -> "WorkflowExecutionResult":
        """Run or resume a workflow.

        Args:
            workflow_id: The ID of the workflow.
            ignore_existing: Ignore we already have an existing output. When
            set false, raise an exception if there has already been a workflow
            running with this id

        Returns:
            Workflow execution result that contains the state and output.
        """
        if workflow_id in self._workflow_outputs and not ignore_existing:
            raise RuntimeError(f"The output of workflow[id={workflow_id}] "
                               "already exists.")
        wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store)
        step_id = wf_store.get_entrypoint_step_id()
        try:
            current_output = self._workflow_outputs[workflow_id].output
        except KeyError:
            current_output = None
        result = recovery.resume_workflow_step(workflow_id, step_id,
                                               self._store.storage_url,
                                               current_output)
        latest_output = LatestWorkflowOutput(result.persisted_output,
                                             workflow_id, step_id)
        self._workflow_outputs[workflow_id] = latest_output
        print("run_or_resume: ", workflow_id, step_id, result.persisted_output)
        self._step_output_cache[(workflow_id, step_id)] = latest_output

        wf_store.save_workflow_meta(
            common.WorkflowMetaData(common.WorkflowStatus.RUNNING))

        if workflow_id not in self._step_status:
            self._step_status[workflow_id] = {}
            logger.info(f"Workflow job [id={workflow_id}] started.")
        return result
예제 #20
0
    def submit_workflow(
        self,
        workflow_id: str,
        state: WorkflowExecutionState,
        ignore_existing: bool = False,
    ):
        """Submit workflow. A submitted workflow can be executed later.

        Args:
            workflow_id: ID of the workflow.
            state: The initial state of the workflow.
            ignore_existing: Ignore existing executed workflows.
        """
        if workflow_id in self._workflow_executors:
            raise RuntimeError(
                f"Workflow[id={workflow_id}] is being executed.")
        if workflow_id in self._executed_workflows and not ignore_existing:
            raise RuntimeError(
                f"Workflow[id={workflow_id}] has been executed.")

        if state.output_task_id is None:
            raise ValueError(
                "No root DAG specified that generates output for the workflow."
            )

        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        if (self._max_running_workflows != -1 and
                len(self._running_workflows) >= self._max_running_workflows):
            try:
                self._workflow_queue.put_nowait(workflow_id)
                self._queued_workflows[workflow_id] = asyncio.Future()
                wf_store.update_workflow_status(WorkflowStatus.PENDING)
            except queue.Full:
                # override with our error message
                raise queue.Full("Workflow queue has been full") from None
        else:
            self._running_workflows.add(workflow_id)
            wf_store.update_workflow_status(WorkflowStatus.RUNNING)
        # initialize executor
        self._workflow_executors[workflow_id] = WorkflowExecutor(state)
예제 #21
0
    def actor_ready(self, actor_id: str) -> ray.ObjectRef:
        """Check if a workflow virtual actor is fully initialized.

        Args:
            actor_id: The ID of a workflow virtual actor.

        Returns:
            A future object that represents the state of the actor.
            "ray.get" the object successfully indicates the actor is
            initialized successfully.
        """
        ws = workflow_storage.WorkflowStorage(actor_id, self._store)
        try:
            step_id = ws.get_entrypoint_step_id()
            output_exists = ws.inspect_step(step_id).output_object_valid
            if output_exists:
                return ray.put(None)
        except Exception:
            pass
        if actor_id not in self._actor_initialized:
            raise ValueError(f"Actor '{actor_id}' has not been created, or "
                             "it has failed before initialization.")
        return self._actor_initialized[actor_id]
예제 #22
0
    async def execute_workflow(
        self,
        job_id: str,
        context: WorkflowStepContext,
    ) -> ray.ObjectRef:
        """Execute a submitted workflow.

        Args:
            job_id: The ID of the job for logging.
            context: The execution context.
        Returns:
            An object ref that represent the result.
        """
        workflow_id = context.workflow_id
        if workflow_id not in self._workflow_executors:
            raise RuntimeError(
                f"Workflow '{workflow_id}' has not been submitted.")

        pending_fut = self._queued_workflows.get(workflow_id)
        if pending_fut is not None:
            await pending_fut  # wait until this workflow is ready to go

        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        executor = self._workflow_executors[workflow_id]
        try:
            await executor.run_until_complete(job_id, context, wf_store)
            return await self.get_output(workflow_id, executor.output_task_id)
        finally:
            self._workflow_executors.pop(workflow_id)
            self._running_workflows.remove(workflow_id)
            self._executed_workflows.add(workflow_id)
            if not self._workflow_queue.empty():
                # schedule another workflow from the pending queue
                next_workflow_id = self._workflow_queue.get_nowait()
                self._running_workflows.add(next_workflow_id)
                fut = self._queued_workflows.pop(next_workflow_id)
                fut.set_result(None)
예제 #23
0
    async def execute_workflow(
        self,
        job_id: str,
        context: WorkflowStepContext,
    ) -> ray.ObjectRef:
        """Execute a submitted workflow.

        Args:
            job_id: The ID of the job for logging.
            context: The execution context.
        Returns:
            An object ref that represent the result.
        """
        workflow_id = context.workflow_id
        if workflow_id not in self._workflow_executors:
            raise RuntimeError(f"Workflow '{workflow_id}' has not been submitted.")
        wf_store = workflow_storage.WorkflowStorage(workflow_id)
        executor = self._workflow_executors[workflow_id]
        try:
            await executor.run_until_complete(job_id, context, wf_store)
            return await self.get_output(workflow_id, executor.output_task_id)
        finally:
            self._workflow_executors.pop(workflow_id)
            self._executed_workflows.add(workflow_id)
예제 #24
0
파일: recovery.py 프로젝트: tchordia/ray
def _construct_resume_workflow_from_step(
        workflow_id: str, step_id: StepID) -> Union[Workflow, Any]:
    """Try to construct a workflow (step) that recovers the workflow step.
    If the workflow step already has an output checkpointing file, we return
    the workflow step id instead.

    Args:
        workflow_id: The ID of the workflow.
        step_id: The ID of the step we want to recover.

    Returns:
        A workflow that recovers the step, or the output of the step
            if it has been checkpointed.
    """
    reader = workflow_storage.WorkflowStorage(workflow_id)

    # Step 1: construct dependency of the DAG (BFS)
    inpsect_results = {}
    dependency_map = defaultdict(list)
    num_in_edges = {}

    dag_visit_queue = deque([step_id])
    while dag_visit_queue:
        s: StepID = dag_visit_queue.popleft()
        if s in inpsect_results:
            continue
        r = reader.inspect_step(s)
        inpsect_results[s] = r
        if not r.is_recoverable():
            raise WorkflowStepNotRecoverableError(s)
        if r.output_object_valid:
            deps = []
        elif isinstance(r.output_step_id, str):
            deps = [r.output_step_id]
        else:
            deps = r.workflows
        for w in deps:
            dependency_map[w].append(s)
        num_in_edges[s] = len(deps)
        dag_visit_queue.extend(deps)

    # Step 2: topological sort to determine the execution order (Kahn's algorithm)
    execution_queue: List[StepID] = []

    start_nodes = deque(k for k, v in num_in_edges.items() if v == 0)
    while start_nodes:
        n = start_nodes.popleft()
        execution_queue.append(n)
        for m in dependency_map[n]:
            num_in_edges[m] -= 1
            assert num_in_edges[m] >= 0, (m, n)
            if num_in_edges[m] == 0:
                start_nodes.append(m)

    # Step 3: recover the workflow by the order of the execution queue
    with serialization.objectref_cache():
        # "input_map" is a context storing the input which has been loaded.
        # This context is important for deduplicate step inputs.
        input_map: Dict[StepID, Any] = {}

        for _step_id in execution_queue:
            result = inpsect_results[_step_id]
            if result.output_object_valid:
                input_map[_step_id] = reader.load_step_output(_step_id)
                continue
            if isinstance(result.output_step_id, str):
                input_map[_step_id] = input_map[result.output_step_id]
                continue

            # Process the wait step as a special case.
            if result.step_options.step_type == StepType.WAIT:
                wait_input_workflows = []
                for w in result.workflows:
                    output = input_map[w]
                    if isinstance(output, Workflow):
                        wait_input_workflows.append(output)
                    else:
                        # Simulate a workflow with a workflow reference so it could be
                        # used directly by 'workflow.wait'.
                        static_ref = WorkflowStaticRef(step_id=w,
                                                       ref=ray.put(output))
                        wait_input_workflows.append(
                            Workflow.from_ref(static_ref))
                recovery_workflow = ray.workflow.wait(
                    wait_input_workflows,
                    **result.step_options.ray_options.get("wait_options", {}),
                )
            else:
                args, kwargs = reader.load_step_args(
                    _step_id,
                    workflows=[input_map[w] for w in result.workflows],
                    workflow_refs=list(map(WorkflowRef, result.workflow_refs)),
                )
                func: Callable = reader.load_step_func_body(_step_id)
                # TODO(suquark): Use an alternative function when "workflow.step"
                # is fully deprecated.
                recovery_workflow = ray.workflow.step(func).step(
                    *args, **kwargs)

            # override step_options
            recovery_workflow._step_id = _step_id
            recovery_workflow.data.step_options = result.step_options

            input_map[_step_id] = recovery_workflow

    # Step 4: return the output of the requested step
    return input_map[step_id]
예제 #25
0
def test_workflow_storage(workflow_start_regular):
    workflow_id = test_workflow_storage.__name__
    wf_storage = workflow_storage.WorkflowStorage(workflow_id)
    step_id = "some_step"
    step_options = WorkflowStepRuntimeOptions.make(step_type=StepType.FUNCTION)
    input_metadata = {
        "name": "test_basic_workflows.append1",
        "workflows": ["def"],
        "workflow_refs": ["some_ref"],
        "step_options": step_options.to_dict(),
    }
    output_metadata = {
        "output_step_id": "a12423",
        "dynamic_output_step_id": "b1234"
    }
    root_output_metadata = {"output_step_id": "c123"}
    flattened_args = [
        signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543"
    ]
    args = signature.recover_args(flattened_args)
    output = ["the_answer"]
    object_resolved = 42
    obj_ref = ray.put(object_resolved)

    # test basics
    wf_storage._put(wf_storage._key_step_input_metadata(step_id),
                    input_metadata, True)

    wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)
    wf_storage._put(wf_storage._key_step_args(step_id), flattened_args)

    wf_storage._put(wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref))
    wf_storage._put(wf_storage._key_step_output_metadata(step_id),
                    output_metadata, True)
    wf_storage._put(wf_storage._key_step_output_metadata(""),
                    root_output_metadata, True)
    wf_storage._put(wf_storage._key_step_output(step_id), output)

    assert wf_storage.load_step_output(step_id) == output
    assert wf_storage.load_step_args(step_id, [], []) == args
    assert wf_storage.load_step_func_body(step_id)(33) == 34
    assert ray.get(wf_storage.load_object_ref(
        obj_ref.hex())) == object_resolved

    # test s3 path
    # here we hardcode the path to make sure s3 path is parsed correctly
    from ray._private.storage import _storage_uri

    if _storage_uri.startswith("s3://"):
        assert wf_storage._get("steps/outputs.json",
                               True) == root_output_metadata

    # test "inspect_step"
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        output_object_valid=True)
    assert inspect_result.is_recoverable()

    step_id = "some_step2"
    wf_storage._put(wf_storage._key_step_input_metadata(step_id),
                    input_metadata, True)
    wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)
    wf_storage._put(wf_storage._key_step_args(step_id), args)
    wf_storage._put(wf_storage._key_step_output_metadata(step_id),
                    output_metadata, True)

    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        output_step_id=output_metadata["dynamic_output_step_id"])
    assert inspect_result.is_recoverable()

    step_id = "some_step3"
    wf_storage._put(wf_storage._key_step_input_metadata(step_id),
                    input_metadata, True)
    wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)
    wf_storage._put(wf_storage._key_step_args(step_id), args)
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        args_valid=True,
        func_body_valid=True,
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options,
    )
    assert inspect_result.is_recoverable()

    step_id = "some_step4"
    wf_storage._put(wf_storage._key_step_input_metadata(step_id),
                    input_metadata, True)

    wf_storage._put(wf_storage._key_step_function_body(step_id), some_func)
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        func_body_valid=True,
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options,
    )
    assert not inspect_result.is_recoverable()

    step_id = "some_step5"
    wf_storage._put(wf_storage._key_step_input_metadata(step_id),
                    input_metadata, True)

    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options,
    )
    assert not inspect_result.is_recoverable()

    step_id = "some_step6"
    inspect_result = wf_storage.inspect_step(step_id)
    print(inspect_result)
    assert inspect_result == workflow_storage.StepInspectResult()
    assert not inspect_result.is_recoverable()
예제 #26
0
 def cancel_workflow(self, workflow_id: str) -> None:
     self._step_status.pop(workflow_id)
     cancel_job(self._workflow_outputs.pop(workflow_id).output)
     wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store)
     wf_store.save_workflow_meta(
         common.WorkflowMetaData(common.WorkflowStatus.CANCELED))
예제 #27
0
def _load_ref_helper(key: str, workflow_id: str):
    # TODO(Alex): We should stream the data directly into `cloudpickle.load`.
    storage = workflow_storage.WorkflowStorage(workflow_id)
    return storage._get(key)
예제 #28
0
def test_workflow_storage(workflow_start_regular):
    workflow_id = test_workflow_storage.__name__
    wf_storage = workflow_storage.WorkflowStorage(workflow_id,
                                                  storage.get_global_storage())
    step_id = "some_step"
    step_options = WorkflowStepRuntimeOptions(
        step_type=StepType.FUNCTION,
        catch_exceptions=False,
        max_retries=1,
        ray_options={})
    input_metadata = {
        "name": "test_basic_workflows.append1",
        "workflows": ["def"],
        "workflow_refs": ["some_ref"],
        "step_options": step_options.to_dict(),
    }
    output_metadata = {
        "output_step_id": "a12423",
        "dynamic_output_step_id": "b1234"
    }
    flattened_args = [
        signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543"
    ]
    args = signature.recover_args(flattened_args)
    output = ["the_answer"]
    object_resolved = 42
    obj_ref = ray.put(object_resolved)

    # test basics
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), input_metadata,
            True))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_function_body(step_id), some_func))
    asyncio_run(
        wf_storage._put(wf_storage._key_step_args(step_id), flattened_args))

    asyncio_run(
        wf_storage._put(
            wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref)))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_output_metadata(step_id), output_metadata,
            True))
    asyncio_run(wf_storage._put(wf_storage._key_step_output(step_id), output))

    assert wf_storage.load_step_output(step_id) == output
    assert wf_storage.load_step_args(step_id, [], []) == args
    assert wf_storage.load_step_func_body(step_id)(33) == 34
    assert ray.get(wf_storage.load_object_ref(
        obj_ref.hex())) == object_resolved

    # test "inspect_step"
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        output_object_valid=True)
    assert inspect_result.is_recoverable()

    step_id = "some_step2"
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), input_metadata,
            True))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_function_body(step_id), some_func))
    asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_output_metadata(step_id), output_metadata,
            True))

    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        output_step_id=output_metadata["dynamic_output_step_id"])
    assert inspect_result.is_recoverable()

    step_id = "some_step3"
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), input_metadata,
            True))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_function_body(step_id), some_func))
    asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args))
    inspect_result = wf_storage.inspect_step(step_id)
    step_options = WorkflowStepRuntimeOptions(
        step_type=StepType.FUNCTION,
        catch_exceptions=False,
        max_retries=1,
        ray_options={})
    assert inspect_result == workflow_storage.StepInspectResult(
        args_valid=True,
        func_body_valid=True,
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options)
    assert inspect_result.is_recoverable()

    step_id = "some_step4"
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), input_metadata,
            True))
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_function_body(step_id), some_func))
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        func_body_valid=True,
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options)
    assert not inspect_result.is_recoverable()

    step_id = "some_step5"
    asyncio_run(
        wf_storage._put(
            wf_storage._key_step_input_metadata(step_id), input_metadata,
            True))
    inspect_result = wf_storage.inspect_step(step_id)
    assert inspect_result == workflow_storage.StepInspectResult(
        workflows=input_metadata["workflows"],
        workflow_refs=input_metadata["workflow_refs"],
        step_options=step_options)
    assert not inspect_result.is_recoverable()

    step_id = "some_step6"
    inspect_result = wf_storage.inspect_step(step_id)
    print(inspect_result)
    assert inspect_result == workflow_storage.StepInspectResult()
    assert not inspect_result.is_recoverable()
예제 #29
0
 def _update_workflow_status(self, workflow_id: str,
                             status: common.WorkflowStatus):
     wf_store = workflow_storage.WorkflowStorage(workflow_id)
     wf_store.update_workflow_status(status)
     self._workflow_status[workflow_id] = status