def get_output(workflow_id: str, name: Optional[str]) -> ray.ObjectRef: """Get the output of a running workflow. See "api.get_output()" for details. """ from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() try: workflow_manager = get_management_actor() except ValueError as e: raise ValueError( "Failed to connect to the workflow management " "actor. The workflow could have already failed. You can use " "workflow.resume() to resume the workflow.") from e try: # check storage first wf_store = workflow_storage.WorkflowStorage(workflow_id) tid = wf_store.inspect_output(name) if tid is not None: return load_step_output_from_storage.remote(workflow_id, name) except ValueError: pass return workflow_manager.get_output.remote(workflow_id, name)
async def save_objectref( self, ref_tuple: Tuple[ray.ObjectRef], workflow_id: "str") -> Tuple[List[str], ray.ObjectRef]: """Serialize and upload an object reference exactly once. Args: ref_tuple: A 1-element tuple which wraps the reference. Returns: A pair. The first element is the paths the ref will be uploaded to. The second is an object reference to the upload task. """ wf_storage = workflow_storage.WorkflowStorage(workflow_id, self._storage) ref, = ref_tuple # Use the hex as the key to avoid holding a reference to the object. key = ref.hex() if key not in self._uploads: # TODO(Alex): We should probably eventually free these refs. identifier_ref = common.calculate_identifier.remote(ref) upload_task = _put_helper.remote(identifier_ref, ref, wf_storage) self._uploads[key] = Upload(identifier_ref, upload_task) self._num_uploads += 1 info = self._uploads[key] identifer = await info.identifier_ref paths = wf_storage._key_obj_id(identifer) return paths, info.upload_task
def update_step_status(self, workflow_id: str, step_id: str, status: common.WorkflowStatus, outputs: List[ray.ObjectRef]): # Note: For virtual actor, we could add more steps even if # the workflow finishes. self._step_status.setdefault(workflow_id, {}) if status == common.WorkflowStatus.SUCCESSFUL: self._step_status[workflow_id].pop(step_id, None) else: self._step_status.setdefault(workflow_id, {})[step_id] = status remaining = len(self._step_status[workflow_id]) if status != common.WorkflowStatus.RUNNING: self._step_output_cache.pop((workflow_id, step_id), None) if status != common.WorkflowStatus.FAILED and remaining != 0: return wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store) if status == common.WorkflowStatus.FAILED: if workflow_id in self._workflow_outputs: cancel_job(self._workflow_outputs.pop(workflow_id).output) wf_store.save_workflow_meta( common.WorkflowMetaData(common.WorkflowStatus.FAILED)) self._step_status.pop(workflow_id) else: wf_store.save_workflow_meta( common.WorkflowMetaData(common.WorkflowStatus.SUCCESSFUL)) self._step_status.pop(workflow_id)
def update_step_status( self, workflow_id: str, step_id: str, status: common.WorkflowStatus, outputs: List[WorkflowStaticRef], ): # Note: For virtual actor, we could add more steps even if # the workflow finishes. self._step_status.setdefault(workflow_id, {}) if status == common.WorkflowStatus.SUCCESSFUL: self._step_status[workflow_id].pop(step_id, None) else: self._step_status.setdefault(workflow_id, {})[step_id] = status remaining = len(self._step_status[workflow_id]) if status != common.WorkflowStatus.RUNNING: self._step_output_cache.pop((workflow_id, step_id), None) if status != common.WorkflowStatus.FAILED and remaining != 0: return if status == common.WorkflowStatus.FAILED: if workflow_id in self._workflow_outputs: cancel_job(self._workflow_outputs.pop(workflow_id).output) self._update_workflow_status(workflow_id, common.WorkflowStatus.FAILED) self._step_status.pop(workflow_id) else: self._update_workflow_status(workflow_id, common.WorkflowStatus.SUCCESSFUL) self._step_status.pop(workflow_id) wf_store = workflow_storage.WorkflowStorage(workflow_id) wf_store.save_workflow_postrun_metadata({"end_time": time.time()})
def workflow_state_from_storage( workflow_id: str, task_id: Optional[TaskID]) -> WorkflowExecutionState: """Try to construct a workflow (step) that recovers the workflow step. If the workflow step already has an output checkpointing file, we return the workflow step id instead. Args: workflow_id: The ID of the workflow. task_id: The ID of the output task. If None, it will be the entrypoint of the workflow. Returns: A workflow that recovers the step, or the output of the step if it has been checkpointed. """ reader = workflow_storage.WorkflowStorage(workflow_id) if task_id is None: task_id = reader.get_entrypoint_step_id() # Construct the workflow execution state. state = WorkflowExecutionState(output_task_id=task_id) state.output_task_id = task_id visited_tasks = set() dag_visit_queue = deque([task_id]) with serialization.objectref_cache(): while dag_visit_queue: task_id: TaskID = dag_visit_queue.popleft() if task_id in visited_tasks: continue visited_tasks.add(task_id) r = reader.inspect_step(task_id) if not r.is_recoverable(): raise WorkflowStepNotRecoverableError(task_id) if r.output_object_valid: target = state.continuation_root.get(task_id, task_id) state.checkpoint_map[target] = WorkflowRef(task_id) continue if isinstance(r.output_step_id, str): # no input dependencies here because the task has already # returned a continuation state.upstream_dependencies[task_id] = [] state.append_continuation(task_id, r.output_step_id) dag_visit_queue.append(r.output_step_id) continue # transfer task info to state state.add_dependencies(task_id, r.workflow_refs) state.task_input_args[task_id] = reader.load_step_args(task_id) # TODO(suquark): although not necessary, but for completeness, # we may also load name and metadata. state.tasks[task_id] = Task( name="", options=r.step_options, user_metadata={}, func_body=reader.load_step_func_body(task_id), ) dag_visit_queue.extend(r.workflow_refs) return state
def _resume_workflow_step_executor(workflow_id: str, step_id: "StepID", store_url: str, current_output: [ ray.ObjectRef ]) -> Tuple[ray.ObjectRef, ray.ObjectRef]: # TODO (yic): We need better dependency management for virtual actor # The current output will always be empty for normal workflow # For virtual actor, if it's not empty, it means the previous job is # running. This is a really bad one. for ref in current_output: try: while isinstance(ref, ray.ObjectRef): ref = ray.get(ref) except Exception: pass try: store = storage.create_storage(store_url) wf_store = workflow_storage.WorkflowStorage(workflow_id, store) r = _construct_resume_workflow_from_step(wf_store, step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): with workflow_context.workflow_step_context(workflow_id, store.storage_url): from ray.workflow.step_executor import (execute_workflow) result = execute_workflow(r, last_step_of_workflow=True) return result.persisted_output, result.volatile_output assert isinstance(r, StepID) return wf_store.load_step_output(r), None
def gen_step_id(self, workflow_id: str, step_name: str) -> str: wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store) idx = wf_store.gen_step_id(step_name) if idx == 0: return step_name else: return f"{step_name}_{idx}"
def get_latest_output(workflow_id: str) -> Any: """Get the latest output of a workflow. This function is intended to be used by readonly virtual actors. To resume a workflow, `resume_workflow_job` should be used instead. Args: workflow_id: The ID of the workflow. Returns: The output of the workflow. """ reader = workflow_storage.WorkflowStorage(workflow_id) try: step_id: StepID = reader.get_latest_progress() while True: result: workflow_storage.StepInspectResult = reader.inspect_step( step_id) if result.output_object_valid: # we already have the output return reader.load_step_output(step_id) if isinstance(result.output_step_id, str): step_id = result.output_step_id else: raise ValueError( "Workflow output does not exists or not valid.") except Exception as e: raise WorkflowNotResumableError(workflow_id) from e
def test_wait_recovery_step_id(workflow_start_regular_shared): # This test ensures workflow reuse the original directory and # step id for "workflow.wait" during recovery. @workflow.step def identity(x: int): # block the step by a global mark assert utils.check_global_mark() return x w = workflow.wait([identity.step(42)], num_returns=1, timeout=None) utils.unset_global_mark() with pytest.raises(RaySystemError): _ = w.run(workflow_id="test_wait_recovery_step_id") utils.set_global_mark() ready, unready = ray.get(workflow.resume("test_wait_recovery_step_id")) assert ready == [42] from ray.workflow import storage, workflow_storage global_storage = storage.get_global_storage() wf_storage = workflow_storage.WorkflowStorage("test_wait_recovery_step_id", global_storage) index = wf_storage.gen_step_id("workflow.wait") # no new step id assert index <= 1
def get_output(self, workflow_id: str, name: Optional[str]) -> WorkflowStaticRef: """Get the output of a running workflow. Args: workflow_id: The ID of a workflow job. Returns: An object reference that can be used to retrieve the workflow result. """ if workflow_id in self._workflow_outputs and name is None: return self._workflow_outputs[workflow_id].output wf_store = workflow_storage.WorkflowStorage(workflow_id) status = wf_store.load_workflow_status() if status == common.WorkflowStatus.NONE: raise ValueError(f"No such workflow {workflow_id}") if status == common.WorkflowStatus.CANCELED: raise ValueError(f"Workflow {workflow_id} is canceled") if name is None: # For resumable workflow, the workflow result is not ready. # It has to be resumed first. if status == common.WorkflowStatus.RESUMABLE: raise ValueError( f"Workflow {workflow_id} is in resumable status, " "please resume it") if name is None: step_id = wf_store.get_entrypoint_step_id() else: step_id = name output = self.get_cached_step_output(workflow_id, step_id) if output is not None: return WorkflowStaticRef.from_output(step_id, output) @ray.remote def load(wf_store, workflow_id, step_id): result = wf_store.inspect_step(step_id) if result.output_object_valid: # we already have the output return wf_store.load_step_output(step_id) if isinstance(result.output_step_id, str): actor = get_management_actor() return WorkflowStaticRef.from_output( result.output_step_id, actor.get_output.remote(workflow_id, result.output_step_id), ) raise ValueError(f"Cannot load output from step id {step_id} " f"in workflow {workflow_id}") return WorkflowStaticRef.from_output( step_id, load.remote(wf_store, workflow_id, step_id), )
def test_checkpoint_dag_full(workflow_start_regular_shared): outputs = workflow.create( checkpoint_dag.options(**workflow.options(name="checkpoint_dag")).bind(True) ).run(workflow_id="checkpoint_whole") assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def load_step_output_from_storage(workflow_id: str, task_id: Optional[TaskID]): wf_store = workflow_storage.WorkflowStorage(workflow_id) tid = wf_store.inspect_output(task_id) if tid is not None: return wf_store.load_step_output(tid) # TODO(suquark): Unify the error from "workflow.get_output" & "workflow.run_async". # Currently they could be different, because "workflow.get_output" could # get the output from a stopped workflow, it does not may sense to raise # "WorkflowExecutionError" as the workflow is not running. if task_id is not None: raise ValueError( f"Cannot load output from task id '{task_id}' in workflow '{workflow_id}'" ) else: raise ValueError(f"Cannot load output from workflow '{workflow_id}'")
async def cancel_workflow(self, workflow_id: str) -> None: """Cancel workflow execution.""" if workflow_id in self._workflow_executors: executor = self._workflow_executors[workflow_id] fut = executor.get_task_output_async(executor.output_task_id) executor.cancel() try: # Wait until cancelled, otherwise workflow status may not # get updated after "workflow.cancel()" is called. await fut except WorkflowCancellationError: pass else: wf_store = workflow_storage.WorkflowStorage(workflow_id) wf_store.update_workflow_status(WorkflowStatus.CANCELED)
def test_checkpoint_dag_skip_partial(workflow_start_regular_shared): outputs = workflow.run( checkpoint_dag.options(**workflow.options( name="checkpoint_dag")).bind(False), workflow_id="checkpoint_partial", ) assert np.isclose(outputs, 8388607.5) recovered = workflow.resume("checkpoint_partial") assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial") _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def test_checkpoint_dag_full(workflow_start_regular): global_storage = storage.get_global_storage() outputs = utils.run_workflow_dag_with_options( checkpoint_dag, (True,), workflow_id="checkpoint_whole", name="checkpoint_dag" ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_whole")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_whole", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="checkpointed") _assert_step_checkpoints(wf_storage, "identity", mode="checkpointed") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def _put_helper(identifier: str, obj: Any, workflow_id: str) -> None: # TODO (Alex): This check isn't sufficient, it only works for directly # nested object refs. if isinstance(obj, ray.ObjectRef): raise NotImplementedError( "Workflow does not support checkpointing nested object references yet." ) key = _obj_id_to_key(identifier) dump_to_storage( key, obj, workflow_id, workflow_storage.WorkflowStorage(workflow_id), update_existing=False, )
def test_checkpoint_dag_skip_partial(workflow_start_regular): global_storage = storage.get_global_storage() outputs = ( checkpoint_dag.options(name="checkpoint_dag") .step(False) .run(workflow_id="checkpoint_partial") ) assert np.isclose(outputs, 8388607.5) recovered = ray.get(workflow.resume("checkpoint_partial")) assert np.isclose(recovered, 8388607.5) wf_storage = workflow_storage.WorkflowStorage("checkpoint_partial", global_storage) _assert_step_checkpoints(wf_storage, "checkpoint_dag", mode="checkpointed") _assert_step_checkpoints(wf_storage, "large_input", mode="output_skipped") _assert_step_checkpoints(wf_storage, "identity", mode="output_skipped") _assert_step_checkpoints(wf_storage, "average", mode="checkpointed")
def run_or_resume( self, job_id: str, workflow_id: str, ignore_existing: bool = False) -> "WorkflowExecutionResult": """Run or resume a workflow. Args: job_id: The ID of the job that submits the workflow execution. workflow_id: The ID of the workflow. ignore_existing: Ignore we already have an existing output. When set false, raise an exception if there has already been a workflow running with this id Returns: Workflow execution result that contains the state and output. """ if workflow_id in self._workflow_outputs and not ignore_existing: raise RuntimeError( f"The output of workflow[id={workflow_id}] already exists.") wf_store = workflow_storage.WorkflowStorage(workflow_id) workflow_prerun_metadata = {"start_time": time.time()} wf_store.save_workflow_prerun_metadata(workflow_prerun_metadata) step_id = wf_store.get_entrypoint_step_id() try: current_output = self._workflow_outputs[workflow_id].output except KeyError: current_output = None result = recovery.resume_workflow_step(job_id, workflow_id, step_id, current_output) latest_output = LatestWorkflowOutput(result.persisted_output, workflow_id, step_id) self._workflow_outputs[workflow_id] = latest_output logger.info(f"run_or_resume: {workflow_id}, {step_id}," f"{result.persisted_output.ref}") self._step_output_cache[(workflow_id, step_id)] = latest_output self._update_workflow_status(workflow_id, common.WorkflowStatus.RUNNING) if workflow_id not in self._step_status: self._step_status[workflow_id] = {} logger.info(f"Workflow job [id={workflow_id}] started.") return result
def run_or_resume( self, workflow_id: str, ignore_existing: bool = False) -> "WorkflowExecutionResult": """Run or resume a workflow. Args: workflow_id: The ID of the workflow. ignore_existing: Ignore we already have an existing output. When set false, raise an exception if there has already been a workflow running with this id Returns: Workflow execution result that contains the state and output. """ if workflow_id in self._workflow_outputs and not ignore_existing: raise RuntimeError(f"The output of workflow[id={workflow_id}] " "already exists.") wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store) step_id = wf_store.get_entrypoint_step_id() try: current_output = self._workflow_outputs[workflow_id].output except KeyError: current_output = None result = recovery.resume_workflow_step(workflow_id, step_id, self._store.storage_url, current_output) latest_output = LatestWorkflowOutput(result.persisted_output, workflow_id, step_id) self._workflow_outputs[workflow_id] = latest_output print("run_or_resume: ", workflow_id, step_id, result.persisted_output) self._step_output_cache[(workflow_id, step_id)] = latest_output wf_store.save_workflow_meta( common.WorkflowMetaData(common.WorkflowStatus.RUNNING)) if workflow_id not in self._step_status: self._step_status[workflow_id] = {} logger.info(f"Workflow job [id={workflow_id}] started.") return result
def submit_workflow( self, workflow_id: str, state: WorkflowExecutionState, ignore_existing: bool = False, ): """Submit workflow. A submitted workflow can be executed later. Args: workflow_id: ID of the workflow. state: The initial state of the workflow. ignore_existing: Ignore existing executed workflows. """ if workflow_id in self._workflow_executors: raise RuntimeError( f"Workflow[id={workflow_id}] is being executed.") if workflow_id in self._executed_workflows and not ignore_existing: raise RuntimeError( f"Workflow[id={workflow_id}] has been executed.") if state.output_task_id is None: raise ValueError( "No root DAG specified that generates output for the workflow." ) wf_store = workflow_storage.WorkflowStorage(workflow_id) if (self._max_running_workflows != -1 and len(self._running_workflows) >= self._max_running_workflows): try: self._workflow_queue.put_nowait(workflow_id) self._queued_workflows[workflow_id] = asyncio.Future() wf_store.update_workflow_status(WorkflowStatus.PENDING) except queue.Full: # override with our error message raise queue.Full("Workflow queue has been full") from None else: self._running_workflows.add(workflow_id) wf_store.update_workflow_status(WorkflowStatus.RUNNING) # initialize executor self._workflow_executors[workflow_id] = WorkflowExecutor(state)
def actor_ready(self, actor_id: str) -> ray.ObjectRef: """Check if a workflow virtual actor is fully initialized. Args: actor_id: The ID of a workflow virtual actor. Returns: A future object that represents the state of the actor. "ray.get" the object successfully indicates the actor is initialized successfully. """ ws = workflow_storage.WorkflowStorage(actor_id, self._store) try: step_id = ws.get_entrypoint_step_id() output_exists = ws.inspect_step(step_id).output_object_valid if output_exists: return ray.put(None) except Exception: pass if actor_id not in self._actor_initialized: raise ValueError(f"Actor '{actor_id}' has not been created, or " "it has failed before initialization.") return self._actor_initialized[actor_id]
async def execute_workflow( self, job_id: str, context: WorkflowStepContext, ) -> ray.ObjectRef: """Execute a submitted workflow. Args: job_id: The ID of the job for logging. context: The execution context. Returns: An object ref that represent the result. """ workflow_id = context.workflow_id if workflow_id not in self._workflow_executors: raise RuntimeError( f"Workflow '{workflow_id}' has not been submitted.") pending_fut = self._queued_workflows.get(workflow_id) if pending_fut is not None: await pending_fut # wait until this workflow is ready to go wf_store = workflow_storage.WorkflowStorage(workflow_id) executor = self._workflow_executors[workflow_id] try: await executor.run_until_complete(job_id, context, wf_store) return await self.get_output(workflow_id, executor.output_task_id) finally: self._workflow_executors.pop(workflow_id) self._running_workflows.remove(workflow_id) self._executed_workflows.add(workflow_id) if not self._workflow_queue.empty(): # schedule another workflow from the pending queue next_workflow_id = self._workflow_queue.get_nowait() self._running_workflows.add(next_workflow_id) fut = self._queued_workflows.pop(next_workflow_id) fut.set_result(None)
async def execute_workflow( self, job_id: str, context: WorkflowStepContext, ) -> ray.ObjectRef: """Execute a submitted workflow. Args: job_id: The ID of the job for logging. context: The execution context. Returns: An object ref that represent the result. """ workflow_id = context.workflow_id if workflow_id not in self._workflow_executors: raise RuntimeError(f"Workflow '{workflow_id}' has not been submitted.") wf_store = workflow_storage.WorkflowStorage(workflow_id) executor = self._workflow_executors[workflow_id] try: await executor.run_until_complete(job_id, context, wf_store) return await self.get_output(workflow_id, executor.output_task_id) finally: self._workflow_executors.pop(workflow_id) self._executed_workflows.add(workflow_id)
def _construct_resume_workflow_from_step( workflow_id: str, step_id: StepID) -> Union[Workflow, Any]: """Try to construct a workflow (step) that recovers the workflow step. If the workflow step already has an output checkpointing file, we return the workflow step id instead. Args: workflow_id: The ID of the workflow. step_id: The ID of the step we want to recover. Returns: A workflow that recovers the step, or the output of the step if it has been checkpointed. """ reader = workflow_storage.WorkflowStorage(workflow_id) # Step 1: construct dependency of the DAG (BFS) inpsect_results = {} dependency_map = defaultdict(list) num_in_edges = {} dag_visit_queue = deque([step_id]) while dag_visit_queue: s: StepID = dag_visit_queue.popleft() if s in inpsect_results: continue r = reader.inspect_step(s) inpsect_results[s] = r if not r.is_recoverable(): raise WorkflowStepNotRecoverableError(s) if r.output_object_valid: deps = [] elif isinstance(r.output_step_id, str): deps = [r.output_step_id] else: deps = r.workflows for w in deps: dependency_map[w].append(s) num_in_edges[s] = len(deps) dag_visit_queue.extend(deps) # Step 2: topological sort to determine the execution order (Kahn's algorithm) execution_queue: List[StepID] = [] start_nodes = deque(k for k, v in num_in_edges.items() if v == 0) while start_nodes: n = start_nodes.popleft() execution_queue.append(n) for m in dependency_map[n]: num_in_edges[m] -= 1 assert num_in_edges[m] >= 0, (m, n) if num_in_edges[m] == 0: start_nodes.append(m) # Step 3: recover the workflow by the order of the execution queue with serialization.objectref_cache(): # "input_map" is a context storing the input which has been loaded. # This context is important for deduplicate step inputs. input_map: Dict[StepID, Any] = {} for _step_id in execution_queue: result = inpsect_results[_step_id] if result.output_object_valid: input_map[_step_id] = reader.load_step_output(_step_id) continue if isinstance(result.output_step_id, str): input_map[_step_id] = input_map[result.output_step_id] continue # Process the wait step as a special case. if result.step_options.step_type == StepType.WAIT: wait_input_workflows = [] for w in result.workflows: output = input_map[w] if isinstance(output, Workflow): wait_input_workflows.append(output) else: # Simulate a workflow with a workflow reference so it could be # used directly by 'workflow.wait'. static_ref = WorkflowStaticRef(step_id=w, ref=ray.put(output)) wait_input_workflows.append( Workflow.from_ref(static_ref)) recovery_workflow = ray.workflow.wait( wait_input_workflows, **result.step_options.ray_options.get("wait_options", {}), ) else: args, kwargs = reader.load_step_args( _step_id, workflows=[input_map[w] for w in result.workflows], workflow_refs=list(map(WorkflowRef, result.workflow_refs)), ) func: Callable = reader.load_step_func_body(_step_id) # TODO(suquark): Use an alternative function when "workflow.step" # is fully deprecated. recovery_workflow = ray.workflow.step(func).step( *args, **kwargs) # override step_options recovery_workflow._step_id = _step_id recovery_workflow.data.step_options = result.step_options input_map[_step_id] = recovery_workflow # Step 4: return the output of the requested step return input_map[step_id]
def test_workflow_storage(workflow_start_regular): workflow_id = test_workflow_storage.__name__ wf_storage = workflow_storage.WorkflowStorage(workflow_id) step_id = "some_step" step_options = WorkflowStepRuntimeOptions.make(step_type=StepType.FUNCTION) input_metadata = { "name": "test_basic_workflows.append1", "workflows": ["def"], "workflow_refs": ["some_ref"], "step_options": step_options.to_dict(), } output_metadata = { "output_step_id": "a12423", "dynamic_output_step_id": "b1234" } root_output_metadata = {"output_step_id": "c123"} flattened_args = [ signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543" ] args = signature.recover_args(flattened_args) output = ["the_answer"] object_resolved = 42 obj_ref = ray.put(object_resolved) # test basics wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True) wf_storage._put(wf_storage._key_step_function_body(step_id), some_func) wf_storage._put(wf_storage._key_step_args(step_id), flattened_args) wf_storage._put(wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref)) wf_storage._put(wf_storage._key_step_output_metadata(step_id), output_metadata, True) wf_storage._put(wf_storage._key_step_output_metadata(""), root_output_metadata, True) wf_storage._put(wf_storage._key_step_output(step_id), output) assert wf_storage.load_step_output(step_id) == output assert wf_storage.load_step_args(step_id, [], []) == args assert wf_storage.load_step_func_body(step_id)(33) == 34 assert ray.get(wf_storage.load_object_ref( obj_ref.hex())) == object_resolved # test s3 path # here we hardcode the path to make sure s3 path is parsed correctly from ray._private.storage import _storage_uri if _storage_uri.startswith("s3://"): assert wf_storage._get("steps/outputs.json", True) == root_output_metadata # test "inspect_step" inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_object_valid=True) assert inspect_result.is_recoverable() step_id = "some_step2" wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True) wf_storage._put(wf_storage._key_step_function_body(step_id), some_func) wf_storage._put(wf_storage._key_step_args(step_id), args) wf_storage._put(wf_storage._key_step_output_metadata(step_id), output_metadata, True) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_step_id=output_metadata["dynamic_output_step_id"]) assert inspect_result.is_recoverable() step_id = "some_step3" wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True) wf_storage._put(wf_storage._key_step_function_body(step_id), some_func) wf_storage._put(wf_storage._key_step_args(step_id), args) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( args_valid=True, func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert inspect_result.is_recoverable() step_id = "some_step4" wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True) wf_storage._put(wf_storage._key_step_function_body(step_id), some_func) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert not inspect_result.is_recoverable() step_id = "some_step5" wf_storage._put(wf_storage._key_step_input_metadata(step_id), input_metadata, True) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options, ) assert not inspect_result.is_recoverable() step_id = "some_step6" inspect_result = wf_storage.inspect_step(step_id) print(inspect_result) assert inspect_result == workflow_storage.StepInspectResult() assert not inspect_result.is_recoverable()
def cancel_workflow(self, workflow_id: str) -> None: self._step_status.pop(workflow_id) cancel_job(self._workflow_outputs.pop(workflow_id).output) wf_store = workflow_storage.WorkflowStorage(workflow_id, self._store) wf_store.save_workflow_meta( common.WorkflowMetaData(common.WorkflowStatus.CANCELED))
def _load_ref_helper(key: str, workflow_id: str): # TODO(Alex): We should stream the data directly into `cloudpickle.load`. storage = workflow_storage.WorkflowStorage(workflow_id) return storage._get(key)
def test_workflow_storage(workflow_start_regular): workflow_id = test_workflow_storage.__name__ wf_storage = workflow_storage.WorkflowStorage(workflow_id, storage.get_global_storage()) step_id = "some_step" step_options = WorkflowStepRuntimeOptions( step_type=StepType.FUNCTION, catch_exceptions=False, max_retries=1, ray_options={}) input_metadata = { "name": "test_basic_workflows.append1", "workflows": ["def"], "workflow_refs": ["some_ref"], "step_options": step_options.to_dict(), } output_metadata = { "output_step_id": "a12423", "dynamic_output_step_id": "b1234" } flattened_args = [ signature.DUMMY_TYPE, 1, signature.DUMMY_TYPE, "2", "k", b"543" ] args = signature.recover_args(flattened_args) output = ["the_answer"] object_resolved = 42 obj_ref = ray.put(object_resolved) # test basics asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run( wf_storage._put(wf_storage._key_step_args(step_id), flattened_args)) asyncio_run( wf_storage._put( wf_storage._key_obj_id(obj_ref.hex()), ray.get(obj_ref))) asyncio_run( wf_storage._put( wf_storage._key_step_output_metadata(step_id), output_metadata, True)) asyncio_run(wf_storage._put(wf_storage._key_step_output(step_id), output)) assert wf_storage.load_step_output(step_id) == output assert wf_storage.load_step_args(step_id, [], []) == args assert wf_storage.load_step_func_body(step_id)(33) == 34 assert ray.get(wf_storage.load_object_ref( obj_ref.hex())) == object_resolved # test "inspect_step" inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_object_valid=True) assert inspect_result.is_recoverable() step_id = "some_step2" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) asyncio_run( wf_storage._put( wf_storage._key_step_output_metadata(step_id), output_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( output_step_id=output_metadata["dynamic_output_step_id"]) assert inspect_result.is_recoverable() step_id = "some_step3" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) asyncio_run(wf_storage._put(wf_storage._key_step_args(step_id), args)) inspect_result = wf_storage.inspect_step(step_id) step_options = WorkflowStepRuntimeOptions( step_type=StepType.FUNCTION, catch_exceptions=False, max_retries=1, ray_options={}) assert inspect_result == workflow_storage.StepInspectResult( args_valid=True, func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert inspect_result.is_recoverable() step_id = "some_step4" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) asyncio_run( wf_storage._put( wf_storage._key_step_function_body(step_id), some_func)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( func_body_valid=True, workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert not inspect_result.is_recoverable() step_id = "some_step5" asyncio_run( wf_storage._put( wf_storage._key_step_input_metadata(step_id), input_metadata, True)) inspect_result = wf_storage.inspect_step(step_id) assert inspect_result == workflow_storage.StepInspectResult( workflows=input_metadata["workflows"], workflow_refs=input_metadata["workflow_refs"], step_options=step_options) assert not inspect_result.is_recoverable() step_id = "some_step6" inspect_result = wf_storage.inspect_step(step_id) print(inspect_result) assert inspect_result == workflow_storage.StepInspectResult() assert not inspect_result.is_recoverable()
def _update_workflow_status(self, workflow_id: str, status: common.WorkflowStatus): wf_store = workflow_storage.WorkflowStorage(workflow_id) wf_store.update_workflow_status(status) self._workflow_status[workflow_id] = status