async def _write_step_inputs(wf_storage: workflow_storage.WorkflowStorage, step_id: StepID, inputs: WorkflowData) -> None: """Save workflow inputs.""" metadata = inputs.to_metadata() with serialization_context.workflow_args_keeping_context(): # TODO(suquark): in the future we should write to storage directly # with plasma store object in memory. args_obj = ray.get(inputs.inputs.args) workflow_id = wf_storage._workflow_id storage = wf_storage._storage save_tasks = [ # TODO (Alex): Handle the json case better? wf_storage._put(wf_storage._key_step_input_metadata(step_id), metadata, True), wf_storage._put(wf_storage._key_step_user_metadata(step_id), inputs.user_metadata, True), serialization.dump_to_storage( wf_storage._key_step_function_body(step_id), inputs.func_body, workflow_id, storage, ), serialization.dump_to_storage(wf_storage._key_step_args(step_id), args_obj, workflow_id, storage), ] await asyncio.gather(*save_tasks)
def test_embedded_objectrefs(workflow_start_regular): workflow_id = test_embedded_objectrefs.__name__ class ObjectRefsWrapper: def __init__(self, refs): self.refs = refs from ray.internal.storage import _storage_uri wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)]) store = workflow_storage.get_workflow_storage(workflow_id) serialization.dump_to_storage("key", wrapped, workflow_id, store) # Be extremely explicit about shutting down. We want to make sure the # `_get` call deserializes the full object and puts it in the object store. # Shutting down the cluster should guarantee we don't accidently get the # old object and pass the test. ray.shutdown() subprocess.check_output("ray stop --force", shell=True) ray.init(storage=_storage_uri) workflow.init() storage2 = workflow_storage.get_workflow_storage(workflow_id) result = storage2._get("key") assert ray.get(result.refs) == [1, 2]
def save_step_output(self, step_id: StepID, ret: Union[Workflow, Any], *, exception: Optional[Exception], outer_most_step_id: StepID) -> None: """When a workflow step returns, 1. If the returned object is a workflow, this means we are a nested workflow. We save the output metadata that points to the workflow. 2. Otherwise, checkpoint the output. Args: step_id: The ID of the workflow step. If it is an empty string, it means we are in the workflow job driver process. ret: The returned object from a workflow step. exception: This step should throw exception. outer_most_step_id: See WorkflowStepContext. """ tasks = [] dynamic_output_id = None if isinstance(ret, Workflow): # This workflow step returns a nested workflow. assert step_id != ret.step_id assert exception is None tasks.append( self._put( self._key_step_output_metadata(step_id), {"output_step_id": ret.step_id}, True)) dynamic_output_id = ret.step_id else: if exception is None: # This workflow step returns a object. ret = ray.get(ret) if isinstance(ret, ray.ObjectRef) else ret promise = serialization.dump_to_storage( self._key_step_output(step_id), ret, self._workflow_id, self._storage) tasks.append(promise) # tasks.append(self._put(self._key_step_output(step_id), ret)) dynamic_output_id = step_id # TODO (yic): Delete exception file else: assert ret is None promise = serialization.dump_to_storage( self._key_step_exception(step_id), exception, self._workflow_id, self._storage) tasks.append(promise) # tasks.append( # self._put(self._key_step_exception(step_id), exception)) # Finish checkpointing. asyncio_run(asyncio.gather(*tasks)) # NOTE: if we update the dynamic output before # finishing checkpointing, then during recovery, the dynamic could # would point to a checkpoint that does not exist. if dynamic_output_id is not None: asyncio_run( self._update_dynamic_output(outer_most_step_id, dynamic_output_id))
def save_step_output(self, step_id: StepID, ret: Union[Workflow, Any], exception: Optional[Exception], outer_most_step_id: Optional[StepID]) -> None: """When a workflow step returns, 1. If the returned object is a workflow, this means we are a nested workflow. We save the output metadata that points to the workflow. 2. Otherwise, checkpoint the output. Args: step_id: The ID of the workflow step. If it is an empty string, it means we are in the workflow job driver process. ret: The returned object from a workflow step. exception: This step should throw exception. outer_most_step_id: See "step_executor.execute_workflow" for explanation. """ tasks = [] if isinstance(ret, Workflow): # This workflow step returns a nested workflow. assert step_id != ret.step_id assert exception is None tasks.append( self._put(self._key_step_output_metadata(step_id), {"output_step_id": ret.step_id}, True)) dynamic_output_id = ret.step_id else: if exception is None: # This workflow step returns a object. ret = ray.get(ret) if isinstance(ret, ray.ObjectRef) else ret promise = serialization.dump_to_storage( self._key_step_output(step_id), ret, self._workflow_id, self._storage) tasks.append(promise) # tasks.append(self._put(self._key_step_output(step_id), ret)) dynamic_output_id = step_id # TODO (yic): Delete exception file # outer_most_step_id == "" indicates the root step of a # workflow. This would directly update "outputs.json" in # the workflow dir, and we want to avoid it. if outer_most_step_id is not None and outer_most_step_id != "": tasks.append( self._update_dynamic_output(outer_most_step_id, dynamic_output_id)) else: assert ret is None promise = serialization.dump_to_storage( self._key_step_exception(step_id), exception, self._workflow_id, self._storage) tasks.append(promise) # tasks.append( # self._put(self._key_step_exception(step_id), exception)) asyncio_run(asyncio.gather(*tasks))
def save_workflow_execution_state(self, creator_task_id: TaskID, state: WorkflowExecutionState) -> None: """Save a workflow execution state. Typically, the state is translated from a Ray DAG. Args: creator_task_id: The ID of the task that creates the state. state: The state converted from the DAG. """ assert creator_task_id != state.output_task_id for task_id, task in state.tasks.items(): # TODO (Alex): Handle the json case better? metadata = { **task.to_dict(), "workflow_refs": state.upstream_dependencies[task_id], } self._put(self._key_step_input_metadata(task_id), metadata, True) # TODO(suquark): The task user metadata duplicates. self._put( self._key_step_user_metadata(task_id), task.user_metadata, True, ) workflow_id = self._workflow_id serialization.dump_to_storage( self._key_step_function_body(task_id), task.func_body, workflow_id, self, ) with serialization_context.workflow_args_keeping_context(): # TODO(suquark): in the future we should write to storage directly # with plasma store object in memory. args_obj = ray.get(state.task_input_args[task_id]) serialization.dump_to_storage( self._key_step_args(task_id), args_obj, workflow_id, self, ) # Finally, point to the output ID of the DAG. The DAG is a continuation # of the creator task. self._put( self._key_step_output_metadata(creator_task_id), {"output_step_id": state.output_task_id}, True, )
def test_embedded_objectrefs(workflow_start_regular): workflow_id = test_embedded_objectrefs.__name__ base_storage = storage.get_global_storage() class ObjectRefsWrapper: def __init__(self, refs): self.refs = refs url = base_storage.storage_url wrapped = ObjectRefsWrapper([ray.put(1), ray.put(2)]) promise = serialization.dump_to_storage(["key"], wrapped, workflow_id, base_storage) workflow_storage.asyncio_run(promise) # Be extremely explicit about shutting down. We want to make sure the # `_get` call deserializes the full object and puts it in the object store. # Shutting down the cluster should guarantee we don't accidently get the # old object and pass the test. ray.shutdown() subprocess.check_output("ray stop --force", shell=True) workflow.init(url) storage2 = workflow_storage.get_workflow_storage(workflow_id) result = workflow_storage.asyncio_run(storage2._get(["key"])) assert ray.get(result.refs) == [1, 2]
def _put(self, key: str, data: Any, is_json: bool = False) -> str: """Serialize and put an object in the object store. Args: key: The key of the object. data: The data to be stored. is_json: If true, json encode the data, otherwise pickle it. """ # TODO(suquark): Currently put to file is not atomic -- you can get a partial # file. This could fail workflow recovery. try: if not is_json: serialization.dump_to_storage( key, data, self._workflow_id, storage=self ) else: serialized_data = json.dumps(data).encode() self._storage.put(key, serialized_data) except Exception as e: raise DataSaveError from e return key
def save_step_output( self, task_id: TaskID, ret: Any, *, exception: Optional[Exception], ) -> None: """When a workflow step returns, 1. If the returned object is a workflow, this means we are a nested workflow. We save the output metadata that points to the workflow. 2. Otherwise, checkpoint the output. Args: task_id: The ID of the workflow step. If it is an empty string, it means we are in the workflow job driver process. ret: The returned object from a workflow step. exception: This step should throw exception. """ if exception is None: # This workflow step returns a object. ret = ray.get(ret) if isinstance(ret, ray.ObjectRef) else ret serialization.dump_to_storage( self._key_step_output(task_id), ret, self._workflow_id, storage=self, ) # tasks.append(self._put(self._key_step_output(task_id), ret)) # TODO (yic): Delete exception file else: assert ret is None serialization.dump_to_storage( self._key_step_exception(task_id), exception, self._workflow_id, storage=self, )