def run(entry_workflow: Workflow, workflow_id: Optional[str] = None, overwrite: bool = True) -> ray.ObjectRef: """Run a workflow asynchronously. # TODO(suquark): The current "run" always overwrite existing workflow. # We need to fix this later. """ store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) commit_step(ws, "", entry_workflow) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def resume(workflow_id: str, storage: Optional[Union[str, Storage]] = None) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details. """ if isinstance(storage, str): store = create_storage(storage) elif isinstance(storage, Storage): store = storage elif storage is None: store = get_global_storage() else: raise TypeError("'storage' should be None, str, or Storage type.") logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError: # the actor does not exist actor = WorkflowManagementActor.options( name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get( actor.run_or_resume.remote(workflow_id, store.storage_url)) direct_output = flatten_workflow_output(workflow_id, output) logger.info(f"Workflow job {workflow_id} resumed.") return direct_output
def resume_workflow_job(workflow_id: str, store: storage.Storage) -> ray.ObjectRef: """Resume a workflow job. Args: workflow_id: The ID of the workflow job. The ID is used to identify the workflow. store: The storage to access the workflow. Raises: WorkflowNotResumableException: fail to resume the workflow. Returns: The execution result of the workflow, represented by Ray ObjectRef. """ reader = workflow_storage.WorkflowStorage(workflow_id, store) try: entrypoint_step_id: StepID = reader.get_entrypoint_step_id() r = _construct_resume_workflow_from_step(reader, entrypoint_step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): try: workflow_context.init_workflow_step_context( workflow_id, store.storage_url) obj_ref = execute_workflow(r) return flatten_workflow_output(workflow_id, obj_ref) finally: workflow_context.set_workflow_step_context(None) return ray.put(reader.load_step_output(r))
def run(entry_workflow: Workflow, storage: Optional[Union[str, Storage]] = None, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" if isinstance(storage, str): set_global_storage(create_storage(storage)) elif isinstance(storage, Storage): set_global_storage(storage) elif storage is not None: raise TypeError("'storage' should be None, str, or Storage type.") storage_url = get_global_storage().storage_url logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{storage_url}\"].") try: workflow_context.init_workflow_step_context(workflow_id, storage_url) commit_step(entry_workflow) try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError: # the actor does not exist actor = WorkflowManagementActor.options( name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get(actor.run_or_resume.remote(workflow_id, storage_url)) direct_output = flatten_workflow_output(workflow_id, output) finally: workflow_context.set_workflow_step_context(None) return direct_output
async def _resume_one(wid: str) -> Tuple[str, Optional[ray.ObjectRef]]: try: obj = await workflow_manager.run_or_resume.remote(wid) return (wid, flatten_workflow_output(wid, obj)) except Exception: logger.error(f"Failed to resume workflow {wid}") return (wid, None)
async def _resume_one(wid: str) -> Tuple[str, Optional[ray.ObjectRef]]: try: result: "WorkflowExecutionResult" = ( await workflow_manager.run_or_resume.remote(wid)) obj = flatten_workflow_output(wid, result.persisted_output) return wid, obj except Exception: logger.error(f"Failed to resume workflow {wid}") return (wid, None)
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. """ store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") with workflow_context.workflow_step_context(workflow_id, store.storage_url): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) wf_exists = True try: ws.get_entrypoint_step_id() except Exception: wf_exists = False # We only commit for # - virtual actor tasks: it's dynamic tasks, so we always add # - it's a new workflow # TODO (yic): follow up with force rerun if entry_workflow.data.step_type != StepType.FUNCTION or not wf_exists: commit_step(ws, "", entry_workflow, None) workflow_manager = get_or_create_management_actor() ignore_existing = (entry_workflow.data.step_type != StepType.FUNCTION) # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. result: "WorkflowExecutionResult" = ray.get( workflow_manager.run_or_resume.remote(workflow_id, ignore_existing)) if entry_workflow.data.step_type == StepType.FUNCTION: return flatten_workflow_output(workflow_id, result.persisted_output) else: return flatten_workflow_output(workflow_id, result.volatile_output)
def get_output(workflow_id: str) -> ray.ObjectRef: """Get the output of a running workflow. See "api.get_output()" for details. """ try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError as e: raise ValueError( "Failed to connect to the workflow management " "actor. The workflow could have already failed. You can use " "workflow.resume() to resume the workflow.") from e output = ray.get(actor.get_output.remote(workflow_id)) return flatten_workflow_output(workflow_id, output)
def get_output(workflow_id: str) -> ray.ObjectRef: """Get the output of a running workflow. See "api.get_output()" for details. """ assert ray.is_initialized() try: workflow_manager = get_management_actor() except ValueError as e: raise ValueError( "Failed to connect to the workflow management " "actor. The workflow could have already failed. You can use " "workflow.resume() to resume the workflow.") from e output = ray.get(workflow_manager.get_output.remote(workflow_id)) return flatten_workflow_output(workflow_id, output)
def test_workflow_output_resolving(workflow_start_regular_shared): # deep nested workflow nested_ref = deep_nested.remote(30) original_func = workflow_access._resolve_workflow_output # replace the original function with a new function that does not # involving named actor workflow_access._resolve_workflow_output = _resolve_workflow_output try: ref = workflow_access.flatten_workflow_output("fake_workflow_id", nested_ref) finally: # restore the function workflow_access._resolve_workflow_output = original_func assert ray.get(ref) == 42
def resume(workflow_id: str) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details. """ storage = get_global_storage() logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url=" f"\"{storage.storage_url}\"].") workflow_manager = get_or_create_management_actor() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get(workflow_manager.run_or_resume.remote(workflow_id)) direct_output = flatten_workflow_output(workflow_id, output) logger.info(f"Workflow job {workflow_id} resumed.") return direct_output
def test_workflow_output_resolving(): ray.init(namespace="workflow") # deep nested workflow nested_ref = deep_nested.remote(30) original_func = workflow_access._resolve_workflow_output # replace the original function with a new function that does not # involving named actor workflow_access._resolve_workflow_output = _resolve_workflow_output try: ref = workflow_access.flatten_workflow_output("fake_workflow_id", nested_ref) finally: # restore the function workflow_access._resolve_workflow_output = original_func assert ray.get(ref) == 42 ray.shutdown()
def run(entry_workflow: Workflow, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" store = get_global_storage() assert ray.is_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") # checkpoint the workflow ws = workflow_storage.WorkflowStorage(workflow_id, store) commit_step(ws, "", entry_workflow) workflow_manager = get_or_create_management_actor() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get(workflow_manager.run_or_resume.remote(workflow_id)) return flatten_workflow_output(workflow_id, output)
def test_workflow_output_resolving(): ray.init() nested_ref = deep_nested.remote(30) ref = flatten_workflow_output("fake_workflow_id", nested_ref) assert ray.get(ref) == 42 ray.shutdown()