def run(entry_workflow: Workflow, storage: Optional[Union[str, Storage]] = None, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" if isinstance(storage, str): set_global_storage(create_storage(storage)) elif isinstance(storage, Storage): set_global_storage(storage) elif storage is not None: raise TypeError("'storage' should be None, str, or Storage type.") storage_url = get_global_storage().storage_url logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{storage_url}\"].") try: workflow_context.init_workflow_step_context(workflow_id, storage_url) commit_step(entry_workflow) try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError: # the actor does not exist actor = WorkflowManagementActor.options( name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get(actor.run_or_resume.remote(workflow_id, storage_url)) direct_output = flatten_workflow_output(workflow_id, output) finally: workflow_context.set_workflow_step_context(None) return direct_output
def resume_workflow_job(workflow_id: str, store_url: str) -> ray.ObjectRef: """Resume a workflow job. Args: workflow_id: The ID of the workflow job. The ID is used to identify the workflow. store_url: The url of the storage to access the workflow. Raises: WorkflowNotResumableException: fail to resume the workflow. Returns: The execution result of the workflow, represented by Ray ObjectRef. """ try: store = storage.create_storage(store_url) wf_store = workflow_storage.WorkflowStorage(workflow_id, store) entrypoint_step_id: StepID = wf_store.get_entrypoint_step_id() r = _construct_resume_workflow_from_step(wf_store, entrypoint_step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): with workflow_context.workflow_step_context(workflow_id, store.storage_url): from ray.experimental.workflow.step_executor import ( execute_workflow) return execute_workflow(r) return wf_store.load_step_output(r)
def actor_ready(self, actor_id: str, storage_url: str) -> ray.ObjectRef: """Check if a workflow virtual actor is fully initialized. Args: actor_id: The ID of a workflow virtual actor. storage_url: A string that represents the storage. Returns: A future object that represents the state of the actor. "ray.get" the object successfully indicates the actor is initialized successfully. """ store = storage.create_storage(storage_url) ws = workflow_storage.WorkflowStorage(actor_id, store) try: step_id = ws.get_entrypoint_step_id() output_exists = ws.inspect_step(step_id).output_object_valid if output_exists: return ray.put(None) except Exception: pass if actor_id not in self._actor_initialized: raise ValueError(f"Actor '{actor_id}' has not been created, or " "it has failed before initialization.") return self._actor_initialized[actor_id]
def resume(workflow_id: str, storage: Optional[Union[str, Storage]] = None) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details. """ if isinstance(storage, str): store = create_storage(storage) elif isinstance(storage, Storage): store = storage elif storage is None: store = get_global_storage() else: raise TypeError("'storage' should be None, str, or Storage type.") logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") try: actor = ray.get_actor(MANAGEMENT_ACTOR_NAME) except ValueError: # the actor does not exist actor = WorkflowManagementActor.options( name=MANAGEMENT_ACTOR_NAME, lifetime="detached").remote() # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. output = ray.get( actor.run_or_resume.remote(workflow_id, store.storage_url)) direct_output = flatten_workflow_output(workflow_id, output) logger.info(f"Workflow job {workflow_id} resumed.") return direct_output
def _resume_workflow_step_executor( workflow_id: str, step_id: "StepID", store_url: str, current_output: [ray.ObjectRef ]) -> Tuple[ray.ObjectRef, ray.ObjectRef]: # TODO (yic): We need better dependency management for virtual actor # The current output will always be empty for normal workflow # For virtual actor, if it's not empty, it means the previous job is # running. This is a really bad one. for ref in current_output: try: while isinstance(ref, ray.ObjectRef): ref = ray.get(ref) except Exception: pass try: store = storage.create_storage(store_url) wf_store = workflow_storage.WorkflowStorage(workflow_id, store) r = _construct_resume_workflow_from_step(wf_store, step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): with workflow_context.workflow_step_context(workflow_id, store.storage_url): from ray.experimental.workflow.step_executor import ( execute_workflow) result = execute_workflow(r, last_step_of_workflow=True) return result.persisted_output, result.volatile_output assert isinstance(r, StepID) return wf_store.load_step_output(r), None
def run(entry_workflow: Workflow, storage: Optional[Union[str, Storage]] = None, workflow_id: Optional[str] = None) -> ray.ObjectRef: """Run a workflow asynchronously. See "api.run()" for details.""" if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{entry_workflow.id}.{time.time():.9f}" if isinstance(storage, str): set_global_storage(create_storage(storage)) elif isinstance(storage, Storage): set_global_storage(storage) elif storage is not None: raise TypeError("'storage' should be None, str, or Storage type.") storage_url = get_global_storage().storage_url logger.info(f"Workflow job created. [id=\"{workflow_id}\", storage_url=" f"\"{storage_url}\"].") try: workflow_context.init_workflow_step_context(workflow_id, storage_url) commit_step(entry_workflow) # TODO(suquark): Move this to a detached named actor, # so the workflow shares fate with the actor. # The current plan is resuming the workflow on the detached named # actor. This is extremely simple to implement, but I am not sure # of its performance. output = recovery.resume_workflow_job(workflow_id, get_global_storage()) logger.info(f"Workflow job {workflow_id} started.") finally: workflow_context.set_workflow_step_context(None) return output
def _get_storage(storage: Optional[Union[str, Storage]]) -> Storage: if storage is None: return get_global_storage() elif isinstance(storage, str): return create_storage(storage) elif isinstance(storage, Storage): return storage else: raise TypeError("'storage' should be None, str, or Storage type.")
def update_workflow_step_context(context: Optional[WorkflowStepContext], step_id: str): global _context _context = context _context.workflow_scope.append(step_id) # avoid cyclic import from ray.experimental.workflow import storage # TODO(suquark): [optimization] if the original storage has the same URL, # skip creating the new one storage.set_global_storage(storage.create_storage(context.storage_url))
def s3_storage(aws_credentials, s3_server): with mock_s3(): client = boto3.client("s3", region_name="us-west-2", endpoint_url=s3_server) client.create_bucket(Bucket="test_bucket") url = ("s3://test_bucket/workflow" f"?region_name=us-west-2&endpoint_url={s3_server}") storage.set_global_storage(storage.create_storage(url)) yield storage.get_global_storage()
def get_actor( actor_id: str, storage: "Optional[Union[str, Storage]]" = None) -> "VirtualActor": """Get an virtual actor. Args: actor_id: The ID of the actor. storage: The storage of the actor. Returns: A virtual actor. """ if storage is None: storage = storage_base.get_global_storage() elif isinstance(storage, str): storage = storage_base.create_storage(storage) return virtual_actor_class.get_actor(actor_id, storage)
def resume(workflow_id: str, storage: Optional[Union[str, Storage]] = None) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details. """ if isinstance(storage, str): store = create_storage(storage) elif isinstance(storage, Storage): store = storage elif storage is None: store = get_global_storage() else: raise TypeError("'storage' should be None, str, or Storage type.") logger.info(f"Resuming workflow [id=\"{workflow_id}\", storage_url=" f"\"{store.storage_url}\"].") output = recovery.resume_workflow_job(workflow_id, store) logger.info(f"Workflow job {workflow_id} resumed.") return output
def _resume_workflow_step_executor( workflow_id: str, step_id: "StepID", store_url: str) -> Tuple[ray.ObjectRef, ray.ObjectRef]: try: store = storage.create_storage(store_url) wf_store = workflow_storage.WorkflowStorage(workflow_id, store) r = _construct_resume_workflow_from_step(wf_store, step_id) except Exception as e: raise WorkflowNotResumableError(workflow_id) from e if isinstance(r, Workflow): with workflow_context.workflow_step_context(workflow_id, store.storage_url): from ray.experimental.workflow.step_executor import ( execute_workflow) result = execute_workflow(r, last_step_of_workflow=True) return result.persisted_output, result.volatile_output return wf_store.load_step_output(r), None
def run_or_resume(self, workflow_id: str, storage_url: str) -> ray.ObjectRef: """Run or resume a workflow. Args: workflow_id: The ID of the workflow. storage_url: A string that represents the storage. Returns: An object reference that can be used to retrieve the workflow result. """ if workflow_id in self._workflow_outputs: raise ValueError(f"The output of workflow[id={workflow_id}] " "already exists.") store = storage.create_storage(storage_url) output = recovery.resume_workflow_job(workflow_id, store) self._workflow_outputs[workflow_id] = output logger.info(f"Workflow job [id={workflow_id}] started.") return output
def init(storage: "Optional[Union[str, Storage]]" = None) -> None: """Initialize workflow. Args: storage: The external storage URL or a custom storage class. If not specified, ``/tmp/ray/workflow_data`` will be used. """ if storage is None: storage = os.environ.get("RAY_WORKFLOW_STORAGE") if storage is None: # We should use get_temp_dir_path, but for ray client, we don't # have this one. We need a flag to tell whether it's a client # or a driver to use the right dir. # For now, just use /tmp/ray/workflow_data logger.warning("Using default local dir: `/tmp/ray/workflow_data`. " "This should only be used for testing purposes.") storage = "file:///tmp/ray/workflow_data" if isinstance(storage, str): storage = storage_base.create_storage(storage) elif not isinstance(storage, Storage): raise TypeError("'storage' should be None, str, or Storage type.") try: _storage = storage_base.get_global_storage() except RuntimeError: pass else: # we have to use the 'else' branch because we would raise a # runtime error, but we do not want to be captured by 'except' if _storage.storage_url == storage.storage_url: logger.warning("Calling 'workflow.init()' again with the same " "storage.") else: raise RuntimeError("Calling 'workflow.init()' again with a " "different storage") storage_base.set_global_storage(storage) workflow_access.init_management_actor()
def resume(workflow_id: str, workflow_root_dir=None) -> ray.ObjectRef: """ Resume a workflow asynchronously. This workflow maybe fail previously. Args: workflow_id: The ID of the workflow. The ID is used to identify the workflow. workflow_root_dir: The path of an external storage used for checkpointing. Returns: The execution result of the workflow, represented by Ray ObjectRef. """ assert ray.is_initialized() if workflow_root_dir is not None: store = storage.create_storage(workflow_root_dir) else: store = storage.get_global_storage() r = recovery.resume_workflow_job(workflow_id, store) if isinstance(r, ray.ObjectRef): return r # skip saving the DAG of a recovery workflow r.skip_saving_workflow_dag = True return run(r, workflow_root_dir, workflow_id)
def filesystem_storage(tmp_path): storage.set_global_storage( storage.create_storage(f"{str(tmp_path)}/workflow_data")) yield storage.get_global_storage()