def resume_all(with_failed: bool) -> List[Tuple[str, ray.ObjectRef]]: filter_set = {WorkflowStatus.RESUMABLE} if with_failed: filter_set.add(WorkflowStatus.FAILED) all_failed = list_all(filter_set) try: workflow_manager = get_management_actor() except Exception as e: raise RuntimeError("Failed to get management actor") from e job_id = ray.get_runtime_context().job_id.hex() reconstructed_refs = [] reconstructed_workflows = [] for wid, _ in all_failed: context = workflow_context.WorkflowStepContext(workflow_id=wid) reconstructed_refs.append( (context, workflow_manager.reconstruct_workflow.remote(job_id, context))) for context, ref in reconstructed_refs: try: ray.get(ref) # make sure the workflow is already reconstructed reconstructed_workflows.append(( context.workflow_id, workflow_manager.execute_workflow.remote(job_id, context), )) except Exception: # TODO(suquark): Here some workflows got resumed successfully but some # failed and the user has no idea about this, which is very wired. # Maybe we should raise an exception here instead? logger.error(f"Failed to resume workflow {context.workflow_id}") return reconstructed_workflows
def resume(workflow_id: str) -> ray.ObjectRef: """Resume a workflow asynchronously. See "api.resume()" for details.""" logger.info(f'Resuming workflow [id="{workflow_id}"].') workflow_manager = get_or_create_management_actor() if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)): raise RuntimeError(f"Workflow '{workflow_id}' is already running.") # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. job_id = ray.get_runtime_context().job_id.hex() context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context)) result = workflow_manager.execute_workflow.remote(job_id, context) logger.info(f"Workflow job {workflow_id} resumed.") return result
def run( dag: DAGNode, dag_inputs: DAGInputData, workflow_id: Optional[str] = None, metadata: Optional[Dict] = None, ) -> ray.ObjectRef: """Run a workflow asynchronously.""" validate_user_metadata(metadata) metadata = metadata or {} from ray.workflow.api import _ensure_workflow_initialized _ensure_workflow_initialized() if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, dag_inputs, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = workflow_storage.get_workflow_storage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = get_or_create_management_actor() if ray.get(workflow_manager.is_workflow_running.remote(workflow_id)): raise RuntimeError(f"Workflow '{workflow_id}' is already running.") if wf_exists: return resume(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)
def resume_async(workflow_id: str) -> ray.ObjectRef: """Resume a workflow asynchronously. Resume a workflow and retrieve its output. If the workflow was incomplete, it will be re-executed from its checkpointed outputs. If the workflow was complete, returns the result immediately. Examples: >>> from ray import workflow >>> start_trip = ... # doctest: +SKIP >>> trip = start_trip.step() # doctest: +SKIP >>> res1 = trip.run_async(workflow_id="trip1") # doctest: +SKIP >>> res2 = workflow.resume("trip1") # doctest: +SKIP >>> assert ray.get(res1) == ray.get(res2) # doctest: +SKIP Args: workflow_id: The id of the workflow to resume. Returns: An object reference that can be used to retrieve the workflow result. """ _ensure_workflow_initialized() logger.info(f'Resuming workflow [id="{workflow_id}"].') workflow_manager = workflow_access.get_management_actor() if ray.get( workflow_manager.is_workflow_non_terminating.remote(workflow_id)): raise RuntimeError( f"Workflow '{workflow_id}' is already running or pending.") # NOTE: It is important to 'ray.get' the returned output. This # ensures caller of 'run()' holds the reference to the workflow # result. Otherwise if the actor removes the reference of the # workflow output, the caller may fail to resolve the result. job_id = ray.get_runtime_context().job_id.hex() context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) ray.get(workflow_manager.reconstruct_workflow.remote(job_id, context)) result = workflow_manager.execute_workflow.remote(job_id, context) logger.info(f"Workflow job {workflow_id} resumed.") return result
def resume_all( include_failed: bool = False) -> List[Tuple[str, ray.ObjectRef]]: """Resume all resumable workflow jobs. This can be used after cluster restart to resume all tasks. Args: include_failed: Whether to resume FAILED workflows. Examples: >>> from ray import workflow >>> failed_job = ... # doctest: +SKIP >>> workflow_step = failed_job.step() # doctest: +SKIP >>> output = workflow_step.run_async(workflow_id="failed_job") # doctest: +SKIP >>> try: # doctest: +SKIP >>> ray.get(output) # doctest: +SKIP >>> except Exception: # doctest: +SKIP >>> print("JobFailed") # doctest: +SKIP >>> jobs = workflow.list_all() # doctest: +SKIP >>> assert jobs == [("failed_job", workflow.FAILED)] # doctest: +SKIP >>> assert workflow.resume_all( # doctest: +SKIP ... include_failed=True).get("failed_job") is not None # doctest: +SKIP Returns: A list of (workflow_id, returned_obj_ref) resumed. """ _ensure_workflow_initialized() filter_set = {WorkflowStatus.RESUMABLE} if include_failed: filter_set.add(WorkflowStatus.FAILED) all_failed = list_all(filter_set) try: workflow_manager = workflow_access.get_management_actor() except Exception as e: raise RuntimeError("Failed to get management actor") from e job_id = ray.get_runtime_context().job_id.hex() reconstructed_workflows = [] for wid, _ in all_failed: context = workflow_context.WorkflowStepContext(workflow_id=wid) # TODO(suquark): This is not very efficient, but it makes sure # running workflows has higher priority when getting reconstructed. try: ray.get( workflow_manager.reconstruct_workflow.remote(job_id, context)) except Exception as e: # TODO(suquark): Here some workflows got resumed successfully but some # failed and the user has no idea about this, which is very wired. # Maybe we should raise an exception here instead? logger.error(f"Failed to resume workflow {context.workflow_id}", exc_info=e) raise reconstructed_workflows.append(context) results = [] for context in reconstructed_workflows: results.append(( context.workflow_id, workflow_manager.execute_workflow.remote(job_id, context), )) return results
def run_async( dag: DAGNode, *args, workflow_id: Optional[str] = None, metadata: Optional[Dict[str, Any]] = None, **kwargs, ) -> ray.ObjectRef: """Run a workflow asynchronously. If the workflow with the given id already exists, it will be resumed. Args: workflow_id: A unique identifier that can be used to resume the workflow. If not specified, a random id will be generated. metadata: The metadata to add to the workflow. It has to be able to serialize to json. Returns: The running result as ray.ObjectRef. """ _ensure_workflow_initialized() if not isinstance(dag, DAGNode): raise TypeError("Input should be a DAG.") input_data = DAGInputData(*args, **kwargs) validate_user_metadata(metadata) metadata = metadata or {} if workflow_id is None: # Workflow ID format: {Entry workflow UUID}.{Unix time to nanoseconds} workflow_id = f"{str(uuid.uuid4())}.{time.time():.9f}" state = workflow_state_from_dag(dag, input_data, workflow_id) logger.info(f'Workflow job created. [id="{workflow_id}"].') context = workflow_context.WorkflowStepContext(workflow_id=workflow_id) with workflow_context.workflow_step_context(context): # checkpoint the workflow ws = WorkflowStorage(workflow_id) ws.save_workflow_user_metadata(metadata) job_id = ray.get_runtime_context().job_id.hex() try: ws.get_entrypoint_step_id() wf_exists = True except Exception: # The workflow does not exist. We must checkpoint entry workflow. ws.save_workflow_execution_state("", state) wf_exists = False workflow_manager = workflow_access.get_management_actor() if ray.get( workflow_manager.is_workflow_non_terminating.remote( workflow_id)): raise RuntimeError( f"Workflow '{workflow_id}' is already running or pending.") if wf_exists: return resume_async(workflow_id) ignore_existing = ws.load_workflow_status() == WorkflowStatus.NONE ray.get( workflow_manager.submit_workflow.remote( workflow_id, state, ignore_existing=ignore_existing)) return workflow_manager.execute_workflow.remote(job_id, context)