def execute_workflow(workflow: Workflow) -> "WorkflowExecutionResult": """Execute workflow. This function also performs tail-recursion optimization for inplace workflow steps. Args: workflow: The workflow to be executed. Returns: An object ref that represent the result. """ # Tail recursion optimization. context = {} while True: with workflow_context.fork_workflow_step_context(**context): result = _execute_workflow(workflow) if not isinstance(result.persisted_output, InplaceReturnedWorkflow): break workflow = result.persisted_output.workflow context = result.persisted_output.context # Convert the outputs into ObjectRefs. if not isinstance(result.persisted_output, WorkflowOutputType): result.persisted_output = ray.put(result.persisted_output) if not isinstance(result.persisted_output, WorkflowOutputType): result.volatile_output = ray.put(result.volatile_output) return result
def from_workflow_inputs(cls, inputs: "WorkflowInputs"): with workflow_context.fork_workflow_step_context( outer_most_step_id=None, last_step_of_workflow=False): workflow_outputs = [ execute_workflow(w).persisted_output for w in inputs.workflows ] return cls(inputs.args, workflow_outputs, inputs.workflow_refs)
def execute_workflow(job_id, workflow: Workflow) -> "WorkflowExecutionResult": """Execute workflow. This function also performs tail-recursion optimization for inplace workflow steps. Args: workflow: The workflow to be executed. Returns: An object ref that represent the result. """ # Tail recursion optimization. context = {} while True: with workflow_context.fork_workflow_step_context(**context): result = _execute_workflow(job_id, workflow) if not isinstance(result.output, InplaceReturnedWorkflow): break workflow = result.output.workflow context = result.output.context # Convert the outputs into WorkflowStaticRef. result.output = WorkflowStaticRef.from_output(workflow.step_id, result.output) return result
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult": """Execute workflow. Args: workflow: The workflow to be executed. Returns: An object ref that represent the result. """ if workflow.executed: return workflow.result # Stage 1: prepare inputs workflow_data = workflow.data inputs = workflow_data.inputs workflow_outputs = [] with workflow_context.fork_workflow_step_context( outer_most_step_id=None, last_step_of_workflow=False): for w in inputs.workflows: static_ref = w.ref if static_ref is None: # The input workflow is not a reference to an executed # workflow . output = execute_workflow(w).persisted_output static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output) workflow_outputs.append(static_ref) baked_inputs = _BakedWorkflowInputs( args=workflow_data.inputs.args, workflow_outputs=workflow_outputs, workflow_refs=inputs.workflow_refs, ) # Stage 2: match executors step_options = workflow_data.step_options if step_options.allow_inplace: # TODO(suquark): For inplace execution, it is impossible # to get the ObjectRef of the output before execution. # Here we use a dummy ObjectRef, because _record_step_status does not # even use it (?!). _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [ray.put(None)]) # Note: we need to be careful about workflow context when # calling the executor directly. # TODO(suquark): We still have recursive Python calls. # This would cause stack overflow if we have a really # deep recursive call. We should fix it later. if step_options.step_type == StepType.WAIT: executor = _workflow_wait_executor else: executor = _workflow_step_executor else: if step_options.step_type == StepType.WAIT: # This is very important to set "num_cpus=0" to # ensure "workflow.wait" is not blocked by other # tasks. executor = _workflow_wait_executor_remote.options( num_cpus=0).remote else: executor = _workflow_step_executor_remote.options( **step_options.ray_options).remote # Stage 3: execution persisted_output, volatile_output = executor( workflow_data.func_body, workflow_context.get_workflow_step_context(), workflow.step_id, baked_inputs, workflow_data.step_options, ) # Stage 4: post processing outputs if not isinstance(persisted_output, WorkflowOutputType): persisted_output = ray.put(persisted_output) if not isinstance(persisted_output, WorkflowOutputType): volatile_output = ray.put(volatile_output) if step_options.step_type != StepType.READONLY_ACTOR_METHOD: if not step_options.allow_inplace: # TODO: [Possible flaky bug] Here the RUNNING state may # be recorded earlier than SUCCESSFUL. This caused some # confusion during development. _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [volatile_output]) result = WorkflowExecutionResult(persisted_output, volatile_output) workflow._result = result workflow._executed = True return result
def _workflow_step_executor( func: Callable, context: "WorkflowStepContext", step_id: "StepID", baked_inputs: "_BakedWorkflowInputs", runtime_options: "WorkflowStepRuntimeOptions", ) -> Tuple[Any, Any]: """Executor function for workflow step. Args: step_id: ID of the step. func: The workflow step function. baked_inputs: The processed inputs for the step. context: Workflow step context. Used to access correct storage etc. runtime_options: Parameters for workflow step execution. Returns: Workflow step output. """ # Part 1: update the context for the step workflow_context.update_workflow_step_context(context, step_id) context = workflow_context.get_workflow_step_context() step_type = runtime_options.step_type # Part 2: resolve inputs args, kwargs = baked_inputs.resolve() # Part 3: execute the step store = workflow_storage.get_workflow_storage() try: step_prerun_metadata = {"start_time": time.time()} store.save_step_prerun_metadata(step_id, step_prerun_metadata) persisted_output, volatile_output = _wrap_run(func, runtime_options, *args, **kwargs) step_postrun_metadata = {"end_time": time.time()} store.save_step_postrun_metadata(step_id, step_postrun_metadata) except Exception as e: commit_step(store, step_id, None, exception=e) raise e # Part 4: save outputs if step_type == StepType.READONLY_ACTOR_METHOD: if isinstance(volatile_output, Workflow): raise TypeError( "Returning a Workflow from a readonly virtual actor " "is not allowed.") assert not isinstance(persisted_output, Workflow) else: store = workflow_storage.get_workflow_storage() commit_step(store, step_id, persisted_output, exception=None) if isinstance(persisted_output, Workflow): outer_most_step_id = context.outer_most_step_id if step_type == StepType.FUNCTION: # Passing down outer most step so inner nested steps would # access the same outer most step. if not context.outer_most_step_id: # The current workflow step returns a nested workflow, and # there is no outer step for the current step. So the # current step is the outer most step for the inner nested # workflow steps. outer_most_step_id = workflow_context.get_current_step_id() assert volatile_output is None # Execute sub-workflow. Pass down "outer_most_step_id". with workflow_context.fork_workflow_step_context( outer_most_step_id=outer_most_step_id): result = execute_workflow(persisted_output) # When virtual actor returns a workflow in the method, # the volatile_output and persisted_output will be put together persisted_output = result.persisted_output volatile_output = result.volatile_output elif context.last_step_of_workflow: # advance the progress of the workflow store.advance_progress(step_id) _record_step_status(step_id, WorkflowStatus.SUCCESSFUL) logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL)) if isinstance(volatile_output, Workflow): # This is the case where a step method is called in the virtual actor. # We need to run the method to get the final result. assert step_type == StepType.ACTOR_METHOD volatile_output = volatile_output.run_async( workflow_context.get_current_workflow_id()) return persisted_output, volatile_output
def _workflow_step_executor(step_type: StepType, func: Callable, context: workflow_context.WorkflowStepContext, step_id: "StepID", baked_inputs: "_BakedWorkflowInputs", catch_exceptions: bool, max_retries: int) -> Any: """Executor function for workflow step. Args: step_type: The type of workflow step. func: The workflow step function. context: Workflow step context. Used to access correct storage etc. step_id: The ID of the step. baked_inputs: The processed inputs for the step. catch_exceptions: If set to be true, return (Optional[Result], Optional[Error]) instead of Result. max_retries: Max number of retries encounter of a failure. Returns: Workflow step output. """ workflow_context.update_workflow_step_context(context, step_id) args, kwargs = _resolve_step_inputs(baked_inputs) store = workflow_storage.get_workflow_storage() try: persisted_output, volatile_output = _wrap_run( func, step_type, step_id, catch_exceptions, max_retries, *args, **kwargs) except Exception as e: commit_step(store, step_id, None, e) raise e if step_type == StepType.READONLY_ACTOR_METHOD: if isinstance(volatile_output, Workflow): raise TypeError( "Returning a Workflow from a readonly virtual actor " "is not allowed.") assert not isinstance(persisted_output, Workflow) else: store = workflow_storage.get_workflow_storage() commit_step(store, step_id, persisted_output, None) outer_most_step_id = context.outer_most_step_id if isinstance(persisted_output, Workflow): if step_type == StepType.FUNCTION: # Passing down outer most step so inner nested steps would # access the same outer most step. if not context.outer_most_step_id: # The current workflow step returns a nested workflow, and # there is no outer step for the current step. So the # current step is the outer most step for the inner nested # workflow steps. outer_most_step_id = workflow_context.get_current_step_id() assert volatile_output is None # Execute sub-workflow. Pass down "outer_most_step_id". with workflow_context.fork_workflow_step_context( outer_most_step_id=outer_most_step_id): result = execute_workflow(persisted_output) # When virtual actor returns a workflow in the method, # the volatile_output and persisted_output will be put together persisted_output = result.persisted_output volatile_output = result.volatile_output elif context.last_step_of_workflow: # advance the progress of the workflow store.advance_progress(step_id) _record_step_status(step_id, WorkflowStatus.SUCCESSFUL) logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL)) if isinstance(volatile_output, Workflow): # This is the case where a step method is called in the virtual actor. # We need to run the method to get the final result. assert step_type == StepType.ACTOR_METHOD volatile_output = volatile_output.run_async( workflow_context.get_current_workflow_id()) return persisted_output, volatile_output
def execute_workflow(workflow: "Workflow") -> "WorkflowExecutionResult": """Execute workflow. Args: workflow: The workflow to be executed. Returns: An object ref that represent the result. """ if workflow.executed: return workflow.result # Stage 1: prepare inputs workflow_data = workflow.data inputs = workflow_data.inputs # Here A is the outer workflow step, B & C are the inner steps. # C is the output step for A, because C produces the output for A. # # @workflow.step # def A(): # b = B.step() # return C.step(b) # # If the outer workflow step skips checkpointing, it would # update the checkpoint context of all inner steps except # the output step, marking them "detached" from the DAG. # Output step is not detached from the DAG because once # completed, it replaces the output of the outer step. step_context = workflow_context.get_workflow_step_context() checkpoint_context = step_context.checkpoint_context.copy() # "detached" could be defined recursively: # detached := already detached or the outer step skips checkpointing checkpoint_context.detached_from_dag = ( checkpoint_context.detached_from_dag or not step_context.checkpoint_context.checkpoint) # Apply checkpoint context to input steps. Since input steps # further apply them to their inputs, this would eventually # apply to all steps except the output step. This avoids # detaching the output step. workflow_outputs = [] with workflow_context.fork_workflow_step_context( outer_most_step_id=None, last_step_of_workflow=False, checkpoint_context=checkpoint_context, ): for w in inputs.workflows: static_ref = w.ref if static_ref is None: # The input workflow is not a reference to an executed # workflow . output = execute_workflow(w).persisted_output static_ref = WorkflowStaticRef(step_id=w.step_id, ref=output) workflow_outputs.append(static_ref) baked_inputs = _BakedWorkflowInputs( args=inputs.args, workflow_outputs=workflow_outputs, workflow_refs=inputs.workflow_refs, ) # Stage 2: match executors step_options = workflow_data.step_options if step_options.allow_inplace: # TODO(suquark): For inplace execution, it is impossible # to get the ObjectRef of the output before execution. # Here we use a dummy ObjectRef, because _record_step_status does not # even use it (?!). _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [ray.put(None)]) # Note: we need to be careful about workflow context when # calling the executor directly. # TODO(suquark): We still have recursive Python calls. # This would cause stack overflow if we have a really # deep recursive call. We should fix it later. if step_options.step_type == StepType.WAIT: executor = _workflow_wait_executor else: executor = _workflow_step_executor else: if step_options.step_type == StepType.WAIT: # This is very important to set "num_cpus=0" to # ensure "workflow.wait" is not blocked by other # tasks. executor = _workflow_wait_executor_remote.options( num_cpus=0).remote else: executor = _workflow_step_executor_remote.options( **step_options.ray_options).remote # Stage 3: execution persisted_output, volatile_output = executor( workflow_data.func_body, step_context, workflow.step_id, baked_inputs, workflow_data.step_options, ) # Stage 4: post processing outputs if not isinstance(persisted_output, WorkflowOutputType): persisted_output = ray.put(persisted_output) if not isinstance(persisted_output, WorkflowOutputType): volatile_output = ray.put(volatile_output) if step_options.step_type != StepType.READONLY_ACTOR_METHOD: if not step_options.allow_inplace: # TODO: [Possible flaky bug] Here the RUNNING state may # be recorded earlier than SUCCESSFUL. This caused some # confusion during development. _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [volatile_output]) result = WorkflowExecutionResult(persisted_output, volatile_output) workflow._result = result workflow._executed = True return result
def _workflow_step_executor( func: Callable, context: "WorkflowStepContext", step_id: "StepID", baked_inputs: "_BakedWorkflowInputs", runtime_options: "WorkflowStepRuntimeOptions", inplace: bool = False, ) -> Tuple[Any, Any]: """Executor function for workflow step. Args: step_id: ID of the step. func: The workflow step function. baked_inputs: The processed inputs for the step. context: Workflow step context. Used to access correct storage etc. runtime_options: Parameters for workflow step execution. inplace: Execute the workflow inplace. Returns: Workflow step output. """ # Part 1: update the context for the step workflow_context.update_workflow_step_context(context, step_id) context = workflow_context.get_workflow_step_context() step_type = runtime_options.step_type context.checkpoint_context.checkpoint = runtime_options.checkpoint # Part 2: resolve inputs args, kwargs = baked_inputs.resolve() # Part 3: execute the step store = workflow_storage.get_workflow_storage() try: step_prerun_metadata = {"start_time": time.time()} store.save_step_prerun_metadata(step_id, step_prerun_metadata) with workflow_context.workflow_execution(): persisted_output, volatile_output = _wrap_run( func, runtime_options, *args, **kwargs) step_postrun_metadata = {"end_time": time.time()} store.save_step_postrun_metadata(step_id, step_postrun_metadata) except Exception as e: # Always checkpoint the exception. commit_step(store, step_id, None, exception=e) raise e # Part 4: save outputs if step_type == StepType.READONLY_ACTOR_METHOD: if isinstance(volatile_output, Workflow): raise TypeError( "Returning a Workflow from a readonly virtual actor is not allowed." ) assert not isinstance(persisted_output, Workflow) else: # TODO(suquark): Validate checkpoint options before # commit the step. store = workflow_storage.get_workflow_storage() if CheckpointMode(runtime_options.checkpoint) == CheckpointMode.SYNC: commit_step( store, step_id, persisted_output, exception=None, ) if isinstance(persisted_output, Workflow): sub_workflow = persisted_output outer_most_step_id = context.outer_most_step_id assert volatile_output is None if step_type == StepType.FUNCTION: # Passing down outer most step so inner nested steps would # access the same outer most step. if not context.outer_most_step_id: # The current workflow step returns a nested workflow, and # there is no outer step for the current step. So the # current step is the outer most step for the inner nested # workflow steps. outer_most_step_id = workflow_context.get_current_step_id() if inplace: _step_options = sub_workflow.data.step_options if (_step_options.step_type != StepType.WAIT and runtime_options.ray_options != _step_options.ray_options): logger.warning( f"Workflow step '{sub_workflow.step_id}' uses " f"a Ray option different to its caller step '{step_id}' " f"and will be executed inplace. Ray assumes it still " f"consumes the same resource as the caller. This may result " f"in oversubscribing resources.") return ( InplaceReturnedWorkflow( sub_workflow, {"outer_most_step_id": outer_most_step_id}), None, ) # Execute sub-workflow. Pass down "outer_most_step_id". with workflow_context.fork_workflow_step_context( outer_most_step_id=outer_most_step_id): result = execute_workflow(sub_workflow) # When virtual actor returns a workflow in the method, # the volatile_output and persisted_output will be put together persisted_output = result.persisted_output volatile_output = result.volatile_output elif context.last_step_of_workflow: # advance the progress of the workflow store.advance_progress(step_id) _record_step_status(step_id, WorkflowStatus.SUCCESSFUL) logger.info(get_step_status_info(WorkflowStatus.SUCCESSFUL)) if isinstance(volatile_output, Workflow): # This is the case where a step method is called in the virtual actor. # We need to run the method to get the final result. assert step_type == StepType.ACTOR_METHOD volatile_output = volatile_output.run_async( workflow_context.get_current_workflow_id()) return persisted_output, volatile_output
def _execute_workflow(job_id, workflow: "Workflow") -> "WorkflowExecutionResult": """Internal function of workflow execution.""" if workflow.executed: return workflow.result # Stage 1: prepare inputs workflow_data = workflow.data inputs = workflow_data.inputs # Here A is the outer workflow step, B & C are the inner steps. # C is the output step for A, because C produces the output for A. # # @workflow.step # def A(): # b = B.step() # return C.step(b) # # If the outer workflow step skips checkpointing, it would # update the checkpoint context of all inner steps except # the output step, marking them "detached" from the DAG. # Output step is not detached from the DAG because once # completed, it replaces the output of the outer step. step_context = workflow_context.get_workflow_step_context() checkpoint_context = step_context.checkpoint_context.copy() # "detached" could be defined recursively: # detached := already detached or the outer step skips checkpointing checkpoint_context.detached_from_dag = ( checkpoint_context.detached_from_dag or not step_context.checkpoint_context.checkpoint) # Apply checkpoint context to input steps. Since input steps # further apply them to their inputs, this would eventually # apply to all steps except the output step. This avoids # detaching the output step. workflow_outputs = [] with workflow_context.fork_workflow_step_context( outer_most_step_id=None, last_step_of_workflow=False, checkpoint_context=checkpoint_context, ): for w in inputs.workflows: static_ref = w.ref if static_ref is None: extra_options = w.data.step_options.ray_options # The input workflow is not a reference to an executed # workflow. static_ref = execute_workflow(job_id, w).output static_ref._resolve_like_object_ref_in_args = extra_options.get( "_resolve_like_object_ref_in_args", False) workflow_outputs.append(static_ref) baked_inputs = _BakedWorkflowInputs( args=inputs.args, workflow_outputs=workflow_outputs, workflow_refs=inputs.workflow_refs, job_id=job_id, ) # Stage 2: match executors step_options = workflow_data.step_options if step_options.allow_inplace: # TODO(suquark): For inplace execution, it is impossible # to get the ObjectRef of the output before execution. # Here we use a dummy ObjectRef, because _record_step_status does not # even use it (?!). _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [ray.put(None)]) # Note: we need to be careful about workflow context when # calling the executor directly. # TODO(suquark): We still have recursive Python calls. # This would cause stack overflow if we have a really # deep recursive call. We should fix it later. if step_options.step_type == StepType.WAIT: executor = _workflow_wait_executor else: # Tell the executor that we are running inplace. This enables # tail-recursion optimization. executor = functools.partial(_workflow_step_executor, inplace=True) else: if step_options.step_type == StepType.WAIT: # This is very important to set "num_cpus=0" to # ensure "workflow.wait" is not blocked by other # tasks. executor = _workflow_wait_executor_remote.options( num_cpus=0).remote else: ray_options = step_options.ray_options.copy() # cleanup the "_resolve_like_object_ref_in_args" option, it is not for Ray. ray_options.pop("_resolve_like_object_ref_in_args", None) executor = _workflow_step_executor_remote.options( **ray_options).remote # Stage 3: execution output = executor( workflow_data.func_body, step_context, job_id, workflow.step_id, baked_inputs, workflow_data.step_options, ) # Stage 4: post processing outputs if not step_options.allow_inplace: # TODO: [Possible flaky bug] Here the RUNNING state may # be recorded earlier than SUCCESSFUL. This caused some # confusion during development. # convert into workflow static ref for step status record. _record_step_status(workflow.step_id, WorkflowStatus.RUNNING, [None]) result = WorkflowExecutionResult(output) workflow._result = result workflow._executed = True return result