def _abort_task(self, error_msg: str) -> task_lib.FinalizePipelineTask: """Returns task to abort pipeline execution.""" error_msg = (f'Aborting pipeline execution due to node execution failure; ' f'error: {error_msg}') logging.error(error_msg) return task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message=error_msg))
def _generate_task( self, node: pipeline_pb2.PipelineNode, node_executions: Sequence[metadata_store_pb2.Execution] ) -> task_lib.Task: """Generates a node execution task. If node execution is not feasible, `None` is returned. Args: node: The pipeline node for which to generate a task. node_executions: Node executions fetched from MLMD. Returns: Returns an `ExecNodeTask` if node can be executed. If an error occurs, a `FinalizePipelineTask` is returned to abort the pipeline execution. """ result = task_gen_utils.generate_task_from_active_execution( self._mlmd_handle, self._pipeline, node, node_executions) if result: return result node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node) resolved_info = task_gen_utils.generate_resolved_info( self._mlmd_handle, node) if resolved_info.input_artifacts is None: return task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to failure to resolve ' f'inputs; problematic node uid: {node_uid}'))) execution = execution_publish_utils.register_execution( metadata_handler=self._mlmd_handle, execution_type=node.node_info.type, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties) outputs_resolver = outputs_utils.OutputsResolver( node, self._pipeline.pipeline_info, self._pipeline.runtime_spec, self._pipeline.execution_mode) return task_lib.ExecNodeTask( node_uid=node_uid, execution=execution, contexts=resolved_info.contexts, input_artifacts=resolved_info.input_artifacts, exec_properties=resolved_info.exec_properties, output_artifacts=outputs_resolver.generate_output_artifacts( execution.id), executor_output_uri=outputs_resolver.get_executor_output_uri( execution.id), stateful_working_dir=outputs_resolver. get_stateful_working_directory(execution.id), pipeline=self._pipeline)
def test_handling_finalize_pipeline_task(self, task_gen): with self._mlmd_connection as m: pipeline = _test_pipeline('pipeline1', pipeline_pb2.Pipeline.SYNC) pipeline_ops.initiate_pipeline_start(m, pipeline) pipeline_uid = task_lib.PipelineUid.from_pipeline(pipeline) finalize_reason = status_lib.Status( code=status_lib.Code.ABORTED, message='foo bar') task_gen.return_value.generate.side_effect = [ [ task_lib.FinalizePipelineTask( pipeline_uid=pipeline_uid, status=finalize_reason) ], ] task_queue = tq.TaskQueue() pipeline_ops.orchestrate(m, task_queue, service_jobs.DummyServiceJobManager()) task_gen.return_value.generate.assert_called_once() self.assertTrue(task_queue.is_empty()) # Load pipeline state and verify stop initiation. with pstate.PipelineState.load(m, pipeline_uid) as pipeline_state: self.assertEqual(finalize_reason, pipeline_state.stop_initiated_reason())
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] for layer_num, nodes in enumerate(layers): # Boolean that's set if there's at least one successfully executed node # in the current layer. completed_node_ids = set() for node in nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._service_job_manager.is_pure_service_node( self._pipeline_state, node.node_info.id): if not self._upstream_nodes_executed(node): continue service_status = self._service_job_manager.ensure_node_services( self._pipeline_state, node_id) if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node completed successfully: %s', node_uid) completed_node_ids.add(node_id) elif service_status == service_jobs.ServiceStatus.FAILED: logging.error('Failed service node: %s', node_uid) return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status( code=status_lib.Code.ABORTED, message= (f'Aborting pipeline execution due to service ' f'node failure; failed node uid: {node_uid}' ))) ] else: logging.info('Pure service node in progress: %s', node_uid) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): continue executions = task_gen_utils.get_executions( self._mlmd_handle, node) if (executions and task_gen_utils.is_latest_execution_successful( executions)): completed_node_ids.add(node_id) continue # If all upstream nodes are executed but current node is not executed, # the node is deemed ready for execution. if self._upstream_nodes_executed(node): task = self._generate_task(node) if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) # If there are no completed nodes in the current layer, downstream nodes # need not be checked. if not completed_node_ids: break # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if layer_num == len(layers) - 1 and completed_node_ids == set( node.node_info.id for node in nodes): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_state.pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def __call__(self) -> List[task_lib.Task]: layers = _topsorted_layers(self._pipeline) terminal_node_ids = _terminal_node_ids(layers) exec_node_tasks = [] update_node_state_tasks = [] successful_node_ids = set() failed_nodes_dict: Dict[str, status_lib.Status] = {} finalize_pipeline_task = None for layer_nodes in layers: for node in layer_nodes: node_id = node.node_info.id node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_state = self._node_states_dict[node_uid] if node_state.is_success(): successful_node_ids.add(node_id) continue if node_state.is_failure(): failed_nodes_dict[node_id] = node_state.status continue if not self._upstream_nodes_successful(node, successful_node_ids): continue tasks = self._generate_tasks_for_node(node) for task in tasks: if task_lib.is_update_node_state_task(task): task = typing.cast(task_lib.UpdateNodeStateTask, task) if pstate.is_node_state_success(task.state): successful_node_ids.add(node_id) elif pstate.is_node_state_failure(task.state): failed_nodes_dict[node_id] = task.status if self._fail_fast: finalize_pipeline_task = self._abort_task( task.status.message) update_node_state_tasks.append(task) elif task_lib.is_exec_node_task(task): exec_node_tasks.append(task) if finalize_pipeline_task: break if finalize_pipeline_task: break if not self._fail_fast and failed_nodes_dict: assert not finalize_pipeline_task node_by_id = _node_by_id(self._pipeline) # Collect nodes that cannot be run because they have a failed ancestor. unrunnable_node_ids = set() for node_id in failed_nodes_dict: unrunnable_node_ids |= _descendants(node_by_id, node_id) # Nodes that are still runnable have neither succeeded nor failed, and # don't have a failed ancestor. runnable_node_ids = node_by_id.keys() - (unrunnable_node_ids | successful_node_ids | failed_nodes_dict.keys()) # If there are no runnable nodes, we can abort the pipeline. if not runnable_node_ids: finalize_pipeline_task = self._abort_task( f'Cannot make progress due to node failures: {failed_nodes_dict}' ) result = update_node_state_tasks if finalize_pipeline_task: result.append(finalize_pipeline_task) elif terminal_node_ids <= successful_node_ids: # If all terminal nodes are successful, the pipeline can be finalized. result.append( task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK))) else: result.extend(exec_node_tasks) return result
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] successful_node_ids = set() for layer_num, layer_nodes in enumerate(layers): for node in layer_nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._in_successful_nodes_cache(node_uid): successful_node_ids.add(node_id) continue if not self._upstream_nodes_successful(node, successful_node_ids): continue # If this is a pure service node, there is no ExecNodeTask to generate # but we ensure node services and check service status. service_status = self._ensure_node_services_if_pure(node_id) if service_status is not None: if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'service job failed; node uid: {node_uid}') ] if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again but we ensure node services # in case of a mixed service node. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): service_status = self._ensure_node_services_if_mixed( node_id) if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'associated service job failed; node uid: {node_uid}' ) ] continue node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) latest_execution = task_gen_utils.get_latest_execution( node_executions) # If the latest execution is successful, we're done. if latest_execution and execution_lib.is_execution_successful( latest_execution): logging.info('Node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If the latest execution failed, the pipeline should be aborted. if latest_execution and not execution_lib.is_execution_active( latest_execution): error_msg_value = latest_execution.custom_properties.get( constants.EXECUTION_ERROR_MSG_KEY) error_msg = data_types_utils.get_metadata_value( error_msg_value) if error_msg_value else '' return [ self._abort_task( f'node failed; node uid: {node_uid}; error: {error_msg}' ) ] # Finally, we are ready to generate an ExecNodeTask for the node. task = self._maybe_generate_task(node, node_executions, successful_node_ids) if task: if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) layer_node_ids = set(node.node_info.id for node in layer_nodes) successful_layer_node_ids = layer_node_ids & successful_node_ids self._update_successful_nodes_cache(successful_layer_node_ids) # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if (layer_num == len(layers) - 1 and successful_layer_node_ids == layer_node_ids): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result