def test_executor_failure(self): # Register a fake task scheduler that returns success but the executor # was cancelled. self._register_task_scheduler( ts.TaskSchedulerResult( status=status_lib.Status(code=status_lib.Code.OK), output=ts.ExecutorNodeOutput( executor_output=_make_executor_output( self._task, code=status_lib.Code.FAILED_PRECONDITION, msg='foobar error')))) task_manager = self._run_task_manager() self.assertTrue(task_manager.done()) self.assertIsNone(task_manager.exception()) # Check that the task was processed and MLMD execution marked failed. self.assertTrue(self._task_queue.is_empty()) execution = self._get_execution() self.assertEqual(metadata_store_pb2.Execution.FAILED, execution.last_known_state) self.assertEqual( 'foobar error', data_types_utils.get_metadata_value(execution.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY])) # Check that stateful working dir, tmp_dir and output artifact URI are # removed. self.assertFalse(os.path.exists(self._task.stateful_working_dir)) self.assertFalse(os.path.exists(self._task.tmp_dir)) self.assertFalse(os.path.exists(self._output_artifact_uri))
def _get_pipeline_from_orchestrator_execution( execution: metadata_store_pb2.Execution) -> pipeline_pb2.Pipeline: pipeline_ir_b64 = data_types_utils.get_metadata_value( execution.properties[_PIPELINE_IR]) pipeline = pipeline_pb2.Pipeline() pipeline.ParseFromString(base64.b64decode(pipeline_ir_b64)) return pipeline
def _attach_artifact_properties(spec: pipeline_pb2.OutputSpec.ArtifactSpec, artifact: types.Artifact): """Attaches properties of an artifact using ArtifactSpec.""" for key, value in spec.additional_properties.items(): if not value.HasField('field_value'): raise RuntimeError('Property value is not a field_value for %s' % key) setattr(artifact, key, data_types_utils.get_metadata_value(value.field_value)) for key, value in spec.additional_custom_properties.items(): if not value.HasField('field_value'): raise RuntimeError('Property value is not a field_value for %s' % key) value_type = value.field_value.WhichOneof('value') if value_type == 'int_value': artifact.set_int_custom_property(key, value.field_value.int_value) elif value_type == 'string_value': artifact.set_string_custom_property(key, value.field_value.string_value) elif value_type == 'double_value': artifact.set_float_custom_property(key, value.field_value.double_value) else: raise RuntimeError(f'Unexpected value_type: {value_type}')
def pipeline(self) -> pipeline_pb2.Pipeline: if not self._pipeline: pipeline_ir_b64 = data_types_utils.get_metadata_value( self.execution.properties[_PIPELINE_IR]) pipeline = pipeline_pb2.Pipeline() pipeline.ParseFromString(base64.b64decode(pipeline_ir_b64)) self._pipeline = pipeline return self._pipeline
def _extract_properties( execution: metadata_store_pb2.Execution) -> Dict[Text, types.Property]: result = {} for key, prop in itertools.chain(execution.properties.items(), execution.custom_properties.items()): value = data_types_utils.get_metadata_value(prop) if value is None: raise ValueError(f'Unexpected property with empty value; key: {key}') result[key] = value return result
def from_mlmd_value( cls, value: Optional[metadata_store_pb2.Value] = None ) -> 'ManualNodeState': if not value: return ManualNodeState() node_state_json = data_types_utils.get_metadata_value(value) if not node_state_json: return ManualNodeState() return json_utils.loads(node_state_json)
def is_node_stop_initiated(self, node_uid: task_lib.NodeUid) -> bool: """Returns `True` if stopping has been initiated for the given node.""" if node_uid.pipeline_uid != self.pipeline_uid: raise RuntimeError( f'Node given by uid {node_uid} does not belong to pipeline given ' f'by uid {self.pipeline_uid}') property_name = _node_stop_initiated_property(node_uid) if property_name in self.execution.custom_properties: return data_types_utils.get_metadata_value( self.execution.custom_properties[property_name]) == 1 return False
def test_scheduler_failure(self): # Register a fake task scheduler that returns a failure status. self._register_task_scheduler( ts.TaskSchedulerResult(status=status_lib.Status( code=status_lib.Code.ABORTED, message='foobar error'), executor_output=None)) task_manager = self._run_task_manager() self.assertTrue(task_manager.done()) self.assertIsNone(task_manager.exception()) # Check that the task was processed and MLMD execution marked failed. self.assertTrue(self._task_queue.is_empty()) execution = self._get_execution() self.assertEqual(metadata_store_pb2.Execution.FAILED, execution.last_known_state) self.assertEqual( 'foobar error', data_types_utils.get_metadata_value(execution.custom_properties[ constants.EXECUTION_ERROR_MSG_KEY]))
def extract_properties( execution: metadata_store_pb2.Execution ) -> Dict[str, types.ExecPropertyTypes]: """Extracts execution properties from mlmd Execution.""" result = {} for key, prop in itertools.chain(execution.properties.items(), execution.custom_properties.items()): if execution_lib.is_schema_key(key): continue schema_key = execution_lib.get_schema_key(key) schema = None if schema_key in execution.custom_properties: schema = proto_utils.json_to_proto( data_types_utils.get_metadata_value( execution.custom_properties[schema_key]), pipeline_pb2.Value.Schema()) value = data_types_utils.get_parsed_value(prop, schema) if value is None: raise ValueError( f'Unexpected property with empty value; key: {key}') result[key] = value return result
def _get_metadata_value( value: Optional[metadata_store_pb2.Value]) -> Optional[types.Property]: if value is None: return None return data_types_utils.get_metadata_value(value)
def _generate_tasks_for_node( self, node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]: """Generates list of tasks for the given node.""" node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node) node_id = node.node_info.id result = [] node_state = self._node_states_dict[node_uid] if node_state.state in (pstate.NodeState.STOPPING, pstate.NodeState.STOPPED): logging.info( 'Ignoring node in state \'%s\' for task generation: %s', node_state.state, node_uid) return result # If this is a pure service node, there is no ExecNodeTask to generate # but we ensure node services and check service status. service_status = self._ensure_node_services_if_pure(node_id) if service_status is not None: if service_status == service_jobs.ServiceStatus.FAILED: error_msg = f'service job failed; node uid: {node_uid}' result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.FAILED, status=status_lib.Status(code=status_lib.Code.ABORTED, message=error_msg))) elif service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node successful: %s', node_uid) result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.COMPLETE)) elif service_status == service_jobs.ServiceStatus.RUNNING: result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.RUNNING)) return result # If a task for the node is already tracked by the task queue, it need # not be considered for generation again but we ensure node services # in case of a mixed service node. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): service_status = self._ensure_node_services_if_mixed(node_id) if service_status == service_jobs.ServiceStatus.FAILED: error_msg = f'associated service job failed; node uid: {node_uid}' result.append( task_lib.UpdateNodeStateTask( node_uid=node_uid, state=pstate.NodeState.FAILED, status=status_lib.Status(code=status_lib.Code.ABORTED, message=error_msg))) return result node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) latest_execution = task_gen_utils.get_latest_execution(node_executions) # If the latest execution is successful, we're done. if latest_execution and execution_lib.is_execution_successful( latest_execution): logging.info('Node successful: %s', node_uid) result.append( task_lib.UpdateNodeStateTask(node_uid=node_uid, state=pstate.NodeState.COMPLETE)) return result # If the latest execution failed or cancelled, the pipeline should be # aborted if the node is not in state STARTING. For nodes that are # in state STARTING, a new execution is created. if (latest_execution and not execution_lib.is_execution_active(latest_execution) and node_state.state != pstate.NodeState.STARTING): error_msg_value = latest_execution.custom_properties.get( constants.EXECUTION_ERROR_MSG_KEY) error_msg = data_types_utils.get_metadata_value( error_msg_value) if error_msg_value else '' error_msg = f'node failed; node uid: {node_uid}; error: {error_msg}' result.append( task_lib.UpdateNodeStateTask(node_uid=node_uid, state=pstate.NodeState.FAILED, status=status_lib.Status( code=status_lib.Code.ABORTED, message=error_msg))) return result exec_node_task = task_gen_utils.generate_task_from_active_execution( self._mlmd_handle, self._pipeline, node, node_executions) if exec_node_task: result.append( task_lib.UpdateNodeStateTask(node_uid=node_uid, state=pstate.NodeState.RUNNING)) result.append(exec_node_task) return result # Finally, we are ready to generate tasks for the node by resolving inputs. result.extend(self._resolve_inputs_and_generate_tasks_for_node(node)) return result
def generate(self) -> List[task_lib.Task]: """Generates tasks for executing the next executable nodes in the pipeline. The returned tasks must have `exec_task` populated. List may be empty if no nodes are ready for execution. Returns: A `list` of tasks to execute. """ layers = topsort.topsorted_layers( [node.pipeline_node for node in self._pipeline.nodes], get_node_id_fn=lambda node: node.node_info.id, get_parent_nodes=( lambda node: [self._node_map[n] for n in node.upstream_nodes]), get_child_nodes=( lambda node: [self._node_map[n] for n in node.downstream_nodes])) result = [] successful_node_ids = set() for layer_num, layer_nodes in enumerate(layers): for node in layer_nodes: node_uid = task_lib.NodeUid.from_pipeline_node( self._pipeline, node) node_id = node.node_info.id if self._in_successful_nodes_cache(node_uid): successful_node_ids.add(node_id) continue if not self._upstream_nodes_successful(node, successful_node_ids): continue # If this is a pure service node, there is no ExecNodeTask to generate # but we ensure node services and check service status. service_status = self._ensure_node_services_if_pure(node_id) if service_status is not None: if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'service job failed; node uid: {node_uid}') ] if service_status == service_jobs.ServiceStatus.SUCCESS: logging.info('Service node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If a task for the node is already tracked by the task queue, it need # not be considered for generation again but we ensure node services # in case of a mixed service node. if self._is_task_id_tracked_fn( task_lib.exec_node_task_id_from_pipeline_node( self._pipeline, node)): service_status = self._ensure_node_services_if_mixed( node_id) if service_status == service_jobs.ServiceStatus.FAILED: return [ self._abort_task( f'associated service job failed; node uid: {node_uid}' ) ] continue node_executions = task_gen_utils.get_executions( self._mlmd_handle, node) latest_execution = task_gen_utils.get_latest_execution( node_executions) # If the latest execution is successful, we're done. if latest_execution and execution_lib.is_execution_successful( latest_execution): logging.info('Node successful: %s', node_uid) successful_node_ids.add(node_id) continue # If the latest execution failed, the pipeline should be aborted. if latest_execution and not execution_lib.is_execution_active( latest_execution): error_msg_value = latest_execution.custom_properties.get( constants.EXECUTION_ERROR_MSG_KEY) error_msg = data_types_utils.get_metadata_value( error_msg_value) if error_msg_value else '' return [ self._abort_task( f'node failed; node uid: {node_uid}; error: {error_msg}' ) ] # Finally, we are ready to generate an ExecNodeTask for the node. task = self._maybe_generate_task(node, node_executions, successful_node_ids) if task: if task_lib.is_finalize_pipeline_task(task): return [task] else: result.append(task) layer_node_ids = set(node.node_info.id for node in layer_nodes) successful_layer_node_ids = layer_node_ids & successful_node_ids self._update_successful_nodes_cache(successful_layer_node_ids) # If all nodes in the final layer are completed successfully , the # pipeline can be finalized. # TODO(goutham): If there are conditional eval nodes, not all nodes may be # executed in the final layer. Handle this case when conditionals are # supported. if (layer_num == len(layers) - 1 and successful_layer_node_ids == layer_node_ids): return [ task_lib.FinalizePipelineTask( pipeline_uid=self._pipeline_uid, status=status_lib.Status(code=status_lib.Code.OK)) ] return result
def _get_pipeline_from_orchestrator_execution( execution: metadata_store_pb2.Execution) -> pipeline_pb2.Pipeline: pipeline_ir_b64 = data_types_utils.get_metadata_value( execution.properties[_PIPELINE_IR]) return _base64_decode_pipeline(pipeline_ir_b64)
def is_stop_initiated(self): """Returns `True` if pipeline execution stopping has been initiated.""" if _STOP_INITIATED in self.execution.custom_properties: return data_types_utils.get_metadata_value( self.execution.custom_properties[_STOP_INITIATED]) == 1 return False