Exemplo n.º 1
0
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_pb2.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        if not task_gen_utils.is_feasible_node(node):
            return None

        executions = task_gen_utils.get_executions(metadata_handler, node)
        result = task_gen_utils.generate_task_from_active_execution(
            self._pipeline, node, executions)
        if result:
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None:
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        # If the latest successful execution had the same resolved input artifacts,
        # the component should not be triggered, so task is not generated.
        # TODO(b/170231077): This logic should be handled by the resolver when it's
        # implemented. Also, currently only the artifact ids of previous execution
        # are checked to decide if a new execution is warranted but it may also be
        # necessary to factor in the difference of execution properties.
        latest_exec = task_gen_utils.get_latest_successful_execution(
            executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(
                    *resolved_info.input_artifacts.values()))
            if latest_exec_input_artifact_ids == current_exec_input_artifact_ids:
                return None

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        return task_gen_utils.create_task(self._pipeline, node, execution)
Exemplo n.º 2
0
    def testGetArtifactIdsForExecutionIdGroupedByEventType(self):
        with metadata.Metadata(connection_config=self._connection_config) as m:
            # Register an input and output artifacts in MLMD.
            input_example = standard_artifacts.Examples()
            input_example.uri = 'example'
            input_example.type_id = common_utils.register_type_if_not_exist(
                m, input_example.artifact_type).id
            output_model = standard_artifacts.Model()
            output_model.uri = 'model'
            output_model.type_id = common_utils.register_type_if_not_exist(
                m, output_model.artifact_type).id
            [input_example.id, output_model.id] = m.store.put_artifacts(
                [input_example.mlmd_artifact, output_model.mlmd_artifact])
            execution = execution_lib.prepare_execution(
                m,
                metadata_store_pb2.ExecutionType(name='my_execution_type'),
                exec_properties={
                    'p1': 1,
                    'p2': '2'
                },
                state=metadata_store_pb2.Execution.COMPLETE)
            contexts = self._generate_contexts(m)
            execution = execution_lib.put_execution(
                m,
                execution,
                contexts,
                input_artifacts={'example': [input_example]},
                output_artifacts={'model': [output_model]})

            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    m, execution.id))
            self.assertDictEqual(
                {
                    metadata_store_pb2.Event.INPUT: set([input_example.id]),
                    metadata_store_pb2.Event.OUTPUT: set([output_model.id]),
                }, artifact_ids_by_event_type)
Exemplo n.º 3
0
    def _generate_task(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> Optional[task_lib.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        executions = task_gen_utils.get_executions(metadata_handler, node)
        result = task_gen_utils.generate_task_from_active_execution(
            metadata_handler, self._pipeline, node, executions)
        if result:
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        if resolved_info.input_artifacts is None or not any(
                resolved_info.input_artifacts.values()):
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return None

        # If the latest successful execution had the same resolved input artifacts,
        # the component should not be triggered, so task is not generated.
        # TODO(b/170231077): This logic should be handled by the resolver when it's
        # implemented. Also, currently only the artifact ids of previous execution
        # are checked to decide if a new execution is warranted but it may also be
        # necessary to factor in the difference of execution properties.
        latest_exec = task_gen_utils.get_latest_successful_execution(
            executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(
                    *resolved_info.input_artifacts.values()))
            if latest_exec_input_artifact_ids == current_exec_input_artifact_ids:
                return None

        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)
        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)

        # For mixed service nodes, we ensure node services and check service
        # status; the node is aborted if its service jobs have failed.
        service_status = self._ensure_node_services_if_mixed(node.node_info.id)
        if service_status is not None:
            if service_status != service_jobs.ServiceStatus.RUNNING:
                return self._abort_node_task(node_uid)

        return task_lib.ExecNodeTask(
            node_uid=node_uid,
            execution=execution,
            contexts=resolved_info.contexts,
            input_artifacts=resolved_info.input_artifacts,
            exec_properties=resolved_info.exec_properties,
            output_artifacts=outputs_resolver.generate_output_artifacts(
                execution.id),
            executor_output_uri=outputs_resolver.get_executor_output_uri(
                execution.id),
            stateful_working_dir=outputs_resolver.
            get_stateful_working_directory(execution.id),
            pipeline=self._pipeline)
Exemplo n.º 4
0
    def _generate_tasks_for_node(
            self, metadata_handler: metadata.Metadata,
            node: pipeline_pb2.PipelineNode) -> List[task_lib.Task]:
        """Generates a node execution task.

    If a node execution is not feasible, `None` is returned.

    Args:
      metadata_handler: A handler to access MLMD db.
      node: The pipeline node for which to generate a task.

    Returns:
      Returns a `Task` or `None` if task generation is deemed infeasible.
    """
        result = []
        node_uid = task_lib.NodeUid.from_pipeline_node(self._pipeline, node)

        executions = task_gen_utils.get_executions(metadata_handler, node)
        exec_node_task = task_gen_utils.generate_task_from_active_execution(
            metadata_handler, self._pipeline, node, executions)
        if exec_node_task:
            result.append(
                task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                             state=pstate.NodeState.RUNNING))
            result.append(exec_node_task)
            return result

        resolved_info = task_gen_utils.generate_resolved_info(
            metadata_handler, node)
        # TODO(b/207038460): Update async pipeline to support ForEach.
        if (resolved_info is None or not resolved_info.input_artifacts
                or resolved_info.input_artifacts[0] is None
                or not any(resolved_info.input_artifacts[0].values())):
            logging.info(
                'Task cannot be generated for node %s since no input artifacts '
                'are resolved.', node.node_info.id)
            return result
        input_artifact = resolved_info.input_artifacts[0]

        executor_spec_fingerprint = hashlib.sha256()
        executor_spec = task_gen_utils.get_executor_spec(
            self._pipeline_state.pipeline, node.node_info.id)
        if executor_spec is not None:
            executor_spec_fingerprint.update(
                executor_spec.SerializeToString(deterministic=True))
        resolved_info.exec_properties[
            constants.
            EXECUTOR_SPEC_FINGERPRINT_KEY] = executor_spec_fingerprint.hexdigest(
            )

        # If the latest execution had the same resolved input artifacts, execution
        # properties and executor specs, we should not trigger a new execution.
        latest_exec = task_gen_utils.get_latest_execution(executions)
        if latest_exec:
            artifact_ids_by_event_type = (
                execution_lib.get_artifact_ids_by_event_type_for_execution_id(
                    metadata_handler, latest_exec.id))
            latest_exec_input_artifact_ids = artifact_ids_by_event_type.get(
                metadata_store_pb2.Event.INPUT, set())
            current_exec_input_artifact_ids = set(
                a.id for a in itertools.chain(*input_artifact.values()))
            latest_exec_properties = task_gen_utils.extract_properties(
                latest_exec)
            current_exec_properties = resolved_info.exec_properties
            latest_exec_executor_spec_fp = latest_exec_properties[
                constants.EXECUTOR_SPEC_FINGERPRINT_KEY]
            current_exec_executor_spec_fp = resolved_info.exec_properties[
                constants.EXECUTOR_SPEC_FINGERPRINT_KEY]
            if (latest_exec_input_artifact_ids
                    == current_exec_input_artifact_ids
                    and _exec_properties_match(latest_exec_properties,
                                               current_exec_properties)
                    and latest_exec_executor_spec_fp
                    == current_exec_executor_spec_fp):
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid, state=pstate.NodeState.STARTED))
                return result

        execution = execution_publish_utils.register_execution(
            metadata_handler=metadata_handler,
            execution_type=node.node_info.type,
            contexts=resolved_info.contexts,
            input_artifacts=input_artifact,
            exec_properties=resolved_info.exec_properties)
        outputs_resolver = outputs_utils.OutputsResolver(
            node, self._pipeline.pipeline_info, self._pipeline.runtime_spec,
            self._pipeline.execution_mode)

        # For mixed service nodes, we ensure node services and check service
        # status; the node is aborted if its service jobs have failed.
        service_status = self._ensure_node_services_if_mixed(node.node_info.id)
        if service_status is not None:
            if service_status != service_jobs.ServiceStatus.RUNNING:
                error_msg = f'associated service job failed; node uid: {node_uid}'
                result.append(
                    task_lib.UpdateNodeStateTask(
                        node_uid=node_uid,
                        state=pstate.NodeState.FAILED,
                        status=status_lib.Status(code=status_lib.Code.ABORTED,
                                                 message=error_msg)))
                return result

        output_artifacts = outputs_resolver.generate_output_artifacts(
            execution.id)
        outputs_utils.make_output_dirs(output_artifacts)
        result.append(
            task_lib.UpdateNodeStateTask(node_uid=node_uid,
                                         state=pstate.NodeState.RUNNING))
        result.append(
            task_lib.ExecNodeTask(
                node_uid=node_uid,
                execution_id=execution.id,
                contexts=resolved_info.contexts,
                input_artifacts=input_artifact,
                exec_properties=resolved_info.exec_properties,
                output_artifacts=output_artifacts,
                executor_output_uri=outputs_resolver.get_executor_output_uri(
                    execution.id),
                stateful_working_dir=outputs_resolver.
                get_stateful_working_directory(execution.id),
                tmp_dir=outputs_resolver.make_tmp_dir(execution.id),
                pipeline=self._pipeline))
        return result