Пример #1
0
    def testRegisterExecutionBackwardCompatibility(self):
        with metadata.Metadata(connection_config=self._connection_config) as m:
            context_id = m.register_run_context_if_not_exists(
                self._pipeline_info)

            # Puts in execution with more columns needed in MLMD schema first and
            # puts in execution with less columns needed next. Verifies the schema
            # update will not affect backward compatibility.
            exec_properties_one = {'arg_one': 1}
            exec_properties_two = {'arg_one': 1, 'arg_two': 2}
            eid_two = m.register_execution(exec_properties=exec_properties_two,
                                           pipeline_info=self._pipeline_info,
                                           component_info=self._component_info,
                                           run_context_id=context_id)
            eid_one = m.register_execution(exec_properties=exec_properties_one,
                                           pipeline_info=self._pipeline_info,
                                           component_info=self._component_info,
                                           run_context_id=context_id)
            [execution_one,
             execution_two] = m.store.get_executions_by_id([eid_one, eid_two])
            self.assertProtoEquals(
                """
        id: 2
        type_id: 2
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }""", execution_one)
            self.assertProtoEquals(
                """
        id: 1
        type_id: 2
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }
        properties {
          key: "arg_two"
          value {
            string_value: "2"
          }
        }""", execution_two)
Пример #2
0
    def testIrisPipelineSklearnGcp(self, mock_pusher, mock_trainer, _):
        mock_pusher.get_service_name_and_api_version.return_value = ('ml',
                                                                     'v1')
        mock_trainer.get_service_name_and_api_version.return_value = ('ml',
                                                                      'v1')
        BeamDagRunner().run(
            iris_pipeline_sklearn_gcp._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                ai_platform_training_args=self._ai_platform_training_args,
                ai_platform_serving_args=self._ai_platform_serving_args,
                beam_pipeline_args=[]))

        self.assertTrue(tf.io.gfile.exists(self._metadata_path))
        mock_trainer.start_aip_training.assert_called_once()
        mock_pusher.deploy_model_for_aip_prediction.assert_called_once()
        expected_execution_count = 6  # 6 components
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(expected_execution_count, execution_count)

        self.assertPipelineExecution()

        # Runs pipeline the second time.
        BeamDagRunner().run(
            iris_pipeline_sklearn_gcp._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                ai_platform_training_args=self._ai_platform_training_args,
                ai_platform_serving_args=self._ai_platform_serving_args,
                beam_pipeline_args=[]))

        # All executions but Evaluator and Pusher are cached.
        with metadata.Metadata(metadata_config) as m:
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            artifact_count = len(m.store.get_artifacts())
            self.assertEqual(expected_execution_count * 2,
                             len(m.store.get_executions()))

        # Runs pipeline the third time.
        BeamDagRunner().run(
            iris_pipeline_sklearn_gcp._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                ai_platform_training_args=self._ai_platform_training_args,
                ai_platform_serving_args=self._ai_platform_serving_args,
                beam_pipeline_args=[]))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            self.assertEqual(expected_execution_count * 3,
                             len(m.store.get_executions()))
Пример #3
0
    def testExecution(self):
        with metadata.Metadata(connection_config=self._connection_config) as m:
            context_id = m.register_run_context_if_not_exists(
                self._pipeline_info)

            # Test prepare_execution.
            exec_properties = {'arg_one': 1}
            eid = m.register_execution(exec_properties=exec_properties,
                                       pipeline_info=self._pipeline_info,
                                       component_info=self._component_info,
                                       run_context_id=context_id)
            [execution] = m.store.get_executions_by_context(context_id)
            self.assertProtoEquals(
                """
        id: 1
        type_id: 2
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }""", execution)

            # Test publish_execution.
            input_artifact = standard_artifacts.Examples()
            m.publish_artifacts([input_artifact])
            output_artifact = standard_artifacts.Examples()
            input_dict = {'input': [input_artifact]}
            output_dict = {'output': [output_artifact]}
            m.publish_execution(eid, input_dict, output_dict)
            # Make sure artifacts in output_dict are published.
            self.assertEqual(ArtifactState.PUBLISHED, output_artifact.state)
            # Make sure execution state are changed.
            [execution] = m.store.get_executions_by_id([eid])
            self.assertEqual(metadata.EXECUTION_STATE_COMPLETE,
                             execution.properties['state'].string_value)
            # Make sure events are published.
            events = m.store.get_events_by_execution_ids([eid])
            self.assertEqual(2, len(events))
            self.assertEqual(input_artifact.id, events[0].artifact_id)
            self.assertEqual(metadata_store_pb2.Event.INPUT, events[0].type)
            self.assertProtoEquals(
                """
          steps {
            key: "input"
          }
          steps {
            index: 0
          }""", events[0].path)
            self.assertEqual(output_artifact.id, events[1].artifact_id)
            self.assertEqual(metadata_store_pb2.Event.OUTPUT, events[1].type)
            self.assertProtoEquals(
                """
          steps {
            key: "output"
          }
          steps {
            index: 0
          }""", events[1].path)
Пример #4
0
  def run_with_ir(
      self,
      pipeline: pipeline_pb2.Pipeline,
      run_options: Optional[pipeline_pb2.RunOptions] = None,
  ) -> None:
    """Runs given pipeline locally.

    Args:
      pipeline: Pipeline IR containing pipeline args and components.
      run_options: Optional args for the run.

    Raises:
      ValueError: If run_options is provided, and partial_run_options.from_nodes
        and partial_run_options.to_nodes are both empty.
    """
    # Substitute the runtime parameter to be a concrete run_id
    runtime_parameter_utils.substitute_runtime_parameter(
        pipeline, {
            constants.PIPELINE_RUN_ID_PARAMETER_NAME:
                datetime.datetime.now().isoformat(),
        })

    deployment_config = runner_utils.extract_local_deployment_config(pipeline)
    connection_config = getattr(
        deployment_config.metadata_connection_config,
        deployment_config.metadata_connection_config.WhichOneof(
            'connection_config'))

    logging.info('Using deployment config:\n %s', deployment_config)
    logging.info('Using connection config:\n %s', connection_config)

    if run_options:
      logging.info('Using run_options:\n %s', run_options)
      pr_opts = run_options.partial_run
      partial_run_utils.mark_pipeline(
          pipeline,
          from_nodes=pr_opts.from_nodes or None,
          to_nodes=pr_opts.to_nodes or None,
          snapshot_settings=pr_opts.snapshot_settings)

    with telemetry_utils.scoped_labels(
        {telemetry_utils.LABEL_TFX_RUNNER: 'local'}):
      # Run each component. Note that the pipeline.components list is in
      # topological order.
      #
      # TODO(b/171319478): After IR-based execution is used, used multi-threaded
      # execution so that independent components can be run in parallel.
      for node in pipeline.nodes:
        pipeline_node = node.pipeline_node
        node_id = pipeline_node.node_info.id
        if pipeline_node.execution_options.HasField('skip'):
          logging.info('Skipping component %s.', node_id)
          continue
        executor_spec = runner_utils.extract_executor_spec(
            deployment_config, node_id)
        custom_driver_spec = runner_utils.extract_custom_driver_spec(
            deployment_config, node_id)

        component_launcher = launcher.Launcher(
            pipeline_node=pipeline_node,
            mlmd_connection=metadata.Metadata(connection_config),
            pipeline_info=pipeline.pipeline_info,
            pipeline_runtime_spec=pipeline.runtime_spec,
            executor_spec=executor_spec,
            custom_driver_spec=custom_driver_spec)
        logging.info('Component %s is running.', node_id)
        if pipeline_node.execution_options.run.perform_snapshot:
          with metadata.Metadata(connection_config) as mlmd_handle:
            partial_run_utils.snapshot(mlmd_handle, pipeline)
        component_launcher.launch()
        logging.info('Component %s is finished.', node_id)
Пример #5
0
    def testStubbedTaxiPipelineBeam(self):
        pipeline_ir = compiler.Compiler().compile(self.taxi_pipeline)

        logging.info('Replacing with test_data_dir:%s',
                     self._recorded_output_dir)
        pipeline_mock.replace_executor_with_stub(pipeline_ir,
                                                 self._recorded_output_dir, [])

        BeamDagRunner().run(pipeline_ir)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 7 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache, pre_transform_statistics,
            # pre_transform_schema, post_transform_statistics, post_transform_schema,
            # post_transform_anomalies) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 7)
            self.assertLen(self.taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = pipeline_recorder_utils.get_component_id_from_execution(
                    m, execution)
                if component_id.startswith('Resolver'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        BeamDagRunner().run(self.taxi_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.taxi_pipeline.metadata_connection_config, self._pipeline_name)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation,
            # A subdirectory of updated_analyzer_cache has changing name.
            'updated_analyzer_cache': self._veryify_root_dir,
        }

        # List of components to verify. Resolver is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.taxi_pipeline.components
            if not component.id.startswith('Resolver')
        ]

        for component_id in verify_component_ids:
            logging.info('Verifying %s', component_id)
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
Пример #6
0
 def metadata(self):
     connection_config = metadata_store_pb2.ConnectionConfig()
     connection_config.sqlite.SetInParent()
     return metadata.Metadata(connection_config=connection_config)
    def testStubbedTaxiPipelineBeam(self):
        # Run pipeline with stub executors.
        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self._recorded_output_dir, test_component_ids=[])

        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_component_launcher.StubComponentLauncher,
            ])
        BeamDagRunner(config=stub_pipeline_config).run(self.taxi_pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 3 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 3)
            self.assertLen(self.taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id.startswith('ResolverNode'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        BeamDagRunner().run(self.taxi_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.taxi_pipeline.metadata_connection_config,
            self.taxi_pipeline.pipeline_info)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation
        }

        # List of components to verify. ResolverNode is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.taxi_pipeline.components
            if not component.id.startswith('ResolverNode')
        ]

        for component_id in verify_component_ids:
            logging.info('Verifying %s', component_id)
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
Пример #8
0
 def _publish_execution_to_metadata(self):
     with metadata.Metadata(self._metadata_connection_config,
                            self._logger) as m:
         return m.publish_execution(self._execution_id, self._input_dict,
                                    self._output_dict)
    def testPenguinPipelineSklearnLocal(self):
        LocalDagRunner().run(
            penguin_pipeline_sklearn_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                pipeline_root=self._pipeline_root,
                data_root=self._data_root,
                trainer_module_file=self._trainer_module_file,
                evaluator_module_file=self._evaluator_module_file,
                serving_model_dir=self._serving_model_dir,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 8  # 7 components + 1 resolver
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(expected_execution_count, execution_count)

        self.assertPipelineExecution()

        # Runs pipeline the second time.
        LocalDagRunner().run(
            penguin_pipeline_sklearn_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                pipeline_root=self._pipeline_root,
                data_root=self._data_root,
                trainer_module_file=self._trainer_module_file,
                evaluator_module_file=self._evaluator_module_file,
                serving_model_dir=self._serving_model_dir,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        with metadata.Metadata(metadata_config) as m:
            # Artifact count is increased by 3 caused by Evaluator and Pusher.
            self.assertEqual(artifact_count + 3, len(m.store.get_artifacts()))
            artifact_count = len(m.store.get_artifacts())
            self.assertEqual(expected_execution_count * 2,
                             len(m.store.get_executions()))

        # Runs pipeline the third time.
        LocalDagRunner().run(
            penguin_pipeline_sklearn_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                pipeline_root=self._pipeline_root,
                data_root=self._data_root,
                trainer_module_file=self._trainer_module_file,
                evaluator_module_file=self._evaluator_module_file,
                serving_model_dir=self._serving_model_dir,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            self.assertEqual(expected_execution_count * 3,
                             len(m.store.get_executions()))
Пример #10
0
    def testStubbedImdbPipelineBeam(self):
        # Runs the pipeline and record to self._recorded_output_dir
        stub_component_launcher.StubComponentLauncher.initialize(
            test_data_dir=self._recorded_output_dir, test_component_ids=[])

        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_component_launcher.StubComponentLauncher,
            ])
        local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run(
            self.imdb_pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            for execution in m.store.get_executions():
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id.startswith('ResolverNode'):
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    assert steps[0].HasField('key')
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))

        # Calls verifier for pipeline output artifacts, excluding the resolver node.
        local_dag_runner.LocalDagRunner().run(self.imdb_pipeline)
        pipeline_outputs = executor_verifier_utils.get_pipeline_outputs(
            self.imdb_pipeline.metadata_connection_config,
            self.imdb_pipeline.pipeline_info)

        verifier_map = {
            'model': self._verify_model,
            'model_run': self._verify_model,
            'examples': self._verify_examples,
            'schema': self._verify_schema,
            'anomalies': self._verify_anomalies,
            'evaluation': self._verify_evaluation
        }

        # List of components to verify. ResolverNode is ignored because it
        # doesn't have an executor.
        verify_component_ids = [
            component.id for component in self.imdb_pipeline.components
            if not component.id.startswith('ResolverNode')
        ]

        for component_id in verify_component_ids:
            for key, artifact_dict in pipeline_outputs[component_id].items():
                for idx, artifact in artifact_dict.items():
                    logging.info('Verifying %s', component_id)
                    recorded_uri = os.path.join(self._recorded_output_dir,
                                                component_id, key, str(idx))
                    verifier_map.get(key, self._verify_file_path)(artifact.uri,
                                                                  recorded_uri)
Пример #11
0
    def testRegisterExecutionBackwardCompatibility(self):
        with metadata.Metadata(connection_config=self._connection_config) as m:
            contexts = m.register_pipeline_contexts_if_not_exists(
                self._pipeline_info)

            # Puts in execution with more columns needed in MLMD schema first and
            # puts in execution with less columns needed next. Verifies the schema
            # update will not affect backward compatibility.
            exec_properties_one = {'arg_one': 1, 'arg_two': 2}
            exec_properties_two = {'arg_one': 1}
            execution_one = m.register_execution(
                input_artifacts={},
                exec_properties=exec_properties_one,
                pipeline_info=self._pipeline_info,
                component_info=self._component_info,
                contexts=contexts)
            execution_two = m.register_execution(
                input_artifacts={},
                exec_properties=exec_properties_two,
                pipeline_info=self._pipeline_info,
                component_info=self._component_info3,
                contexts=contexts)
            [execution_one, execution_two] = m.store.get_executions_by_id(
                [execution_one.id, execution_two.id])
            # Skip verifying time sensitive fields.
            execution_one.ClearField('create_time_since_epoch')
            execution_one.ClearField('last_update_time_since_epoch')
            self.assertProtoEquals(
                """
        id: 1
        type_id: 3
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }
        properties {
          key: "arg_two"
          value {
            string_value: "2"
          }
        }""", execution_one)
            # Skip verifying time sensitive fields.
            execution_two.ClearField('create_time_since_epoch')
            execution_two.ClearField('last_update_time_since_epoch')
            self.assertProtoEquals(
                """
        id: 2
        type_id: 3
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }""", execution_two)
Пример #12
0
    def testTaxiPipelineBeam(self):
        num_components = 10

        BeamDagRunner().run(
            taxi_pipeline_infraval_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(num_components, execution_count)

        self.assertPipelineExecution()
        self.assertInfraValidatorPassed()

        # Runs pipeline the second time.
        BeamDagRunner().run(
            taxi_pipeline_infraval_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        # All executions but Evaluator and Pusher are cached.
        # Note that Resolver will always execute.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is increased by 3 caused by Evaluator and Pusher.
            self.assertLen(m.store.get_artifacts(), artifact_count + 3)
            artifact_count = len(m.store.get_artifacts())
            # 10 more cached executions.
            self.assertLen(m.store.get_executions(), num_components * 2)

        # Runs pipeline the third time.
        BeamDagRunner().run(
            taxi_pipeline_infraval_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertLen(m.store.get_artifacts(), artifact_count)
            # 10 more cached executions.
            self.assertLen(m.store.get_executions(), num_components * 3)
Пример #13
0
    def testTaxiPipelineWithImporter(self):
        BeamDagRunner().run(
            taxi_pipeline_importer._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                user_schema_path=self._user_schema_path,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(10, execution_count)

        self.assertPipelineExecution()

        # Runs the pipeline again.
        BeamDagRunner().run(
            taxi_pipeline_importer._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                user_schema_path=self._user_schema_path,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        # All executions but Evaluator and Pusher are cached.
        # Note that Resolver will always execute.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is increased by 3 caused by Evaluator and Pusher.
            self.assertEqual(artifact_count + 3, len(m.store.get_artifacts()))
            artifact_count = len(m.store.get_artifacts())
            self.assertEqual(20, len(m.store.get_executions()))

        # Runs the pipeline the third time.
        BeamDagRunner().run(
            taxi_pipeline_importer._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                user_schema_path=self._user_schema_path,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                beam_pipeline_args=[]))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            self.assertEqual(30, len(m.store.get_executions()))
Пример #14
0
    def testRegisterExecutionUpdatedExecutionType(self):
        with metadata.Metadata(connection_config=self._connection_config) as m:
            contexts_one = m.register_pipeline_contexts_if_not_exists(
                self._pipeline_info)
            contexts_two = m.register_pipeline_contexts_if_not_exists(
                self._pipeline_info3)

            # Puts in execution with less columns needed in MLMD schema first and
            # puts in execution with more columns needed next. Verifies the schema
            # update will not be breaking change.
            exec_properties_one = {'arg_one': 1}
            exec_properties_two = {'arg_one': 1, 'arg_two': 2}
            execution_one = m.register_execution(
                input_artifacts={},
                exec_properties=exec_properties_one,
                pipeline_info=self._pipeline_info,
                component_info=self._component_info,
                contexts=contexts_one)
            execution_two = m.register_execution(
                input_artifacts={},
                exec_properties=exec_properties_two,
                pipeline_info=self._pipeline_info3,
                component_info=self._component_info3,
                contexts=contexts_two)
            [execution_one, execution_two] = m.store.get_executions_by_id(
                [execution_one.id, execution_two.id])
            self.assertProtoEquals(
                """
        id: 1
        type_id: 3
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }""", execution_one)
            self.assertProtoEquals(
                """
        id: 2
        type_id: 3
        properties {
          key: "state"
          value {
            string_value: "new"
          }
        }
        properties {
          key: "pipeline_name"
          value {
            string_value: "my_pipeline2"
          }
        }
        properties {
          key: "pipeline_root"
          value {
            string_value: "/tmp"
          }
        }
        properties {
          key: "run_id"
          value {
            string_value: "my_run_id"
          }
        }
        properties {
          key: "component_id"
          value {
            string_value: "my_component"
          }
        }
        properties {
          key: "arg_one"
          value {
            string_value: "1"
          }
        }
        properties {
          key: "arg_two"
          value {
            string_value: "2"
          }
        }""", execution_two)
Пример #15
0
 def testPublishSuccessfulExecution(self):
   with metadata.Metadata(connection_config=self._connection_config) as m:
     contexts = self._generate_contexts(m)
     execution_id = execution_publish_utils.register_execution(
         m, self._execution_type, contexts).id
     output_key = 'examples'
     output_example = standard_artifacts.Examples()
     output_example.uri = '/examples_uri'
     executor_output = execution_result_pb2.ExecutorOutput()
     text_format.Parse(
         """
         uri: '/examples_uri'
         custom_properties {
           key: 'prop'
           value {int_value: 1}
         }
         """, executor_output.output_artifacts[output_key].artifacts.add())
     output_dict = execution_publish_utils.publish_succeeded_execution(
         m, execution_id, contexts, {output_key: [output_example]},
         executor_output)
     [execution] = m.store.get_executions()
     self.assertProtoPartiallyEquals(
         """
         id: 1
         type_id: 3
         last_known_state: COMPLETE
         """,
         execution,
         ignored_fields=[
             'create_time_since_epoch', 'last_update_time_since_epoch'
         ])
     [artifact] = m.store.get_artifacts()
     self.assertProtoPartiallyEquals(
         """
         id: 1
         type_id: 4
         state: LIVE
         uri: '/examples_uri'
         custom_properties {
           key: 'prop'
           value {int_value: 1}
         }""",
         artifact,
         ignored_fields=[
             'create_time_since_epoch', 'last_update_time_since_epoch'
         ])
     [event] = m.store.get_events_by_execution_ids([execution.id])
     self.assertProtoPartiallyEquals(
         """
         artifact_id: 1
         execution_id: 1
         path {
           steps {
             key: 'examples'
           }
           steps {
             index: 0
           }
         }
         type: OUTPUT
         """,
         event,
         ignored_fields=['milliseconds_since_epoch'])
     # Verifies the context-execution edges are set up.
     self.assertCountEqual(
         [c.id for c in contexts],
         [c.id for c in m.store.get_contexts_by_execution(execution.id)])
     for artifact_list in output_dict.values():
       for output_example in artifact_list:
         self.assertCountEqual([c.id for c in contexts], [
             c.id for c in m.store.get_contexts_by_artifact(output_example.id)
         ])
Пример #16
0
  def testIrisPipelineNativeKeras(self):
    BeamDagRunner().run(
        iris_pipeline_native_keras_infraval._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[]))

    self.assertTrue(tf.io.gfile.exists(self._serving_model_dir))
    self.assertTrue(tf.io.gfile.exists(self._metadata_path))
    expected_execution_count = 10  # 9 components + 1 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    with metadata.Metadata(metadata_config) as m:
      artifact_count = len(m.store.get_artifacts())
      execution_count = len(m.store.get_executions())
      self.assertGreaterEqual(artifact_count, execution_count)
      self.assertEqual(expected_execution_count, execution_count)

    self.assertPipelineExecution()
    self.assertInfraValidatorPassed()

    # Runs pipeline the second time.
    BeamDagRunner().run(
        iris_pipeline_native_keras_infraval._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[]))

    # All executions but Evaluator and Pusher are cached.
    with metadata.Metadata(metadata_config) as m:
      # Artifact count is increased by 3 caused by Evaluator and Pusher.
      self.assertEqual(artifact_count + 3, len(m.store.get_artifacts()))
      artifact_count = len(m.store.get_artifacts())
      self.assertEqual(expected_execution_count * 2,
                       len(m.store.get_executions()))

    # Runs pipeline the third time.
    BeamDagRunner().run(
        iris_pipeline_native_keras_infraval._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[]))

    # Asserts cache execution.
    with metadata.Metadata(metadata_config) as m:
      # Artifact count is unchanged.
      self.assertEqual(artifact_count, len(m.store.get_artifacts()))
      self.assertEqual(expected_execution_count * 3,
                       len(m.store.get_executions()))
Пример #17
0
  def testPublishSuccessExecutionExecutorEditedOutputDict(self):
    # There is one artifact in the system provided output_dict, while there are
    # two artifacts in executor output. We expect that two artifacts are
    # published.
    with metadata.Metadata(connection_config=self._connection_config) as m:
      contexts = self._generate_contexts(m)
      execution_id = execution_publish_utils.register_execution(
          m, self._execution_type, contexts).id

      output_example = standard_artifacts.Examples()
      output_example.uri = '/original_path'

      executor_output = execution_result_pb2.ExecutorOutput()
      output_key = 'examples'
      text_format.Parse(
          """
          uri: '/original_path/subdir_1'
          custom_properties {
            key: 'prop'
            value {int_value: 1}
          }
          """, executor_output.output_artifacts[output_key].artifacts.add())
      text_format.Parse(
          """
          uri: '/original_path/subdir_2'
          custom_properties {
            key: 'prop'
            value {int_value: 2}
          }
          """, executor_output.output_artifacts[output_key].artifacts.add())

      output_dict = execution_publish_utils.publish_succeeded_execution(
          m, execution_id, contexts, {output_key: [output_example]},
          executor_output)
      [execution] = m.store.get_executions()
      self.assertProtoPartiallyEquals(
          """
          id: 1
          type_id: 3
          last_known_state: COMPLETE
          """,
          execution,
          ignored_fields=[
              'create_time_since_epoch', 'last_update_time_since_epoch'
          ])
      artifacts = m.store.get_artifacts()
      self.assertLen(artifacts, 2)
      self.assertProtoPartiallyEquals(
          """
          id: 1
          type_id: 4
          state: LIVE
          uri: '/original_path/subdir_1'
          custom_properties {
            key: 'prop'
            value {int_value: 1}
          }""",
          artifacts[0],
          ignored_fields=[
              'create_time_since_epoch', 'last_update_time_since_epoch'
          ])
      self.assertProtoPartiallyEquals(
          """
          id: 2
          type_id: 4
          state: LIVE
          uri: '/original_path/subdir_2'
          custom_properties {
            key: 'prop'
            value {int_value: 2}
          }""",
          artifacts[1],
          ignored_fields=[
              'create_time_since_epoch', 'last_update_time_since_epoch'
          ])
      events = m.store.get_events_by_execution_ids([execution.id])
      self.assertLen(events, 2)
      self.assertProtoPartiallyEquals(
          """
          artifact_id: 1
          execution_id: 1
          path {
            steps {
              key: 'examples'
            }
            steps {
              index: 0
            }
          }
          type: OUTPUT
          """,
          events[0],
          ignored_fields=['milliseconds_since_epoch'])
      self.assertProtoPartiallyEquals(
          """
          artifact_id: 2
          execution_id: 1
          path {
            steps {
              key: 'examples'
            }
            steps {
              index: 1
            }
          }
          type: OUTPUT
          """,
          events[1],
          ignored_fields=['milliseconds_since_epoch'])
      # Verifies the context-execution edges are set up.
      self.assertCountEqual(
          [c.id for c in contexts],
          [c.id for c in m.store.get_contexts_by_execution(execution.id)])
      for artifact_list in output_dict.values():
        for output_example in artifact_list:
          self.assertCountEqual([c.id for c in contexts], [
              c.id for c in m.store.get_contexts_by_artifact(output_example.id)
          ])
Пример #18
0
    def setUp(self):
        super(TaskManagerE2ETest, self).setUp()
        pipeline_root = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self.id())

        # Makes sure multiple connections within a test always connect to the same
        # MLMD instance.
        metadata_path = os.path.join(pipeline_root, 'metadata', 'metadata.db')
        self._metadata_path = metadata_path
        connection_config = metadata.sqlite_metadata_connection_config(
            metadata_path)
        connection_config.sqlite.SetInParent()
        self._mlmd_connection = metadata.Metadata(
            connection_config=connection_config)

        # Sets up the pipeline.
        pipeline = pipeline_pb2.Pipeline()
        self.load_proto_from_text(
            os.path.join(os.path.dirname(__file__), 'testdata',
                         'async_pipeline.pbtxt'), pipeline)

        # Extracts components.
        self._example_gen = pipeline.nodes[0].pipeline_node
        self._transform = pipeline.nodes[1].pipeline_node
        self._trainer = pipeline.nodes[2].pipeline_node

        # Pack deployment config for testing.
        deployment_config = pipeline_pb2.IntermediateDeploymentConfig()
        executor_spec = pipeline_pb2.ExecutorSpec.PythonClassExecutorSpec(
            class_path='fake.ClassPath')
        deployment_config.executor_specs[self._trainer.node_info.id].Pack(
            executor_spec)
        deployment_config.executor_specs[self._transform.node_info.id].Pack(
            executor_spec)
        self._type_url = deployment_config.executor_specs[
            self._trainer.node_info.id].type_url
        pipeline.deployment_config.Pack(deployment_config)
        self._pipeline = pipeline
        self._pipeline_info = pipeline.pipeline_info
        self._pipeline_runtime_spec = pipeline.runtime_spec
        self._pipeline_runtime_spec.pipeline_root.field_value.string_value = (
            pipeline_root)

        ts.TaskSchedulerRegistry.clear()
        self._task_queue = tq.TaskQueue()

        # Run fake example-gen to prepare downstreams component triggers.
        test_utils.fake_example_gen_run(self._mlmd_connection,
                                        self._example_gen, 1, 1)

        # Task generator should produce a task to run transform.
        with self._mlmd_connection as m:
            tasks = asptg.AsyncPipelineTaskGenerator(
                m, self._pipeline,
                self._task_queue.contains_task_id).generate()
        self.assertLen(tasks, 1)
        task = tasks[0]
        self.assertEqual('my_transform', task.node_uid.node_id)

        # Task generator should produce a task to run transform.
        with self._mlmd_connection as m:
            tasks = asptg.AsyncPipelineTaskGenerator(
                m, self._pipeline,
                self._task_queue.contains_task_id).generate()
        self.assertLen(tasks, 1)
        self._task = tasks[0]
        self.assertEqual('my_transform', self._task.node_uid.node_id)
        self._task_queue.enqueue(self._task)

        # There should be 1 active execution in MLMD.
        with self._mlmd_connection as m:
            executions = m.store.get_executions()
        active_executions = [
            e for e in executions
            if e.last_known_state == metadata_store_pb2.Execution.RUNNING
        ]
        self.assertLen(active_executions, 1)

        # Active execution id.
        self._execution_id = active_executions[0].id