Exemplos de MetadataStore em Python, exemplos de ml_metadata.MetadataStore em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: penguin_pipeline_local_e2e_test.py Projeto: htahir1/tfx

  def testPenguinPipelineLocalWithImporter(self, model_framework):
    module_file = self._module_file_name(model_framework)
    LocalDagRunner().run(
        penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=0.1,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=self._schema_path,
            enable_tuning=False,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=[]))

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    expected_execution_count = 9  # 7 components + 1 resolver + 1 importer
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)

    self._assertPipelineExecution(has_schema_gen=False)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: penguin_pipeline_local_e2e_test.py Projeto: htahir1/tfx

  def testPenguinPipelineLocalWithTuner(self):
    # TODO(b/180723394): Parameterize this test when Flax supports tuning.
    module_file = self._module_file_name('keras')
    LocalDagRunner().run(
        penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=0.1,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=None,
            enable_tuning=True,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=self._make_beam_pipeline_args()))

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    expected_execution_count = 10  # 9 components + 1 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)

    self._assertPipelineExecution(has_tuner=True)

Exemplo n.º 3

0

Exibir arquivo

    def testPenguinPipelineLocal(self, model_framework):
        if model_framework == 'tfdf_experimental':
            # Skip if TFDF is not available or incompatible.
            try:
                importlib.import_module('tensorflow_decision_forests')
            except (ImportError, tf.errors.NotFoundError):
                self.skipTest('TensorflowDecisionForests is not available')
        module_file = self._module_file_name(model_framework)
        pipeline = penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=0.1,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=None,
            enable_tuning=False,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=self._make_beam_pipeline_args(),
            enable_transform_input_cache=False)

        logging.info('Starting the first pipeline run.')
        LocalDagRunner().run(pipeline)

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 8  # 7 components + 1 resolver
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution()

        logging.info('Starting the second pipeline run. All components except '
                     'Evaluator and Pusher will use cached results.')
        LocalDagRunner().run(pipeline)

        # Artifact count is increased by 3 caused by Evaluator and Pusher.
        self.assertLen(store.get_artifacts(), artifact_count + 3)
        artifact_count = len(store.get_artifacts())
        self.assertLen(store.get_executions(), expected_execution_count * 2)

        logging.info('Starting the third pipeline run. '
                     'All components will use cached results.')
        LocalDagRunner().run(pipeline)

        # Asserts cache execution.
        # Artifact count is unchanged.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 3)

Exemplo n.º 4

0

Exibir arquivo

    def testPenguinPipelineLocalConditionalWithoutPusher(self):
        module_file = self._module_file_name('keras')
        pipeline = penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=
            1.0,  # Model evaluation will fail with 1.0 threshold
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=None,
            enable_tuning=False,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=self._make_beam_pipeline_args(),
            enable_transform_input_cache=False)

        logging.info('Starting the first pipeline run.')
        LocalDagRunner().run(pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 7  # Without pusher because evaluation fails
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution(has_pusher=False)

        logging.info('Starting the second pipeline run. All components except '
                     'Evaluator will use cached results. Pusher will not run.')
        LocalDagRunner().run(pipeline)

        # Artifact count stays the same, because no new blessed model,
        # hence no new evaluation and no new pushed model.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 2)

        logging.info('Starting the third pipeline run. '
                     'All components will use cached results.')
        LocalDagRunner().run(pipeline)

        # Asserts cache execution.
        # Artifact count is unchanged.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 3)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: penguin_pipeline_local_e2e_test.py Projeto: htahir1/tfx

  def testPenguinPipelineLocal(self, model_framework):
    module_file = self._module_file_name(model_framework)
    pipeline = penguin_pipeline_local._create_pipeline(
        pipeline_name=self._pipeline_name,
        data_root=self._data_root,
        module_file=module_file,
        accuracy_threshold=0.1,
        serving_model_dir=self._serving_model_dir,
        pipeline_root=self._pipeline_root,
        metadata_path=self._metadata_path,
        user_provided_schema_path=None,
        enable_tuning=False,
        enable_bulk_inferrer=False,
        examplegen_input_config=None,
        examplegen_range_config=None,
        resolver_range_config=None,
        beam_pipeline_args=self._make_beam_pipeline_args())

    logging.info('Starting the first pipeline run.')
    LocalDagRunner().run(pipeline)

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    expected_execution_count = 9  # 8 components + 1 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)

    self._assertPipelineExecution()

    logging.info('Starting the second pipeline run. All components except '
                 'Evaluator and Pusher will use cached results.')
    LocalDagRunner().run(pipeline)

    # Artifact count is increased by 3 caused by Evaluator and Pusher.
    self.assertLen(store.get_artifacts(), artifact_count + 3)
    artifact_count = len(store.get_artifacts())
    self.assertLen(store.get_executions(), expected_execution_count * 2)

    logging.info('Starting the third pipeline run. '
                 'All components will use cached results.')
    LocalDagRunner().run(pipeline)

    # Asserts cache execution.
    # Artifact count is unchanged.
    self.assertLen(store.get_artifacts(), artifact_count)
    self.assertLen(store.get_executions(), expected_execution_count * 3)

Exemplo n.º 6

0

Exibir arquivo

    def __enter__(self) -> 'Metadata':
        # TODO(ruoyu): Establishing a connection pool instead of newing
        # a connection every time. Until then, check self._store before usage
        # in every method.
        connection_error = None
        for _ in range(_MAX_INIT_RETRY):
            try:
                self._store = mlmd.MetadataStore(self._connection_config)
            except RuntimeError as err:
                # MetadataStore could raise Aborted error if multiple concurrent
                # connections try to execute initialization DDL in database.
                # This is safe to retry.
                connection_error = err
                time.sleep(random.random())
                continue
            else:
                return self

        raise RuntimeError(
            'Failed to establish connection to Metadata storage with error: %s'
            % connection_error)

Exemplo n.º 7

0

Exibir arquivo

    def testPenguinPipelineLocalWithBulkInferrer(self, model_framework):
        if model_framework == 'tfdf_experimental':
            # Skip if TFDF is not available or incompatible.
            try:
                importlib.import_module('tensorflow_decision_forests')
            except (ImportError, tf.errors.NotFoundError):
                self.skipTest('TensorflowDecisionForests is not available')
        module_file = self._module_file_name(model_framework)
        LocalDagRunner().run(
            penguin_pipeline_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=module_file,
                accuracy_threshold=0.1,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                user_provided_schema_path=None,
                enable_tuning=False,
                enable_bulk_inferrer=True,
                examplegen_input_config=None,
                examplegen_range_config=None,
                resolver_range_config=None,
                beam_pipeline_args=[],
                enable_transform_input_cache=False))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 10  # 9 components + 1 resolver
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution(has_bulk_inferrer=True)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: testdata_utils.py Projeto: tensorflow/model-card-toolkit

def get_tfx_pipeline_metadata_store(tmp_db_path: str) -> mlmd.MetadataStore:
    """Copies and opens a metadata_store from the testdata tfx pipeline db.

  It migrates the db to the compatible schema at the head. In addition, it
  updates the stored artifacts' uri to the test data db path, so that the test
  code can open the testdata files mentioned in the database.

  Args:
    tmp_db_path: a temp path for copying the pipeline database.

  Returns:
    A ml-metadata store for the copied pipeline db.
  """
    testdata_db_path = os.path.join(_TEST_DATA_DIR, _TFX_0_21_DB_FILE)
    shutil.copyfile(testdata_db_path, tmp_db_path)

    connection_config = metadata_store_pb2.ConnectionConfig(
        sqlite=metadata_store_pb2.SqliteMetadataSourceConfig(
            filename_uri=tmp_db_path,
            connection_mode=metadata_store_pb2.SqliteMetadataSourceConfig.
            READWRITE,
        ))
    # The pipeline db is created with mlmd 0.21, the test run from the head
    # may include newer mlmd schema versions. We migrate the db to newer
    # mlmd schema if needed.
    store = mlmd.MetadataStore(connection_config,
                               enable_upgrade_migration=True)
    # The pipeline db is generated with real pipelines in which the payloads of
    # the artifacts are stored in the file system when the pipeline ran. We fix
    # the uri to point to the testdata payloads generated by the pipeline.
    fixed_artifacts = []
    for artifact in store.get_artifacts():
        artifact.uri = artifact.uri.replace(_TFX_0_21_PAYLOAD_DIR,
                                            _TEST_DATA_DIR)
        fixed_artifacts.append(artifact)
    store.put_artifacts(fixed_artifacts)
    return store

Exemplo n.º 9

0

Exibir arquivo

Arquivo: penguin_pipeline_local_e2e_test.py Projeto: htahir1/tfx

  def testPenguinPipelineLocalWithRollingWindow(self, model_framework):
    module_file = self._module_file_name('keras')
    examplegen_input_config = proto.Input(splits=[
        proto.Input.Split(name='test', pattern='day{SPAN}/*'),
    ])
    resolver_range_config = proto.RangeConfig(
        rolling_range=proto.RollingRange(num_spans=2))

    def run_pipeline(examplegen_range_config):
      LocalDagRunner().run(
          penguin_pipeline_local._create_pipeline(
              pipeline_name=self._pipeline_name,
              data_root=self._data_root_span,
              module_file=module_file,
              accuracy_threshold=0.1,
              serving_model_dir=self._serving_model_dir,
              pipeline_root=self._pipeline_root,
              metadata_path=self._metadata_path,
              user_provided_schema_path=None,
              enable_tuning=False,
              enable_bulk_inferrer=False,
              examplegen_input_config=examplegen_input_config,
              examplegen_range_config=examplegen_range_config,
              resolver_range_config=resolver_range_config,
              beam_pipeline_args=self._make_beam_pipeline_args()))

    # Trigger the pipeline for the first span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=1, end_span_number=1))
    run_pipeline(examplegen_range_config)

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    self._assertPipelineExecution()
    transform_execution_type = 'tfx.components.transform.component.Transform'
    trainer_execution_type = 'tfx.components.trainer.component.Trainer'
    expected_execution_count = 10  # 8 components + 2 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 1)
    # SpansResolver (controlled by resolver_range_config) returns span 1.
    self.assertEqual(
        1, tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value)

    # Trigger the pipeline for the second span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=2, end_span_number=2))
    run_pipeline(examplegen_range_config)

    execution_count = len(store.get_executions())
    self.assertEqual(expected_execution_count * 2, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 2)
    spans = {
        tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value,
        tft_input_examples_artifacts[1].custom_properties[
            _SPAN_PROPERTY_NAME].int_value
    }
    # SpansResolver (controlled by resolver_range_config) returns span 1 & 2.
    self.assertSetEqual({1, 2}, spans)
    # Verify Trainer's input examples artifacts.
    self.assertLen(
        self._get_input_examples_artifacts(store, trainer_execution_type),
        2)

    # Trigger the pipeline for the thrid span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=3, end_span_number=3))
    run_pipeline(examplegen_range_config)

    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    execution_count = len(store.get_executions())
    self.assertEqual(expected_execution_count * 3, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 2)
    spans = {
        tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value,
        tft_input_examples_artifacts[1].custom_properties[
            _SPAN_PROPERTY_NAME].int_value
    }
    # SpansResolver (controlled by resolver_range_config) returns span 2 & 3.
    self.assertSetEqual({2, 3}, spans)
    # Verify Trainer's input examples artifacts.
    self.assertLen(
        self._get_input_examples_artifacts(store, trainer_execution_type),
        2)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: model_card_toolkit_test.py Projeto: Rqcker/model-card-toolkit

    def test_scaffold_assets_with_source(self, output_file_format: str,
                                         artifacts: bool):
        if artifacts:
            connection_config = metadata_store_pb2.ConnectionConfig()
            connection_config.fake_database.SetInParent()
            mlmd_store = mlmd.MetadataStore(connection_config)
        else:
            mlmd_store = None

        train_dataset_name = 'Dataset-Split-train'
        train_features = ['feature_name1']
        eval_dataset_name = 'Dataset-Split-eval'
        eval_features = ['feature_name2']

        tfma_path = os.path.join(self.tmpdir, 'tfma')
        tfdv_path = os.path.join(self.tmpdir, 'tfdv')
        pushed_model_path = os.path.join(self.tmpdir, 'pushed_model')
        self._write_tfma(tfma_path, output_file_format, mlmd_store)
        self._write_tfdv(tfdv_path, train_dataset_name, train_features,
                         eval_dataset_name, eval_features, mlmd_store)

        if artifacts:
            model_evaluation_artifacts = mlmd_store.get_artifacts_by_type(
                standard_artifacts.ModelEvaluation.TYPE_NAME)
            example_statistics_artifacts = mlmd_store.get_artifacts_by_type(
                standard_artifacts.ExampleStatistics.TYPE_NAME)
            pushed_model_artifact = standard_artifacts.PushedModel()
            pushed_model_artifact.uri = pushed_model_path
            tfma_src = src.TfmaSource(
                model_evaluation_artifacts=model_evaluation_artifacts,
                metrics_exclude=['average_loss'])
            tfdv_src = src.TfdvSource(
                example_statistics_artifacts=example_statistics_artifacts,
                features_include=['feature_name1'])
            model_src = src.ModelSource(
                pushed_model_artifact=pushed_model_artifact)
        else:
            tfma_src = src.TfmaSource(eval_result_paths=[tfma_path],
                                      metrics_exclude=['average_loss'])
            tfdv_src = src.TfdvSource(dataset_statistics_paths=[tfdv_path],
                                      features_include=['feature_name1'])
            model_src = src.ModelSource(pushed_model_path=pushed_model_path)

        mc = model_card_toolkit.ModelCardToolkit(source=src.Source(
            tfma=tfma_src, tfdv=tfdv_src, model=model_src)).scaffold_assets()

        with self.subTest(name='quantitative_analysis'):
            list_to_proto = lambda lst: [x.to_proto() for x in lst]
            expected_performance_metrics = [
                model_card.PerformanceMetric(
                    type='post_export_metrics/example_count', value='2.0')
            ]
            self.assertCountEqual(
                list_to_proto(mc.quantitative_analysis.performance_metrics),
                list_to_proto(expected_performance_metrics))
            self.assertLen(mc.quantitative_analysis.graphics.collection, 1)

        with self.subTest(name='model_parameters.data'):
            self.assertLen(mc.model_parameters.data, 2)  # train and eval
            for dataset in mc.model_parameters.data:
                for graphic in dataset.graphics.collection:
                    self.assertIsNotNone(
                        graphic.image,
                        msg=
                        f'No image found for graphic: {dataset.name} {graphic.name}'
                    )
                    graphic.image = None  # ignore graphic.image for below assertions
            self.assertIn(
                model_card.Dataset(
                    name=train_dataset_name,
                    graphics=model_card.GraphicsCollection(collection=[
                        model_card.Graphic(name='counts | feature_name1')
                    ])), mc.model_parameters.data)
            self.assertNotIn(
                model_card.Dataset(
                    name=eval_dataset_name,
                    graphics=model_card.GraphicsCollection(collection=[
                        model_card.Graphic(name='counts | feature_name2')
                    ])), mc.model_parameters.data)

        with self.subTest(name='model_details.path'):
            self.assertEqual(mc.model_details.path, pushed_model_path)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: tfx_util_test.py Projeto: tensorflow/model-card-toolkit

 def _get_empty_metadata_store(self):
   """Returns an empty in memory mlmd store."""
   empty_db_config = metadata_store_pb2.ConnectionConfig()
   empty_db_config.fake_database.SetInParent()
   return mlmd.MetadataStore(empty_db_config)