示例#1
0
    def testPenguinPipelineLocalWithImporter(self):
        module_file = self._module_file_name('keras')
        LocalDagRunner().run(
            penguin_pipeline_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=module_file,
                accuracy_threshold=0.1,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                user_provided_schema_path=self._schema_path,
                enable_tuning=False,
                enable_bulk_inferrer=False,
                examplegen_input_config=None,
                examplegen_range_config=None,
                resolver_range_config=None,
                beam_pipeline_args=[],
                enable_transform_input_cache=False))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 9  # 7 components + 1 resolver + 1 importer
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution(has_schema_gen=False)
示例#2
0
    def testPenguinPipelineLocal(self, model_framework):
        if model_framework == 'tfdf_experimental':
            # Skip if TFDF is not available or incompatible.
            try:
                importlib.import_module('tensorflow_decision_forests')
            except (ImportError, tf.errors.NotFoundError):
                self.skipTest('TensorflowDecisionForests is not available')
        module_file = self._module_file_name(model_framework)
        pipeline = penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=0.1,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=None,
            enable_tuning=False,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=self._make_beam_pipeline_args(),
            enable_transform_input_cache=False)

        logging.info('Starting the first pipeline run.')
        LocalDagRunner().run(pipeline)

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 8  # 7 components + 1 resolver
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution()

        logging.info('Starting the second pipeline run. All components except '
                     'Evaluator and Pusher will use cached results.')
        LocalDagRunner().run(pipeline)

        # Artifact count is increased by 3 caused by Evaluator and Pusher.
        self.assertLen(store.get_artifacts(), artifact_count + 3)
        artifact_count = len(store.get_artifacts())
        self.assertLen(store.get_executions(), expected_execution_count * 2)

        logging.info('Starting the third pipeline run. '
                     'All components will use cached results.')
        LocalDagRunner().run(pipeline)

        # Asserts cache execution.
        # Artifact count is unchanged.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 3)
示例#3
0
    def testPenguinPipelineLocalConditionalWithoutPusher(self):
        module_file = self._module_file_name('keras')
        pipeline = penguin_pipeline_local._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=module_file,
            accuracy_threshold=
            1.0,  # Model evaluation will fail with 1.0 threshold
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            user_provided_schema_path=None,
            enable_tuning=False,
            enable_bulk_inferrer=False,
            examplegen_input_config=None,
            examplegen_range_config=None,
            resolver_range_config=None,
            beam_pipeline_args=self._make_beam_pipeline_args(),
            enable_transform_input_cache=False)

        logging.info('Starting the first pipeline run.')
        LocalDagRunner().run(pipeline)

        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 7  # Without pusher because evaluation fails
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution(has_pusher=False)

        logging.info('Starting the second pipeline run. All components except '
                     'Evaluator will use cached results. Pusher will not run.')
        LocalDagRunner().run(pipeline)

        # Artifact count stays the same, because no new blessed model,
        # hence no new evaluation and no new pushed model.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 2)

        logging.info('Starting the third pipeline run. '
                     'All components will use cached results.')
        LocalDagRunner().run(pipeline)

        # Asserts cache execution.
        # Artifact count is unchanged.
        self.assertLen(store.get_artifacts(), artifact_count)
        self.assertLen(store.get_executions(), expected_execution_count * 3)
  def testPenguinPipelineLocal(self, model_framework):
    module_file = self._module_file_name(model_framework)
    pipeline = penguin_pipeline_local._create_pipeline(
        pipeline_name=self._pipeline_name,
        data_root=self._data_root,
        module_file=module_file,
        accuracy_threshold=0.1,
        serving_model_dir=self._serving_model_dir,
        pipeline_root=self._pipeline_root,
        metadata_path=self._metadata_path,
        user_provided_schema_path=None,
        enable_tuning=False,
        enable_bulk_inferrer=False,
        examplegen_input_config=None,
        examplegen_range_config=None,
        resolver_range_config=None,
        beam_pipeline_args=self._make_beam_pipeline_args())

    logging.info('Starting the first pipeline run.')
    LocalDagRunner().run(pipeline)

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    expected_execution_count = 9  # 8 components + 1 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)

    self._assertPipelineExecution()

    logging.info('Starting the second pipeline run. All components except '
                 'Evaluator and Pusher will use cached results.')
    LocalDagRunner().run(pipeline)

    # Artifact count is increased by 3 caused by Evaluator and Pusher.
    self.assertLen(store.get_artifacts(), artifact_count + 3)
    artifact_count = len(store.get_artifacts())
    self.assertLen(store.get_executions(), expected_execution_count * 2)

    logging.info('Starting the third pipeline run. '
                 'All components will use cached results.')
    LocalDagRunner().run(pipeline)

    # Asserts cache execution.
    # Artifact count is unchanged.
    self.assertLen(store.get_artifacts(), artifact_count)
    self.assertLen(store.get_executions(), expected_execution_count * 3)
示例#5
0
    def testPenguinPipelineLocalWithBulkInferrer(self, model_framework):
        if model_framework == 'tfdf_experimental':
            # Skip if TFDF is not available or incompatible.
            try:
                importlib.import_module('tensorflow_decision_forests')
            except (ImportError, tf.errors.NotFoundError):
                self.skipTest('TensorflowDecisionForests is not available')
        module_file = self._module_file_name(model_framework)
        LocalDagRunner().run(
            penguin_pipeline_local._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=module_file,
                accuracy_threshold=0.1,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                user_provided_schema_path=None,
                enable_tuning=False,
                enable_bulk_inferrer=True,
                examplegen_input_config=None,
                examplegen_range_config=None,
                resolver_range_config=None,
                beam_pipeline_args=[],
                enable_transform_input_cache=False))

        self.assertTrue(fileio.exists(self._serving_model_dir))
        self.assertTrue(fileio.exists(self._metadata_path))
        expected_execution_count = 10  # 9 components + 1 resolver
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        store = mlmd.MetadataStore(metadata_config)
        artifact_count = len(store.get_artifacts())
        execution_count = len(store.get_executions())
        self.assertGreaterEqual(artifact_count, execution_count)
        self.assertEqual(expected_execution_count, execution_count)

        self._assertPipelineExecution(has_bulk_inferrer=True)
  def testPenguinPipelineLocalWithRollingWindow(self, model_framework):
    module_file = self._module_file_name('keras')
    examplegen_input_config = proto.Input(splits=[
        proto.Input.Split(name='test', pattern='day{SPAN}/*'),
    ])
    resolver_range_config = proto.RangeConfig(
        rolling_range=proto.RollingRange(num_spans=2))

    def run_pipeline(examplegen_range_config):
      LocalDagRunner().run(
          penguin_pipeline_local._create_pipeline(
              pipeline_name=self._pipeline_name,
              data_root=self._data_root_span,
              module_file=module_file,
              accuracy_threshold=0.1,
              serving_model_dir=self._serving_model_dir,
              pipeline_root=self._pipeline_root,
              metadata_path=self._metadata_path,
              user_provided_schema_path=None,
              enable_tuning=False,
              enable_bulk_inferrer=False,
              examplegen_input_config=examplegen_input_config,
              examplegen_range_config=examplegen_range_config,
              resolver_range_config=resolver_range_config,
              beam_pipeline_args=self._make_beam_pipeline_args()))

    # Trigger the pipeline for the first span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=1, end_span_number=1))
    run_pipeline(examplegen_range_config)

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    self._assertPipelineExecution()
    transform_execution_type = 'tfx.components.transform.component.Transform'
    trainer_execution_type = 'tfx.components.trainer.component.Trainer'
    expected_execution_count = 10  # 8 components + 2 resolver
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    store = mlmd.MetadataStore(metadata_config)
    artifact_count = len(store.get_artifacts())
    execution_count = len(store.get_executions())
    self.assertGreaterEqual(artifact_count, execution_count)
    self.assertEqual(expected_execution_count, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 1)
    # SpansResolver (controlled by resolver_range_config) returns span 1.
    self.assertEqual(
        1, tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value)

    # Trigger the pipeline for the second span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=2, end_span_number=2))
    run_pipeline(examplegen_range_config)

    execution_count = len(store.get_executions())
    self.assertEqual(expected_execution_count * 2, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 2)
    spans = {
        tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value,
        tft_input_examples_artifacts[1].custom_properties[
            _SPAN_PROPERTY_NAME].int_value
    }
    # SpansResolver (controlled by resolver_range_config) returns span 1 & 2.
    self.assertSetEqual({1, 2}, spans)
    # Verify Trainer's input examples artifacts.
    self.assertLen(
        self._get_input_examples_artifacts(store, trainer_execution_type),
        2)

    # Trigger the pipeline for the thrid span.
    examplegen_range_config = proto.RangeConfig(
        static_range=proto.StaticRange(
            start_span_number=3, end_span_number=3))
    run_pipeline(examplegen_range_config)

    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    execution_count = len(store.get_executions())
    self.assertEqual(expected_execution_count * 3, execution_count)
    # Verify Transform's input examples artifacts.
    tft_input_examples_artifacts = self._get_input_examples_artifacts(
        store, transform_execution_type)
    self.assertLen(tft_input_examples_artifacts, 2)
    spans = {
        tft_input_examples_artifacts[0].custom_properties[
            _SPAN_PROPERTY_NAME].int_value,
        tft_input_examples_artifacts[1].custom_properties[
            _SPAN_PROPERTY_NAME].int_value
    }
    # SpansResolver (controlled by resolver_range_config) returns span 2 & 3.
    self.assertSetEqual({2, 3}, spans)
    # Verify Trainer's input examples artifacts.
    self.assertLen(
        self._get_input_examples_artifacts(store, trainer_execution_type),
        2)