def testPenguinPipelineLocalWithImporter(self): module_file = self._module_file_name('keras') LocalDagRunner().run( penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=module_file, accuracy_threshold=0.1, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=self._schema_path, enable_tuning=False, enable_bulk_inferrer=False, examplegen_input_config=None, examplegen_range_config=None, resolver_range_config=None, beam_pipeline_args=[], enable_transform_input_cache=False)) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 9 # 7 components + 1 resolver + 1 importer metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self._assertPipelineExecution(has_schema_gen=False)
def testPenguinPipelineLocal(self, model_framework): if model_framework == 'tfdf_experimental': # Skip if TFDF is not available or incompatible. try: importlib.import_module('tensorflow_decision_forests') except (ImportError, tf.errors.NotFoundError): self.skipTest('TensorflowDecisionForests is not available') module_file = self._module_file_name(model_framework) pipeline = penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=module_file, accuracy_threshold=0.1, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=None, enable_tuning=False, enable_bulk_inferrer=False, examplegen_input_config=None, examplegen_range_config=None, resolver_range_config=None, beam_pipeline_args=self._make_beam_pipeline_args(), enable_transform_input_cache=False) logging.info('Starting the first pipeline run.') LocalDagRunner().run(pipeline) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 8 # 7 components + 1 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self._assertPipelineExecution() logging.info('Starting the second pipeline run. All components except ' 'Evaluator and Pusher will use cached results.') LocalDagRunner().run(pipeline) # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertLen(store.get_artifacts(), artifact_count + 3) artifact_count = len(store.get_artifacts()) self.assertLen(store.get_executions(), expected_execution_count * 2) logging.info('Starting the third pipeline run. ' 'All components will use cached results.') LocalDagRunner().run(pipeline) # Asserts cache execution. # Artifact count is unchanged. self.assertLen(store.get_artifacts(), artifact_count) self.assertLen(store.get_executions(), expected_execution_count * 3)
def testPenguinPipelineLocalConditionalWithoutPusher(self): module_file = self._module_file_name('keras') pipeline = penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=module_file, accuracy_threshold= 1.0, # Model evaluation will fail with 1.0 threshold serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=None, enable_tuning=False, enable_bulk_inferrer=False, examplegen_input_config=None, examplegen_range_config=None, resolver_range_config=None, beam_pipeline_args=self._make_beam_pipeline_args(), enable_transform_input_cache=False) logging.info('Starting the first pipeline run.') LocalDagRunner().run(pipeline) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 7 # Without pusher because evaluation fails metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self._assertPipelineExecution(has_pusher=False) logging.info('Starting the second pipeline run. All components except ' 'Evaluator will use cached results. Pusher will not run.') LocalDagRunner().run(pipeline) # Artifact count stays the same, because no new blessed model, # hence no new evaluation and no new pushed model. self.assertLen(store.get_artifacts(), artifact_count) self.assertLen(store.get_executions(), expected_execution_count * 2) logging.info('Starting the third pipeline run. ' 'All components will use cached results.') LocalDagRunner().run(pipeline) # Asserts cache execution. # Artifact count is unchanged. self.assertLen(store.get_artifacts(), artifact_count) self.assertLen(store.get_executions(), expected_execution_count * 3)
def testPenguinPipelineLocal(self, model_framework): module_file = self._module_file_name(model_framework) pipeline = penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=module_file, accuracy_threshold=0.1, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=None, enable_tuning=False, enable_bulk_inferrer=False, examplegen_input_config=None, examplegen_range_config=None, resolver_range_config=None, beam_pipeline_args=self._make_beam_pipeline_args()) logging.info('Starting the first pipeline run.') LocalDagRunner().run(pipeline) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 9 # 8 components + 1 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self._assertPipelineExecution() logging.info('Starting the second pipeline run. All components except ' 'Evaluator and Pusher will use cached results.') LocalDagRunner().run(pipeline) # Artifact count is increased by 3 caused by Evaluator and Pusher. self.assertLen(store.get_artifacts(), artifact_count + 3) artifact_count = len(store.get_artifacts()) self.assertLen(store.get_executions(), expected_execution_count * 2) logging.info('Starting the third pipeline run. ' 'All components will use cached results.') LocalDagRunner().run(pipeline) # Asserts cache execution. # Artifact count is unchanged. self.assertLen(store.get_artifacts(), artifact_count) self.assertLen(store.get_executions(), expected_execution_count * 3)
def testPenguinPipelineLocalWithBulkInferrer(self, model_framework): if model_framework == 'tfdf_experimental': # Skip if TFDF is not available or incompatible. try: importlib.import_module('tensorflow_decision_forests') except (ImportError, tf.errors.NotFoundError): self.skipTest('TensorflowDecisionForests is not available') module_file = self._module_file_name(model_framework) LocalDagRunner().run( penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=module_file, accuracy_threshold=0.1, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=None, enable_tuning=False, enable_bulk_inferrer=True, examplegen_input_config=None, examplegen_range_config=None, resolver_range_config=None, beam_pipeline_args=[], enable_transform_input_cache=False)) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) expected_execution_count = 10 # 9 components + 1 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) self._assertPipelineExecution(has_bulk_inferrer=True)
def testPenguinPipelineLocalWithRollingWindow(self, model_framework): module_file = self._module_file_name('keras') examplegen_input_config = proto.Input(splits=[ proto.Input.Split(name='test', pattern='day{SPAN}/*'), ]) resolver_range_config = proto.RangeConfig( rolling_range=proto.RollingRange(num_spans=2)) def run_pipeline(examplegen_range_config): LocalDagRunner().run( penguin_pipeline_local._create_pipeline( pipeline_name=self._pipeline_name, data_root=self._data_root_span, module_file=module_file, accuracy_threshold=0.1, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, user_provided_schema_path=None, enable_tuning=False, enable_bulk_inferrer=False, examplegen_input_config=examplegen_input_config, examplegen_range_config=examplegen_range_config, resolver_range_config=resolver_range_config, beam_pipeline_args=self._make_beam_pipeline_args())) # Trigger the pipeline for the first span. examplegen_range_config = proto.RangeConfig( static_range=proto.StaticRange( start_span_number=1, end_span_number=1)) run_pipeline(examplegen_range_config) self.assertTrue(fileio.exists(self._serving_model_dir)) self.assertTrue(fileio.exists(self._metadata_path)) self._assertPipelineExecution() transform_execution_type = 'tfx.components.transform.component.Transform' trainer_execution_type = 'tfx.components.trainer.component.Trainer' expected_execution_count = 10 # 8 components + 2 resolver metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) store = mlmd.MetadataStore(metadata_config) artifact_count = len(store.get_artifacts()) execution_count = len(store.get_executions()) self.assertGreaterEqual(artifact_count, execution_count) self.assertEqual(expected_execution_count, execution_count) # Verify Transform's input examples artifacts. tft_input_examples_artifacts = self._get_input_examples_artifacts( store, transform_execution_type) self.assertLen(tft_input_examples_artifacts, 1) # SpansResolver (controlled by resolver_range_config) returns span 1. self.assertEqual( 1, tft_input_examples_artifacts[0].custom_properties[ _SPAN_PROPERTY_NAME].int_value) # Trigger the pipeline for the second span. examplegen_range_config = proto.RangeConfig( static_range=proto.StaticRange( start_span_number=2, end_span_number=2)) run_pipeline(examplegen_range_config) execution_count = len(store.get_executions()) self.assertEqual(expected_execution_count * 2, execution_count) # Verify Transform's input examples artifacts. tft_input_examples_artifacts = self._get_input_examples_artifacts( store, transform_execution_type) self.assertLen(tft_input_examples_artifacts, 2) spans = { tft_input_examples_artifacts[0].custom_properties[ _SPAN_PROPERTY_NAME].int_value, tft_input_examples_artifacts[1].custom_properties[ _SPAN_PROPERTY_NAME].int_value } # SpansResolver (controlled by resolver_range_config) returns span 1 & 2. self.assertSetEqual({1, 2}, spans) # Verify Trainer's input examples artifacts. self.assertLen( self._get_input_examples_artifacts(store, trainer_execution_type), 2) # Trigger the pipeline for the thrid span. examplegen_range_config = proto.RangeConfig( static_range=proto.StaticRange( start_span_number=3, end_span_number=3)) run_pipeline(examplegen_range_config) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) execution_count = len(store.get_executions()) self.assertEqual(expected_execution_count * 3, execution_count) # Verify Transform's input examples artifacts. tft_input_examples_artifacts = self._get_input_examples_artifacts( store, transform_execution_type) self.assertLen(tft_input_examples_artifacts, 2) spans = { tft_input_examples_artifacts[0].custom_properties[ _SPAN_PROPERTY_NAME].int_value, tft_input_examples_artifacts[1].custom_properties[ _SPAN_PROPERTY_NAME].int_value } # SpansResolver (controlled by resolver_range_config) returns span 2 & 3. self.assertSetEqual({2, 3}, spans) # Verify Trainer's input examples artifacts. self.assertLen( self._get_input_examples_artifacts(store, trainer_execution_type), 2)