def testSimplePipelineRun(self): self.assertEqual(self.RAN_COMPONENTS, []) # Construct component instances. dummy_load_component = LoadDummyDatasetComponent() dummy_train_component = DummyTrainComponent( training_data=dummy_load_component.outputs['dataset'], num_iterations=5) dummy_validate_component = DummyValidateComponent( model=dummy_train_component.outputs['model'], loss=dummy_train_component.outputs['loss'], accuracy=dummy_train_component.outputs['accuracy']) # Construct and run pipeline temp_path = tempfile.mkdtemp() pipeline_root_path = os.path.join(temp_path, 'pipeline_root') metadata_path = os.path.join(temp_path, 'metadata.db') test_pipeline = pipeline.Pipeline( pipeline_name='test_pipeline', pipeline_root=pipeline_root_path, metadata_connection_config=sqlite_metadata_connection_config( metadata_path), components=[ dummy_load_component, dummy_train_component, dummy_validate_component, ]) local_dag_runner.LocalDagRunner().run(test_pipeline) self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train', 'Validate'])
def testPatcher(self, mock_run): patcher = local_dag_runner_patcher.LocalDagRunnerPatcher() with patcher.patch() as context: local_dag_runner.LocalDagRunner().run( tfx_pipeline.Pipeline(_PIPELINE_NAME, '')) mock_run.assert_not_called() self.assertEqual(context[patcher.PIPELINE_NAME], _PIPELINE_NAME)
def testRunWithIR(self): local_dag_runner.LocalDagRunner().run_with_ir( self._getTestPipelineIR()) self.assertEqual(_executed_components, [ '_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c', '_FakeComponent.d', '_FakeComponent.e' ])
def testPartialRun(self): local_dag_runner.LocalDagRunner().run( self._getTestPipeline(), run_options=pipeline_py.RunOptions(to_nodes=['c'])) self.assertEqual( _executed_components, ['_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c'])
def testNoSupportedLaunchers(self): config = pipeline_config.PipelineConfig(supported_launcher_classes=[ docker_component_launcher.DockerComponentLauncher ]) runner = local_dag_runner.LocalDagRunner(config=config) with self.assertRaisesRegex(RuntimeError, 'No launcher info can be found'): runner.run(self._getTestPipeline())
def testSimplePipelinePartialRun(self): self.assertEqual(self.RAN_COMPONENTS, []) local_dag_runner.LocalDagRunner().run( self._getTestPipeline(), run_options=pipeline_py.RunOptions(to_nodes=['Train'])) self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train'])
def testPartialRunWithIR(self): pr_opts = pipeline_pb2.PartialRun() pr_opts.to_nodes.append('c') pr_opts.snapshot_settings.latest_pipeline_run_strategy.SetInParent() local_dag_runner.LocalDagRunner().run_with_ir( self._getTestPipelineIR(), run_options=pipeline_pb2.RunOptions(partial_run=pr_opts)) self.assertEqual( _executed_components, ['_FakeComponent.a', '_FakeComponent.b', '_FakeComponent.c'])
def testSimplePipelinePartialRunWithIR(self): self.assertEqual(self.RAN_COMPONENTS, []) pr_opts = pipeline_pb2.PartialRun() pr_opts.to_nodes.append('Train') pr_opts.snapshot_settings.latest_pipeline_run_strategy.SetInParent() local_dag_runner.LocalDagRunner().run_with_ir( self._getTestPipelineIR(), run_options=pipeline_pb2.RunOptions(partial_run=pr_opts)) self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train'])
def setUp(self): super(TaxiPipelineRegressionEndToEndTest, self).setUp() self._test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._pipeline_name = 'beam_stub_test' # This example assumes that the taxi data and taxi utility function are # stored in tfx/examples/chicago_taxi_pipeline. Feel free to customize this # as needed. taxi_root = os.path.dirname(taxi_pipeline_beam.__file__) self._data_root = os.path.join(taxi_root, 'data', 'simple') self._module_file = os.path.join(taxi_root, 'taxi_utils.py') self._serving_model_dir = os.path.join(self._test_dir, 'serving_model') self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines', self._pipeline_name) # Metadata path for recording successful pipeline run. self._recorded_mlmd_path = os.path.join(self._test_dir, 'tfx', 'record', 'metadata.db') # Metadata path for stub pipeline runs. self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata', self._pipeline_name, 'metadata.db') self._recorded_output_dir = os.path.join(self._test_dir, 'testdata') # Runs the pipeline and record to self._recorded_output_dir record_taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._recorded_mlmd_path, beam_pipeline_args=[]) local_dag_runner.LocalDagRunner().run(record_taxi_pipeline) pipeline_recorder_utils.record_pipeline( output_dir=self._recorded_output_dir, metadata_db_uri=self._recorded_mlmd_path, pipeline_name=self._pipeline_name) self.taxi_pipeline = taxi_pipeline_beam._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])
def setUp(self): super(ImdbStubPipelineRegressionEndToEndTest, self).setUp() self._test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._pipeline_name = 'imdb_stub_test' # This example assumes that the imdb data and imdb utility function are # stored in tfx/examples/imdb. Feel free to customize this as needed. imdb_root = os.path.dirname(imdb_pipeline_native_keras.__file__) self._data_root = os.path.join(imdb_root, 'data') self._module_file = os.path.join(imdb_root, 'imdb_utils_native_keras.py') self._serving_model_dir = os.path.join(self._test_dir, 'serving_model') self._pipeline_root = os.path.join(self._test_dir, 'pipelines', self._pipeline_name) # Metadata path for recording successful pipeline run. self._recorded_mlmd_path = os.path.join(self._test_dir, 'record', 'metadata.db') # Metadata path for stub pipeline self._metadata_path = os.path.join(self._test_dir, 'metadata', self._pipeline_name, 'metadata.db') self._recorded_output_dir = os.path.join(self._test_dir, 'testdata') record_imdb_pipeline = imdb_pipeline_native_keras._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._recorded_mlmd_path, beam_pipeline_args=[]) local_dag_runner.LocalDagRunner().run(record_imdb_pipeline) pipeline_recorder_utils.record_pipeline( output_dir=self._recorded_output_dir, metadata_db_uri=self._recorded_mlmd_path, pipeline_name=self._pipeline_name) # Run pipeline with stub executors. self.imdb_pipeline = imdb_pipeline_native_keras._create_pipeline( # pylint:disable=protected-access pipeline_name=self._pipeline_name, data_root=self._data_root, module_file=self._module_file, serving_model_dir=self._serving_model_dir, pipeline_root=self._pipeline_root, metadata_path=self._metadata_path, beam_pipeline_args=[])
def run(): """Define a pipeline.""" local_dag_runner.LocalDagRunner().run( pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_path=DATA_PATH, # NOTE: Use `query` instead of `data_path` to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs( num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=SERVING_MODEL_DIR, # NOTE: Provide GCP configs to use BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, metadata_connection_config=metadata. sqlite_metadata_connection_config(METADATA_PATH)))
def testStubbedTaxiPipelineBeam(self): # Run pipeline with stub executors. stub_component_launcher.StubComponentLauncher.initialize( test_data_dir=self._recorded_output_dir, test_component_ids=[]) stub_pipeline_config = pipeline_config.PipelineConfig( supported_launcher_classes=[ stub_component_launcher.StubComponentLauncher, ]) local_dag_runner.LocalDagRunner(config=stub_pipeline_config).run( self.taxi_pipeline) self.assertTrue(fileio.exists(self._metadata_path)) metadata_config = metadata.sqlite_metadata_connection_config( self._metadata_path) # Verify that recorded files are successfully copied to the output uris. with metadata.Metadata(metadata_config) as m: artifacts = m.store.get_artifacts() artifact_count = len(artifacts) executions = m.store.get_executions() execution_count = len(executions) # Artifact count is greater by 3 due to extra artifacts produced by # Evaluator(blessing and evaluation), Trainer(model and model_run) and # Transform(example, graph, cache) minus Resolver which doesn't generate # new artifact. self.assertEqual(artifact_count, execution_count + 3) self.assertLen(self.taxi_pipeline.components, execution_count) for execution in executions: component_id = execution.properties[ metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value # pylint: disable=protected-access if component_id.startswith('ResolverNode'): continue eid = [execution.id] events = m.store.get_events_by_execution_ids(eid) output_events = [ x for x in events if x.type == metadata_store_pb2.Event.OUTPUT ] for event in output_events: steps = event.path.steps self.assertTrue(steps[0].HasField('key')) name = steps[0].key artifacts = m.store.get_artifacts_by_id( [event.artifact_id]) for idx, artifact in enumerate(artifacts): self.assertDirectoryEqual( artifact.uri, os.path.join(self._recorded_output_dir, component_id, name, str(idx))) # Calls verifier for pipeline output artifacts, excluding the resolver node. local_dag_runner.LocalDagRunner().run(self.taxi_pipeline) pipeline_outputs = executor_verifier_utils.get_pipeline_outputs( self.taxi_pipeline.metadata_connection_config, self.taxi_pipeline.pipeline_info) verifier_map = { 'model': self._verify_model, 'model_run': self._verify_model, 'examples': self._verify_examples, 'schema': self._verify_schema, 'anomalies': self._verify_anomalies, 'evaluation': self._verify_evaluation } # List of components to verify. ResolverNode is ignored because it # doesn't have an executor. verify_component_ids = [ component.id for component in self.taxi_pipeline.components if not component.id.startswith('ResolverNode') ] for component_id in verify_component_ids: logging.info('Verifying %s', component_id) for key, artifact_dict in pipeline_outputs[component_id].items(): for idx, artifact in artifact_dict.items(): recorded_uri = os.path.join(self._recorded_output_dir, component_id, key, str(idx)) verifier_map.get(key, self._verify_file_path)(artifact.uri, recorded_uri)
f"--direct_num_workers={direct_num_workers}", f"--direct_running_mode=multi_processing", ] tfx_pipeline = pipeline.Pipeline( pipeline_name=config.PIPELINE_NAME, pipeline_root=config.PIPELINE_ROOT, components=components, enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( config.METADATA_PATH), beam_pipeline_args=beam_arg, ) return tfx_pipeline # %% if __name__ == "__main__": tfx_components = init_components( config.DATA_DIR_PATH, config.MODULE_FILE_PATH, config.SERVING_MODEL_DIR, ) # %% tfx_pipeline = init_pipeline(tfx_components, config.PIPELINE_ROOT, 4) # %% #the localDagRunner() doesn't work in ipykernel, so you would have to run # this in terminal #or you have to run context.run(component) within ipykernel local_dag_runner.LocalDagRunner().run(tfx_pipeline)
_metadata_path = os.path.join(_tfx_root, 'metadata', _pipeline_name, 'metadata.db') def _create_pipeline(pipeline_name: Text, pipeline_root: Text, data_root: Text, metadata_path: Text) -> pipeline.Pipeline: """Implements the chicago taxi pipeline with TFX.""" examples = external_input(data_root) # Brings data into the pipeline or otherwise joins/converts training data. example_gen = CsvExampleGen(input=examples) return pipeline.Pipeline( pipeline_name=pipeline_name, pipeline_root=pipeline_root, components=[example_gen], enable_cache=True, metadata_connection_config=metadata.sqlite_metadata_connection_config( metadata_path), additional_pipeline_args={}, ) if __name__ == '__main__': absl.logging.set_verbosity(absl.logging.INFO) local_dag_runner.LocalDagRunner().run( _create_pipeline(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root, metadata_path=_metadata_path))
# Path to a SQLite DB file to use as an MLMD storage. METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db') # Output directory where created models from the pipeline will be exported. SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME) from absl import logging logging.set_verbosity(logging.INFO) # Set default logging level. import urllib.request import tempfile DATA_ROOT = tempfile.mkdtemp( prefix='tfx-data') # Create a temporary directory. _data_url = 'https://raw.githubusercontent.com/tensorflow/tfx/master/tfx/examples/penguin/data/penguins_processed.csv' _data_filepath = os.path.join(DATA_ROOT, "data.csv") urllib.request.urlretrieve(_data_url, _data_filepath) _trainer_module_file = 'penguin_trainer.py' from tfx.orchestration.local import local_dag_runner from pipeline import _create_pipeline local_dag_runner.LocalDagRunner().run( _create_pipeline(pipeline_name=PIPELINE_NAME, pipeline_root=PIPELINE_ROOT, data_root=DATA_ROOT, module_file=_trainer_module_file, serving_model_dir=SERVING_MODEL_DIR, metadata_path=METADATA_PATH))
def testSimplePipelineRunWithIR(self): self.assertEqual(self.RAN_COMPONENTS, []) local_dag_runner.LocalDagRunner().run_with_ir(self._getTestPipelineIR()) self.assertEqual(self.RAN_COMPONENTS, ['Load', 'Train', 'Validate'])