Exemplo n.º 1
0
  def testTaxiPipelineBeam(self):
    beam_pipeline_args = self._make_beam_pipeline_args()
    BeamDagRunner().run(
        taxi_pipeline_beam._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=beam_pipeline_args))

    self.assertTrue(fileio.exists(self._serving_model_dir))
    self.assertTrue(fileio.exists(self._metadata_path))
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    with metadata.Metadata(metadata_config) as m:
      artifact_count = len(m.store.get_artifacts())
      execution_count = len(m.store.get_executions())
      self.assertGreaterEqual(artifact_count, execution_count)
      self.assertEqual(9, execution_count)

    self.assertPipelineExecution()

    # Runs pipeline the second time.
    BeamDagRunner().run(
        taxi_pipeline_beam._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=beam_pipeline_args))

    # All executions but Evaluator and Pusher are cached.
    # Note that Resolver will always execute.
    with metadata.Metadata(metadata_config) as m:
      # Artifact count is increased by 3 caused by Evaluator and Pusher.
      self.assertLen(m.store.get_artifacts(), artifact_count + 3)
      artifact_count = len(m.store.get_artifacts())
      self.assertLen(m.store.get_executions(), 18)

    # Runs pipeline the third time.
    BeamDagRunner().run(
        taxi_pipeline_beam._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=beam_pipeline_args))

    # Asserts cache execution.
    with metadata.Metadata(metadata_config) as m:
      # Artifact count is unchanged.
      self.assertLen(m.store.get_artifacts(), artifact_count)
      self.assertLen(m.store.get_executions(), 27)
Exemplo n.º 2
0
    def testTaxiPipelineBeam(self):
        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        self.assertTrue(tf.io.gfile.exists(self._serving_model_dir))
        self.assertTrue(tf.io.gfile.exists(self._metadata_path))
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(9, execution_count)

        self.assertPipelineExecution()

        # Runs pipeline the second time.
        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        # All executions but ModelValidator and Pusher are cached.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is increased by 2 caused by ModelValidator and Pusher.
            self.assertEqual(artifact_count + 2, len(m.store.get_artifacts()))
            artifact_count = len(m.store.get_artifacts())
            # 9 more cached executions.
            self.assertEqual(18, len(m.store.get_executions()))

        # Runs pipeline the third time.
        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            # 9 more cached executions.
            self.assertEqual(27, len(m.store.get_executions()))
Exemplo n.º 3
0
    def setUp(self):
        super(TaxiPipelineRegressionEndToEndTest, self).setUp()
        self._test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        self._pipeline_name = 'beam_stub_test'
        # This example assumes that the taxi data and taxi utility function are
        # stored in tfx/examples/chicago_taxi_pipeline. Feel free to customize this
        # as needed.
        taxi_root = os.path.dirname(taxi_pipeline_beam.__file__)
        self._data_root = os.path.join(taxi_root, 'data', 'simple')
        self._module_file = os.path.join(taxi_root, 'taxi_utils.py')
        self._serving_model_dir = os.path.join(self._test_dir, 'serving_model')
        self._pipeline_root = os.path.join(self._test_dir, 'tfx', 'pipelines',
                                           self._pipeline_name)
        # Metadata path for recording successful pipeline run.
        self._recorded_mlmd_path = os.path.join(self._test_dir, 'tfx',
                                                'record', 'metadata.db')
        # Metadata path for stub pipeline runs.
        self._metadata_path = os.path.join(self._test_dir, 'tfx', 'metadata',
                                           self._pipeline_name, 'metadata.db')
        self._recorded_output_dir = os.path.join(self._test_dir, 'testdata')

        # Runs the pipeline and record to self._recorded_output_dir
        record_taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._recorded_mlmd_path,
            beam_pipeline_args=[])

        local_dag_runner.LocalDagRunner().run(record_taxi_pipeline)

        pipeline_recorder_utils.record_pipeline(
            output_dir=self._recorded_output_dir,
            metadata_db_uri=self._recorded_mlmd_path,
            pipeline_name=self._pipeline_name)

        self.taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[])
Exemplo n.º 4
0
 def testTaxiPipelineCheckDagConstruction(self):
     logical_pipeline = taxi_pipeline_beam._create_pipeline(
         pipeline_name='Test',
         pipeline_root=self._test_dir,
         data_root=self._test_dir,
         module_file=self._test_dir,
         serving_model_dir=self._test_dir,
         metadata_path=self._test_dir)
     self.assertEqual(9, len(logical_pipeline.components))
Exemplo n.º 5
0
  def testTaxiPipelineBeam(self):
    BeamDagRunner().run(
        taxi_pipeline_beam._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            direct_num_workers=1))

    self.assertTrue(tf.io.gfile.exists(self._serving_model_dir))
    self.assertTrue(tf.io.gfile.exists(self._metadata_path))
    metadata_config = metadata.sqlite_metadata_connection_config(
        self._metadata_path)
    with metadata.Metadata(metadata_config) as m:
      artifact_count = len(m.store.get_artifacts())
      execution_count = len(m.store.get_executions())
      self.assertGreaterEqual(artifact_count, execution_count)
      self.assertEqual(9, execution_count)

    self.assertPipelineExecution()

    # Run pipeline again.
    BeamDagRunner().run(
        taxi_pipeline_beam._create_pipeline(
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            direct_num_workers=1))

    # Assert cache execution.
    with metadata.Metadata(metadata_config) as m:
      # Artifact count is unchanged.
      self.assertEqual(artifact_count, len(m.store.get_artifacts()))
      # 9 more cached executions.
      self.assertEqual(18, len(m.store.get_executions()))

    self.assertPipelineExecution()
    def testTaxiPipelineBeam(self):
        # Runs the pipeline and record to self._recorded_output_dir
        record_taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._recorded_mlmd_path,
            beam_pipeline_args=[])
        BeamDagRunner().run(record_taxi_pipeline)
        pipeline_recorder_utils.record_pipeline(
            output_dir=self._recorded_output_dir,
            metadata_db_uri=self._recorded_mlmd_path,
            host=None,
            port=None,
            pipeline_name=self._pipeline_name,
            run_id=None)

        # Run pipeline with stub executors.
        taxi_pipeline = taxi_pipeline_beam._create_pipeline(  # pylint:disable=protected-access
            pipeline_name=self._pipeline_name,
            data_root=self._data_root,
            module_file=self._module_file,
            serving_model_dir=self._serving_model_dir,
            pipeline_root=self._pipeline_root,
            metadata_path=self._metadata_path,
            beam_pipeline_args=[])

        model_resolver_id = 'ResolverNode.latest_blessed_model_resolver'
        stubbed_component_ids = [
            component.id for component in taxi_pipeline.components
            if component.id != model_resolver_id
        ]

        stub_launcher = stub_component_launcher.get_stub_launcher_class(
            test_data_dir=self._recorded_output_dir,
            stubbed_component_ids=stubbed_component_ids,
            stubbed_component_map={})
        stub_pipeline_config = pipeline_config.PipelineConfig(
            supported_launcher_classes=[
                stub_launcher,
            ])
        BeamDagRunner(config=stub_pipeline_config).run(taxi_pipeline)

        self.assertTrue(tf.io.gfile.exists(self._metadata_path))

        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)

        # Verify that recorded files are successfully copied to the output uris.
        with metadata.Metadata(metadata_config) as m:
            artifacts = m.store.get_artifacts()
            artifact_count = len(artifacts)
            executions = m.store.get_executions()
            execution_count = len(executions)
            # Artifact count is greater by 3 due to extra artifacts produced by
            # Evaluator(blessing and evaluation), Trainer(model and model_run) and
            # Transform(example, graph, cache) minus Resolver which doesn't generate
            # new artifact.
            self.assertEqual(artifact_count, execution_count + 3)
            self.assertLen(taxi_pipeline.components, execution_count)

            for execution in executions:
                component_id = execution.properties[
                    metadata._EXECUTION_TYPE_KEY_COMPONENT_ID].string_value  # pylint: disable=protected-access
                if component_id == 'ResolverNode.latest_blessed_model_resolver':
                    continue
                eid = [execution.id]
                events = m.store.get_events_by_execution_ids(eid)
                output_events = [
                    x for x in events
                    if x.type == metadata_store_pb2.Event.OUTPUT
                ]
                for event in output_events:
                    steps = event.path.steps
                    self.assertTrue(steps[0].HasField('key'))
                    name = steps[0].key
                    artifacts = m.store.get_artifacts_by_id(
                        [event.artifact_id])
                    for idx, artifact in enumerate(artifacts):
                        self.assertDirectoryEqual(
                            artifact.uri,
                            os.path.join(self._recorded_output_dir,
                                         component_id, name, str(idx)))
Exemplo n.º 7
0
    def testTaxiPipelineBeam(self):
        num_components = 10

        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        self.assertTrue(tf.io.gfile.exists(self._serving_model_dir))
        self.assertTrue(tf.io.gfile.exists(self._metadata_path))
        metadata_config = metadata.sqlite_metadata_connection_config(
            self._metadata_path)
        with metadata.Metadata(metadata_config) as m:
            artifact_count = len(m.store.get_artifacts())
            execution_count = len(m.store.get_executions())
            self.assertGreaterEqual(artifact_count, execution_count)
            self.assertEqual(num_components, execution_count)

        self.assertPipelineExecution()
        self.assertInfraValidatorPassed()

        # Runs pipeline the second time.
        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        # All executions but Evaluator and Pusher are cached.
        # Note that Resolver will always execute.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is increased by 3 caused by Evaluator and Pusher.
            self.assertEqual(artifact_count + 3, len(m.store.get_artifacts()))
            artifact_count = len(m.store.get_artifacts())
            # 10 more cached executions.
            self.assertEqual(num_components * 2, len(m.store.get_executions()))

        # Runs pipeline the third time.
        BeamDagRunner().run(
            taxi_pipeline_beam._create_pipeline(
                pipeline_name=self._pipeline_name,
                data_root=self._data_root,
                module_file=self._module_file,
                serving_model_dir=self._serving_model_dir,
                pipeline_root=self._pipeline_root,
                metadata_path=self._metadata_path,
                direct_num_workers=1))

        # Asserts cache execution.
        with metadata.Metadata(metadata_config) as m:
            # Artifact count is unchanged.
            self.assertEqual(artifact_count, len(m.store.get_artifacts()))
            # 10 more cached executions.
            self.assertEqual(num_components * 3, len(m.store.get_executions()))