예제 #1
0
 def test_channel_as_channel_success(self):
     instance_a = types.Artifact('MyTypeName')
     instance_b = types.Artifact('MyTypeName')
     chnl_original = channel.Channel('MyTypeName',
                                     artifacts=[instance_a, instance_b])
     chnl_result = channel.as_channel(chnl_original)
     self.assertEqual(chnl_original, chnl_result)
예제 #2
0
 def test_valid_channel(self):
     instance_a = types.Artifact('MyTypeName')
     instance_b = types.Artifact('MyTypeName')
     chnl = channel.Channel('MyTypeName',
                            artifacts=[instance_a, instance_b])
     self.assertEqual(chnl.type_name, 'MyTypeName')
     self.assertItemsEqual(chnl.get(), [instance_a, instance_b])
예제 #3
0
    def testCsvExampleGenWrapper(self):
        input_base = types.Artifact(type_name='ExternalPath', split='')
        input_base.uri = '/path/to/dataset'

        with patch.object(executor, 'Executor', autospec=True) as _:
            wrapper = executor_wrappers.CsvExampleGenWrapper(
                argparse.Namespace(
                    exec_properties=json.dumps(self.exec_properties),
                    outputs=artifact_utils.jsonify_artifact_dict(
                        {'examples': self.examples}),
                    executor_class_path=
                    ('tfx.components.example_gen.csv_example_gen.executor.Executor'
                     ),
                    input_base=json.dumps([input_base.json_dict()])), )
            wrapper.run(output_basedir=self.output_basedir)

            # TODO(b/133011207): Validate arguments for executor and Do() method.

            metadata_file = os.path.join(self.output_basedir,
                                         'output/ml_metadata/examples')

            expected_output_examples = types.Artifact(type_name='ExamplesPath',
                                                      split='dummy')
            # Expect that span and path are resolved.
            expected_output_examples.span = 1
            expected_output_examples.uri = (
                '/path/to/output/csv_example_gen/examples/mock_workflow_id/dummy/'
            )

            with tf.gfile.GFile(metadata_file) as f:
                self.assertEqual([expected_output_examples.json_dict()],
                                 json.loads(f.read()))
예제 #4
0
  def test_do(self):
    source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')
    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)
    tf.gfile.MakeDirs(output_data_dir)

    # Create input dict.
    train_examples = types.Artifact(type_name='ExamplesPath', split='train')
    train_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/train/')
    eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
    eval_examples.uri = os.path.join(source_data_dir, 'csv_example_gen/eval/')

    train_stats = types.Artifact(
        type_name='ExampleStatisticsPath', split='train')
    train_stats.uri = os.path.join(output_data_dir, 'train', '')
    eval_stats = types.Artifact(type_name='ExampleStatisticsPath', split='eval')
    eval_stats.uri = os.path.join(output_data_dir, 'eval', '')
    input_dict = {
        'input_data': [train_examples, eval_examples],
    }

    output_dict = {
        'output': [train_stats, eval_stats],
    }

    # Run executor.
    evaluator = executor.Executor()
    evaluator.Do(input_dict, output_dict, exec_properties={})

    # Check statistics_gen outputs.
    self._validate_stats_output(os.path.join(train_stats.uri, 'stats_tfrecord'))
    self._validate_stats_output(os.path.join(eval_stats.uri, 'stats_tfrecord'))
예제 #5
0
    def setUp(self):
        self._source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        self._output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.gfile.MakeDirs(self._output_data_dir)
        self._model_export = types.Artifact(type_name='ModelExportPath')
        self._model_export.uri = os.path.join(self._source_data_dir,
                                              'trainer/current/')
        self._model_blessing = types.Artifact(type_name='ModelBlessingPath')
        self._input_dict = {
            'model_export': [self._model_export],
            'model_blessing': [self._model_blessing],
        }

        self._model_push = types.Artifact(type_name='ModelPushPath')
        self._model_push.uri = os.path.join(self._output_data_dir,
                                            'model_push')
        tf.gfile.MakeDirs(self._model_push.uri)
        self._output_dict = {
            'model_push': [self._model_push],
        }
        self._serving_model_dir = os.path.join(self._output_data_dir,
                                               'serving_model_dir')
        tf.gfile.MakeDirs(self._serving_model_dir)
        self._exec_properties = {
            'push_destination':
            json_format.MessageToJson(
                pusher_pb2.PushDestination(
                    filesystem=pusher_pb2.PushDestination.Filesystem(
                        base_directory=self._serving_model_dir))),
        }
        self._executor = executor.Executor()
예제 #6
0
    def testPipelineWithArtifactInfo(self):
        artifacts_collection = [types.Artifact('channel_one')]
        channel_one = types.Channel(type_name='channel_one',
                                    artifacts=artifacts_collection)
        component_a = _make_fake_component_instance(
            name='component_a', inputs={}, outputs={'one': channel_one})
        component_b = _make_fake_component_instance(
            name='component_b',
            inputs={
                'a': component_a.outputs.one,
            },
            outputs={})

        my_pipeline = pipeline.Pipeline(
            pipeline_name='a',
            pipeline_root='b',
            components=[component_b, component_a],
            metadata_connection_config=self._metadata_connection_config)
        expected_artifact = types.Artifact('channel_one')
        expected_artifact.name = 'one'
        expected_artifact.pipeline_name = 'a'
        expected_artifact.pipeline_timestamp_ms = 0
        expected_artifact.producer_component = 'component_a'
        self.assertItemsEqual(my_pipeline.components,
                              [component_a, component_b])
        self.assertEqual(component_a.outputs.one._artifacts[0].pipeline_name,
                         'a')
        self.assertEqual(
            component_a.outputs.one._artifacts[0].producer_component,
            component_a.component_id)
        self.assertEqual(component_a.outputs.one._artifacts[0].name, 'one')
        self.assertEqual(component_b.inputs.a._artifacts[0].pipeline_name, 'a')
        self.assertEqual(component_b.inputs.a._artifacts[0].producer_component,
                         component_a.component_id)
        self.assertEqual(component_b.inputs.a._artifacts[0].name, 'one')
예제 #7
0
  def test_fetch_previous_result(self):
    with metadata.Metadata(connection_config=self._connection_config) as m:

      # Create an 'previous' execution.
      exec_properties = {'log_root': 'path'}
      eid = m.register_execution(
          exec_properties=exec_properties,
          pipeline_info=self._pipeline_info,
          component_info=self._component_info)
      input_artifact = types.Artifact(type_name='ExamplesPath')
      m.publish_artifacts([input_artifact])
      output_artifact = types.Artifact(type_name='ExamplesPath')
      input_artifacts = {'input': [input_artifact]}
      output_artifacts = {'output': [output_artifact]}
      m.publish_execution(eid, input_artifacts, output_artifacts)

      # Test previous_run.
      self.assertEqual(
          None,
          m.previous_execution(
              input_artifacts=input_artifacts,
              exec_properties={},
              pipeline_info=self._pipeline_info,
              component_info=self._component_info))
      self.assertEqual(
          None,
          m.previous_execution(
              input_artifacts={},
              exec_properties=exec_properties,
              pipeline_info=self._pipeline_info,
              component_info=self._component_info))
      self.assertEqual(
          None,
          m.previous_execution(
              input_artifacts=input_artifacts,
              exec_properties=exec_properties,
              pipeline_info=self._pipeline_info,
              component_info=data_types.ComponentInfo(
                  component_id='unique', component_type='a.b.c')))
      self.assertEqual(
          eid,
          m.previous_execution(
              input_artifacts=input_artifacts,
              exec_properties=exec_properties,
              pipeline_info=self._pipeline_info,
              component_info=self._component_info))

      # Test fetch_previous_result_artifacts.
      new_output_artifact = types.Artifact(type_name='ExamplesPath')
      self.assertNotEqual(ArtifactState.PUBLISHED,
                          new_output_artifact.state)
      new_output_dict = {'output': [new_output_artifact]}
      updated_output_dict = m.fetch_previous_result_artifacts(
          new_output_dict, eid)
      previous_artifact = output_artifacts['output'][-1].artifact
      current_artifact = updated_output_dict['output'][-1].artifact
      self.assertEqual(ArtifactState.PUBLISHED,
                       current_artifact.properties['state'].string_value)
      self.assertEqual(previous_artifact.id, current_artifact.id)
      self.assertEqual(previous_artifact.type_id, current_artifact.type_id)
예제 #8
0
    def test_do(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')

        train_stats_artifact = types.Artifact('ExampleStatsPath',
                                              split='train')
        train_stats_artifact.uri = os.path.join(source_data_dir,
                                                'statistics_gen/train/')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        schema_output = types.Artifact('SchemaPath')
        schema_output.uri = os.path.join(output_data_dir, 'schema_output')

        input_dict = {
            'stats': [train_stats_artifact],
        }
        output_dict = {
            'output': [schema_output],
        }

        exec_properties = {}

        schema_gen_executor = executor.Executor()
        schema_gen_executor.Do(input_dict, output_dict, exec_properties)
        self.assertNotEqual(0, len(tf.gfile.ListDirectory(schema_output.uri)))
예제 #9
0
  def setUp(self):
    self._source_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'testdata')
    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
        self._testMethodName)
    self.component_name = 'test_component'

    # Create input dict.
    eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
    eval_examples.uri = os.path.join(self._source_data_dir,
                                     'csv_example_gen/eval/')
    model = types.Artifact(type_name='ModelExportPath')
    model.uri = os.path.join(self._source_data_dir, 'trainer/current/')
    self._input_dict = {
        'examples': [eval_examples],
        'model': [model],
    }

    # Create output dict.
    self._blessing = types.Artifact('ModelBlessingPath')
    self._blessing.uri = os.path.join(output_data_dir, 'blessing')
    self._output_dict = {
        'blessing': [self._blessing]
    }

    # Create context
    self._tmp_dir = os.path.join(output_data_dir, '.temp')
    self._context = executor.Executor.Context(tmp_dir=self._tmp_dir,
                                              unique_id='2')
예제 #10
0
 def setUp(self):
     self._mock_metadata = tf.test.mock.Mock()
     self._input_dict = {
         'input_data':
         channel.Channel(type_name='input_data',
                         artifacts=[types.Artifact(type_name='input_data')])
     }
     input_dir = os.path.join(
         os.environ.get('TEST_TMP_DIR', self.get_temp_dir()),
         self._testMethodName, 'input_dir')
     # valid input artifacts must have a uri pointing to an existing directory.
     for key, input_channel in self._input_dict.items():
         for index, artifact in enumerate(input_channel.get()):
             artifact.id = index + 1
             uri = os.path.join(input_dir, key, str(artifact.id), '')
             artifact.uri = uri
             tf.gfile.MakeDirs(uri)
     self._output_dict = {
         'output_data':
         channel.Channel(type_name='output_data',
                         artifacts=[
                             types.Artifact(type_name='output_data',
                                            split='split')
                         ])
     }
     self._input_artifacts = channel.unwrap_channel_dict(self._input_dict)
     self._output_artifacts = {
         'output_data': [types.Artifact(type_name='OutputType')],
     }
     self._exec_properties = {
         'key': 'value',
     }
     self._execution_id = 100
예제 #11
0
 def test_construct(self):
     examples = types.Artifact(type_name='ExamplesPath')
     model_exports = types.Artifact(type_name='ModelExportPath')
     evaluator = component.Evaluator(
         examples=channel.as_channel([examples]),
         model_exports=channel.as_channel([model_exports]))
     self.assertEqual('ModelEvalPath', evaluator.outputs.output.type_name)
예제 #12
0
    def fakeUpstreamOutputs(mlmd_connection: metadata.Metadata,
                            example_gen: pipeline_pb2.PipelineNode,
                            transform: pipeline_pb2.PipelineNode):

        with mlmd_connection as m:
            if example_gen:
                # Publishes ExampleGen output.
                output_example = types.Artifact(
                    example_gen.outputs.outputs['output_examples'].
                    artifact_spec.type)
                output_example.uri = 'my_examples_uri'
                contexts = context_lib.register_contexts_if_not_exists(
                    m, example_gen.contexts)
                execution = execution_publish_utils.register_execution(
                    m, example_gen.node_info.type, contexts)
                execution_publish_utils.publish_succeeded_execution(
                    m, execution.id, contexts, {
                        'output_examples': [output_example],
                    })

            if transform:
                # Publishes Transform output.
                output_transform_graph = types.Artifact(
                    transform.outputs.outputs['transform_graph'].artifact_spec.
                    type)
                output_example.uri = 'my_transform_graph_uri'
                contexts = context_lib.register_contexts_if_not_exists(
                    m, transform.contexts)
                execution = execution_publish_utils.register_execution(
                    m, transform.node_info.type, contexts)
                execution_publish_utils.publish_succeeded_execution(
                    m, execution.id, contexts, {
                        'transform_graph': [output_transform_graph],
                    })
예제 #13
0
    def setUp(self):
        self._source_data_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),
            'components', 'testdata')
        self._output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        tf.gfile.MakeDirs(self._output_data_dir)
        self._model_export = types.Artifact(type_name='ModelExportPath')
        self._model_export.uri = os.path.join(self._source_data_dir,
                                              'trainer/current/')
        self._model_blessing = types.Artifact(type_name='ModelBlessingPath')
        self._input_dict = {
            'model_export': [self._model_export],
            'model_blessing': [self._model_blessing],
        }

        self._model_push = types.Artifact(type_name='ModelPushPath')
        self._model_push.uri = os.path.join(self._output_data_dir,
                                            'model_push')
        tf.gfile.MakeDirs(self._model_push.uri)
        self._output_dict = {
            'model_push': [self._model_push],
        }
        self._exec_properties = {
            'custom_config': {
                'ai_platform_serving_args': {
                    'model_name': 'model_name',
                    'project_id': 'project_id'
                },
            },
        }
        self._executor = Executor()
예제 #14
0
 def testMainEmptyInputs(self):
     """Test executor class import under empty inputs/outputs."""
     inputs = {
         'x':
         [types.Artifact(type_name='X'),
          types.Artifact(type_name='X')]
     }
     outputs = {'y': [types.Artifact(type_name='Y')]}
     exec_properties = {'a': 'b'}
     args = [
         '--executor_class_path=%s.%s' %
         (FakeExecutor.__module__, FakeExecutor.__name__),
         '--inputs=%s' % artifact_utils.jsonify_artifact_dict(inputs),
         '--outputs=%s' % artifact_utils.jsonify_artifact_dict(outputs),
         '--exec-properties=%s' % json.dumps(exec_properties),
     ]
     with ArgsCapture() as args_capture:
         run_executor.main(args)
         # TODO(b/131417512): Add equal comparison to types.Artifact class so we
         # can use asserters.
         self.assertSetEqual(set(args_capture.input_dict.keys()),
                             set(inputs.keys()))
         self.assertSetEqual(set(args_capture.output_dict.keys()),
                             set(outputs.keys()))
         self.assertDictEqual(args_capture.exec_properties, exec_properties)
예제 #15
0
파일: executor_test.py 프로젝트: two8g/tfx
  def testDoWithCache(self):
    # First run that creates cache.
    output_cache_artifact = types.Artifact('OutputCache')
    output_cache_artifact.uri = os.path.join(self._output_data_dir, 'CACHE/')

    self._output_dict['cache_output_path'] = [output_cache_artifact]

    self._exec_properties['module_file'] = self._module_file
    self._transform_executor.Do(self._input_dict, self._output_dict,
                                self._exec_properties)
    self._verify_transform_outputs()
    self.assertNotEqual(0,
                        len(tf.gfile.ListDirectory(output_cache_artifact.uri)))

    # Second run from cache.
    self._output_data_dir = self._get_output_data_dir('2nd_run')
    input_cache_artifact = types.Artifact('InputCache')
    input_cache_artifact.uri = output_cache_artifact.uri

    output_cache_artifact = types.Artifact('OutputCache')
    output_cache_artifact.uri = os.path.join(self._output_data_dir, 'CACHE/')

    self._make_base_do_params(self._source_data_dir, self._output_data_dir)

    self._input_dict['cache_input_path'] = [input_cache_artifact]
    self._output_dict['cache_output_path'] = [output_cache_artifact]

    self._exec_properties['module_file'] = self._module_file
    self._transform_executor.Do(self._input_dict, self._output_dict,
                                self._exec_properties)

    self._verify_transform_outputs()
    self.assertNotEqual(0,
                        len(tf.gfile.ListDirectory(output_cache_artifact.uri)))
예제 #16
0
 def test_construct(self):
   examples = types.Artifact(type_name='ExamplesPath')
   model = types.Artifact(type_name='ModelExportPath')
   model_validator = component.ModelValidator(
       examples=channel.as_channel([examples]),
       model=channel.as_channel([model]))
   self.assertEqual('ModelBlessingPath',
                    model_validator.outputs.blessing.type_name)
예제 #17
0
 def test_construct(self):
     train_examples = types.Artifact(type_name='ExamplesPath',
                                     split='train')
     eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
     statistics_gen = component.StatisticsGen(
         input_data=channel.as_channel([train_examples, eval_examples]))
     self.assertEqual('ExampleStatisticsPath',
                      statistics_gen.outputs.output.type_name)
예제 #18
0
    def test_do(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create input dict.
        train_examples = types.Artifact(type_name='ExamplesPath',
                                        split='train')
        train_examples.uri = os.path.join(
            source_data_dir, 'transform/transformed_examples/train/')
        eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
        eval_examples.uri = os.path.join(
            source_data_dir, 'transform/transformed_examples/eval/')
        transform_output = types.Artifact(type_name='TransformPath')
        transform_output.uri = os.path.join(source_data_dir,
                                            'transform/transform_output/')
        schema = types.Artifact(type_name='ExamplesPath')
        schema.uri = os.path.join(source_data_dir, 'schema_gen/')

        input_dict = {
            'examples': [train_examples, eval_examples],
            'transform_output': [transform_output],
            'schema': [schema],
        }

        # Create output dict.
        model_exports = types.Artifact(type_name='ModelExportPath')
        model_exports.uri = os.path.join(output_data_dir, 'model_export_path')
        output_dict = {'output': [model_exports]}

        # Create exec properties.
        module_file_path = os.path.join(source_data_dir, 'module_file',
                                        'trainer_module.py')

        exec_properties = {
            'train_args':
            json_format.MessageToJson(trainer_pb2.TrainArgs(num_steps=1000)),
            'eval_args':
            json_format.MessageToJson(trainer_pb2.EvalArgs(num_steps=500)),
            'module_file':
            module_file_path,
            'warm_starting':
            False,
        }

        trainer_executor = executor.Executor()
        trainer_executor.Do(input_dict=input_dict,
                            output_dict=output_dict,
                            exec_properties=exec_properties)

        # Check outputs.
        self.assertTrue(
            tf.gfile.Exists(os.path.join(model_exports.uri, 'eval_model_dir')))
        self.assertTrue(
            tf.gfile.Exists(
                os.path.join(model_exports.uri, 'serving_model_dir')))
예제 #19
0
 def test_unwrap_channel_dict(self):
     instance_a = types.Artifact('MyTypeName')
     instance_b = types.Artifact('MyTypeName')
     channel_dict = {
         'id': channel.Channel('MyTypeName',
                               artifacts=[instance_a, instance_b])
     }
     result = channel.unwrap_channel_dict(channel_dict)
     self.assertDictEqual(result, {'id': [instance_a, instance_b]})
예제 #20
0
    def test_run(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        example_gen = FileBasedExampleGen(
            executor_class=parquet_executor.Executor,
            input_base=external_input(self.parquet_dir_path),
            input_config=self.input_config,
            output_config=self.output_config,
            name='parquetExampleGenComponent')

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        pipeline_root = os.path.join(output_data_dir, 'Test')
        tf.gfile.MakeDirs(pipeline_root)
        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        launcher = component_launcher.ComponentLauncher(
            component=example_gen,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join(
                [FileBasedExampleGen.__module__,
                 FileBasedExampleGen.__name__]))

        launcher.launch()
        mock_publisher.return_value.publish_execution.assert_called_once()

        # Get output paths.
        component_id = '.'.join([example_gen.component_name, example_gen.name])
        output_path = os.path.join(pipeline_root, component_id, 'examples/1')
        train_examples = types.Artifact(type_name='ExamplesPath',
                                        split='train')
        train_examples.uri = os.path.join(output_path, 'train')
        eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
        eval_examples.uri = os.path.join(output_path, 'eval')

        # Check parquet example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())
예제 #21
0
 def setUp(self):
     self._mock_metadata = tf.test.mock.Mock()
     self._mock_metadata.publish_execution = tf.test.mock.Mock()
     self._input_dict = {
         'input_data': [types.Artifact(type_name='InputType')],
     }
     self._output_dict = {
         'output_data': [types.Artifact(type_name='OutputType')],
     }
     self._execution_id = 100
예제 #22
0
 def test_construct_without_transform_output(self):
     transformed_examples = types.Artifact(type_name='ExamplesPath')
     schema = types.Artifact(type_name='SchemaPath')
     trainer = component.Trainer(
         module_file='/path/to/module/file',
         examples=channel.as_channel([transformed_examples]),
         schema=channel.as_channel([schema]),
         train_args=trainer_pb2.TrainArgs(num_steps=100),
         eval_args=trainer_pb2.EvalArgs(num_steps=50))
     self.assertEqual('ModelExportPath', trainer.outputs.output.type_name)
예제 #23
0
 def test_construct_with_slice_spec(self):
     examples = types.Artifact(type_name='ExamplesPath')
     model_exports = types.Artifact(type_name='ModelExportPath')
     evaluator = component.Evaluator(
         examples=channel.as_channel([examples]),
         model_exports=channel.as_channel([model_exports]),
         feature_slicing_spec=evaluator_pb2.FeatureSlicingSpec(specs=[
             evaluator_pb2.SingleSlicingSpec(
                 column_for_slicing=['trip_start_hour'])
         ]))
     self.assertEqual('ModelEvalPath', evaluator.outputs.output.type_name)
예제 #24
0
    def testDo(self, mock_client):
        # Mock query result schema for _BigQueryConverter.
        mock_client.return_value.query.return_value.result.return_value.schema = self._schema

        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create output dict.
        train_examples = types.Artifact(type_name='ExamplesPath',
                                        split='train')
        train_examples.uri = os.path.join(output_data_dir, 'train')
        eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
        eval_examples.uri = os.path.join(output_data_dir, 'eval')
        output_dict = {'examples': [train_examples, eval_examples]}

        # Create exe properties.
        exec_properties = {
            'input_config':
            json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(
                        name='bq', pattern='SELECT i, f, s FROM `fake`'),
                ])),
            'output_config':
            json_format.MessageToJson(
                example_gen_pb2.Output(
                    split_config=example_gen_pb2.SplitConfig(splits=[
                        example_gen_pb2.SplitConfig.Split(name='train',
                                                          hash_buckets=2),
                        example_gen_pb2.SplitConfig.Split(name='eval',
                                                          hash_buckets=1)
                    ])))
        }

        # Run executor.
        big_query_example_gen = executor.Executor()
        big_query_example_gen.Do({}, output_dict, exec_properties)

        # Check BigQuery example gen outputs.
        train_output_file = os.path.join(train_examples.uri,
                                         'data_tfrecord-00000-of-00001.gz')
        eval_output_file = os.path.join(eval_examples.uri,
                                        'data_tfrecord-00000-of-00001.gz')
        self.assertTrue(tf.gfile.Exists(train_output_file))
        self.assertTrue(tf.gfile.Exists(eval_output_file))
        self.assertGreater(
            tf.gfile.GFile(train_output_file).size(),
            tf.gfile.GFile(eval_output_file).size())
예제 #25
0
    def testPreExecutionNewExecution(self, mock_verify_input_artifacts_fn):
        input_dict = {
            'input_a':
            types.Channel(type_name='input_a',
                          artifacts=[types.Artifact(type_name='input_a')])
        }
        output_dict = {
            'output_a':
            types.Channel(type_name='output_a',
                          artifacts=[
                              types.Artifact(type_name='output_a',
                                             split='split')
                          ])
        }
        execution_id = 1
        context_id = 123
        exec_properties = copy.deepcopy(self._exec_properties)
        driver_args = data_types.DriverArgs(enable_cache=True)
        pipeline_info = data_types.PipelineInfo(
            pipeline_name='my_pipeline_name',
            pipeline_root=os.environ.get('TEST_TMP_DIR', self.get_temp_dir()),
            run_id='my_run_id')
        component_info = data_types.ComponentInfo(
            component_type='a.b.c', component_id='my_component_id')
        self._mock_metadata.get_artifacts_by_info.side_effect = list(
            input_dict['input_a'].get())
        self._mock_metadata.register_execution.side_effect = [execution_id]
        self._mock_metadata.previous_execution.side_effect = [None]
        self._mock_metadata.register_run_context_if_not_exists.side_effect = [
            context_id
        ]

        driver = base_driver.BaseDriver(metadata_handler=self._mock_metadata)
        execution_decision = driver.pre_execution(
            input_dict=input_dict,
            output_dict=output_dict,
            exec_properties=exec_properties,
            driver_args=driver_args,
            pipeline_info=pipeline_info,
            component_info=component_info)
        self.assertFalse(execution_decision.use_cached_results)
        self.assertEqual(execution_decision.execution_id, 1)
        self.assertItemsEqual(execution_decision.exec_properties,
                              exec_properties)
        self.assertEqual(
            execution_decision.output_dict['output_a'][0].uri,
            os.path.join(pipeline_info.pipeline_root,
                         component_info.component_id, 'output_a',
                         str(execution_id), 'split', ''))
예제 #26
0
    def test_do(self):
        source_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        # Create input dict.
        train_examples = types.Artifact(type_name='ExamplesPath',
                                        split='train')
        eval_examples = types.Artifact(type_name='ExamplesPath', split='eval')
        eval_examples.uri = os.path.join(source_data_dir,
                                         'csv_example_gen/eval/')
        model_exports = types.Artifact(type_name='ModelExportPath')
        model_exports.uri = os.path.join(source_data_dir, 'trainer/current/')
        input_dict = {
            'examples': [train_examples, eval_examples],
            'model_exports': [model_exports],
        }

        # Create output dict.
        eval_output = types.Artifact('ModelEvalPath')
        eval_output.uri = os.path.join(output_data_dir, 'eval_output')
        output_dict = {'output': [eval_output]}

        # Create exec proterties.
        exec_properties = {
            'feature_slicing_spec':
            json_format.MessageToJson(
                evaluator_pb2.FeatureSlicingSpec(specs=[
                    evaluator_pb2.SingleSlicingSpec(
                        column_for_slicing=['trip_start_hour']),
                    evaluator_pb2.SingleSlicingSpec(
                        column_for_slicing=['trip_start_day', 'trip_miles']),
                ]))
        }

        # Run executor.
        evaluator = executor.Executor()
        evaluator.Do(input_dict, output_dict, exec_properties)

        # Check evaluator outputs.
        self.assertTrue(
            tf.gfile.Exists(os.path.join(eval_output.uri, 'eval_config')))
        self.assertTrue(
            tf.gfile.Exists(os.path.join(eval_output.uri, 'metrics')))
        self.assertTrue(tf.gfile.Exists(os.path.join(eval_output.uri,
                                                     'plots')))
    def _create_launcher_context(self, component_config=None):
        test_dir = self.get_temp_dir()

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        pipeline_root = os.path.join(test_dir, 'Test')

        input_artifact = types.Artifact(type_name='InputPath')
        input_artifact.uri = os.path.join(test_dir, 'input')

        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]),
            custom_executor_spec=executor_spec.ExecutorContainerSpec(
                image='gcr://test', args=['{{input_dict["input"][0].uri}}']))

        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        launcher = kubernetes_component_launcher.KubernetesComponentLauncher.create(
            component=component,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            beam_pipeline_args=[],
            additional_pipeline_args={},
            component_config=component_config)

        return {'launcher': launcher, 'input_artifact': input_artifact}
예제 #28
0
    def testUnresolvedChannel(self):
        class _FakeComponentSpec(types.ComponentSpec):
            PARAMETERS = {}
            INPUTS = {
                'input': component_spec.ChannelParameter(type_name='Foo')
            }
            OUTPUTS = {}

        class _FakeExecutor(base_executor.BaseExecutor):
            CALLED = False

            def Do(self, input_dict: Dict[Text, List[types.Artifact]],
                   output_dict: Dict[Text, List[types.Artifact]],
                   exec_properties: Dict[Text, Any]) -> None:
                _FakeExecutor.CALLED = True

        class _FakeComponent(base_component.BaseComponent):
            SPEC_CLASS = _FakeComponentSpec
            EXECUTOR_SPEC = executor_spec.ExecutorClassSpec(_FakeExecutor)

            def __init__(self, spec: types.ComponentSpec):
                super(_FakeComponent, self).__init__(spec=spec)

        c = interactive_context.InteractiveContext()
        foo = types.Channel(type_name='Foo', artifacts=[types.Artifact('Foo')])
        component = _FakeComponent(_FakeComponentSpec(input=foo))
        with self.assertRaisesRegexp(ValueError, 'Unresolved input channel'):
            c.run(component)
예제 #29
0
  def __init__(self,
               examples: channel.Channel,
               model: channel.Channel,
               blessing: Optional[channel.Channel] = None,
               name: Optional[Text] = None):
    """Construct a ModelValidator component.

    Args:
      examples: A Channel of 'ExamplesPath' type, usually produced by ExampleGen
        component.
      model: A Channel of 'ModelExportPath' type, usually produced by Trainer
        component.
      blessing: Optional output channel of 'ModelBlessingPath' for result of
        blessing.
      name: Optional unique name. Necessary if multiple ModelValidator
        components are declared in the same pipeline.
    """
    blessing = blessing or channel.Channel(
        type_name='ModelBlessingPath',
        artifacts=[types.Artifact('ModelBlessingPath')])
    name = name or ''
    spec = ModelValidatorSpec(
        examples=channel.as_channel(examples),
        model=channel.as_channel(model),
        component_unique_name=name,
        blessing=blessing)
    super(ModelValidator, self).__init__(spec=spec, name=name)
예제 #30
0
    def schedule(self) -> task_scheduler.TaskSchedulerResult:
        def _as_dict(proto_map) -> Dict[str, types.Property]:
            return {
                k: data_types_utils.get_value(v)
                for k, v in proto_map.items()
            }

        pipeline_node = self.task.get_pipeline_node()
        output_spec = pipeline_node.outputs.outputs[importer.IMPORT_RESULT_KEY]
        properties = _as_dict(output_spec.artifact_spec.additional_properties)
        custom_properties = _as_dict(
            output_spec.artifact_spec.additional_custom_properties)

        output_artifacts = importer.generate_output_dict(
            metadata_handler=self.mlmd_handle,
            uri=str(self.task.exec_properties[importer.SOURCE_URI_KEY]),
            properties=properties,
            custom_properties=custom_properties,
            reimport=bool(
                self.task.exec_properties[importer.REIMPORT_OPTION_KEY]),
            output_artifact_class=types.Artifact(
                output_spec.artifact_spec.type).type,
            mlmd_artifact_type=output_spec.artifact_spec.type)

        return task_scheduler.TaskSchedulerResult(
            status=status_lib.Status(code=status_lib.Code.OK),
            output=task_scheduler.ImporterNodeOutput(
                output_artifacts=output_artifacts))