예제 #1
0
파일: types_test.py 프로젝트: luvneries/tfx
 def test_get_from_single_list(self):
   """Test various retrieval utilities on a single list of TfxType."""
   single_list = [types.TfxType('MyTypeName', split='eval')]
   single_list[0].uri = '/tmp/evaluri'
   self.assertEqual(single_list[0], types.get_single_instance(single_list))
   self.assertEqual('/tmp/evaluri', types.get_single_uri(single_list))
   self.assertEqual(single_list[0],
                    types._get_split_instance(single_list, 'eval'))
   self.assertEqual('/tmp/evaluri', types.get_split_uri(single_list, 'eval'))
   with self.assertRaises(ValueError):
     types._get_split_instance(single_list, 'train')
   with self.assertRaises(ValueError):
     types.get_split_uri(single_list, 'train')
예제 #2
0
 def test_get_from_single_list(self):
   """Test various retrieval utilities on a single list of TfxType."""
   single_list = [types.TfxType('MyTypeName', split='eval')]
   single_list[0].uri = '/tmp/evaluri'
   self.assertEqual(single_list[0], types.get_single_instance(single_list))
   self.assertEqual('/tmp/evaluri', types.get_single_uri(single_list))
   self.assertEqual(single_list[0],
                    types._get_split_instance(single_list, 'eval'))
   self.assertEqual('/tmp/evaluri', types.get_split_uri(single_list, 'eval'))
   with self.assertRaises(ValueError):
     types._get_split_instance(single_list, 'train')
   with self.assertRaises(ValueError):
     types.get_split_uri(single_list, 'train')
예제 #3
0
파일: executor.py 프로젝트: marcromeyn/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Take BigQuery sql and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties.
        - query: BigQuery sql string.

    Returns:
      None

    Raises:
      RuntimeError: if query is missing in exec_properties.
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    training_tfrecord = types.get_split_uri(output_dict['examples'], 'train')
    eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval')

    if 'query' not in exec_properties:
      raise RuntimeError('Missing query.')
    query = exec_properties['query']

    tf.logging.info('Generating examples from BigQuery.')
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      converter = _BigQueryConverter(query)
      example_splits = (
          pipeline
          | 'QueryTable' >> self._big_query_ptransform(query)
          | 'ToSerializedTFExample' >> beam.Map(
              converter.row_to_serialized_example)
          | 'SplitData' >> beam.Partition(_partition_fn, 2))
      # TODO(jyzhao): make shuffle optional.
      # pylint: disable=expression-not-assigned
      (example_splits[0]
       | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle()
       | 'OutputTrainSplit' >> beam.io.WriteToTFRecord(
           os.path.join(training_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      (example_splits[1]
       | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle()
       | 'OutputEvalSplit' >> beam.io.WriteToTFRecord(
           os.path.join(eval_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      # pylint: enable=expression-not-assigned
    tf.logging.info('Examples generated.')
예제 #4
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Take input csv data and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input-base: input dir that contains csv data. csv files must have
          header line.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    training_tfrecord = types.get_split_uri(output_dict['examples'], 'train')
    eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval')

    input_base = types.get_single_instance(input_dict['input-base'])
    input_base_uri = input_base.uri

    tf.logging.info('Generating examples.')

    raw_data = io_utils.get_only_uri_in_dir(input_base_uri)
    tf.logging.info('No split {}.'.format(raw_data))

    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      example_splits = (
          pipeline
          # pylint: disable=no-value-for-parameter
          | 'CsvToSerializedExample' >> _CsvToSerializedExample(raw_data)
          | 'SplitData' >> beam.Partition(_partition_fn, 2))
      # TODO(jyzhao): make shuffle optional.
      # pylint: disable=expression-not-assigned
      (example_splits[0]
       | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle()
       | 'OutputTrainSplit' >> beam.io.WriteToTFRecord(
           os.path.join(training_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      (example_splits[1]
       | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle()
       | 'OutputEvalSplit' >> beam.io.WriteToTFRecord(
           os.path.join(eval_tfrecord, DEFAULT_FILE_NAME),
           file_name_suffix='.gz'))
      # pylint: enable=expression-not-assigned

    tf.logging.info('Examples generated.')
예제 #5
0
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow SchemaGen executor entrypoint.

    This infers the schema using tensorflow_data_validation on the precomputed
    stats of 'train' split.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'train'. Stats on other splits are ignored.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'SchemaPath' artifact of size one.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    # TODO(zhitaoli): Move constants between this file and component.py to a
    # constants.py.
    train_stats_uri = io_utils.get_only_uri_in_dir(
        types.get_split_uri(input_dict['stats'], 'train'))
    output_uri = os.path.join(
        types.get_single_uri(output_dict['output']), _DEFAULT_FILE_NAME)

    infer_feature_shape = False
    tf.logging.info('Infering schema from statistics.')
    schema = tfdv.infer_schema(
        tfdv.load_statistics(train_stats_uri), infer_feature_shape)
    io_utils.write_pbtxt_file(output_uri, schema)
    tf.logging.info('Schema written to {}.'.format(output_uri))
예제 #6
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Take input data source and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      None

    Raises:
      RuntimeError: if output split config is not specified.
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # Get output split information.
        output_config = example_gen_pb2.Output()
        json_format.Parse(exec_properties['output'], output_config)
        self._check_split_config(output_config.split_config)
        splits = output_config.split_config.splits
        # Calculate split buckets.
        buckets = []
        total_buckets = 0
        for split in splits:
            total_buckets += split.hash_buckets
            buckets.append(total_buckets)

        tf.logging.info('Generating examples.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            input_to_example = self.GetInputSourceToExamplePTransform()
            example_splits = (
                pipeline
                | 'InputSourceToExample' >> input_to_example(
                    input_dict, exec_properties)
                # Returns deterministic string as partition is based on it.
                | 'SerializeDeterministically' >>
                beam.Map(lambda x: x.SerializeToString(deterministic=True))
                | 'SplitData' >> beam.Partition(_partition_fn, len(buckets),
                                                buckets))
            # TODO(jyzhao): make shuffle optional.
            # pylint: disable=expression-not-assigned
            for index, example_split in enumerate(example_splits):
                (example_split
                 | 'ShuffleSplit' + splits[index].name >>
                 beam.transforms.Reshuffle()
                 | 'OutputSplit' + splits[index].name >>
                 beam.io.WriteToTFRecord(os.path.join(
                     types.get_split_uri(output_dict['examples'],
                                         splits[index].name),
                     DEFAULT_FILE_NAME),
                                         file_name_suffix='.gz'))
            # pylint: enable=expression-not-assigned

        tf.logging.info('Examples generated.')
예제 #7
0
    def Do(self, input_dict: Dict[Text, List[types.TfxType]],
           output_dict: Dict[Text, List[types.TfxType]],
           exec_properties: Dict[Text, Any]) -> None:
        """Take input data source and generates TF Example splits.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: splits of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.
        - input: JSON string of example_gen_pb2.Input instance, providing input
          configuration.
        - output: JSON string of example_gen_pb2.Output instance, providing
          output configuration.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Generating examples.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            example_splits = self.GenerateExamplesByBeam(
                pipeline, input_dict, exec_properties)

            # pylint: disable=expression-not-assigned, no-value-for-parameter
            for split_name, example_split in example_splits.items():
                (example_split
                 | 'WriteSplit' + split_name >> _WriteSplit(
                     types.get_split_uri(output_dict['examples'], split_name)))
            # pylint: enable=expression-not-assigned, no-value-for-parameter

        tf.logging.info('Examples generated.')
예제 #8
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        tf.logging.info('Validating schema against the computed statistics.')
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                types.get_single_uri(input_dict['schema'])))
        stats = tfdv.load_statistics(
            io_utils.get_only_uri_in_dir(
                types.get_split_uri(input_dict['stats'], 'eval')))
        output_uri = types.get_single_uri(output_dict['output'])
        anomalies = tfdv.validate_statistics(stats, schema)
        io_utils.write_pbtxt_file(os.path.join(output_uri, DEFAULT_FILE_NAME),
                                  anomalies)
        tf.logging.info(
            'Validation complete. Anomalies written to {}.'.format(output_uri))
예제 #9
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Take input data source and generates train and eval tf examples.

    Args:
      input_dict: Input dict from input key to a list of Artifacts. Depends on
        detailed example gen implementation.
      output_dict: Output dict from output key to a list of Artifacts.
        - examples: train and eval split of tf examples.
      exec_properties: A dict of execution properties. Depends on detailed
        example gen implementation.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        training_tfrecord = types.get_split_uri(output_dict['examples'],
                                                'train')
        eval_tfrecord = types.get_split_uri(output_dict['examples'], 'eval')

        tf.logging.info('Generating examples.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            input_to_example = self.GetInputSourceToExamplePTransform()
            example_splits = (
                pipeline
                | 'InputSourceToExample' >> input_to_example(
                    input_dict, exec_properties)
                # Returns deterministic string as partition is based on it.
                | 'SerializeDeterministically' >>
                beam.Map(lambda x: x.SerializeToString(deterministic=True))
                | 'SplitData' >> beam.Partition(_partition_fn, 2))
            # TODO(jyzhao): make shuffle optional.
            # pylint: disable=expression-not-assigned
            (example_splits[0]
             | 'ShuffleTrainSplit' >> beam.transforms.Reshuffle()
             | 'OutputTrainSplit' >> beam.io.WriteToTFRecord(
                 os.path.join(training_tfrecord, DEFAULT_FILE_NAME),
                 file_name_suffix='.gz'))
            (example_splits[1]
             | 'ShuffleEvalSplit' >> beam.transforms.Reshuffle()
             | 'OutputEvalSplit' >> beam.io.WriteToTFRecord(
                 os.path.join(eval_tfrecord, DEFAULT_FILE_NAME),
                 file_name_suffix='.gz'))
            # pylint: enable=expression-not-assigned

        tf.logging.info('Examples generated.')
예제 #10
0
파일: executor.py 프로젝트: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
    if 'model_exports' not in input_dict:
      raise ValueError('\'model_exports\' is missing in input dict.')
    if 'examples' not in input_dict:
      raise ValueError('\'examples\' is missing in input dict.')
    if 'output' not in output_dict:
      raise ValueError('\'output\' is missing in output dict.')

    self._log_startup(input_dict, output_dict, exec_properties)

    # Extract input artifacts
    model_exports_uri = types.get_single_uri(input_dict['model_exports'])

    feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
    json_format.Parse(exec_properties['feature_slicing_spec'],
                      feature_slicing_spec)
    slice_spec = self._get_slice_spec_from_feature_slicing_spec(
        feature_slicing_spec)

    output_uri = types.get_single_uri(output_dict['output'])

    eval_model_path = path_utils.eval_model_path(model_exports_uri)

    tf.logging.info('Using {} for model eval.'.format(eval_model_path))
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=eval_model_path)

    tf.logging.info('Evaluating model.')
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
      # pylint: disable=expression-not-assigned
      (pipeline
       | 'ReadData' >> beam.io.ReadFromTFRecord(
           file_pattern=io_utils.all_files_pattern(
               types.get_split_uri(input_dict['examples'], 'eval')))
       |
       'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults(
           eval_shared_model=eval_shared_model,
           slice_spec=slice_spec,
           output_path=output_uri))
    tf.logging.info(
        'Evaluation complete. Results written to {}.'.format(output_uri))
예제 #11
0
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data.

    Returns:
      None
    """
        if 'model_exports' not in input_dict:
            raise ValueError('\'model_exports\' is missing in input dict.')
        if 'examples' not in input_dict:
            raise ValueError('\'examples\' is missing in input dict.')
        if 'output' not in output_dict:
            raise ValueError('\'output\' is missing in output dict.')

        self._log_startup(input_dict, output_dict, exec_properties)

        # Extract input artifacts
        model_exports_uri = types.get_single_uri(input_dict['model_exports'])

        feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
        json_format.Parse(exec_properties['feature_slicing_spec'],
                          feature_slicing_spec)
        slice_spec = self._get_slice_spec_from_feature_slicing_spec(
            feature_slicing_spec)

        output_uri = types.get_single_uri(output_dict['output'])

        eval_model_path = path_utils.eval_model_path(model_exports_uri)

        tf.logging.info('Using {} for model eval.'.format(eval_model_path))
        eval_shared_model = tfma.default_eval_shared_model(
            eval_saved_model_path=eval_model_path)

        tf.logging.info('Evaluating model.')
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as pipeline:
            # pylint: disable=expression-not-assigned
            (pipeline
             | 'ReadData' >>
             beam.io.ReadFromTFRecord(file_pattern=io_utils.all_files_pattern(
                 types.get_split_uri(input_dict['examples'], 'eval')))
             | 'ExtractEvaluateAndWriteResults' >>
             tfma.ExtractEvaluateAndWriteResults(
                 eval_shared_model=eval_shared_model,
                 slice_spec=slice_spec,
                 output_path=output_uri))
        tf.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))
예제 #12
0
파일: types_test.py 프로젝트: luvneries/tfx
  def test_get_from_split_list(self):
    """Test various retrieval utilities on a list of split TfxTypes."""
    split_list = []
    for split in ['train', 'eval']:
      instance = types.TfxType('MyTypeName', split=split)
      instance.uri = '/tmp/' + split
      split_list.append(instance)

    with self.assertRaises(ValueError):
      types.get_single_instance(split_list)

    with self.assertRaises(ValueError):
      types.get_single_uri(split_list)

    self.assertEqual(split_list[0],
                     types._get_split_instance(split_list, 'train'))
    self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train'))
    self.assertEqual(split_list[1], types._get_split_instance(
        split_list, 'eval'))
    self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
예제 #13
0
  def test_get_from_split_list(self):
    """Test various retrieval utilities on a list of split TfxTypes."""
    split_list = []
    for split in ['train', 'eval']:
      instance = types.TfxType('MyTypeName', split=split)
      instance.uri = '/tmp/' + split
      split_list.append(instance)

    with self.assertRaises(ValueError):
      types.get_single_instance(split_list)

    with self.assertRaises(ValueError):
      types.get_single_uri(split_list)

    self.assertEqual(split_list[0],
                     types._get_split_instance(split_list, 'train'))
    self.assertEqual('/tmp/train', types.get_split_uri(split_list, 'train'))
    self.assertEqual(split_list[1],
                     types._get_split_instance(split_list, 'eval'))
    self.assertEqual('/tmp/eval', types.get_split_uri(split_list, 'eval'))
예제 #14
0
 def testGetSplitUriDeprecated(self):
     with mock.patch.object(tf_logging, 'warning'):
         warn_mock = mock.MagicMock()
         tf_logging.warning = warn_mock
         my_artifact = artifact.Artifact('TestType')
         my_artifact.uri = '123'
         my_artifact.split = 'train'
         self.assertEqual('123', types.get_split_uri([my_artifact],
                                                     'train'))
         warn_mock.assert_called_once()
         self.assertIn('tfx.utils.types.get_split_uri has been renamed to',
                       warn_mock.call_args[0][5])
예제 #15
0
    def Do(self, input_dict: Dict[Text, List[types.TfxType]],
           output_dict: Dict[Text, List[types.TfxType]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_to_instance = {x.split: x for x in input_dict['input_data']}
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, instance in split_to_instance.items():
                tf.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(instance.uri)
                output_uri = types.get_split_uri(output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
            tf.logging.info('Statistics written to {}.'.format(output_uri))
예제 #16
0
파일: executor.py 프로젝트: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    split_to_instance = {x.split: x for x in input_dict['input_data']}
    with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
      # TODO(b/126263006): Support more stats_options through config.
      stats_options = options.StatsOptions()
      for split, instance in split_to_instance.items():
        tf.logging.info('Generating statistics for split {}'.format(split))
        input_uri = io_utils.all_files_pattern(instance.uri)
        output_uri = types.get_split_uri(output_dict['output'], split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        _ = (
            p
            | 'ReadData.' + split >>
            beam.io.ReadFromTFRecord(file_pattern=input_uri)
            | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample()
            | 'GenerateStatistics.' + split >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
      tf.logging.info('Statistics written to {}.'.format(output_uri))
예제 #17
0
파일: executor.py 프로젝트: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow ExampleValidator executor entrypoint.

    This validates the statistics on the 'eval' split against the schema.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - stats: A list of 'ExampleStatisticsPath' type which should contain
          split 'eval'. Stats on other splits are ignored.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - output: A list of 'ExampleValidationPath' artifact of size one. It
          will include a single pbtxt file which contains all anomalies found.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    tf.logging.info('Validating schema against the computed statistics.')
    schema = io_utils.SchemaReader().read(
        io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema'])))
    stats = tfdv.load_statistics(
        io_utils.get_only_uri_in_dir(
            types.get_split_uri(input_dict['stats'], 'eval')))
    output_uri = types.get_single_uri(output_dict['output'])
    anomalies = tfdv.validate_statistics(stats, schema)
    io_utils.write_pbtxt_file(
        os.path.join(output_uri, DEFAULT_FILE_NAME), anomalies)
    tf.logging.info(
        'Validation complete. Anomalies written to {}.'.format(output_uri))
예제 #18
0
파일: executor.py 프로젝트: socar-kyle/tfx
    def Do(self, input_dict: Dict[Text, List[types.TfxArtifact]],
           output_dict: Dict[Text, List[types.TfxArtifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Uses a user-supplied tf.estimator to train a TensorFlow model locally.

    The Trainer Executor invokes a training_fn callback function provided by
    the user via the module_file parameter.  With the tf.estimator returned by
    this function, the Trainer Executor then builds a TensorFlow model using the
    user-provided tf.estimator.

    Args:
      input_dict: Input dict from input key to a list of ML-Metadata Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None

    Raises:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(zhitaoli): Deprecate this in a future version.
        if exec_properties.get('custom_config', None):
            cmle_args = exec_properties.get('custom_config',
                                            {}).get('cmle_training_args')
            if cmle_args:
                executor_class_path = '.'.join(
                    [Executor.__module__, Executor.__name__])
                tf.logging.warn(
                    'Passing \'cmle_training_args\' to trainer directly is deprecated, '
                    'please use extension executor at '
                    'tfx.extensions.google_cloud_ai_platform.trainer.executor instead'
                )

                return cmle_runner.start_cmle_training(input_dict, output_dict,
                                                       exec_properties,
                                                       executor_class_path,
                                                       cmle_args)

        trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                          'trainer_fn')

        # Set up training parameters
        train_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'train'))
        ]
        transform_output = types.get_single_uri(input_dict['transform_output'])
        eval_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'eval'))
        ]
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        train_args = trainer_pb2.TrainArgs()
        eval_args = trainer_pb2.EvalArgs()
        json_format.Parse(exec_properties['train_args'], train_args)
        json_format.Parse(exec_properties['eval_args'], eval_args)

        # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
        # num_steps=None.  Conversion of the proto to python will set the default
        # value of an int as 0 so modify the value here.  Tensorflow will raise an
        # error if num_steps <= 0.
        train_steps = train_args.num_steps or None
        eval_steps = eval_args.num_steps or None

        output_path = types.get_single_uri(output_dict['output'])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        # Assemble warm start path if needed.
        warm_start_from = None
        if exec_properties.get('warm_starting') and exec_properties.get(
                'warm_start_from'):
            previous_model_dir = os.path.join(
                exec_properties['warm_start_from'],
                path_utils.SERVING_MODEL_DIR)
            if previous_model_dir and tf.gfile.Exists(
                    os.path.join(previous_model_dir,
                                 self._CHECKPOINT_FILE_NAME)):
                warm_start_from = previous_model_dir

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        hparams = tf.contrib.training.HParams(
            # A list of uris for train files.
            train_files=train_files,
            # A single uri for transform graph produced by TFT.
            transform_output=transform_output,
            # A single uri for the output directory of the serving model.
            serving_model_dir=serving_model_dir,
            # A list of uris for eval files.
            eval_files=eval_files,
            # A single uri for schema file.
            schema_file=schema_file,
            # Number of train steps.
            train_steps=train_steps,
            # Number of eval steps.
            eval_steps=eval_steps,
            # A single uri for the model directory to warm start from.
            warm_start_from=warm_start_from)

        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

        training_spec = trainer_fn(hparams, schema)

        # Train the model
        tf.logging.info('Training model.')
        tf.estimator.train_and_evaluate(training_spec['estimator'],
                                        training_spec['train_spec'],
                                        training_spec['eval_spec'])
        tf.logging.info('Training complete.  Model written to %s',
                        serving_model_dir)

        # Export an eval savedmodel for TFMA
        tf.logging.info('Exporting eval_savedmodel for TFMA.')
        tfma.export.export_eval_savedmodel(
            estimator=training_spec['estimator'],
            export_dir_base=eval_model_dir,
            eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

        tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
예제 #19
0
파일: executor.py 프로젝트: luvneries/tfx
  def Do(self, input_dict,
         output_dict,
         exec_properties):
    """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)
    train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
    eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
    schema_file = io_utils.get_only_uri_in_dir(
        types.get_single_uri(input_dict['schema']))

    transform_output = types.get_single_uri(output_dict['transform_output'])
    if tf.gfile.Exists(transform_output):
      io_utils.delete_dir(transform_output)

    transformed_train_output = types.get_split_uri(
        output_dict['transformed_examples'], 'train')
    if tf.gfile.Exists(transformed_train_output):
      io_utils.delete_dir(transformed_train_output)

    transformed_eval_output = types.get_split_uri(
        output_dict['transformed_examples'], 'eval')
    if tf.gfile.Exists(transformed_eval_output):
      io_utils.delete_dir(transformed_eval_output)

    temp_path = os.path.join(transform_output, _TEMP_DIR_IN_TRANSFORM_OUTPUT)
    tf.logging.debug('Using temp path %s for tft.beam', temp_path)

    label_inputs = {
        labels.COMPUTE_STATISTICS_LABEL:
            False,
        labels.SCHEMA_PATH_LABEL:
            schema_file,
        labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
        labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
        labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
        labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
        labels.PREPROCESSING_FN:
            exec_properties['module_file'],
    }

    label_outputs = {
        labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL: transform_output,
        labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
            os.path.join(transformed_train_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            os.path.join(transformed_eval_output,
                         _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
        ],
        labels.TEMP_OUTPUT_LABEL: str(temp_path),
    }
    status_file = 'status_file'  # Unused
    self.Transform(label_inputs, label_outputs, status_file)
    tf.logging.info('Cleaning up temp path %s on executor success', temp_path)
    io_utils.delete_dir(temp_path)
예제 #20
0
    def Do(self, input_dict, output_dict, exec_properties):
        """TensorFlow Transform executor entrypoint.

    This implements BaseExecutor.Do() and is invoked by orchestration systems.
    This is not inteded for manual usage or further customization. Please use
    the Transform() function which takes an input format with no artifact
    dependency.

    Args:
      input_dict: Input dict from input key to a list of artifacts, including:
        - input_data: A list of 'ExamplesPath' type which should contain two
          splits 'train' and 'eval'.
        - schema: A list of 'SchemaPath' type which should contain a single
          schema artifact.
      output_dict: Output dict from key to a list of artifacts, including:
        - transform_output: Output of 'tf.Transform', which includes an exported
          Tensorflow graph suitable for both training and serving;
        - transformed_examples: Materialized transformed examples, which
          includes both 'train' and 'eval' splits.
      exec_properties: A dict of execution properties, including:
        - module_file: The file path to a python module file, from which the
          'preprocessing_fn' function will be loaded.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)
        train_data_uri = types.get_split_uri(input_dict['input_data'], 'train')
        eval_data_uri = types.get_split_uri(input_dict['input_data'], 'eval')
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        transform_output = types.get_single_uri(
            output_dict['transform_output'])
        if tf.gfile.Exists(transform_output):
            io_utils.delete_dir(transform_output)

        transformed_train_output = types.get_split_uri(
            output_dict['transformed_examples'], 'train')
        if tf.gfile.Exists(transformed_train_output):
            io_utils.delete_dir(transformed_train_output)

        transformed_eval_output = types.get_split_uri(
            output_dict['transformed_examples'], 'eval')
        if tf.gfile.Exists(transformed_eval_output):
            io_utils.delete_dir(transformed_eval_output)

        temp_path = os.path.join(transform_output,
                                 _TEMP_DIR_IN_TRANSFORM_OUTPUT)
        tf.logging.debug('Using temp path %s for tft.beam', temp_path)

        label_inputs = {
            labels.COMPUTE_STATISTICS_LABEL:
            False,
            labels.SCHEMA_PATH_LABEL:
            schema_file,
            labels.EXAMPLES_DATA_FORMAT_LABEL:
            labels.FORMAT_TF_EXAMPLE,
            labels.ANALYZE_AND_TRANSFORM_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(train_data_uri),
            labels.TRANSFORM_ONLY_DATA_PATHS_LABEL:
            io_utils.all_files_pattern(eval_data_uri),
            labels.TFT_STATISTICS_USE_TFDV_LABEL:
            True,
            labels.PREPROCESSING_FN:
            exec_properties['module_file'],
        }

        label_outputs = {
            labels.TRANSFORM_METADATA_OUTPUT_PATH_LABEL:
            transform_output,
            labels.TRANSFORM_MATERIALIZE_OUTPUT_PATHS_LABEL: [
                os.path.join(transformed_train_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
                os.path.join(transformed_eval_output,
                             _DEFAULT_TRANSFORMED_EXAMPLES_PREFIX),
            ],
            labels.TEMP_OUTPUT_LABEL:
            str(temp_path),
        }
        status_file = 'status_file'  # Unused
        self.Transform(label_inputs, label_outputs, status_file)
        tf.logging.info('Cleaning up temp path %s on executor success',
                        temp_path)
        io_utils.delete_dir(temp_path)
예제 #21
0
파일: executor.py 프로젝트: rohithreddy/tfx
    def Do(self, input_dict, output_dict, exec_properties):
        """Runs trainer job the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - transformed_examples: Transformed example.
        - transform_output: Input transform graph.
        - schema: Schema of the data.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: Exported model.
      exec_properties: A dict of execution properties.
        - train_args: JSON string of trainer_pb2.TrainArgs instance, providing
          args for training.
        - eval_args: JSON string of trainer_pb2.EvalArgs instance, providing
          args for eval.
        - module_file: Python module file containing UDF model definition.
        - warm_starting: Whether or not we need to do warm starting.
        - warm_start_from: Optional. If warm_starting is True, this is the
          directory to find previous model to warm start on.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(khaas): Move this to tfx/extensions.
        if exec_properties.get('custom_config', None):
            cmle_args = exec_properties.get('custom_config',
                                            {}).get('cmle_training_args')
            if cmle_args:
                return cmle_runner.start_cmle_training(input_dict, output_dict,
                                                       exec_properties,
                                                       cmle_args)

        trainer_fn = io_utils.import_func(exec_properties['module_file'],
                                          'trainer_fn')

        # Set up training parameters
        train_files = [
            _all_files_pattern(
                types.get_split_uri(input_dict['transformed_examples'],
                                    'train'))
        ]
        transform_output = types.get_single_uri(input_dict['transform_output'])
        eval_files = _all_files_pattern(
            types.get_split_uri(input_dict['transformed_examples'], 'eval'))
        schema_file = io_utils.get_only_uri_in_dir(
            types.get_single_uri(input_dict['schema']))

        train_args = trainer_pb2.TrainArgs()
        eval_args = trainer_pb2.EvalArgs()
        json_format.Parse(exec_properties['train_args'], train_args)
        json_format.Parse(exec_properties['eval_args'], eval_args)

        # https://github.com/tensorflow/tfx/issues/45: Replace num_steps=0 with
        # num_steps=None.  Conversion of the proto to python will set the default
        # value of an int as 0 so modify the value here.  Tensorflow will raise an
        # error if num_steps <= 0.
        train_steps = train_args.num_steps or None
        eval_steps = eval_args.num_steps or None

        output_path = types.get_single_uri(output_dict['output'])
        serving_model_dir = path_utils.serving_model_dir(output_path)
        eval_model_dir = path_utils.eval_model_dir(output_path)

        # Assemble warm start path if needed.
        warm_start_from = None
        if exec_properties.get('warm_starting') and exec_properties.get(
                'warm_start_from'):
            previous_model_dir = os.path.join(
                exec_properties['warm_start_from'],
                path_utils.SERVING_MODEL_DIR)
            if previous_model_dir and tf.gfile.Exists(
                    os.path.join(previous_model_dir,
                                 self._CHECKPOINT_FILE_NAME)):
                warm_start_from = previous_model_dir

        # TODO(b/126242806) Use PipelineInputs when it is available in third_party.
        hparams = tf.contrib.training.HParams(
            train_files=train_files,
            transform_output=transform_output,
            output_dir=output_path,
            serving_model_dir=serving_model_dir,
            eval_files=eval_files,
            schema_file=schema_file,
            train_steps=train_steps,
            eval_steps=eval_steps,
            warm_start_from=warm_start_from)

        schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())

        training_spec = trainer_fn(hparams, schema)

        # Train the model
        tf.logging.info('Training model.')
        tf.estimator.train_and_evaluate(training_spec['estimator'],
                                        training_spec['train_spec'],
                                        training_spec['eval_spec'])
        tf.logging.info('Training complete.  Model written to %s',
                        serving_model_dir)

        # Export an eval savedmodel for TFMA
        tf.logging.info('Exporting eval_savedmodel for TFMA.')
        tfma.export.export_eval_savedmodel(
            estimator=training_spec['estimator'],
            export_dir_base=eval_model_dir,
            eval_input_receiver_fn=training_spec['eval_input_receiver_fn'])

        tf.logging.info('Exported eval_savedmodel to %s.', eval_model_dir)
예제 #22
0
    def Do(self, input_dict, output_dict, exec_properties):
        """Validate current model against last blessed model.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: examples for eval the model.
        - model: current model for validation.
      output_dict: Output dict from output key to a list of Artifacts.
        - blessing: model blessing result.
        - results: model validation results.
      exec_properties: A dict of execution properties.
        - blessed_model: last blessed model for validation.
        - blessed_model_id: last blessed model id.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        # TODO(b/125451545): Provide a safe temp path from base executor instead.
        self._temp_path = os.path.join(
            types.get_single_uri(output_dict['results']), '.temp')
        tf.logging.info('Using temp path {} for tft.beam'.format(
            self._temp_path))

        eval_examples_uri = types.get_split_uri(input_dict['examples'], 'eval')
        blessing = types.get_single_instance(output_dict['blessing'])

        # Current model.
        current_model = types.get_single_instance(input_dict['model'])
        tf.logging.info('Using {} as current model.'.format(current_model.uri))
        blessing.set_string_custom_property('current_model', current_model.uri)
        blessing.set_int_custom_property('current_model_id', current_model.id)

        # Blessed model.
        blessed_model_dir = exec_properties['blessed_model']
        blessed_model_id = exec_properties['blessed_model_id']
        tf.logging.info('Using {} as blessed model.'.format(blessed_model_dir))
        if blessed_model_dir:
            blessing.set_string_custom_property('blessed_model',
                                                blessed_model_dir)
            blessing.set_int_custom_property('blessed_model_id',
                                             blessed_model_id)

        tf.logging.info('Validating model.')
        # TODO(b/125853306): support customized slice spec.
        blessed = self._generate_blessing_result(
            eval_examples_uri=eval_examples_uri,
            slice_spec=[tfma.slicer.slicer.SingleSliceSpec()],
            current_model_dir=current_model.uri,
            blessed_model_dir=blessed_model_dir)

        if blessed:
            io_utils.write_string_file(os.path.join(blessing.uri, 'BLESSED'),
                                       '')
            blessing.set_int_custom_property('blessed', 1)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, 'NOT_BLESSED'), '')
            blessing.set_int_custom_property('blessed', 0)
        tf.logging.info('Blessing result {} written to {}.'.format(
            blessed, blessing.uri))

        io_utils.delete_dir(self._temp_path)
        tf.logging.info('Cleaned up temp path {} on executor success.'.format(
            self._temp_path))