Пример #1
0
  def testPreprocessingFn(self):
    schema_file = os.path.join(self._testdata_path, 'schema_gen/schema.pbtxt')
    schema = io_utils.parse_pbtxt_file(schema_file, schema_pb2.Schema())
    feature_spec = taxi_utils._get_raw_feature_spec(schema)
    working_dir = self.get_temp_dir()
    transform_graph_path = os.path.join(working_dir, 'transform_graph')
    transformed_examples_path = os.path.join(
        working_dir, 'transformed_examples')

    # Run very simplified version of executor logic.
    # TODO(kestert): Replace with tft_unit.assertAnalyzeAndTransformResults.
    # Generate legacy `DatasetMetadata` object.  Future version of Transform
    # will accept the `Schema` proto directly.
    legacy_metadata = dataset_metadata.DatasetMetadata(
        schema_utils.schema_from_feature_spec(feature_spec))
    tfxio = tf_example_record.TFExampleRecord(
        file_pattern=os.path.join(self._testdata_path,
                                  'csv_example_gen/Split-train/*'),
        telemetry_descriptors=['Tests'],
        schema=legacy_metadata.schema)
    with beam.Pipeline() as p:
      with tft_beam.Context(temp_dir=os.path.join(working_dir, 'tmp')):
        examples = p | 'ReadTrainData' >> tfxio.BeamSource()
        (transformed_examples, transformed_metadata), transform_fn = (
            (examples, tfxio.TensorAdapterConfig())
            | 'AnalyzeAndTransform' >> tft_beam.AnalyzeAndTransformDataset(
                taxi_utils.preprocessing_fn))

        # WriteTransformFn writes transform_fn and metadata to subdirectories
        # tensorflow_transform.SAVED_MODEL_DIR and
        # tensorflow_transform.TRANSFORMED_METADATA_DIR respectively.
        # pylint: disable=expression-not-assigned
        (transform_fn
         |
         'WriteTransformFn' >> tft_beam.WriteTransformFn(transform_graph_path))

        encoder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
        (transformed_examples
         | 'EncodeTrainData' >> beam.Map(encoder.encode)
         | 'WriteTrainData' >> beam.io.WriteToTFRecord(
             os.path.join(transformed_examples_path,
                          'Split-train/transformed_examples.gz'),
             coder=beam.coders.BytesCoder()))
        # pylint: enable=expression-not-assigned

    # Verify the output matches golden output.
    # NOTE: we don't verify that transformed examples match golden output.
    expected_transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(
            self._testdata_path,
            'transform/transform_graph/transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    transformed_schema = io_utils.parse_pbtxt_file(
        os.path.join(transform_graph_path, 'transformed_metadata/schema.pbtxt'),
        schema_pb2.Schema())
    # Clear annotations so we only have to test main schema.
    transformed_schema.ClearField('annotation')
    for feature in transformed_schema.feature:
      feature.ClearField('annotation')
    self.assertEqual(transformed_schema, expected_transformed_schema)
Пример #2
0
def generate_statistics_from_tfrecord(
    data_location: Text,
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  if output_path is None:
    output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
  output_dir_path = os.path.dirname(output_path)
  if not tf.io.gfile.exists(output_dir_path):
    tf.io.gfile.makedirs(output_dir_path)

  batch_size = stats_options.desired_batch_size
  # PyLint doesn't understand Beam PTransforms.
  # pylint: disable=no-value-for-parameter
  with beam.Pipeline(options=pipeline_options) as p:
    # Auto detect tfrecord file compression format based on input data
    # path suffix.
    _ = (
        p
        | 'ReadData' >> (tf_example_record.TFExampleRecord(
            file_pattern=data_location,
            schema=None,
            telemetry_descriptors=['tfdv', 'generate_statistics_from_tfrecord'])
                         .BeamSource(batch_size))
        | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
        | 'WriteStatsOutput' >>
        (stats_api.WriteStatisticsToTFRecord(output_path)))
  return stats_util.load_statistics(output_path)
Пример #3
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                tfxio_kwargs = {'file_pattern': input_uri}
                # TODO(b/151624179): clean this up after tfx_bsl is released with the
                # below flag.
                if getattr(tfxio, 'TFXIO_HAS_TELEMETRY', False):
                    tfxio_kwargs[
                        'telemetry_descriptors'] = _TELEMETRY_DESCRIPTORS
                input_tfxio = tf_example_record.TFExampleRecord(**tfxio_kwargs)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                data = p | 'TFXIORead[{}]'.format(
                    split) >> input_tfxio.BeamSource()
                # TODO(b/153368237): Clean this up after a release post tfx 0.21.
                if not getattr(tfdv, 'TFDV_ACCEPT_RECORD_BATCH', False):
                    data |= 'RecordBatchToTable[{}]'.format(split) >> beam.Map(
                        lambda rb: pa.Table.from_batches([rb]))
                _ = (data
                     | 'GenerateStatistics[{}]'.format(split) >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput[{}]'.format(split) >>
                     beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
Пример #4
0
def make_tfxio(
        file_pattern: OneOrMorePatterns,
        telemetry_descriptors: List[str],
        payload_format: Union[str, int],
        data_view_uri: Optional[str] = None,
        schema: Optional[schema_pb2.Schema] = None,
        read_as_raw_records: bool = False,
        raw_record_column_name: Optional[str] = None,
        file_format: Optional[Union[str, List[str]]] = None) -> tfxio.TFXIO:
    """Creates a TFXIO instance that reads `file_pattern`.

  Args:
    file_pattern: the file pattern for the TFXIO to access.
    telemetry_descriptors: A set of descriptors that identify the component that
      is instantiating the TFXIO. These will be used to construct the namespace
      to contain metrics for profiling and are therefore expected to be
      identifiers of the component itself and not individual instances of source
      use.
    payload_format: one of the enums from example_gen_pb2.PayloadFormat (may be
      in string or int form). If None, default to FORMAT_TF_EXAMPLE.
    data_view_uri: uri to a DataView artifact. A DataView is needed in order to
      create a TFXIO for certain payload formats.
    schema: TFMD schema. Note: although optional, some payload formats need a
      schema in order for all TFXIO interfaces (e.g. TensorAdapter()) to work.
      Unless you know what you are doing, always supply a schema.
    read_as_raw_records: If True, ignore the payload type of `examples`. Always
      use RawTfRecord TFXIO.
    raw_record_column_name: If provided, the arrow RecordBatch produced by the
      TFXIO will contain a string column of the given name, and the contents of
      that column will be the raw records. Note that not all TFXIO supports this
      option, and an error will be raised in that case. Required if
      read_as_raw_records == True.
    file_format: file format string for each file_pattern. Only 'tfrecords_gzip'
      is supported for now.

  Returns:
    a TFXIO instance.
  """
    if not isinstance(payload_format, int):
        payload_format = example_gen_pb2.PayloadFormat.Value(payload_format)

    if file_format is not None:
        if type(file_format) is not type(file_pattern):
            raise ValueError(
                f'The type of file_pattern and file_formats should be the same.'
                f'Given: file_pattern={file_pattern}, file_format={file_format}'
            )
        if isinstance(file_format, list):
            if len(file_format) != len(file_pattern):
                raise ValueError(
                    f'The length of file_pattern and file_formats should be the same.'
                    f'Given: file_pattern={file_pattern}, file_format={file_format}'
                )
            else:
                if any(item != 'tfrecords_gzip' for item in file_format):
                    raise NotImplementedError(
                        f'{file_format} is not supported yet.')
        else:  # file_format is str type.
            if file_format != 'tfrecords_gzip':
                raise NotImplementedError(
                    f'{file_format} is not supported yet.')

    if read_as_raw_records:
        assert raw_record_column_name is not None, (
            'read_as_raw_records is specified - '
            'must provide raw_record_column_name')
        return raw_tf_record.RawTfRecordTFXIO(
            file_pattern=file_pattern,
            raw_record_column_name=raw_record_column_name,
            telemetry_descriptors=telemetry_descriptors)

    if payload_format == example_gen_pb2.PayloadFormat.FORMAT_TF_EXAMPLE:
        return tf_example_record.TFExampleRecord(
            file_pattern=file_pattern,
            schema=schema,
            raw_record_column_name=raw_record_column_name,
            telemetry_descriptors=telemetry_descriptors)

    if (payload_format ==
            example_gen_pb2.PayloadFormat.FORMAT_TF_SEQUENCE_EXAMPLE):
        return tf_sequence_example_record.TFSequenceExampleRecord(
            file_pattern=file_pattern,
            schema=schema,
            raw_record_column_name=raw_record_column_name,
            telemetry_descriptors=telemetry_descriptors)

    if payload_format == example_gen_pb2.PayloadFormat.FORMAT_PROTO:
        assert data_view_uri is not None, (
            'Accessing FORMAT_PROTO requires a DataView to parse the proto.')
        return record_to_tensor_tfxio.TFRecordToTensorTFXIO(
            file_pattern=file_pattern,
            saved_decoder_path=data_view_uri,
            telemetry_descriptors=telemetry_descriptors,
            raw_record_column_name=raw_record_column_name)

    raise NotImplementedError(
        'Unsupport payload format: {}'.format(payload_format))
Пример #5
0
 def _MakeTFXIO(self, schema, raw_record_column_name=None):
     return tf_example_record.TFExampleRecord(
         self._example_file,
         schema=schema,
         raw_record_column_name=raw_record_column_name,
         telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
Пример #6
0
 def _MakeTFXIO(self, schema):
     return tf_example_record.TFExampleRecord(self._example_file,
                                              schema=schema)
Пример #7
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Runs a batch job to evaluate the eval_model against the given input.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - model_exports: exported model.
        - examples: examples for eval the model.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: model evaluation results.
      exec_properties: A dict of execution properties.
        - eval_config: JSON string of tfma.EvalConfig.
        - feature_slicing_spec: JSON string of evaluator_pb2.FeatureSlicingSpec
          instance, providing the way to slice the data. Deprecated, use
          eval_config.slicing_specs instead.

    Returns:
      None
    """
        if constants.EXAMPLES_KEY not in input_dict:
            raise ValueError('EXAMPLES_KEY is missing from input dict.')
        if constants.MODEL_KEY not in input_dict:
            raise ValueError('MODEL_KEY is missing from input dict.')
        if constants.EVALUATION_KEY not in output_dict:
            raise ValueError('EVALUATION_KEY is missing from output dict.')
        if len(input_dict[constants.MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one candidate model, there are {}.'.format(
                    len(input_dict[constants.MODEL_KEY])))
        if constants.BASELINE_MODEL_KEY in input_dict and len(
                input_dict[constants.BASELINE_MODEL_KEY]) > 1:
            raise ValueError(
                'There can be only one baseline model, there are {}.'.format(
                    len(input_dict[constants.BASELINE_MODEL_KEY])))

        self._log_startup(input_dict, output_dict, exec_properties)

        # Add fairness indicator metric callback if necessary.
        fairness_indicator_thresholds = exec_properties.get(
            'fairness_indicator_thresholds', None)
        add_metrics_callbacks = None
        if fairness_indicator_thresholds:
            # Need to import the following module so that the fairness indicator
            # post-export metric is registered.
            import tensorflow_model_analysis.addons.fairness.post_export_metrics.fairness_indicators  # pylint: disable=g-import-not-at-top, unused-variable
            add_metrics_callbacks = [
                tfma.post_export_metrics.fairness_indicators(  # pytype: disable=module-attr
                    thresholds=fairness_indicator_thresholds),
            ]

        output_uri = artifact_utils.get_single_uri(
            output_dict[constants.EVALUATION_KEY])

        run_validation = False
        models = []
        if 'eval_config' in exec_properties and exec_properties['eval_config']:
            slice_spec = None
            has_baseline = bool(input_dict.get(constants.BASELINE_MODEL_KEY))
            eval_config = tfma.EvalConfig()
            json_format.Parse(exec_properties['eval_config'], eval_config)
            eval_config = tfma.update_eval_config_with_defaults(
                eval_config,
                maybe_add_baseline=has_baseline,
                maybe_remove_baseline=not has_baseline)
            tfma.verify_eval_config(eval_config)
            # Do not validate model when there is no thresholds configured. This is to
            # avoid accidentally blessing models when users forget to set thresholds.
            run_validation = bool(
                tfma.metrics.metric_thresholds_from_metrics_specs(
                    eval_config.metrics_specs))
            if len(eval_config.model_specs) > 2:
                raise ValueError(
                    """Cannot support more than two models. There are {} models in this
             eval_config.""".format(len(eval_config.model_specs)))
            # Extract model artifacts.
            for model_spec in eval_config.model_specs:
                if model_spec.is_baseline:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.BASELINE_MODEL_KEY])
                else:
                    model_uri = artifact_utils.get_single_uri(
                        input_dict[constants.MODEL_KEY])
                if tfma.get_model_type(model_spec) == tfma.TF_ESTIMATOR:
                    model_path = path_utils.eval_model_path(model_uri)
                else:
                    model_path = path_utils.serving_model_path(model_uri)
                absl.logging.info('Using {} as {} model.'.format(
                    model_path, model_spec.name))
                models.append(
                    tfma.default_eval_shared_model(
                        model_name=model_spec.name,
                        eval_saved_model_path=model_path,
                        add_metrics_callbacks=add_metrics_callbacks,
                        eval_config=eval_config))
        else:
            eval_config = None
            assert ('feature_slicing_spec' in exec_properties
                    and exec_properties['feature_slicing_spec']
                    ), 'both eval_config and feature_slicing_spec are unset.'
            feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
            json_format.Parse(exec_properties['feature_slicing_spec'],
                              feature_slicing_spec)
            slice_spec = self._get_slice_spec_from_feature_slicing_spec(
                feature_slicing_spec)
            model_uri = artifact_utils.get_single_uri(
                input_dict[constants.MODEL_KEY])
            model_path = path_utils.eval_model_path(model_uri)
            absl.logging.info('Using {} for model eval.'.format(model_path))
            models.append(
                tfma.default_eval_shared_model(
                    eval_saved_model_path=model_path,
                    add_metrics_callbacks=add_metrics_callbacks))

        file_pattern = io_utils.all_files_pattern(
            artifact_utils.get_split_uri(input_dict[constants.EXAMPLES_KEY],
                                         'eval'))
        eval_shared_model = models[0] if len(models) == 1 else models
        schema = None
        if constants.SCHEMA_KEY in input_dict:
            schema = io_utils.SchemaReader().read(
                io_utils.get_only_uri_in_dir(
                    artifact_utils.get_single_uri(
                        input_dict[constants.SCHEMA_KEY])))

        absl.logging.info('Evaluating model.')
        with self._make_beam_pipeline() as pipeline:
            # pylint: disable=expression-not-assigned
            if _USE_TFXIO:
                tensor_adapter_config = None
                if tfma.is_batched_input(eval_shared_model, eval_config):
                    tfxio = tf_example_record.TFExampleRecord(
                        file_pattern=file_pattern,
                        schema=schema,
                        raw_record_column_name=tfma.BATCHED_INPUT_KEY)
                    if schema is not None:
                        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                            arrow_schema=tfxio.ArrowSchema(),
                            tensor_representations=tfxio.TensorRepresentations(
                            ))
                    data = pipeline | 'ReadFromTFRecordToArrow' >> tfxio.BeamSource(
                    )
                else:
                    data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                        file_pattern=file_pattern)
                (data
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_shared_model=models[0]
                     if len(models) == 1 else models,
                     eval_config=eval_config,
                     output_path=output_uri,
                     slice_spec=slice_spec,
                     tensor_adapter_config=tensor_adapter_config))
            else:
                data = pipeline | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                    file_pattern=file_pattern)
                (data
                 | 'ExtractEvaluateAndWriteResults' >>
                 tfma.ExtractEvaluateAndWriteResults(
                     eval_shared_model=models[0]
                     if len(models) == 1 else models,
                     eval_config=eval_config,
                     output_path=output_uri,
                     slice_spec=slice_spec))
        absl.logging.info(
            'Evaluation complete. Results written to {}.'.format(output_uri))

        if not run_validation:
            # TODO(jinhuang): delete the BLESSING_KEY from output_dict when supported.
            absl.logging.info(
                'No threshold configured, will not validate model.')
            return
        # Set up blessing artifact
        blessing = artifact_utils.get_single_instance(
            output_dict[constants.BLESSING_KEY])
        blessing.set_string_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_URI_KEY,
            artifact_utils.get_single_uri(input_dict[constants.MODEL_KEY]))
        blessing.set_int_custom_property(
            constants.ARTIFACT_PROPERTY_CURRENT_MODEL_ID_KEY,
            input_dict[constants.MODEL_KEY][0].id)
        if input_dict.get(constants.BASELINE_MODEL_KEY):
            baseline_model = input_dict[constants.BASELINE_MODEL_KEY][0]
            blessing.set_string_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_URI_KEY,
                baseline_model.uri)
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BASELINE_MODEL_ID_KEY,
                baseline_model.id)
        if 'current_component_id' in exec_properties:
            blessing.set_string_custom_property(
                'component_id', exec_properties['current_component_id'])
        # Check validation result and write BLESSED file accordingly.
        absl.logging.info('Checking validation results.')
        validation_result = tfma.load_validation_result(output_uri)
        if validation_result.validation_ok:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.BLESSED_FILE_NAME), '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.BLESSED_VALUE)
        else:
            io_utils.write_string_file(
                os.path.join(blessing.uri, constants.NOT_BLESSED_FILE_NAME),
                '')
            blessing.set_int_custom_property(
                constants.ARTIFACT_PROPERTY_BLESSED_KEY,
                constants.NOT_BLESSED_VALUE)
        absl.logging.info('Blessing result {} written to {}.'.format(
            validation_result.validation_ok, blessing.uri))
Пример #8
0
def validate_examples_in_tfrecord(
    data_location: Text,
    stats_options: options.StatsOptions,
    output_path: Optional[Text] = None,
    # TODO(b/131719250): Add option to output a sample of anomalous examples for
    # each anomaly reason.
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Validates TFExamples in TFRecord files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto in which each dataset consists of the
      set of examples that exhibit a particular anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    with beam.Pipeline(options=pipeline_options) as p:
        _ = (p
             | 'ReadData' >> (tf_example_record.TFExampleRecord(
                 file_pattern=data_location,
                 schema=None,
                 telemetry_descriptors=[
                     'tfdv', 'validate_examples_in_tfrecord'
                 ]).BeamSource(batch_size=1))
             | 'DetectAnomalies' >>
             validation_api.IdentifyAnomalousExamples(stats_options)
             | 'GenerateSummaryStatistics' >>
             stats_impl.GenerateSlicedStatisticsImpl(stats_options,
                                                     is_slicing_enabled=True)
             | 'WriteStatsOutput' >>
             stats_api.WriteStatisticsToTFRecord(output_path))

    return stats_util.load_statistics(output_path)
Пример #9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                input_tfxio = tf_example_record.TFExampleRecord(
                    file_pattern=input_uri,
                    telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                data = p | 'TFXIORead[{}]'.format(
                    split) >> input_tfxio.BeamSource()
                _ = (data
                     | 'GenerateStatistics[{}]'.format(split) >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput[{}]'.format(split) >>
                     stats_api.WriteStatisticsToTFRecord(output_path))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
Пример #10
0
 def _MakeTFXIO(self, schema, raw_record_column_name=None):
     return tf_example_record.TFExampleRecord(
         self._example_file,
         schema=schema,
         raw_record_column_name=raw_record_column_name)
Пример #11
0
def validate_examples_in_tfrecord(
    data_location: Text,
    stats_options: options.StatsOptions,
    output_path: Optional[Text] = None,
    pipeline_options: Optional[PipelineOptions] = None,
    num_sampled_examples=0,
) -> Union[statistics_pb2.DatasetFeatureStatisticsList, Tuple[
        statistics_pb2.DatasetFeatureStatisticsList, Mapping[
            str, List[tf.train.Example]]]]:
    """Validates TFExamples in TFRecord files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.
    num_sampled_examples: If set, returns up to this many examples
      of each anomaly type as a map from anomaly reason string to a list of
      tf.Examples.

  Returns:
    If num_sampled_examples is zero, returns a single
    DatasetFeatureStatisticsList proto in which each dataset consists of the
    set of examples that exhibit a particular anomaly. If
    num_sampled_examples is nonzero, returns the same statistics
    proto as well as a mapping from anomaly to a list of tf.Examples that
    exhibited that anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)
    with io_util.Materializer(output_dir_path) as sample_materializer:
        with beam.Pipeline(options=pipeline_options) as p:
            anomalous_examples = (
                p
                | 'ReadData' >> (tf_example_record.TFExampleRecord(
                    file_pattern=data_location,
                    schema=None,
                    telemetry_descriptors=[
                        'tfdv', 'validate_examples_in_tfrecord'
                    ]).BeamSource(batch_size=1))
                | 'DetectAnomalies' >>
                validation_api.IdentifyAnomalousExamples(stats_options))
            _ = (anomalous_examples | 'GenerateSummaryStatistics' >>
                 stats_impl.GenerateSlicedStatisticsImpl(
                     stats_options, is_slicing_enabled=True)
                 | 'WriteStatsOutput' >>
                 stats_api.WriteStatisticsToTFRecord(output_path))
            if num_sampled_examples:
                # TODO(b/68154497): Relint
                # pylint: disable=no-value-for-parameter
                _ = (
                    anomalous_examples
                    | 'Sample' >>
                    beam.combiners.Sample.FixedSizePerKey(num_sampled_examples)
                    | 'ToExample' >> _record_batch_to_example_fn(
                        example_coder.RecordBatchToExamplesEncoder(
                            stats_options.schema))
                    | 'WriteSamples' >> sample_materializer.writer())
                # pylint: enable=no-value-for-parameter
        if num_sampled_examples:
            samples_per_reason = collections.defaultdict(list)
            for reason, serialized_example in sample_materializer.reader():
                samples_per_reason[reason].append(
                    tf.train.Example.FromString(serialized_example))
            return stats_util.load_statistics(output_path), samples_per_reason
    return stats_util.load_statistics(output_path)