def test_identify_anomalous_examples_options_without_schema(self):
     examples = [{'annotated_enum': np.array(['D'])}]
     options = stats_options.StatsOptions()
     with self.assertRaisesRegexp(ValueError,
                                  'options must include a schema'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | validation_api.IdentifyAnomalousExamples(options))
 def test_identify_anomalous_examples(self, examples, schema_text,
                                      expected_result):
     schema = text_format.Parse(schema_text, schema_pb2.Schema())
     options = stats_options.StatsOptions(schema=schema)
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | validation_api.IdentifyAnomalousExamples(options))
         util.assert_that(result, util.equal_to(expected_result))
 def test_identify_anomalous_examples_options_of_wrong_type(self):
     examples = [{'annotated_enum': np.array(['D'])}]
     options = 1
     with self.assertRaisesRegexp(
             ValueError, 'options must be a `StatsOptions` '
             'object.'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | validation_api.IdentifyAnomalousExamples(options))
 def test_identify_anomalous_examples_invalid_max_examples_type(self):
     examples = [{'annotated_enum': np.array(['D'])}]
     options = stats_options.StatsOptions(schema=schema_pb2.Schema())
     max_examples_per_anomaly = 1.5
     with self.assertRaisesRegexp(
             TypeError, 'max_examples_per_anomaly must be an integer.'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | validation_api.IdentifyAnomalousExamples(
                      options, max_examples_per_anomaly))
예제 #5
0
def validate_examples_in_tfrecord(
    data_location: Text,
    stats_options: options.StatsOptions,
    output_path: Optional[Text] = None,
    # TODO(b/131719250): Add option to output a sample of anomalous examples for
    # each anomaly reason.
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Validates TFExamples in TFRecord files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto in which each dataset consists of the
      set of examples that exhibit a particular anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    with beam.Pipeline(options=pipeline_options) as p:
        _ = (
            p
            |
            'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location)
            | 'DecodeData' >>
            tf_example_decoder.DecodeTFExample(desired_batch_size=1)
            | 'DetectAnomalies' >>
            validation_api.IdentifyAnomalousExamples(stats_options)
            | 'GenerateSummaryStatistics' >>
            stats_impl.GenerateSlicedStatisticsImpl(stats_options,
                                                    is_slicing_enabled=True)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))

    return stats_gen_lib.load_statistics(output_path)
예제 #6
0
def validate_examples_in_csv(
    data_location: Text,
    stats_options: options.StatsOptions,
    column_names: Optional[List[types.FeatureName]] = None,
    delimiter: Text = ',',
    output_path: Optional[Text] = None,
    # TODO(b/131719250): Add option to output a sample of anomalous examples for
    # each anomaly reason.
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Validates examples in csv files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in CSV format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    column_names: A list of column names to be treated as the CSV header. Order
      must match the order in the input CSV files. If this argument is not
      specified, we assume the first line in the input CSV files as the header.
      Note that this option is valid only for 'csv' input file format.
    delimiter: A one-character string used to separate fields in a CSV file.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.

  Returns:
    A DatasetFeatureStatisticsList proto in which each dataset consists of the
      set of examples that exhibit a particular anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
  if stats_options.schema is None:
    raise ValueError('The specified stats_options must include a schema.')
  if output_path is None:
    output_path = os.path.join(tempfile.mkdtemp(), 'anomaly_stats.tfrecord')
  output_dir_path = os.path.dirname(output_path)
  if not tf.gfile.Exists(output_dir_path):
    tf.gfile.MakeDirs(output_dir_path)

  # If a header is not provided, assume the first line in a file
  # to be the header.
  skip_header_lines = 1 if column_names is None else 0
  if column_names is None:
    column_names = stats_gen_lib.get_csv_header(data_location, delimiter)

  with beam.Pipeline(options=pipeline_options) as p:
    _ = (
        p
        | 'ReadData' >> beam.io.textio.ReadFromText(
            file_pattern=data_location, skip_header_lines=skip_header_lines)
        | 'DecodeData' >> csv_decoder.DecodeCSV(
            column_names=column_names, delimiter=delimiter,
            schema=stats_options.schema,
            infer_type_from_schema=stats_options.infer_type_from_schema,
            desired_batch_size=1)
        | 'DetectAnomalies' >>
        validation_api.IdentifyAnomalousExamples(stats_options)
        |
        'GenerateSummaryStatistics' >> stats_impl.GenerateSlicedStatisticsImpl(
            stats_options, is_slicing_enabled=True)
        # TODO(b/112014711) Implement a custom sink to write the stats proto.
        | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
            output_path,
            shard_name_template='',
            coder=beam.coders.ProtoCoder(
                statistics_pb2.DatasetFeatureStatisticsList)))

  return stats_gen_lib.load_statistics(output_path)
 def test_identify_anomalous_examples_with_max_examples_per_anomaly(self):
     examples = [{
         'annotated_enum': np.array(['D'])
     }, {
         'annotated_enum': np.array(['D'])
     }, {
         'annotated_enum': np.array(['C'])
     }, {
         'feature_not_in_schema': np.array([1])
     }, {
         'feature_not_in_schema': np.array([1])
     }]
     schema = text_format.Parse(
         """
     string_domain {
       name: "MyAloneEnum"
       value: "A"
       value: "B"
       value: "C"
     }
     feature {
       name: "annotated_enum"
       value_count {
         min:1
         max:1
       }
       presence {
         min_count: 0
       }
       type: BYTES
       domain: "MyAloneEnum"
     }
     feature {
       name: "ignore_this"
       lifecycle_stage: DEPRECATED
       value_count {
         min:1
       }
       presence {
         min_count: 1
       }
       type: BYTES
     }
     """, schema_pb2.Schema())
     options = stats_options.StatsOptions(schema=schema)
     max_examples_per_anomaly = 1
     expected_result = [
         ('annotated_enum_ENUM_TYPE_UNEXPECTED_STRING_VALUES', [{
             'annotated_enum':
             np.array(['D'])
         }]),
         ('feature_not_in_schema_SCHEMA_NEW_COLUMN', [{
             'feature_not_in_schema':
             np.array([1])
         }])
     ]
     with beam.Pipeline() as p:
         result = (p | beam.Create(examples)
                   | validation_api.IdentifyAnomalousExamples(
                       options, max_examples_per_anomaly))
         util.assert_that(result, util.equal_to(expected_result))
예제 #8
0
def validate_examples_in_tfrecord(
    data_location: Text,
    stats_options: options.StatsOptions,
    output_path: Optional[Text] = None,
    pipeline_options: Optional[PipelineOptions] = None,
    num_sampled_examples=0,
) -> Union[statistics_pb2.DatasetFeatureStatisticsList, Tuple[
        statistics_pb2.DatasetFeatureStatisticsList, Mapping[
            str, List[tf.train.Example]]]]:
    """Validates TFExamples in TFRecord files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.
    num_sampled_examples: If set, returns up to this many examples
      of each anomaly type as a map from anomaly reason string to a list of
      tf.Examples.

  Returns:
    If num_sampled_examples is zero, returns a single
    DatasetFeatureStatisticsList proto in which each dataset consists of the
    set of examples that exhibit a particular anomaly. If
    num_sampled_examples is nonzero, returns the same statistics
    proto as well as a mapping from anomaly to a list of tf.Examples that
    exhibited that anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)
    with io_util.Materializer(output_dir_path) as sample_materializer:
        with beam.Pipeline(options=pipeline_options) as p:
            anomalous_examples = (
                p
                | 'ReadData' >> (tf_example_record.TFExampleRecord(
                    file_pattern=data_location,
                    schema=None,
                    telemetry_descriptors=[
                        'tfdv', 'validate_examples_in_tfrecord'
                    ]).BeamSource(batch_size=1))
                | 'DetectAnomalies' >>
                validation_api.IdentifyAnomalousExamples(stats_options))
            _ = (anomalous_examples | 'GenerateSummaryStatistics' >>
                 stats_impl.GenerateSlicedStatisticsImpl(
                     stats_options, is_slicing_enabled=True)
                 | 'WriteStatsOutput' >>
                 stats_api.WriteStatisticsToTFRecord(output_path))
            if num_sampled_examples:
                # TODO(b/68154497): Relint
                # pylint: disable=no-value-for-parameter
                _ = (
                    anomalous_examples
                    | 'Sample' >>
                    beam.combiners.Sample.FixedSizePerKey(num_sampled_examples)
                    | 'ToExample' >> _record_batch_to_example_fn(
                        example_coder.RecordBatchToExamplesEncoder(
                            stats_options.schema))
                    | 'WriteSamples' >> sample_materializer.writer())
                # pylint: enable=no-value-for-parameter
        if num_sampled_examples:
            samples_per_reason = collections.defaultdict(list)
            for reason, serialized_example in sample_materializer.reader():
                samples_per_reason[reason].append(
                    tf.train.Example.FromString(serialized_example))
            return stats_util.load_statistics(output_path), samples_per_reason
    return stats_util.load_statistics(output_path)
예제 #9
0
def validate_examples_in_csv(
    data_location: Text,
    stats_options: options.StatsOptions,
    column_names: Optional[List[types.FeatureName]] = None,
    delimiter: Text = ',',
    output_path: Optional[Text] = None,
    pipeline_options: Optional[PipelineOptions] = None,
    num_sampled_examples=0,
) -> Union[statistics_pb2.DatasetFeatureStatisticsList, Tuple[
        statistics_pb2.DatasetFeatureStatisticsList, Mapping[str,
                                                             pd.DataFrame]]]:
    """Validates examples in csv files.

  Runs a Beam pipeline to detect anomalies on a per-example basis. If this
  function detects anomalous examples, it generates summary statistics regarding
  the set of examples that exhibit each anomaly.

  This is a convenience function for users with data in CSV format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'IdentifyAnomalousExamples'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    stats_options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    column_names: A list of column names to be treated as the CSV header. Order
      must match the order in the input CSV files. If this argument is not
      specified, we assume the first line in the input CSV files as the header.
      Note that this option is valid only for 'csv' input file format.
    delimiter: A one-character string used to separate fields in a CSV file.
    output_path: The file path to output data statistics result to. If None, the
      function uses a temporary directory. The output will be a TFRecord file
      containing a single data statistics list proto, and can be read with the
      'load_statistics' function. If you run this function on Google Cloud, you
      must specify an output_path. Specifying None may cause an error.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    num_sampled_examples: If set, returns up to this many examples of each
      anomaly type as a map from anomaly reason string to pd.DataFrame.

  Returns:
    If num_sampled_examples is zero, returns a single
    DatasetFeatureStatisticsList proto in which each dataset consists of the
    set of examples that exhibit a particular anomaly. If
    num_sampled_examples is nonzero, returns the same statistics
    proto as well as a mapping from anomaly to a pd.DataFrame of CSV rows
    exhibiting that anomaly.

  Raises:
    ValueError: If the specified stats_options does not include a schema.
  """
    if stats_options.schema is None:
        raise ValueError('The specified stats_options must include a schema.')
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(),
                                   'anomaly_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)
    if num_sampled_examples:
        sample_materializer = io_util.Materializer(output_dir_path)

    # If a header is not provided, assume the first line in a file
    # to be the header.
    skip_header_lines = 1 if column_names is None else 0
    if column_names is None:
        column_names = stats_gen_lib.get_csv_header(data_location, delimiter)

    with beam.Pipeline(options=pipeline_options) as p:

        anomalous_examples = (
            p
            | 'ReadData' >> beam.io.textio.ReadFromText(
                file_pattern=data_location,
                skip_header_lines=skip_header_lines)
            | 'DecodeData' >> csv_decoder.DecodeCSV(
                column_names=column_names,
                delimiter=delimiter,
                schema=stats_options.schema
                if stats_options.infer_type_from_schema else None,
                desired_batch_size=1)
            | 'DetectAnomalies' >>
            validation_api.IdentifyAnomalousExamples(stats_options))
        _ = (anomalous_examples
             | 'GenerateSummaryStatistics' >>
             stats_impl.GenerateSlicedStatisticsImpl(stats_options,
                                                     is_slicing_enabled=True)
             | 'WriteStatsOutput' >>
             stats_api.WriteStatisticsToTFRecord(output_path))
        if num_sampled_examples:
            _ = (anomalous_examples
                 | 'Sample' >>
                 beam.combiners.Sample.FixedSizePerKey(num_sampled_examples)
                 | 'ToPandas' >> beam.FlatMap(_encode_pandas_and_key)
                 | 'WriteSamples' >> sample_materializer.writer())

    if num_sampled_examples:
        samples_per_reason_acc = collections.defaultdict(list)
        for reason, pandas_dataframe in sample_materializer.reader():
            samples_per_reason_acc[reason].append(pandas_dataframe)
        samples_per_reason = {}
        for reason, dataframes in samples_per_reason_acc.items():
            samples_per_reason[reason] = pd.concat(dataframes)
        sample_materializer.cleanup()
        return stats_util.load_statistics(output_path), samples_per_reason
    return stats_util.load_statistics(output_path)