예제 #1
0
    def test_generate_statistics_in_memory_invalid_custom_generator(self):

        # Dummy PTransform that does nothing.
        class CustomPTransform(beam.PTransform):
            def expand(self, pcoll):
                pass

        examples = [{'a': np.array([1.0])}]
        custom_generator = stats_generator.TransformStatsGenerator(
            name='CustomStatsGenerator', ptransform=CustomPTransform())
        options = stats_options.StatsOptions(generators=[custom_generator])
        with self.assertRaisesRegexp(
                TypeError, 'Statistics generator.* found object of type '
                'TransformStatsGenerator.'):
            stats_impl.generate_statistics_in_memory(examples, options)
예제 #2
0
  def test_generate_statistics_in_memory_empty_examples(self):
    examples = []
    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 0
        }""", statistics_pb2.DatasetFeatureStatisticsList())

    result = stats_impl.generate_statistics_in_memory(examples)
    compare.assertProtoEqual(
        self, result, expected_result, normalize_numbers=True)
예제 #3
0
 def test_generate_statistics_in_memory(
     self, examples, options, expected_result_proto_text, schema=None):
   expected_result = text_format.Parse(
       expected_result_proto_text,
       statistics_pb2.DatasetFeatureStatisticsList())
   if schema is not None:
     options.schema = schema
   result = stats_impl.generate_statistics_in_memory(
       examples, options)
   # generate_statistics_in_memory does not deterministically
   # order multiple features within a DatasetFeatureStatistics proto. So, we
   # cannot use compare.assertProtoEqual (which requires the same ordering of
   # repeated fields) here.
   test_util.assert_dataset_feature_stats_proto_equal(
       self, result.datasets[0], expected_result.datasets[0])
예제 #4
0
def validate_instance(
    instance,
    options,
    environment = None
):
  """Validates a single example against the schema provided in `options`.

  If an optional `environment` is specified, the schema is filtered using the
  `environment` and the `instance` is validated against the filtered schema.

  Args:
    instance: A single example in the form of a dict mapping a feature name to a
      numpy array.
    options: `tfdv.StatsOptions` for generating data statistics. This must
      contain a schema.
    environment: An optional string denoting the validation environment. Must be
      one of the default environments specified in the schema. In some cases
      introducing slight schema variations is necessary, for instance features
      used as labels are required during training (and should be validated), but
      are missing during serving. Environments can be used to express such
      requirements. For example, assume a feature named 'LABEL' is required for
      training, but is expected to be missing from serving. This can be
      expressed by defining two distinct environments in the schema: ["SERVING",
      "TRAINING"] and associating 'LABEL' only with environment "TRAINING".

  Returns:
    An Anomalies protocol buffer.

  Raises:
    ValueError: If `options` is not a StatsOptions object.
    ValueError: If `options` does not contain a schema.
  """
  if not isinstance(options, stats_options.StatsOptions):
    raise ValueError('options must be a StatsOptions object.')
  if options.schema is None:
    raise ValueError('options must include a schema.')
  feature_statistics_list = (
      stats_impl.generate_statistics_in_memory([instance], options))
  anomalies = validate_statistics(feature_statistics_list, options.schema,
                                  environment)
  if anomalies.anomaly_info:
    # If anomalies were found, remove anomaly types that do not apply on a
    # per-example basis from the Anomalies proto.
    anomalies_util.remove_anomaly_types(anomalies, _GLOBAL_ONLY_ANOMALY_TYPES)
  return anomalies
예제 #5
0
def generate_statistics_from_dataframe(
        dataframe,
        stats_options=options.StatsOptions(),
):
    """Compute data statistics for the input pandas DataFrame.

  This is a utility method for users with in-memory data represented
  as a pandas DataFrame.

  Args:
    dataframe: Input pandas DataFrame.
    stats_options: Options for generating data statistics.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError('dataframe argument is of type {}. Must be a '
                        'pandas DataFrame.'.format(type(dataframe).__name__))

    column_names = list(dataframe.columns)
    column_types = list(dataframe.dtypes)
    num_columns = len(column_names)

    inmemory_dicts = []
    for row in dataframe.itertuples(index=False):
        row_dict = {}
        for i in range(num_columns):
            if pd.isnull(row[i]):
                row_dict[column_names[i]] = None
            elif column_types[i] == np.object:
                row_dict[column_names[i]] = np.array([str(row[i])],
                                                     dtype=np.object)
            else:
                row_dict[column_names[i]] = np.array([row[i]],
                                                     dtype=column_types[i])
        inmemory_dicts.append(row_dict)

    return stats_impl.generate_statistics_in_memory(inmemory_dicts,
                                                    stats_options)
예제 #6
0
  def test_generate_statistics_in_memory_valid_custom_generator(
      self):

    # CombinerStatsGenerator that returns a DatasetFeatureStatistic proto with
    # custom stat.
    class CustomCombinerStatsGenerator(stats_generator.CombinerStatsGenerator):

      def create_accumulator(self):
        return 0

      def add_input(self, accumulator,
                    input_batch):
        return 0

      def merge_accumulators(self, accumulators):
        return 0

      def extract_output(
          self, accumulator):
        stats_proto = statistics_pb2.DatasetFeatureStatistics()
        proto_feature = stats_proto.features.add()
        proto_feature.name = 'a'
        custom_stat = proto_feature.custom_stats.add()
        custom_stat.name = 'custom_stat'
        custom_stat.str = 'custom_stat_value'
        return stats_proto

    examples = [
        {'a': np.array(['xyz', 'qwe'])},
        {'a': np.array(['qwe'])},
        {'a': np.array(['qwe'])},
    ]

    expected_result = text_format.Parse(
        """
        datasets {
          num_examples: 3
          features {
            name: 'a'
            type: STRING
            custom_stats {
              name: 'custom_stat'
              str: 'custom_stat_value'
            }
            string_stats {
              avg_length: 3
              unique: 2
              common_stats {
                num_non_missing: 3
                min_num_values: 1
                max_num_values: 2
                avg_num_values: 1.333333
                tot_num_values: 4
                num_values_histogram {
                  buckets {
                    low_value: 1.0
                    high_value: 1.0
                    sample_count: 1.0
                  }
                  buckets {
                    low_value: 1.0
                    high_value: 2.0
                    sample_count: 1.0
                  }
                  buckets {
                    low_value: 2.0
                    high_value: 2.0
                    sample_count: 1.0
                  }
                  type: QUANTILES
                }
              }
              top_values {
                value: 'qwe'
                frequency: 3
              }
              top_values {
                value: 'xyz'
                frequency: 1
              }
              rank_histogram {
                buckets {
                  low_rank: 0
                  high_rank: 0
                  label: "qwe"
                  sample_count: 3.0
                }
                buckets {
                  low_rank: 1
                  high_rank: 1
                  label: "xyz"
                  sample_count: 1.0
                }
              }
            }
          }
        }""", statistics_pb2.DatasetFeatureStatisticsList())

    options = stats_options.StatsOptions(
        generators=[CustomCombinerStatsGenerator('CustomStatsGenerator')],
        num_top_values=4,
        num_rank_histogram_buckets=3,
        num_values_histogram_buckets=3)
    result = stats_impl.generate_statistics_in_memory(
        examples, options)
    compare.assertProtoEqual(
        self, result, expected_result, normalize_numbers=True)