def test_stats_pipeline_with_sample_count(self): # input with three examples. examples = [{ 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }, { 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }, { 'c': np.linspace(1, 3000, 3000, dtype=np.int32) }] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_count=1, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001) result = (p | beam.Create(examples) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
def test_stats_options_with_slice_fns_to_json(self): slice_functions = [slicing_util.get_feature_value_slicer({'b': None})] options = stats_options.StatsOptions( experimental_slice_functions=slice_functions) with self.assertRaisesRegex(ValueError, 'StatsOptions cannot be converted'): options.to_json()
def test_stats_options_from_json(self): options_json = """{ "_generators": null, "_feature_whitelist": null, "_schema": null, "weight_feature": null, "label_feature": null, "_slice_functions": null, "_sample_rate": null, "num_top_values": 20, "frequency_threshold": 1, "weighted_frequency_threshold": 1.0, "num_rank_histogram_buckets": 1000, "_num_values_histogram_buckets": 10, "_num_histogram_buckets": 10, "_num_quantiles_histogram_buckets": 10, "epsilon": 0.01, "infer_type_from_schema": false, "_desired_batch_size": null, "enable_semantic_domain_stats": false, "_semantic_domain_stats_sample_rate": null }""" actual_options = stats_options.StatsOptions.from_json(options_json) expected_options_dict = stats_options.StatsOptions().__dict__ self.assertEqual(expected_options_dict, actual_options.__dict__)
def test_validate_instance_stats_options_without_schema(self): instance = {'feature': np.array(['A'])} # This instance of StatsOptions has no schema. options = stats_options.StatsOptions() with self.assertRaisesRegexp(ValueError, 'options must include a schema.'): _ = validation_api.validate_instance(instance, options)
def test_validate_instance_invalid_environment(self): instance = {'feature': np.array(['A'])} schema = text_format.Parse( """ default_environment: "TRAINING" default_environment: "SERVING" feature { name: "label" not_in_environment: "SERVING" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } feature { name: "feature" value_count { min: 1 max: 1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) options = stats_options.StatsOptions(schema=schema) with self.assertRaisesRegexp(ValueError, 'Environment.*not found in the schema.*'): _ = validation_api.validate_instance(instance, options, environment='INVALID')
def setUp(self): self._default_stats_options = stats_options.StatsOptions( num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2)
def test_stats_options_from_json(self): options_json = """{ "_generators": null, "_feature_allowlist": null, "_schema": null, "_vocab_paths": null, "weight_feature": null, "label_feature": null, "_slice_functions": null, "_sample_rate": null, "num_top_values": 20, "frequency_threshold": 1, "weighted_frequency_threshold": 1.0, "num_rank_histogram_buckets": 1000, "_num_values_histogram_buckets": 10, "_num_histogram_buckets": 10, "_num_quantiles_histogram_buckets": 10, "epsilon": 0.01, "infer_type_from_schema": false, "_desired_batch_size": null, "enable_semantic_domain_stats": false, "_semantic_domain_stats_sample_rate": null, "_per_feature_weight_override": null, "_add_default_generators": true, "_use_sketch_based_topk_uniques": false, "_slice_sqls": null, "_experimental_result_partitions": 1, "_experimental_num_feature_partitions": 1 }""" actual_options = stats_options.StatsOptions.from_json(options_json) expected_options_dict = stats_options.StatsOptions().__dict__ self.assertEqual(expected_options_dict, actual_options.__dict__)
def test_stats_pipeline_with_sample_count(self): record_batches = [ pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), pa.RecordBatch.from_arrays( [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])], ['c']), ] with beam.Pipeline() as p: options = stats_options.StatsOptions( sample_count=3000, num_top_values=2, num_rank_histogram_buckets=2, num_values_histogram_buckets=2, num_histogram_buckets=2, num_quantiles_histogram_buckets=2, epsilon=0.001, desired_batch_size=3000) result = (p | beam.Create(record_batches) | stats_api.GenerateStatistics(options)) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, self._sampling_test_expected_result))
def test_invalid_sample_rate_negative(self): examples = [{}] with self.assertRaises(ValueError): with beam.Pipeline() as p: options = stats_options.StatsOptions(sample_rate=-1) _ = (p | beam.Create(examples) | stats_api.GenerateStatistics(options))
def generate_statistics_in_memory(examples, options=stats_options.StatsOptions()): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = _get_generators(options, in_memory=True) batch = batch_util.merge_single_batch(examples) # If whitelist features are provided, keep only those features. if options.feature_whitelist: batch = { feature_name: batch[feature_name] for feature_name in options.feature_whitelist } outputs = [ generator.extract_output( generator.add_input(generator.create_accumulator(), batch)) # The type checker raises a false positive here because the type hint for # the return value of _get_generators (which created the list of # stats_generators) is StatsGenerator, but add_input, create_accumulator, # and extract_output can be called only on CombinerStatsGenerators. for generator in stats_generators # pytype: disable=attribute-error ] return _make_dataset_feature_statistics_list_proto( [_merge_dataset_feature_stats_protos(outputs)])
def test_invalid_feature_whitelist(self): examples = [{'a': np.array([1.0, 2.0])}] with self.assertRaises(TypeError): with beam.Pipeline() as p: options = stats_options.StatsOptions(feature_whitelist={}) _ = (p | beam.Create(examples) | stats_api.GenerateStatistics(options))
def test_identify_anomalous_examples(self, examples, schema_text, expected_result): schema = text_format.Parse(schema_text, schema_pb2.Schema()) options = stats_options.StatsOptions(schema=schema) with beam.Pipeline() as p: result = (p | beam.Create(examples) | validation_api.IdentifyAnomalousExamples(options)) util.assert_that(result, util.equal_to(expected_result))
def test_identify_anomalous_examples_options_without_schema(self): examples = [{'annotated_enum': np.array(['D'])}] options = stats_options.StatsOptions() with self.assertRaisesRegexp(ValueError, 'options must include a schema'): with beam.Pipeline() as p: _ = (p | beam.Create(examples) | validation_api.IdentifyAnomalousExamples(options))
def test_invalid_both_sample_count_and_sample_rate(self): examples = [{}] with self.assertRaises(ValueError): with beam.Pipeline() as p: options = stats_options.StatsOptions(sample_count=100, sample_rate=0.5) _ = (p | beam.Create(examples) | stats_api.GenerateStatistics(options))
def test_validate_instance_global_only_anomaly_type(self): instance = {'annotated_enum': np.array(['D'])} # This schema has a presence.min_count > 1, which will generate an anomaly # of type FEATURE_TYPE_LOW_NUMBER_PRESENT when any single example is # validated using this schema. This test checks that this anomaly type # (which is not meaningful in per-example validation) is not included in the # Anomalies proto that validate_instance returns. schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 5 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D " "(~100%). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D " "(~100%). " } """, anomalies_pb2.AnomalyInfo()) } options = stats_options.StatsOptions(schema=schema) anomalies = validation_api.validate_instance(instance, options) self._assert_equal_anomalies(anomalies, expected_anomalies)
def generate_statistics_from_tfrecord( data_location, output_path=None, stats_options=options.StatsOptions(), pipeline_options=None, ): """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.gfile.Exists(output_dir_path): tf.gfile.MakeDirs(output_dir_path) # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location) | 'DecodeData' >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) # TODO(b/112014711) Implement a custom sink to write the stats proto. | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) return load_statistics(output_path)
def test_validate_examples_in_tfrecord_no_schema(self): temp_dir_path = self.create_tempdir().full_path input_data_path = os.path.join(temp_dir_path, 'input_data.tfrecord') # By default, StatsOptions does not include a schema. options = stats_options.StatsOptions() with self.assertRaisesRegexp( ValueError, 'The specified stats_options must include a schema.'): validation_lib.validate_examples_in_tfrecord( data_location=input_data_path, stats_options=options)
def generate_statistics_from_tfrecord( data_location: Text, output_path: Optional[bytes] = None, stats_options: options.StatsOptions = options.StatsOptions(), pipeline_options: Optional[PipelineOptions] = None, ) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics from TFRecord files containing TFExamples. Runs a Beam pipeline to compute the data statistics and return the result data statistics proto. This is a convenience method for users with data in TFRecord format. Users with data in unsupported file/data formats, or users who wish to create their own Beam pipelines need to use the 'GenerateStatistics' PTransform API directly instead. Args: data_location: The location of the input data files. output_path: The file path to output data statistics result to. If None, we use a temporary directory. It will be a TFRecord file containing a single data statistics proto, and can be read with the 'load_statistics' API. If you run this function on Google Cloud, you must specify an output_path. Specifying None may cause an error. stats_options: `tfdv.StatsOptions` for generating data statistics. pipeline_options: Optional beam pipeline options. This allows users to specify various beam pipeline execution parameters like pipeline runner (DirectRunner or DataflowRunner), cloud dataflow service project id, etc. See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for more details. Returns: A DatasetFeatureStatisticsList proto. """ if output_path is None: output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord') output_dir_path = os.path.dirname(output_path) if not tf.io.gfile.exists(output_dir_path): tf.io.gfile.makedirs(output_dir_path) batch_size = stats_options.desired_batch_size # PyLint doesn't understand Beam PTransforms. # pylint: disable=no-value-for-parameter with beam.Pipeline(options=pipeline_options) as p: # Auto detect tfrecord file compression format based on input data # path suffix. _ = ( p | 'ReadData' >> (tf_example_record.TFExampleRecord( file_pattern=data_location, schema=None, telemetry_descriptors=['tfdv', 'generate_statistics_from_tfrecord']) .BeamSource(batch_size)) | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput' >> (stats_api.WriteStatisticsToTFRecord(output_path))) return stats_util.load_statistics(output_path)
def test_identify_anomalous_examples_invalid_max_examples_type(self): examples = [{'annotated_enum': np.array(['D'])}] options = stats_options.StatsOptions(schema=schema_pb2.Schema()) max_examples_per_anomaly = 1.5 with self.assertRaisesRegexp( TypeError, 'max_examples_per_anomaly must be an integer.'): with beam.Pipeline() as p: _ = (p | beam.Create(examples) | validation_api.IdentifyAnomalousExamples( options, max_examples_per_anomaly))
def test_stats_options_with_generators_to_json(self): generators = [ lift_stats_generator.LiftStatsGenerator( schema=None, y_path=types.FeaturePath(['label']), x_paths=[types.FeaturePath(['feature'])]) ] options = stats_options.StatsOptions(generators=generators) with self.assertRaisesRegex(ValueError, 'StatsOptions cannot be converted'): options.to_json()
def test_example_weight_map(self): options = stats_options.StatsOptions() self.assertIsNone(options.example_weight_map.get(types.FeaturePath(['f']))) self.assertEqual(frozenset([]), options.example_weight_map.all_weight_features()) options = stats_options.StatsOptions(weight_feature='w') self.assertEqual('w', options.example_weight_map.get(types.FeaturePath(['f']))) self.assertEqual( frozenset(['w']), options.example_weight_map.all_weight_features()) options = stats_options.StatsOptions( per_feature_weight_override={types.FeaturePath(['x']): 'w'}) self.assertIsNone(options.example_weight_map.get(types.FeaturePath(['f']))) self.assertEqual('w', options.example_weight_map.get(types.FeaturePath(['x']))) self.assertEqual(frozenset(['w']), options.example_weight_map.all_weight_features())
def test_validate_instance(self): instance = {'annotated_enum': np.array(['D'])} schema = text_format.Parse( """ string_domain { name: "MyAloneEnum" value: "A" value: "B" value: "C" } feature { name: "annotated_enum" value_count { min:1 max:1 } presence { min_count: 1 } type: BYTES domain: "MyAloneEnum" } feature { name: "ignore_this" lifecycle_stage: DEPRECATED value_count { min:1 } presence { min_count: 1 } type: BYTES } """, schema_pb2.Schema()) expected_anomalies = { 'annotated_enum': text_format.Parse( """ description: "Examples contain values missing from the schema: D " "(~100%). " severity: ERROR short_description: "Unexpected string values" reason { type: ENUM_TYPE_UNEXPECTED_STRING_VALUES short_description: "Unexpected string values" description: "Examples contain values missing from the schema: D " "(~100%). " } """, anomalies_pb2.AnomalyInfo()) } options = stats_options.StatsOptions(schema=schema) anomalies = validation_api.validate_instance(instance, options) self._assert_equal_anomalies(anomalies, expected_anomalies)
def __init__(self, options=stats_options.StatsOptions()): """Initializes the transform. Args: options: Options for generating data statistics. Raises: TypeError: If any of the input options is not of the expected type. ValueError: If any of the input options is invalid. """ self._check_options(options) self._options = options
def __init__(self, options=stats_options.StatsOptions()): """Initializes the transform. Args: options: `tfdv.StatsOptions` for generating data statistics. Raises: TypeError: If options is not of the expected type. """ if not isinstance(options, stats_options.StatsOptions): raise TypeError( 'options is of type %s, should be a StatsOptions.' % type(options).__name__) self._options = options
def generate_statistics_from_dataframe( dataframe: DataFrame, stats_options: options.StatsOptions = options.StatsOptions(), n_jobs: int = 1) -> statistics_pb2.DatasetFeatureStatisticsList: """Compute data statistics for the input pandas DataFrame. This is a utility method for users with in-memory data represented as a pandas DataFrame. Args: dataframe: Input pandas DataFrame. stats_options: `tfdv.StatsOptions` for generating data statistics. n_jobs: Number of processes to run (defaults to 1). If -1 is provided, uses the same number of processes as the number of CPU cores. Returns: A DatasetFeatureStatisticsList proto. """ if not isinstance(dataframe, DataFrame): raise TypeError('dataframe argument is of type {}. Must be a ' 'pandas DataFrame.'.format(type(dataframe).__name__)) stats_generators = cast( List[stats_generator.CombinerStatsGenerator], stats_impl.get_generators(stats_options, in_memory=True)) if n_jobs < -1 or n_jobs == 0: raise ValueError('Invalid n_jobs parameter {}. Should be either ' ' -1 or >= 1.'.format(n_jobs)) if n_jobs == -1: n_jobs = multiprocessing.cpu_count() n_jobs = max(min(n_jobs, multiprocessing.cpu_count()), 1) if n_jobs == 1: merged_partial_stats = _generate_partial_statistics_from_df( dataframe, stats_options, stats_generators) else: # TODO(pachristopher): Investigate why we don't observe linear speedup after # a certain number of processes. splits = np.array_split(dataframe, n_jobs) partial_stats = Parallel(n_jobs=n_jobs)( delayed(_generate_partial_statistics_from_df)( splits[i], stats_options, stats_generators) for i in range(n_jobs)) merged_partial_stats = [ gen.merge_accumulators(stats) for gen, stats in zip(stats_generators, zip(*partial_stats)) ] return stats_impl.extract_statistics_output(merged_partial_stats, stats_generators)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Computes stats for each split of input using tensorflow_data_validation. Args: input_dict: Input dict from input key to a list of Artifacts. - input_data: A list of type `standard_artifacts.Examples`. This should contain both 'train' and 'eval' split. output_dict: Output dict from output key to a list of Artifacts. - output: A list of type `standard_artifacts.ExampleStatistics`. This should contain both the 'train' and 'eval' splits. exec_properties: A dict of execution properties. Not used yet. Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) split_uris = [] for artifact in input_dict['input_data']: for split in artifact_utils.decode_split_names( artifact.split_names): uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) with self._make_beam_pipeline() as p: # TODO(b/126263006): Support more stats_options through config. stats_options = options.StatsOptions() for split, uri in split_uris: absl.logging.info( 'Generating statistics for split {}'.format(split)) input_uri = io_utils.all_files_pattern(uri) output_uri = artifact_utils.get_split_uri( output_dict['output'], split) output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord(file_pattern=input_uri) | 'DecodeData.' + split >> tf_example_decoder.DecodeTFExample() | 'GenerateStatistics.' + split >> stats_api.GenerateStatistics(stats_options) | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord( output_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) absl.logging.info( 'Statistics for split {} written to {}.'.format( split, output_uri))
def test_generate_statistics_in_memory_invalid_custom_generator(self): # Dummy PTransform that does nothing. class CustomPTransform(beam.PTransform): def expand(self, pcoll): pass examples = [{'a': np.array([1.0])}] custom_generator = stats_generator.TransformStatsGenerator( name='CustomStatsGenerator', ptransform=CustomPTransform()) options = stats_options.StatsOptions(generators=[custom_generator]) with self.assertRaisesRegexp( TypeError, 'Statistics generator.* found object of type ' 'TransformStatsGenerator.'): stats_impl.generate_statistics_in_memory(examples, options)
def test_empty_input(self): examples = [] expected_result = text_format.Parse( """ datasets { num_examples: 0 } """, statistics_pb2.DatasetFeatureStatisticsList()) with beam.Pipeline() as p: result = p | beam.Create(examples) | stats_api.GenerateStatistics( stats_options.StatsOptions()) util.assert_that( result, test_util.make_dataset_feature_stats_list_proto_equal_fn( self, expected_result))
def generate_statistics_in_memory(examples, options=stats_options.StatsOptions()): """Generates statistics for an in-memory list of examples. Args: examples: A list of input examples. options: Options for generating data statistics. Returns: A DatasetFeatureStatisticsList proto. """ stats_generators = get_generators(options, in_memory=True) partial_stats = generate_partial_statistics_in_memory( examples, options, stats_generators) return extract_statistics_output(partial_stats, stats_generators)
def test_stats_options_invalid_slicing_sql_query(self): schema = schema_pb2.Schema(feature=[ schema_pb2.Feature(name='feat1', type=schema_pb2.BYTES), schema_pb2.Feature(name='feat3', type=schema_pb2.INT) ], ) experimental_slice_sqls = [ """ SELECT STRUCT(feat1, feat2) FROM example.feat1, example.feat2 """ ] with self.assertRaisesRegex(ValueError, 'One of the slice SQL query'): stats_options.StatsOptions( experimental_slice_sqls=experimental_slice_sqls, schema=schema)