def assert_on_two_protos_with_different_num_examples(self): expected = text_format.Parse( """ num_examples: 1 features { path { step: 'fa' } type: STRING string_stats { unique: 4 } } """, statistics_pb2.DatasetFeatureStatistics()) actual = text_format.Parse( """ num_examples: 2 features { path { step: 'fa' } type: STRING string_stats { unique: 4 } }""", statistics_pb2.DatasetFeatureStatistics()) test_util.assert_dataset_feature_stats_proto_equal( self, actual, expected)
def assert_on_two_protos_with_different_numbers_of_features(self): expected = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { unique: 4 } } features { name: 'fb' type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()) actual = text_format.Parse( """ features { name: 'fa' type: STRING string_stats { unique: 4 } }""", statistics_pb2.DatasetFeatureStatistics()) test_util.assert_dataset_feature_stats_proto_equal( self, actual, expected)
def test_make_dataset_feature_stats_proto_topk_single(self): expected_result = text_format.Parse( """ features { string_stats { top_values { value: "e" frequency: 20.0 } top_values { value: "d" frequency: 20.0 } top_values { value: "a" frequency: 15.0 } rank_histogram { buckets { label: "e" sample_count: 20.0 } buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 20.0 } } } path { step: "fa" } }""", statistics_pb2.DatasetFeatureStatistics()) value_counts = [('e', 20), ('d', 20), ('a', 15), ('c', 10), ('b', 5)] value_count_list = [ top_k_uniques_stats_util.FeatureValueCount(value_count[0], value_count[1]) for value_count in value_counts ] result = (top_k_uniques_stats_util. make_dataset_feature_stats_proto_topk_single( types.FeaturePath(['fa']).steps(), value_count_list=value_count_list, categorical_features=frozenset([ types.FeaturePath(['fa']), types.FeaturePath(['fb']) ]), is_weighted_stats=False, num_top_values=3, frequency_threshold=1, num_rank_histogram_buckets=2)) test_util.assert_dataset_feature_stats_proto_equal( self, result, expected_result)
def test_stats_gen_with_dataframe(self): records, _, expected_result = self._get_csv_test(delimiter=',', with_header=True) input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') dataframe = pd.read_csv(input_data_path) result = stats_gen_lib.generate_statistics_from_dataframe( dataframe=dataframe, stats_options=self._default_stats_options) self.assertLen(result.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, result.datasets[0], expected_result.datasets[0])
def test_generate_statistics_in_memory( self, examples, options, expected_result_proto_text, schema=None): expected_result = text_format.Parse( expected_result_proto_text, statistics_pb2.DatasetFeatureStatisticsList()) if schema is not None: options.schema = schema result = stats_impl.generate_statistics_in_memory( examples, options) # generate_statistics_in_memory does not deterministically # order multiple features within a DatasetFeatureStatistics proto. So, we # cannot use compare.assertProtoEqual (which requires the same ordering of # repeated fields) here. test_util.assert_dataset_feature_stats_proto_equal( self, result.datasets[0], expected_result.datasets[0])
def test_write_stats_to_tfrecrod(self): stats = text_format.Parse( """ datasets { name: 'x' num_examples: 100 } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path = os.path.join(self._get_temp_dir(), 'stats') with beam.Pipeline() as p: _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToTFRecord(output_path)) stats_from_file = stats_util.load_statistics(output_path) self.assertLen(stats_from_file.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_file.datasets[0], stats.datasets[0])
def test_stats_gen_with_dataframe_feature_allowlist(self): records, _, expected_result = self._get_csv_test(delimiter=',', with_header=True) input_data_path = self._write_records_to_csv(records, self._get_temp_dir(), 'input_data.csv') dataframe = pd.read_csv(input_data_path) stats_options_allowlist = self._default_stats_options stats_options_allowlist.feature_allowlist = list(dataframe.columns) dataframe['to_be_removed_column'] = [ [1, 2], [], None, [1], None, [3, 4], [], None] result = stats_gen_lib.generate_statistics_from_dataframe( dataframe=dataframe, stats_options=stats_options_allowlist, n_jobs=1) self.assertLen(result.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, result.datasets[0], expected_result.datasets[0])
def test_get_combined_statistics_allowlist_features(self): statistics = text_format.Parse( """ datasets { name: 'test' features { path { step: 'a' } type: FLOAT } features { path { step: 'c' } type: INT } features { path { step: 'b' } type: STRING } } """, statistics_pb2.DatasetFeatureStatisticsList()) expected_output = text_format.Parse( """ datasets { name: 'test' features { path { step: 'a' } type: FLOAT } features { path { step: 'b' } type: STRING } } """, statistics_pb2.DatasetFeatureStatisticsList()) actual_output = display_util._get_combined_statistics( statistics, allowlist_features=[ types.FeaturePath(['a']), types.FeaturePath(['b']) ]) self.assertLen(actual_output.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, actual_output.datasets[0], expected_output.datasets[0])
def assert_on_two_protos_with_same_features_in_same_order(self): expected = text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { unique: 4 } } features { path { step: 'fb' } type: STRING string_stats { unique: 5 } } """, statistics_pb2.DatasetFeatureStatistics()) actual = text_format.Parse( """ features { path { step: 'fa' } type: STRING string_stats { unique: 4 } } features { path { step: 'fb' } type: STRING string_stats { unique: 5 } }""", statistics_pb2.DatasetFeatureStatistics()) test_util.assert_dataset_feature_stats_proto_equal( self, actual, expected)
def test_write_stats_to_binary_file(self): stats = text_format.Parse( """ datasets { name: 'x' num_examples: 100 } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path = os.path.join(self._get_temp_dir(), 'stats') with beam.Pipeline() as p: _ = (p | beam.Create([stats]) | stats_api.WriteStatisticsToBinaryFile(output_path)) stats_from_file = statistics_pb2.DatasetFeatureStatisticsList() serialized_stats = io_util.read_file_to_string(output_path, binary_mode=True) stats_from_file.ParseFromString(serialized_stats) self.assertLen(stats_from_file.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_file.datasets[0], stats.datasets[0])
def test_write_stats_to_tfrecord_and_binary(self): stats1 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats2 = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) stats_combined = text_format.Parse( """ datasets { name: 'x' num_examples: 100 features: { path: { step: "f1" } } features: { path: { step: "f2" } } } """, statistics_pb2.DatasetFeatureStatisticsList()) output_path_binary = os.path.join(self._get_temp_dir(), 'stats.pb') output_path_prefix = os.path.join(self._get_temp_dir(), 'stats_shards') with beam.Pipeline() as p: _ = (p | beam.Create([stats1, stats2]) | stats_api.WriteStatisticsToRecordsAndBinaryFile( output_path_binary, output_path_prefix)) stats_from_pb = statistics_pb2.DatasetFeatureStatisticsList() serialized_stats = io_util.read_file_to_string(output_path_binary, binary_mode=True) stats_from_pb.ParseFromString(serialized_stats) self.assertLen(stats_from_pb.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_pb.datasets[0], stats_combined.datasets[0]) stats_from_shards = stats_util.load_sharded_statistics( output_path_prefix + '*').proto() self.assertLen(stats_from_shards.datasets, 1) test_util.assert_dataset_feature_stats_proto_equal( self, stats_from_shards.datasets[0], stats_combined.datasets[0])