def ProtoFromTfRecordFiles(files, max_entries=10000, features=None, is_sequence=False, iterator_options=None): """Creates a feature statistics proto from a set of TFRecord files. Args: files: A list of dicts describing files for each dataset for the proto. Each entry contains a 'path' field with the path to the TFRecord file on disk and a 'name' field to identify the dataset in the proto. max_entries: The maximum number of examples to load from each dataset in order to create the proto. Defaults to 10000. features: A list of strings that is a whitelist of feature names to create feature statistics for. If set to None then all features in the dataset are analyzed. Defaults to None. is_sequence: True if the input data from 'tables' are tf.SequenceExamples, False if tf.Examples. Defaults to false. Returns: The feature statistics proto for the provided files. """ datasets = [] for entry in files: entries, size = _GetTfRecordEntries(entry['path'], max_entries, is_sequence, iterator_options) datasets.append({ 'entries': entries, 'size': size, 'name': entry['name'] }) return gfsg.GetDatasetsProto(datasets, features)
def testQuantiles(self): examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].int64_list.value.append(i) examples.append(example) for i in range(50): example = tf.train.Example() example.features.feature['num'].int64_list.value.append(100) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) numfeat = p.datasets[0].features[0] self.assertEqual(2, len(numfeat.num_stats.histograms)) self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, numfeat.num_stats.histograms[1].type) buckets = numfeat.num_stats.histograms[1].buckets self.assertEqual(10, len(buckets)) self.assertEqual(0, buckets[0].low_value) self.assertEqual(9.9, buckets[0].high_value) self.assertEqual(10, buckets[0].sample_count) self.assertEqual(100, buckets[9].low_value) self.assertEqual(100, buckets[9].high_value) self.assertEqual(10, buckets[9].sample_count)
def testInfinitysOnly(self): examples = [] example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('inf')) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('-inf')) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) numfeat = p.datasets[0].features[0] hist = buckets = numfeat.num_stats.histograms[0] buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(float('-inf'), buckets[0].low_value) self.assertEqual(0.1, buckets[0].high_value) self.assertEqual(1, buckets[0].sample_count) self.assertEquals(0.9, buckets[9].low_value) self.assertEqual(float('inf'), buckets[9].high_value) self.assertEqual(1, buckets[9].sample_count)
def testGetDatasetsProtoFromEntriesLists(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type) self.assertEqual(1, numfeat.num_stats.min) self.assertEqual(3, numfeat.num_stats.max) hist = numfeat.num_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count)
def testGetDatasetsProtoWithWhitelist(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 3], 'counts': [1, 1, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } entries['ignoreFeature'] = { 'vals': [5, 6], 'counts': [1, 1], 'missing': 1, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets, features=['testFeature']) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('testDataset', test_data.name) self.assertEqual(3, test_data.num_examples) self.assertEqual(1, len(test_data.features)) numfeat = test_data.features[0] self.assertEqual('testFeature', numfeat.name) self.assertEqual(1, numfeat.num_stats.min)
def testInfinityAndNan(self): examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].float_list.value.append(i) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('inf')) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('-inf')) examples.append(example) example = tf.train.Example() example.features.feature['num'].float_list.value.append(float('nan')) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) numfeat = p.datasets[0].features[0] self.assertEqual('num', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, numfeat.type) self.assertTrue(np.isnan(numfeat.num_stats.min)) self.assertTrue(np.isnan(numfeat.num_stats.max)) self.assertTrue(np.isnan(numfeat.num_stats.mean)) self.assertTrue(np.isnan(numfeat.num_stats.median)) self.assertEqual(1, numfeat.num_stats.num_zeros) self.assertTrue(np.isnan(numfeat.num_stats.std_dev)) self.assertEqual(53, numfeat.num_stats.common_stats.num_non_missing) hist = buckets = numfeat.num_stats.histograms[0] buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type) self.assertEqual(1, hist.num_nan) self.assertEqual(10, len(buckets)) self.assertEqual(float('-inf'), buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(6, buckets[0].sample_count) self.assertEquals(44.1, buckets[9].low_value) self.assertEqual(float('inf'), buckets[9].high_value) self.assertEqual(6, buckets[9].sample_count)
def testGetProtoMultipleDatasets(self): # Tests converting multiple datsets into the feature stats proto # including ensuring feature order is consistent in the protos. examples1 = [] for i in range(2): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'one') example.features.feature['num'].int64_list.value.append(0) examples1.append(example) examples2 = [] example = tf.train.Example() example.features.feature['num'].int64_list.value.append(1) example.features.feature['str'].bytes_list.value.append(b'two') examples2.append(example) entries1 = {} for i, example1 in enumerate(examples1): fs._ParseExample(example1.features.feature, [], entries1, i) entries2 = {} for i, example2 in enumerate(examples2): fs._ParseExample(example2.features.feature, [], entries2, i) datasets = [{ 'entries': entries1, 'size': len(examples1), 'name': 'test1' }, { 'entries': entries2, 'size': len(examples2), 'name': 'test2' }] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(2, len(p.datasets)) test_data_1 = p.datasets[0] self.assertEqual('test1', test_data_1.name) self.assertEqual(2, test_data_1.num_examples) num_feat_index = 0 if test_data_1.features[0].name == 'num' else 1 self.assertEqual(0, test_data_1.features[num_feat_index].num_stats.max) test_data_2 = p.datasets[1] self.assertEqual('test2', test_data_2.name) self.assertEqual(1, test_data_2.num_examples) self.assertEqual(1, test_data_2.features[num_feat_index].num_stats.max)
def testGetDatasetsProtoSequenceExampleHistogram(self): entries = {} entries['testFeature'] = { 'vals': [1, 2, 2, 3], 'counts': [1, 2, 1], 'feat_lens': [1, 2, 1], 'missing': 0, 'type': gfsg.GetFeatureStatsProtoDef().INT } datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}] p = gfsg.GetDatasetsProto(datasets) hist = p.datasets[0].features[ 0].num_stats.common_stats.feature_list_length_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.3, buckets[0].sample_count) self.assertEqual(1.8, buckets[9].low_value) self.assertEqual(2, buckets[9].high_value) self.assertEqual(.3, buckets[9].sample_count)
def testGetProtoStrings(self): # Tests converting string examples into the feature stats proto examples = [] for i in range(2): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hello') examples.append(example) for i in range(3): example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hi') examples.append(example) example = tf.train.Example() example.features.feature['str'].bytes_list.value.append(b'hey') examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('test', test_data.name) self.assertEqual(6, test_data.num_examples) strfeat = test_data.features[0] self.assertEqual('str', strfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, strfeat.type) self.assertEqual(3, strfeat.string_stats.unique) self.assertAlmostEqual(19 / 6.0, strfeat.string_stats.avg_length, 4) self.assertEqual(0, strfeat.string_stats.common_stats.num_missing) self.assertEqual(6, strfeat.string_stats.common_stats.num_non_missing) self.assertEqual(1, strfeat.string_stats.common_stats.min_num_values) self.assertEqual(1, strfeat.string_stats.common_stats.max_num_values) self.assertEqual(1, strfeat.string_stats.common_stats.avg_num_values) hist = strfeat.string_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(.6, buckets[0].sample_count) self.assertEquals(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(.6, buckets[9].sample_count) self.assertEqual(2, len(strfeat.string_stats.top_values)) self.assertEqual(3, strfeat.string_stats.top_values[0].frequency) self.assertEqual('hi', strfeat.string_stats.top_values[0].value) self.assertEqual(2, strfeat.string_stats.top_values[1].frequency) self.assertEqual('hello', strfeat.string_stats.top_values[1].value) buckets = strfeat.string_stats.rank_histogram.buckets self.assertEqual(3, len(buckets)) self.assertEqual(0, buckets[0].low_rank) self.assertEqual(0, buckets[0].high_rank) self.assertEqual(3, buckets[0].sample_count) self.assertEqual('hi', buckets[0].label) self.assertEqual(2, buckets[2].low_rank) self.assertEqual(2, buckets[2].high_rank) self.assertEqual(1, buckets[2].sample_count) self.assertEqual('hey', buckets[2].label)
def testGetProtoNums(self): # Tests converting int examples into the feature stats proto examples = [] for i in range(50): example = tf.train.Example() example.features.feature['num'].int64_list.value.append(i) examples.append(example) example = tf.train.Example() example.features.feature['other'].int64_list.value.append(0) examples.append(example) entries = {} for i, example in enumerate(examples): fs._ParseExample(example.features.feature, [], entries, i) datasets = [{ 'entries': entries, 'size': len(examples), 'name': 'test' }] p = gfsg.GetDatasetsProto(datasets) self.assertEqual(1, len(p.datasets)) test_data = p.datasets[0] self.assertEqual('test', test_data.name) self.assertEqual(51, test_data.num_examples) numfeat = test_data.features[0] if ( test_data.features[0].name == 'num') else test_data.features[1] self.assertEqual('num', numfeat.name) self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type) self.assertEqual(0, numfeat.num_stats.min) self.assertEqual(49, numfeat.num_stats.max) self.assertEqual(24.5, numfeat.num_stats.mean) self.assertEqual(24.5, numfeat.num_stats.median) self.assertEqual(1, numfeat.num_stats.num_zeros) self.assertAlmostEqual(14.430869689, numfeat.num_stats.std_dev, 4) self.assertEqual(1, numfeat.num_stats.common_stats.num_missing) self.assertEqual(50, numfeat.num_stats.common_stats.num_non_missing) self.assertEqual(1, numfeat.num_stats.common_stats.min_num_values) self.assertEqual(1, numfeat.num_stats.common_stats.max_num_values) self.assertAlmostEqual(1, numfeat.num_stats.common_stats.avg_num_values, 4) hist = numfeat.num_stats.common_stats.num_values_histogram buckets = hist.buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type) self.assertEqual(10, len(buckets)) self.assertEqual(1, buckets[0].low_value) self.assertEqual(1, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertEquals(1, buckets[9].low_value) self.assertEqual(1, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count) self.assertEqual(2, len(numfeat.num_stats.histograms)) buckets = numfeat.num_stats.histograms[0].buckets self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, numfeat.num_stats.histograms[0].type) self.assertEqual(10, len(buckets)) self.assertEqual(0, buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertAlmostEqual(44.1, buckets[9].low_value) self.assertEqual(49, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count) buckets = numfeat.num_stats.histograms[1].buckets self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, numfeat.num_stats.histograms[1].type) self.assertEqual(10, len(buckets)) self.assertEqual(0, buckets[0].low_value) self.assertEqual(4.9, buckets[0].high_value) self.assertEqual(5, buckets[0].sample_count) self.assertAlmostEqual(44.1, buckets[9].low_value) self.assertEqual(49, buckets[9].high_value) self.assertEqual(5, buckets[9].sample_count)