示例#1
0
    def testInfinitysOnly(self):
        examples = []
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('inf'))
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('-inf'))
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        numfeat = p.datasets[0].features[0]
        hist = buckets = numfeat.num_stats.histograms[0]
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(float('-inf'), buckets[0].low_value)
        self.assertEqual(0.1, buckets[0].high_value)
        self.assertEqual(1, buckets[0].sample_count)
        self.assertEquals(0.9, buckets[9].low_value)
        self.assertEqual(float('inf'), buckets[9].high_value)
        self.assertEqual(1, buckets[9].sample_count)
示例#2
0
    def testQuantiles(self):
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(100)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        numfeat = p.datasets[0].features[0]
        self.assertEqual(2, len(numfeat.num_stats.histograms))
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES,
                         numfeat.num_stats.histograms[1].type)
        buckets = numfeat.num_stats.histograms[1].buckets
        self.assertEqual(10, len(buckets))
        self.assertEqual(0, buckets[0].low_value)
        self.assertEqual(9.9, buckets[0].high_value)
        self.assertEqual(10, buckets[0].sample_count)
        self.assertEqual(100, buckets[9].low_value)
        self.assertEqual(100, buckets[9].high_value)
        self.assertEqual(10, buckets[9].sample_count)
示例#3
0
    def testParseExampleSequenceFeatureList(self):
        examples = []
        for i in range(50):
            example = tf.train.SequenceExample()
            feat = example.feature_lists.feature_list['num'].feature.add()
            feat.int64_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.context.feature,
                             example.feature_lists.feature_list, entries, i)
        self._check_sequence_example_entries(entries, 50, 1, 1)
示例#4
0
    def testParseExampleSequenceContext(self):
        # Tests parsing examples of integers in context field
        examples = []
        for i in range(50):
            example = tf.train.SequenceExample()
            example.context.feature['num'].int64_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.context.feature,
                             example.feature_lists.feature_list, entries, i)
        self._check_sequence_example_entries(entries, 50, 1)
        self.assertEqual(1, len(entries))
示例#5
0
    def testParseExamplesTypeMismatch(self):
        examples = []
        example = tf.train.Example()
        example.features.feature['feat'].int64_list.value.append(0)
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['feat'].bytes_list.value.append(b'str')
        examples.append(example)

        entries = {}
        fs._ParseExample(examples[0].features.feature, [], entries, 0)

        with self.assertRaises(TypeError):
            fs._ParseExample(examples[1].features.feature, [], entries, 1)
示例#6
0
    def testInfinityAndNan(self):
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].float_list.value.append(i)
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('inf'))
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('-inf'))
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('nan'))
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        numfeat = p.datasets[0].features[0]

        self.assertEqual('num', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, numfeat.type)
        self.assertTrue(np.isnan(numfeat.num_stats.min))
        self.assertTrue(np.isnan(numfeat.num_stats.max))
        self.assertTrue(np.isnan(numfeat.num_stats.mean))
        self.assertTrue(np.isnan(numfeat.num_stats.median))
        self.assertEqual(1, numfeat.num_stats.num_zeros)
        self.assertTrue(np.isnan(numfeat.num_stats.std_dev))
        self.assertEqual(53, numfeat.num_stats.common_stats.num_non_missing)
        hist = buckets = numfeat.num_stats.histograms[0]
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type)
        self.assertEqual(1, hist.num_nan)
        self.assertEqual(10, len(buckets))
        self.assertEqual(float('-inf'), buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(6, buckets[0].sample_count)
        self.assertEquals(44.1, buckets[9].low_value)
        self.assertEqual(float('inf'), buckets[9].high_value)
        self.assertEqual(6, buckets[9].sample_count)
示例#7
0
    def testParseExampleSequenceFeatureListMultipleEntriesOuter(self):
        # Tests parsing examples of integers in context field
        examples = []
        for i in range(2):
            example = tf.train.SequenceExample()
            for j in range(25):
                feat = example.feature_lists.feature_list['num'].feature.add()
                feat.int64_list.value.append(i * 25 + j)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.context.feature,
                             example.feature_lists.feature_list, entries, i)
        self._check_sequence_example_entries(entries, 2, 25, 25)
示例#8
0
    def testGetProtoMultipleDatasets(self):
        # Tests converting multiple datsets into the feature stats proto
        # including ensuring feature order is consistent in the protos.
        examples1 = []
        for i in range(2):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'one')
            example.features.feature['num'].int64_list.value.append(0)
            examples1.append(example)
        examples2 = []
        example = tf.train.Example()
        example.features.feature['num'].int64_list.value.append(1)
        example.features.feature['str'].bytes_list.value.append(b'two')
        examples2.append(example)

        entries1 = {}
        for i, example1 in enumerate(examples1):
            fs._ParseExample(example1.features.feature, [], entries1, i)
        entries2 = {}
        for i, example2 in enumerate(examples2):
            fs._ParseExample(example2.features.feature, [], entries2, i)

        datasets = [{
            'entries': entries1,
            'size': len(examples1),
            'name': 'test1'
        }, {
            'entries': entries2,
            'size': len(examples2),
            'name': 'test2'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(2, len(p.datasets))
        test_data_1 = p.datasets[0]
        self.assertEqual('test1', test_data_1.name)
        self.assertEqual(2, test_data_1.num_examples)
        num_feat_index = 0 if test_data_1.features[0].name == 'num' else 1
        self.assertEqual(0, test_data_1.features[num_feat_index].num_stats.max)
        test_data_2 = p.datasets[1]
        self.assertEqual('test2', test_data_2.name)
        self.assertEqual(1, test_data_2.num_examples)
        self.assertEqual(1, test_data_2.features[num_feat_index].num_stats.max)
示例#9
0
    def testParseExampleInt(self):
        # Tests parsing examples of integers
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(1, len(entries))
        self.assertIn('num', entries)
        info = entries['num']
        self.assertEqual(0, info['missing'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, info['type'])
        for i in range(len(examples)):
            self.assertEqual(1, info['counts'][i])
            self.assertEqual(i, info['vals'][i])
示例#10
0
    def testParseExampleMissingValueList(self):
        # Tests parsing examples of integers
        examples = []
        example = tf.train.Example()
        # pylint: disable=pointless-statement
        example.features.feature['str']
        # pylint: enable=pointless-statement
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['str'].bytes_list.value.append(b'test')
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(1, len(entries))
        self.assertIn('str', entries)
        info = entries['str']
        self.assertEqual(1, info['missing'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, info['type'])
        self.assertEqual(0, info['counts'][0])
        self.assertEqual(1, info['counts'][1])
示例#11
0
    def testVaryingCountsAndMissing(self):
        # Tests parsing examples of when some examples have missing features
        examples = []
        for i in range(5):
            example = tf.train.Example()
            example.features.feature['other'].int64_list.value.append(0)
            for _ in range(i):
                example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['other'].int64_list.value.append(0)
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        info = entries['num']
        self.assertEqual(2, info['missing'])
        self.assertEquals(4, len(info['counts']))
        for i in range(4):
            self.assertEqual(i + 1, info['counts'][i])
        self.assertEqual(10, len(info['vals']))
示例#12
0
    def testParseExampleStringsAndFloats(self):
        # Tests parsing examples of string and float features
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hi')
            example.features.feature['float'].float_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(2, len(entries))
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT,
                         entries['float']['type'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING,
                         entries['str']['type'])
        for i in range(len(examples)):
            self.assertEqual(1, entries['str']['counts'][i])
            self.assertEqual(1, entries['float']['counts'][i])
            self.assertEqual(i, entries['float']['vals'][i])
            self.assertEqual(
                'hi', entries['str']['vals'][i].decode('UTF-8', 'strict'))
示例#13
0
    def testGetProtoStrings(self):
        # Tests converting string examples into the feature stats proto
        examples = []
        for i in range(2):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hello')
            examples.append(example)
        for i in range(3):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hi')
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['str'].bytes_list.value.append(b'hey')
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('test', test_data.name)
        self.assertEqual(6, test_data.num_examples)

        strfeat = test_data.features[0]
        self.assertEqual('str', strfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, strfeat.type)
        self.assertEqual(3, strfeat.string_stats.unique)
        self.assertAlmostEqual(19 / 6.0, strfeat.string_stats.avg_length, 4)
        self.assertEqual(0, strfeat.string_stats.common_stats.num_missing)
        self.assertEqual(6, strfeat.string_stats.common_stats.num_non_missing)
        self.assertEqual(1, strfeat.string_stats.common_stats.min_num_values)
        self.assertEqual(1, strfeat.string_stats.common_stats.max_num_values)
        self.assertEqual(1, strfeat.string_stats.common_stats.avg_num_values)
        hist = strfeat.string_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.6, buckets[0].sample_count)
        self.assertEquals(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(.6, buckets[9].sample_count)

        self.assertEqual(2, len(strfeat.string_stats.top_values))
        self.assertEqual(3, strfeat.string_stats.top_values[0].frequency)
        self.assertEqual('hi', strfeat.string_stats.top_values[0].value)
        self.assertEqual(2, strfeat.string_stats.top_values[1].frequency)
        self.assertEqual('hello', strfeat.string_stats.top_values[1].value)

        buckets = strfeat.string_stats.rank_histogram.buckets
        self.assertEqual(3, len(buckets))
        self.assertEqual(0, buckets[0].low_rank)
        self.assertEqual(0, buckets[0].high_rank)
        self.assertEqual(3, buckets[0].sample_count)
        self.assertEqual('hi', buckets[0].label)
        self.assertEqual(2, buckets[2].low_rank)
        self.assertEqual(2, buckets[2].high_rank)
        self.assertEqual(1, buckets[2].sample_count)
        self.assertEqual('hey', buckets[2].label)
示例#14
0
    def testGetProtoNums(self):
        # Tests converting int examples into the feature stats proto
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['other'].int64_list.value.append(0)
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('test', test_data.name)
        self.assertEqual(51, test_data.num_examples)

        numfeat = test_data.features[0] if (
            test_data.features[0].name == 'num') else test_data.features[1]
        self.assertEqual('num', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type)
        self.assertEqual(0, numfeat.num_stats.min)
        self.assertEqual(49, numfeat.num_stats.max)
        self.assertEqual(24.5, numfeat.num_stats.mean)
        self.assertEqual(24.5, numfeat.num_stats.median)
        self.assertEqual(1, numfeat.num_stats.num_zeros)
        self.assertAlmostEqual(14.430869689, numfeat.num_stats.std_dev, 4)
        self.assertEqual(1, numfeat.num_stats.common_stats.num_missing)
        self.assertEqual(50, numfeat.num_stats.common_stats.num_non_missing)
        self.assertEqual(1, numfeat.num_stats.common_stats.min_num_values)
        self.assertEqual(1, numfeat.num_stats.common_stats.max_num_values)
        self.assertAlmostEqual(1,
                               numfeat.num_stats.common_stats.avg_num_values,
                               4)
        hist = numfeat.num_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertEquals(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)

        self.assertEqual(2, len(numfeat.num_stats.histograms))
        buckets = numfeat.num_stats.histograms[0].buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD,
                         numfeat.num_stats.histograms[0].type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(0, buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertAlmostEqual(44.1, buckets[9].low_value)
        self.assertEqual(49, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)

        buckets = numfeat.num_stats.histograms[1].buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES,
                         numfeat.num_stats.histograms[1].type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(0, buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertAlmostEqual(44.1, buckets[9].low_value)
        self.assertEqual(49, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)