def testGetDatasetsProtoFromEntriesLists(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': gfsg.GetFeatureStatsProtoDef().INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        hist = numfeat.num_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.3, buckets[0].sample_count)
        self.assertEqual(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(.3, buckets[9].sample_count)
    def testProtoFromDataFrames(self):
        data = [[1, 'hi'], [2, 'hello'], [3, 'hi']]
        df = pd.DataFrame(data,
                          columns=['testFeatureInt', 'testFeatureString'])
        dataframes = [{'table': df, 'name': 'testDataset'}]
        p = gfsg.ProtoFromDataFrames(dataframes)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(2, len(test_data.features))

        if test_data.features[0].name == 'testFeatureInt':
            numfeat = test_data.features[0]
            stringfeat = test_data.features[1]
        else:
            numfeat = test_data.features[1]
            stringfeat = test_data.features[0]

        self.assertEqual('testFeatureInt', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type)
        self.assertEqual(1, numfeat.num_stats.min)
        self.assertEqual(3, numfeat.num_stats.max)
        self.assertEqual('testFeatureString', stringfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING,
                         stringfeat.type)
        self.assertEqual(2, stringfeat.string_stats.unique)
    def testGetDatasetsProtoWithWhitelist(self):
        entries = {}
        entries['testFeature'] = {
            'vals': [1, 2, 3],
            'counts': [1, 1, 1],
            'missing': 0,
            'type': gfsg.GetFeatureStatsProtoDef().INT
        }
        entries['ignoreFeature'] = {
            'vals': [5, 6],
            'counts': [1, 1],
            'missing': 1,
            'type': gfsg.GetFeatureStatsProtoDef().INT
        }
        datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
        p = gfsg.GetDatasetsProto(datasets, features=['testFeature'])

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('testDataset', test_data.name)
        self.assertEqual(3, test_data.num_examples)
        self.assertEqual(1, len(test_data.features))
        numfeat = test_data.features[0]
        self.assertEqual('testFeature', numfeat.name)
        self.assertEqual(1, numfeat.num_stats.min)
示例#4
0
    def testInfinityAndNan(self):
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].float_list.value.append(i)
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('inf'))
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('-inf'))
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['num'].float_list.value.append(float('nan'))
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        numfeat = p.datasets[0].features[0]

        self.assertEqual('num', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT, numfeat.type)
        self.assertTrue(np.isnan(numfeat.num_stats.min))
        self.assertTrue(np.isnan(numfeat.num_stats.max))
        self.assertTrue(np.isnan(numfeat.num_stats.mean))
        self.assertTrue(np.isnan(numfeat.num_stats.median))
        self.assertEqual(1, numfeat.num_stats.num_zeros)
        self.assertTrue(np.isnan(numfeat.num_stats.std_dev))
        self.assertEqual(53, numfeat.num_stats.common_stats.num_non_missing)
        hist = buckets = numfeat.num_stats.histograms[0]
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD, hist.type)
        self.assertEqual(1, hist.num_nan)
        self.assertEqual(10, len(buckets))
        self.assertEqual(float('-inf'), buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(6, buckets[0].sample_count)
        self.assertEquals(44.1, buckets[9].low_value)
        self.assertEqual(float('inf'), buckets[9].high_value)
        self.assertEqual(6, buckets[9].sample_count)
示例#5
0
    def testParseExampleStringsAndFloats(self):
        # Tests parsing examples of string and float features
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hi')
            example.features.feature['float'].float_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(2, len(entries))
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT,
                         entries['float']['type'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING,
                         entries['str']['type'])
        for i in range(len(examples)):
            self.assertEqual(1, entries['str']['counts'][i])
            self.assertEqual(1, entries['float']['counts'][i])
            self.assertEqual(i, entries['float']['vals'][i])
            self.assertEqual(
                'hi', entries['str']['vals'][i].decode('UTF-8', 'strict'))
示例#6
0
 def _check_sequence_example_entries(self,
                                     entries,
                                     n_examples,
                                     n_features,
                                     feat_len=None):
     self.assertIn('num', entries)
     info = entries['num']
     self.assertEqual(0, info['missing'])
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, info['type'])
     for i in range(n_examples):
         self.assertEqual(n_features, info['counts'][i])
         if feat_len is not None:
             self.assertEqual(feat_len, info['feat_lens'][i])
     for i in range(n_examples * n_features):
         self.assertEqual(i, info['vals'][i])
     if feat_len is None:
         self.assertEqual(0, len(info['feat_lens']))
 def testDTypeToType(self):
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT,
                      gfsg.DtypeToType(np.dtype(np.int32)))
     # Boolean and time types treated as int
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT,
                      gfsg.DtypeToType(np.dtype(np.bool)))
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT,
                      gfsg.DtypeToType(np.dtype(np.datetime64)))
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT,
                      gfsg.DtypeToType(np.dtype(np.timedelta64)))
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().FLOAT,
                      gfsg.DtypeToType(np.dtype(np.float32)))
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING,
                      gfsg.DtypeToType(np.dtype(np.str)))
     # Unsupported types treated as string for now
     self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING,
                      gfsg.DtypeToType(np.dtype(np.void)))
示例#8
0
    def testParseExampleInt(self):
        # Tests parsing examples of integers
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(1, len(entries))
        self.assertIn('num', entries)
        info = entries['num']
        self.assertEqual(0, info['missing'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, info['type'])
        for i in range(len(examples)):
            self.assertEqual(1, info['counts'][i])
            self.assertEqual(i, info['vals'][i])
 def testGetDatasetsProtoSequenceExampleHistogram(self):
     entries = {}
     entries['testFeature'] = {
         'vals': [1, 2, 2, 3],
         'counts': [1, 2, 1],
         'feat_lens': [1, 2, 1],
         'missing': 0,
         'type': gfsg.GetFeatureStatsProtoDef().INT
     }
     datasets = [{'entries': entries, 'size': 3, 'name': 'testDataset'}]
     p = gfsg.GetDatasetsProto(datasets)
     hist = p.datasets[0].features[
         0].num_stats.common_stats.feature_list_length_histogram
     buckets = hist.buckets
     self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
     self.assertEqual(10, len(buckets))
     self.assertEqual(1, buckets[0].low_value)
     self.assertEqual(1, buckets[0].high_value)
     self.assertEqual(.3, buckets[0].sample_count)
     self.assertEqual(1.8, buckets[9].low_value)
     self.assertEqual(2, buckets[9].high_value)
     self.assertEqual(.3, buckets[9].sample_count)
示例#10
0
    def testParseExampleMissingValueList(self):
        # Tests parsing examples of integers
        examples = []
        example = tf.train.Example()
        # pylint: disable=pointless-statement
        example.features.feature['str']
        # pylint: enable=pointless-statement
        examples.append(example)
        example = tf.train.Example()
        example.features.feature['str'].bytes_list.value.append(b'test')
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        self.assertEqual(1, len(entries))
        self.assertIn('str', entries)
        info = entries['str']
        self.assertEqual(1, info['missing'])
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, info['type'])
        self.assertEqual(0, info['counts'][0])
        self.assertEqual(1, info['counts'][1])
示例#11
0
def _ParseExample(example_features, example_feature_lists, entries, index):
    """Parses data from an example, populating a dictionary of feature values.

  Args:
    example_features: A map of strings to tf.Features from the example.
    example_feature_lists: A map of strings to tf.FeatureLists from the example.
    entries: A dictionary of all features parsed thus far and arrays of their
        values. This is mutated by the function.
    index: The index of the example to parse from a list of examples.
  Raises:
    TypeError: Raises an exception when a feature has inconsistent types across
        examples.
  """
    features_seen = set()

    for feature_list, is_feature in zip(
        [example_features, example_feature_lists], [True, False]):
        sequence_length = None
        for feature_name in feature_list:
            # If this feature has not been seen in previous examples, then initialize
            # its entry into the entries dictionary.
            if feature_name not in entries:
                entries[feature_name] = {
                    'vals': [],
                    'counts': [],
                    'feat_lens': [],
                    'missing': index
                }

            feature_entry = entries[feature_name]
            feature = feature_list[feature_name]

            value_type = None
            value_list = []
            if is_feature:
                # If parsing a tf.Feature, extract the type and values simply.
                if feature.HasField('float_list'):
                    value_list = feature.float_list.value
                    value_type = gfsg.GetFeatureStatsProtoDef().FLOAT
                elif feature.HasField('bytes_list'):
                    value_list = feature.bytes_list.value
                    value_type = gfsg.GetFeatureStatsProtoDef().STRING
                elif feature.HasField('int64_list'):
                    value_list = feature.int64_list.value
                    value_type = gfsg.GetFeatureStatsProtoDef().INT
            else:
                # If parsing a tf.FeatureList, get the type and values by iterating
                # over all Features in the FeatureList.
                sequence_length = len(feature.feature)
                if sequence_length != 0 and feature.feature[0].HasField(
                        'float_list'):
                    for feat in feature.feature:
                        for value in feat.float_list.value:
                            value_list.append(value)
                    value_type = gfsg.GetFeatureStatsProtoDef().FLOAT
                elif sequence_length != 0 and feature.feature[0].HasField(
                        'bytes_list'):
                    for feat in feature.feature:
                        for value in feat.bytes_list.value:
                            value_list.append(value)
                    value_type = gfsg.GetFeatureStatsProtoDef().STRING
                elif sequence_length != 0 and feature.feature[0].HasField(
                        'int64_list'):
                    for feat in feature.feature:
                        for value in feat.int64_list.value:
                            value_list.append(value)
                    value_type = gfsg.GetFeatureStatsProtoDef().INT
            if value_type is not None:
                if 'type' not in feature_entry:
                    feature_entry['type'] = value_type
                elif feature_entry['type'] != value_type:
                    raise TypeError('type mismatch for feature ' +
                                    feature_name)
            feature_entry['counts'].append(len(value_list))
            feature_entry['vals'].extend(value_list)
            if sequence_length is not None:
                feature_entry['feat_lens'].append(sequence_length)
            if value_list:
                features_seen.add(feature_name)

    # For all previously-seen features not found in this example, update the
    # feature's missing value.
    for f in entries:
        fv = entries[f]
        if f not in features_seen:
            fv['missing'] += 1
示例#12
0
    def testGetProtoStrings(self):
        # Tests converting string examples into the feature stats proto
        examples = []
        for i in range(2):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hello')
            examples.append(example)
        for i in range(3):
            example = tf.train.Example()
            example.features.feature['str'].bytes_list.value.append(b'hi')
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['str'].bytes_list.value.append(b'hey')
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('test', test_data.name)
        self.assertEqual(6, test_data.num_examples)

        strfeat = test_data.features[0]
        self.assertEqual('str', strfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().STRING, strfeat.type)
        self.assertEqual(3, strfeat.string_stats.unique)
        self.assertAlmostEqual(19 / 6.0, strfeat.string_stats.avg_length, 4)
        self.assertEqual(0, strfeat.string_stats.common_stats.num_missing)
        self.assertEqual(6, strfeat.string_stats.common_stats.num_non_missing)
        self.assertEqual(1, strfeat.string_stats.common_stats.min_num_values)
        self.assertEqual(1, strfeat.string_stats.common_stats.max_num_values)
        self.assertEqual(1, strfeat.string_stats.common_stats.avg_num_values)
        hist = strfeat.string_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(.6, buckets[0].sample_count)
        self.assertEquals(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(.6, buckets[9].sample_count)

        self.assertEqual(2, len(strfeat.string_stats.top_values))
        self.assertEqual(3, strfeat.string_stats.top_values[0].frequency)
        self.assertEqual('hi', strfeat.string_stats.top_values[0].value)
        self.assertEqual(2, strfeat.string_stats.top_values[1].frequency)
        self.assertEqual('hello', strfeat.string_stats.top_values[1].value)

        buckets = strfeat.string_stats.rank_histogram.buckets
        self.assertEqual(3, len(buckets))
        self.assertEqual(0, buckets[0].low_rank)
        self.assertEqual(0, buckets[0].high_rank)
        self.assertEqual(3, buckets[0].sample_count)
        self.assertEqual('hi', buckets[0].label)
        self.assertEqual(2, buckets[2].low_rank)
        self.assertEqual(2, buckets[2].high_rank)
        self.assertEqual(1, buckets[2].sample_count)
        self.assertEqual('hey', buckets[2].label)
示例#13
0
    def testGetProtoNums(self):
        # Tests converting int examples into the feature stats proto
        examples = []
        for i in range(50):
            example = tf.train.Example()
            example.features.feature['num'].int64_list.value.append(i)
            examples.append(example)
        example = tf.train.Example()
        example.features.feature['other'].int64_list.value.append(0)
        examples.append(example)

        entries = {}
        for i, example in enumerate(examples):
            fs._ParseExample(example.features.feature, [], entries, i)

        datasets = [{
            'entries': entries,
            'size': len(examples),
            'name': 'test'
        }]
        p = gfsg.GetDatasetsProto(datasets)

        self.assertEqual(1, len(p.datasets))
        test_data = p.datasets[0]
        self.assertEqual('test', test_data.name)
        self.assertEqual(51, test_data.num_examples)

        numfeat = test_data.features[0] if (
            test_data.features[0].name == 'num') else test_data.features[1]
        self.assertEqual('num', numfeat.name)
        self.assertEqual(gfsg.GetFeatureStatsProtoDef().INT, numfeat.type)
        self.assertEqual(0, numfeat.num_stats.min)
        self.assertEqual(49, numfeat.num_stats.max)
        self.assertEqual(24.5, numfeat.num_stats.mean)
        self.assertEqual(24.5, numfeat.num_stats.median)
        self.assertEqual(1, numfeat.num_stats.num_zeros)
        self.assertAlmostEqual(14.430869689, numfeat.num_stats.std_dev, 4)
        self.assertEqual(1, numfeat.num_stats.common_stats.num_missing)
        self.assertEqual(50, numfeat.num_stats.common_stats.num_non_missing)
        self.assertEqual(1, numfeat.num_stats.common_stats.min_num_values)
        self.assertEqual(1, numfeat.num_stats.common_stats.max_num_values)
        self.assertAlmostEqual(1,
                               numfeat.num_stats.common_stats.avg_num_values,
                               4)
        hist = numfeat.num_stats.common_stats.num_values_histogram
        buckets = hist.buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES, hist.type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(1, buckets[0].low_value)
        self.assertEqual(1, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertEquals(1, buckets[9].low_value)
        self.assertEqual(1, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)

        self.assertEqual(2, len(numfeat.num_stats.histograms))
        buckets = numfeat.num_stats.histograms[0].buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().STANDARD,
                         numfeat.num_stats.histograms[0].type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(0, buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertAlmostEqual(44.1, buckets[9].low_value)
        self.assertEqual(49, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)

        buckets = numfeat.num_stats.histograms[1].buckets
        self.assertEqual(gfsg.GetHistogramProtoDef().QUANTILES,
                         numfeat.num_stats.histograms[1].type)
        self.assertEqual(10, len(buckets))
        self.assertEqual(0, buckets[0].low_value)
        self.assertEqual(4.9, buckets[0].high_value)
        self.assertEqual(5, buckets[0].sample_count)
        self.assertAlmostEqual(44.1, buckets[9].low_value)
        self.assertEqual(49, buckets[9].high_value)
        self.assertEqual(5, buckets[9].sample_count)