def test_make_feature_type_float(self): self.assertEqual(stats_util.make_feature_type(np.dtype('float16')), statistics_pb2.FeatureNameStatistics.FLOAT) self.assertEqual(stats_util.make_feature_type(np.dtype('float32')), statistics_pb2.FeatureNameStatistics.FLOAT) self.assertEqual(stats_util.make_feature_type(np.dtype('float64')), statistics_pb2.FeatureNameStatistics.FLOAT)
def test_make_feature_type_int(self): self.assertEqual(stats_util.make_feature_type(np.dtype('int8')), statistics_pb2.FeatureNameStatistics.INT) self.assertEqual(stats_util.make_feature_type(np.dtype('int16')), statistics_pb2.FeatureNameStatistics.INT) self.assertEqual(stats_util.make_feature_type(np.dtype('int32')), statistics_pb2.FeatureNameStatistics.INT) self.assertEqual(stats_util.make_feature_type(np.dtype('int64')), statistics_pb2.FeatureNameStatistics.INT)
def _update_common_stats(common_stats, value, feature_name): """Update the partial common statistics using the input value.""" # Check if the input value is a numpy array. If so, we have a non-missing # value to process. if isinstance(value, np.ndarray): # Get the number of values for the feature in the example. num_values = value.size common_stats.num_non_missing += 1 common_stats.min_num_values = min(common_stats.min_num_values, num_values) common_stats.max_num_values = max(common_stats.max_num_values, num_values) common_stats.total_num_values += num_values feature_type = stats_util.make_feature_type(value.dtype) if feature_type is None: raise TypeError( 'Feature %s has value which is a numpy array of type %s, ' 'should be int, float or str types.' % (feature_name, value.dtype.name)) if common_stats.type is None: common_stats.type = feature_type elif common_stats.type != feature_type: raise TypeError('Cannot determine the type of feature %s. ' 'Found values of types %s and %s.' % (feature_name, common_stats.type, feature_type)) # If the feature is missing, increment num_missing. # We represent a missing value by None. elif value is None: common_stats.num_missing += 1 else: raise TypeError('Feature %s has value of type %s, ' 'should be numpy.ndarray or None' % (feature_name, type(value).__name__))
def add_input(self, accumulator, input_batch): # Iterate through each feature and update the partial string stats. for feature_name, values in six.iteritems(input_batch): # Update the string statistics for every example in the batch. for value in values: # Check if we have a numpy array with at least one value. if not isinstance(value, np.ndarray) or value.size == 0: continue # If the feature is neither categorical nor of string type, then # skip the feature. if not (feature_name in self._categorical_features or stats_util.make_feature_type(value.dtype) == statistics_pb2.FeatureNameStatistics.STRING): continue # If we encounter this feature for the first time, create a # new partial string stats. if feature_name not in accumulator: accumulator[feature_name] = _PartialStringStats() # If we have a categorical feature, convert the value to string type. if feature_name in self._categorical_features: value = value.astype(str) # Update the partial string stats. for v in value: accumulator[feature_name].total_bytes_length += len(v) accumulator[feature_name].total_num_values += len(value) return accumulator
def add_input(self, accumulator, input_batch): if self._weight_feature: if self._weight_feature not in input_batch: raise ValueError( 'Weight feature "{}" not present in the input ' 'batch.'.format(self._weight_feature)) weights = input_batch[self._weight_feature] # Iterate through each feature and update the partial common stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue # If we encounter this feature for the first time, create a # new partial common stats. if feature_name not in accumulator: partial_stats = _PartialCommonStats( self._weight_feature is not None) # Store empty summary. partial_stats.num_values_summary = ( self._quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Update the common statistics for every example in the batch. num_values = [] for i, value in enumerate(values): if self._weight_feature: if weights[i] is None: raise ValueError('Weight feature "{}" missing in an ' 'example.'.format( self._weight_feature)) elif (stats_util.make_feature_type(weights[i].dtype) == statistics_pb2.FeatureNameStatistics.STRING): raise ValueError( 'Weight feature "{}" must be of numeric type. ' 'Found {}.'.format(self._weight_feature, weights[i])) elif weights[i].size != 1: raise ValueError( 'Weight feature "{}" must have a single value. ' 'Found {}.'.format(self._weight_feature, weights[i])) _update_common_stats( accumulator[feature_name], value, feature_name, weights[i][0] if self._weight_feature else None) # Keep track of the number of values in non-missing examples. if isinstance(value, np.ndarray): num_values.append(value.size) # Update the num_vals_histogram summary for the feature based on the # current batch. if num_values: accumulator[feature_name].num_values_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name].num_values_summary, [num_values])) return accumulator
def add_input(self, accumulator, input_batch ): # Iterate through each feature and update the partial numeric stats. for feature_name, values in six.iteritems(input_batch): # If we have a categorical feature, don't generate numeric stats. if feature_name in self._categorical_features: continue # Update the numeric statistics for every example in the batch. current_batch = [] for value in values: # Check if we have a numpy array with at least one value. if not isinstance(value, np.ndarray) or value.size == 0: continue # Check if the numpy array is of numeric type. feature_type = stats_util.make_feature_type(value.dtype) if feature_type not in [ statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT ]: continue # If we encounter this feature for the first time, create a # new partial numeric stats. if feature_name not in accumulator: partial_stats = _PartialNumericStats() # Store empty summary. partial_stats.std_hist_summary = ( self._std_hist_combiner.create_accumulator()) partial_stats.quantiles_hist_summary = ( self._quantiles_hist_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Update the partial numeric stats and append values # to the current batch. _update_numeric_stats(accumulator[feature_name], value, feature_name, feature_type, current_batch) # Update the equi-width histogram and quantiles histogram sequi-widthor # the feature based on the current batch. if current_batch: accumulator[feature_name].std_hist_summary = ( self._std_hist_combiner.add_input( accumulator[feature_name].std_hist_summary, [current_batch])) accumulator[feature_name].quantiles_hist_summary = ( self._quantiles_hist_combiner.add_input( accumulator[feature_name].quantiles_hist_summary, [current_batch])) return accumulator
def process(self, element): self._num_instances.inc(1) for _, value in six.iteritems(element): if not isinstance(value, np.ndarray): self._unknown_feature_values_count.update(1) continue feature_type = stats_util.make_feature_type(value.dtype) if feature_type == statistics_pb2.FeatureNameStatistics.INT: self._int_feature_values_count.update(len(value)) elif feature_type == statistics_pb2.FeatureNameStatistics.FLOAT: self._float_feature_values_count.update(len(value)) elif feature_type == statistics_pb2.FeatureNameStatistics.STRING: self._string_feature_values_count.update(len(value)) else: self._unknown_feature_values_count.update(len(value)) yield element
def _filter_irrelevant_features(self, input_batch): """Filters out non-string features.""" for feature_name, values_batch in six.iteritems(input_batch): is_categorical = feature_name in self._categorical_features for values in values_batch: # Check if we have a numpy array with at least one value. if not isinstance(values, np.ndarray) or values.size == 0: continue # If the feature is neither categorical nor of string type, then # skip the feature. if not (is_categorical or stats_util.make_feature_type(values.dtype) == statistics_pb2.FeatureNameStatistics.STRING): continue yield (feature_name, values.astype(str) if is_categorical else values)
def test_make_feature_type_invalid_dtype(self): with self.assertRaises(TypeError): stats_util.make_feature_type(int)
def test_make_feature_type_none(self): self.assertIsNone(stats_util.make_feature_type(np.dtype('complex64')))
def test_make_feature_type_string(self): self.assertEqual(stats_util.make_feature_type(np.dtype('S')), statistics_pb2.FeatureNameStatistics.STRING) self.assertEqual(stats_util.make_feature_type(np.dtype('U')), statistics_pb2.FeatureNameStatistics.STRING)