def test_get_weight_feature_with_weight_feature_multiple_values(self): batch = { 'a': np.array([np.array([1])]), 'w': np.array([np.array([2, 3])]) } with self.assertRaisesRegexp(ValueError, 'Weight feature.*single value'): stats_util.get_weight_feature(batch, 'w')
def test_get_weight_feature_with_weight_feature_string_type(self): batch = { 'a': np.array([np.array([1])]), 'w': np.array([np.array(['a'])]) } with self.assertRaisesRegexp(ValueError, 'Weight feature.*numeric type'): stats_util.get_weight_feature(batch, 'w')
def test_get_weight_feature_with_valid_weight_feature(self): batch = { 'a': np.array([np.array([1, 2]), np.array([3])]), 'w': np.array([np.array([10]), np.array([20])]) } actual = stats_util.get_weight_feature(batch, 'w') np.testing.assert_equal(actual, batch['w'])
def add_input(self, accumulator, input_batch): if self._weight_feature: weights = stats_util.get_weight_feature(input_batch, self._weight_feature) # Iterate through each feature and update the partial common stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue # If we encounter this feature for the first time, create a # new partial common stats. if feature_name not in accumulator: partial_stats = _PartialCommonStats( self._weight_feature is not None) # Store empty summary. partial_stats.num_values_summary = ( self._quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Update the common statistics for every example in the batch. num_values = [] for i, value in enumerate(values): _update_common_stats( accumulator[feature_name], value, feature_name, weights[i][0] if self._weight_feature else None) # Keep track of the number of values in non-missing examples. if isinstance(value, np.ndarray): num_values.append(value.size) # Update the num_vals_histogram summary for the feature based on the # current batch. if num_values: accumulator[feature_name].num_values_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name].num_values_summary, [num_values])) return accumulator
def _unbatch_input_to_feature_values_with_weights(input_batch, categorical_features, weight_feature=None): """Unbatches the input to output tuples containing feature values and weights. Specifically, iterates over all the STRING features in the input batch and outputs tuples containing feature name, feature value and the weight associated with the value (if a weight feature is provided). Args: input_batch: Current batch of examples. categorical_features: Set of names of categorical features. weight_feature: Name of the weight feature. None if there is no weight feature. Yields: A tuple (feature_name, feature_value_list, optional weight). """ if weight_feature is not None: weights = stats_util.get_weight_feature(input_batch, weight_feature) for feature_name, values_batch in six.iteritems(input_batch): if feature_name == weight_feature: continue is_categorical = feature_name in categorical_features for i, values in enumerate(values_batch): # Check if we have a numpy array with at least one value. if not isinstance(values, np.ndarray) or values.size == 0: continue # If the feature is neither categorical nor of string type, then # skip the feature. if not (is_categorical or get_feature_type(values.dtype) == statistics_pb2.FeatureNameStatistics.STRING): continue yield _FeatureNameAndValueListWithWeight( feature_name, values.astype(str) if is_categorical else values, weights[i][0] if weight_feature else None)
def add_input(self, accumulator, input_batch): if self._weight_feature is not None: weights = stats_util.get_weight_feature(input_batch, self._weight_feature) for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue unweighted_counts = collections.Counter() weighted_counts = _WeightedCounter() for i, value in enumerate(values): # Check if we have a numpy array with at least one value. if not isinstance(value, np.ndarray) or value.size == 0: continue # Check that the feature is either categorical or of string type. if not (feature_name in self._categorical_features or get_feature_type(value.dtype) == statistics_pb2.FeatureNameStatistics.STRING): continue if feature_name in self._categorical_features: value = value.astype(str) unweighted_counts.update(value) if self._weight_feature is not None: weighted_counts.weighted_update(value, weights[i][0]) if feature_name not in accumulator: accumulator[feature_name] = _ValueCounts( unweighted_counts=unweighted_counts, weighted_counts=weighted_counts) else: accumulator[feature_name].unweighted_counts.update( unweighted_counts) accumulator[feature_name].weighted_counts.update( weighted_counts) return accumulator
def add_input(self, accumulator, input_batch ): if self._weight_feature: # TODO(b/118489848): This method also validates the weight feature. # Consider moving these validation checks outside of the generators. weights = stats_util.get_weight_feature(input_batch, self._weight_feature) # Iterate through each feature and update the partial basic stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue is_categorical_feature = feature_name in self._categorical_features # If we encounter this feature for the first time, create a # new partial basic stats. if feature_name not in accumulator: partial_stats = _PartialBasicStats(self._weight_feature is not None) # Store empty summary. partial_stats.common_stats.num_values_summary = ( self._num_values_quantiles_combiner.create_accumulator()) partial_stats.numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Keep track of the number of values in each example in order to update # the common statistics. num_values = [] # Keep track of the values and the weights in the current batch for # numeric feature. Note that we store the values in the current batch # so that we invoke the quantiles combiner only once per feature for # the input batch. values_and_weights_numeric_feat = [[], []] for i, value in enumerate(values): # TODO(b/79685042): Currently we infer the type for each example, which # is expensive. Consider doing the type inference only once per batch. if isinstance(value, np.ndarray): feature_type = get_feature_type(value.dtype) if feature_type is None: raise TypeError('Feature {} has value {} which is a numpy array ' 'of type {}, should be int, float or str ' 'types.'.format(feature_name, value, value.dtype.name)) elif value is None: # We have a missing value. feature_type = None else: raise TypeError('Feature %s has value of type %s, ' 'should be numpy.ndarray or None' % (feature_name, type(value).__name__)) accumulator[feature_name].common_stats.update( value, feature_name, feature_type, weights[i][0] if self._weight_feature else None) if value is None: continue # Keep track of the number of values in non-missing examples. num_values.append(value.size) if value.size == 0: continue if (is_categorical_feature or feature_type == statistics_pb2.FeatureNameStatistics.STRING): # If we have a categorical feature, convert the value to string type. if is_categorical_feature: value = value.astype(str) # Update the partial string stats. accumulator[feature_name].string_stats.update(value) else: # Update the partial numeric stats and append values # to the current batch of values and weights. accumulator[feature_name].numeric_stats.update( value, values_and_weights_numeric_feat, weights[i][0] if self._weight_feature else None) # Update the num_vals_histogram summary for the feature based on the # current batch. if num_values: accumulator[feature_name].common_stats.num_values_summary = ( self._num_values_quantiles_combiner.add_input( accumulator[feature_name].common_stats.num_values_summary, [num_values])) # Update the quantiles summary of the numeric feature values based on the # current batch of values and weights. if values_and_weights_numeric_feat[0]: numeric_stats = accumulator[feature_name].numeric_stats # For the unweighted case, explicitly set the weights to be 1. We do # this so that we can use the same weighted quantiles combiner for both # scenarios. numeric_stats.quantiles_summary = ( self._values_quantiles_combiner.add_input( numeric_stats.quantiles_summary, [values_and_weights_numeric_feat[0], # Set weights to be 1. [1] * len(values_and_weights_numeric_feat[0])])) if self._weight_feature: numeric_stats.weighted_quantiles_summary = ( self._values_quantiles_combiner.add_input( numeric_stats.weighted_quantiles_summary, values_and_weights_numeric_feat)) return accumulator
def test_get_weight_feature_with_weight_feature_missing(self): batch = {'a': np.array([np.array([1])]), 'w': np.array([None])} with self.assertRaisesRegexp(ValueError, 'Weight feature.*missing'): stats_util.get_weight_feature(batch, 'w')
def test_get_weight_feature_invalid_weight_feature(self): batch = {'a': np.array([np.array([1])])} with self.assertRaisesRegexp(ValueError, 'Weight feature.*not present'): stats_util.get_weight_feature(batch, 'w')
def add_input(self, accumulator, input_batch): if self._weight_feature: weights = stats_util.get_weight_feature(input_batch, self._weight_feature) # Iterate through each feature and update the partial numeric stats. for feature_name, values in six.iteritems(input_batch): # Skip the weight feature. if feature_name == self._weight_feature: continue # If we have a categorical feature, don't generate numeric stats. if feature_name in self._categorical_features: continue # Update the numeric statistics for every example in the batch. # Keep track of the values and the weights in the current batch. Note # that we store the values in the current batch so that we invoke the # quantiles combiner only once per feature for the input batch. current_batch = [[], []] # stores values and weights for i, value in enumerate(values): # Check if we have a numpy array with at least one value. if not isinstance(value, np.ndarray) or value.size == 0: continue # Check if the numpy array is of numeric type. feature_type = get_feature_type(value.dtype) if feature_type not in [ statistics_pb2.FeatureNameStatistics.INT, statistics_pb2.FeatureNameStatistics.FLOAT ]: continue # If we encounter this feature for the first time, create a # new partial numeric stats. if feature_name not in accumulator: partial_stats = _PartialNumericStats( self._weight_feature is not None) # Store empty summary. partial_stats.quantiles_summary = ( self._quantiles_combiner.create_accumulator()) accumulator[feature_name] = partial_stats # Update the partial numeric stats and append values # to the current batch. _update_numeric_stats( accumulator[feature_name], value, feature_name, feature_type, current_batch, weights[i][0] if self._weight_feature else None) # Update the quantiles summary of the feature based on the current batch. if current_batch[0]: # For the unweighted case, explicitly set the weights to be 1. We do # this so that we can use the same weighted quantiles combiner for both # scenarios. accumulator[feature_name].quantiles_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name].quantiles_summary, [current_batch[0], [1] * len(current_batch[0])])) if self._weight_feature: accumulator[feature_name].weighted_quantiles_summary = ( self._quantiles_combiner.add_input( accumulator[feature_name]. weighted_quantiles_summary, current_batch)) return accumulator