def test_get_weight_feature_with_weight_feature_multiple_values(self):
     batch = {
         'a': np.array([np.array([1])]),
         'w': np.array([np.array([2, 3])])
     }
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*single value'):
         stats_util.get_weight_feature(batch, 'w')
 def test_get_weight_feature_with_weight_feature_string_type(self):
     batch = {
         'a': np.array([np.array([1])]),
         'w': np.array([np.array(['a'])])
     }
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*numeric type'):
         stats_util.get_weight_feature(batch, 'w')
 def test_get_weight_feature_with_valid_weight_feature(self):
     batch = {
         'a': np.array([np.array([1, 2]), np.array([3])]),
         'w': np.array([np.array([10]), np.array([20])])
     }
     actual = stats_util.get_weight_feature(batch, 'w')
     np.testing.assert_equal(actual, batch['w'])
示例#4
0
    def add_input(self, accumulator, input_batch):
        if self._weight_feature:
            weights = stats_util.get_weight_feature(input_batch,
                                                    self._weight_feature)

        # Iterate through each feature and update the partial common stats.
        for feature_name, values in six.iteritems(input_batch):
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            # If we encounter this feature for the first time, create a
            # new partial common stats.
            if feature_name not in accumulator:
                partial_stats = _PartialCommonStats(
                    self._weight_feature is not None)
                # Store empty summary.
                partial_stats.num_values_summary = (
                    self._quantiles_combiner.create_accumulator())
                accumulator[feature_name] = partial_stats

            # Update the common statistics for every example in the batch.
            num_values = []

            for i, value in enumerate(values):
                _update_common_stats(
                    accumulator[feature_name], value, feature_name,
                    weights[i][0] if self._weight_feature else None)
                # Keep track of the number of values in non-missing examples.
                if isinstance(value, np.ndarray):
                    num_values.append(value.size)

            # Update the num_vals_histogram summary for the feature based on the
            # current batch.
            if num_values:
                accumulator[feature_name].num_values_summary = (
                    self._quantiles_combiner.add_input(
                        accumulator[feature_name].num_values_summary,
                        [num_values]))

        return accumulator
def _unbatch_input_to_feature_values_with_weights(input_batch,
                                                  categorical_features,
                                                  weight_feature=None):
    """Unbatches the input to output tuples containing feature values and weights.

  Specifically, iterates over all the STRING features in the input batch and
  outputs tuples containing feature name, feature value and the weight
  associated with the value (if a weight feature is provided).

  Args:
    input_batch: Current batch of examples.
    categorical_features: Set of names of categorical features.
    weight_feature: Name of the weight feature. None if there is no
        weight feature.

  Yields:
    A tuple (feature_name, feature_value_list, optional weight).
  """
    if weight_feature is not None:
        weights = stats_util.get_weight_feature(input_batch, weight_feature)

    for feature_name, values_batch in six.iteritems(input_batch):
        if feature_name == weight_feature:
            continue

        is_categorical = feature_name in categorical_features
        for i, values in enumerate(values_batch):
            # Check if we have a numpy array with at least one value.
            if not isinstance(values, np.ndarray) or values.size == 0:
                continue
            # If the feature is neither categorical nor of string type, then
            # skip the feature.
            if not (is_categorical or get_feature_type(values.dtype)
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue

            yield _FeatureNameAndValueListWithWeight(
                feature_name,
                values.astype(str) if is_categorical else values,
                weights[i][0] if weight_feature else None)
示例#6
0
    def add_input(self, accumulator, input_batch):
        if self._weight_feature is not None:
            weights = stats_util.get_weight_feature(input_batch,
                                                    self._weight_feature)

        for feature_name, values in six.iteritems(input_batch):
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            unweighted_counts = collections.Counter()
            weighted_counts = _WeightedCounter()

            for i, value in enumerate(values):
                # Check if we have a numpy array with at least one value.
                if not isinstance(value, np.ndarray) or value.size == 0:
                    continue
                # Check that the feature is either categorical or of string type.
                if not (feature_name in self._categorical_features
                        or get_feature_type(value.dtype)
                        == statistics_pb2.FeatureNameStatistics.STRING):
                    continue
                if feature_name in self._categorical_features:
                    value = value.astype(str)

                unweighted_counts.update(value)
                if self._weight_feature is not None:
                    weighted_counts.weighted_update(value, weights[i][0])

            if feature_name not in accumulator:
                accumulator[feature_name] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_name].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_name].weighted_counts.update(
                    weighted_counts)
        return accumulator
  def add_input(self, accumulator,
                input_batch
               ):
    if self._weight_feature:
      # TODO(b/118489848): This method also validates the weight feature.
      # Consider moving these validation checks outside of the generators.
      weights = stats_util.get_weight_feature(input_batch, self._weight_feature)

    # Iterate through each feature and update the partial basic stats.
    for feature_name, values in six.iteritems(input_batch):
      # Skip the weight feature.
      if feature_name == self._weight_feature:
        continue
      is_categorical_feature = feature_name in self._categorical_features

      # If we encounter this feature for the first time, create a
      # new partial basic stats.
      if feature_name not in accumulator:
        partial_stats = _PartialBasicStats(self._weight_feature is not None)
        # Store empty summary.
        partial_stats.common_stats.num_values_summary = (
            self._num_values_quantiles_combiner.create_accumulator())
        partial_stats.numeric_stats.quantiles_summary = (
            self._values_quantiles_combiner.create_accumulator())
        accumulator[feature_name] = partial_stats

      # Keep track of the number of values in each example in order to update
      # the common statistics.
      num_values = []

      # Keep track of the values and the weights in the current batch for
      # numeric feature. Note that we store the values in the current batch
      # so that we invoke the quantiles combiner only once per feature for
      # the input batch.
      values_and_weights_numeric_feat = [[], []]

      for i, value in enumerate(values):
        # TODO(b/79685042): Currently we infer the type for each example, which
        # is expensive. Consider doing the type inference only once per batch.
        if isinstance(value, np.ndarray):
          feature_type = get_feature_type(value.dtype)
          if feature_type is None:
            raise TypeError('Feature {} has value {} which is a numpy array '
                            'of type {}, should be int, float or str '
                            'types.'.format(feature_name, value,
                                            value.dtype.name))
        elif value is None:
          # We have a missing value.
          feature_type = None
        else:
          raise TypeError('Feature %s has value of type %s, '
                          'should be numpy.ndarray or None' %
                          (feature_name, type(value).__name__))

        accumulator[feature_name].common_stats.update(
            value, feature_name, feature_type,
            weights[i][0] if self._weight_feature else None)
        if value is None:
          continue
        # Keep track of the number of values in non-missing examples.
        num_values.append(value.size)
        if value.size == 0:
          continue

        if (is_categorical_feature or
            feature_type == statistics_pb2.FeatureNameStatistics.STRING):
          # If we have a categorical feature, convert the value to string type.
          if is_categorical_feature:
            value = value.astype(str)

          # Update the partial string stats.
          accumulator[feature_name].string_stats.update(value)
        else:
          # Update the partial numeric stats and append values
          # to the current batch of values and weights.
          accumulator[feature_name].numeric_stats.update(
              value, values_and_weights_numeric_feat,
              weights[i][0] if self._weight_feature else None)

      # Update the num_vals_histogram summary for the feature based on the
      # current batch.
      if num_values:
        accumulator[feature_name].common_stats.num_values_summary = (
            self._num_values_quantiles_combiner.add_input(
                accumulator[feature_name].common_stats.num_values_summary,
                [num_values]))

      # Update the quantiles summary of the numeric feature values based on the
      # current batch of values and weights.
      if values_and_weights_numeric_feat[0]:
        numeric_stats = accumulator[feature_name].numeric_stats
        # For the unweighted case, explicitly set the weights to be 1. We do
        # this so that we can use the same weighted quantiles combiner for both
        # scenarios.
        numeric_stats.quantiles_summary = (
            self._values_quantiles_combiner.add_input(
                numeric_stats.quantiles_summary,
                [values_and_weights_numeric_feat[0],
                 # Set weights to be 1.
                 [1] * len(values_and_weights_numeric_feat[0])]))

        if self._weight_feature:
          numeric_stats.weighted_quantiles_summary = (
              self._values_quantiles_combiner.add_input(
                  numeric_stats.weighted_quantiles_summary,
                  values_and_weights_numeric_feat))

    return accumulator
 def test_get_weight_feature_with_weight_feature_missing(self):
     batch = {'a': np.array([np.array([1])]), 'w': np.array([None])}
     with self.assertRaisesRegexp(ValueError, 'Weight feature.*missing'):
         stats_util.get_weight_feature(batch, 'w')
 def test_get_weight_feature_invalid_weight_feature(self):
     batch = {'a': np.array([np.array([1])])}
     with self.assertRaisesRegexp(ValueError,
                                  'Weight feature.*not present'):
         stats_util.get_weight_feature(batch, 'w')
示例#10
0
    def add_input(self, accumulator, input_batch):
        if self._weight_feature:
            weights = stats_util.get_weight_feature(input_batch,
                                                    self._weight_feature)

        # Iterate through each feature and update the partial numeric stats.
        for feature_name, values in six.iteritems(input_batch):
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue

            # If we have a categorical feature, don't generate numeric stats.
            if feature_name in self._categorical_features:
                continue

            # Update the numeric statistics for every example in the batch.
            # Keep track of the values and the weights in the current batch. Note
            # that we store the values in the current batch so that we invoke the
            # quantiles combiner only once per feature for the input batch.
            current_batch = [[], []]  # stores values and weights
            for i, value in enumerate(values):
                # Check if we have a numpy array with at least one value.
                if not isinstance(value, np.ndarray) or value.size == 0:
                    continue

                # Check if the numpy array is of numeric type.
                feature_type = get_feature_type(value.dtype)
                if feature_type not in [
                        statistics_pb2.FeatureNameStatistics.INT,
                        statistics_pb2.FeatureNameStatistics.FLOAT
                ]:
                    continue

                # If we encounter this feature for the first time, create a
                # new partial numeric stats.
                if feature_name not in accumulator:
                    partial_stats = _PartialNumericStats(
                        self._weight_feature is not None)
                    # Store empty summary.
                    partial_stats.quantiles_summary = (
                        self._quantiles_combiner.create_accumulator())
                    accumulator[feature_name] = partial_stats

                # Update the partial numeric stats and append values
                # to the current batch.
                _update_numeric_stats(
                    accumulator[feature_name], value, feature_name,
                    feature_type, current_batch,
                    weights[i][0] if self._weight_feature else None)

            # Update the quantiles summary of the feature based on the current batch.
            if current_batch[0]:
                # For the unweighted case, explicitly set the weights to be 1. We do
                # this so that we can use the same weighted quantiles combiner for both
                # scenarios.
                accumulator[feature_name].quantiles_summary = (
                    self._quantiles_combiner.add_input(
                        accumulator[feature_name].quantiles_summary,
                        [current_batch[0], [1] * len(current_batch[0])]))

                if self._weight_feature:
                    accumulator[feature_name].weighted_quantiles_summary = (
                        self._quantiles_combiner.add_input(
                            accumulator[feature_name].
                            weighted_quantiles_summary, current_batch))

        return accumulator