예제 #1
0
def _get_unicode_value(value: Union[Text, bytes], path: types.FeaturePath):
  value = stats_util.maybe_get_utf8(value)
  # Check if we have a valid utf-8 string. If not, assign a placeholder.
  if value is None:
    logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                    'decoded as a UTF-8 string.', path, value)
    value = constants.NON_UTF8_PLACEHOLDER
  return value
예제 #2
0
def _get_unicode_value(value: Union[Text, bytes]) -> Text:
    """Get feature value decoded as utf-8."""
    decoded_value = stats_util.maybe_get_utf8(value)
    # Check if we have a valid utf-8 string. If not, assign a placeholder.
    if decoded_value is None:
        _NON_UTF8_VALUES_COUNTER.inc()
        decoded_value = constants.NON_UTF8_PLACEHOLDER
    return decoded_value
def _make_feature_stats_proto_topk(
    feature_path: types.FeaturePath,
    top_k_values_pairs: List[FeatureValueCount], is_categorical: bool,
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[float, int],
    num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the top-k stats."""
  # Sort (a copy of) the top_k_value_pairs in descending order by count.
  # Where multiple feature values have the same count, consider the feature with
  # the 'larger' feature value to be larger for purposes of breaking the tie.

  top_k_values_pairs = sorted(
      top_k_values_pairs,
      key=lambda pair: (pair.count, pair.feature_value),
      reverse=True)

  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # If we have a categorical feature, we preserve the type to be the original
  # INT type.
  result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                 else statistics_pb2.FeatureNameStatistics.STRING)

  if is_weighted_stats:
    string_stats = result.string_stats.weighted_string_stats
  else:
    string_stats = result.string_stats

  for i in range(len(top_k_values_pairs)):
    value, count = top_k_values_pairs[i]
    if count < frequency_threshold:
      break
    # Check if we have a valid utf-8 string. If not, assign a default invalid
    # string value.
    if isinstance(value, six.binary_type):
      decoded_value = stats_util.maybe_get_utf8(value)
      if decoded_value is None:
        logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                        'decoded as a UTF-8 string.', feature_path, value)
        value = constants.NON_UTF8_PLACEHOLDER
      else:
        value = decoded_value
    elif not isinstance(value, six.text_type):
      value = str(value)

    if i < num_top_values:
      freq_and_value = string_stats.top_values.add()
      freq_and_value.value = value
      freq_and_value.frequency = count
    if i < num_rank_histogram_buckets:
      bucket = string_stats.rank_histogram.buckets.add()
      bucket.low_rank = i
      bucket.high_rank = i
      bucket.sample_count = count
      bucket.label = value
  return result
예제 #4
0
def _to_slice_key(feature_value: Any):
  """Decode slice key as UTF-8."""
  # For bytes features we try decoding it as utf-8 (and throw an error if
  # fails). This is because in stats proto the slice name (dataset name) is a
  # string field which can only accept valid unicode.
  if isinstance(feature_value, six.binary_type):
    decoded_value = stats_util.maybe_get_utf8(feature_value)
    if decoded_value is None:
      raise ValueError('Feature names and slicing feature values must be valid'
                       ' UTF-8. Found value {}.'.format(feature_value))
    return decoded_value
  return str(feature_value)
  def update(self, value_list):
    """Updates the partial Time statistics using the value list.

    Args:
      value_list: A list of the values in an example.
    """
    for value in value_list:
      if not value:
        continue
      if isinstance(value, bytes):
        utf8_or_none = stats_util.maybe_get_utf8(value)
        if utf8_or_none is None:
          self.invalidated = True
          return
        else:
          value = utf8_or_none
      self.considered += 1
      for strptime_format, time_regex in _TIME_RE_LIST:
        if time_regex.match(value):
          self.matching_formats[strptime_format] += 1
    def add_input(self, accumulator, input_batch):
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      input_batch: A list representing a batch of feature value_lists
        (one per example) which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        for value_list in input_batch:
            # If the value_list is None or empty ignore.
            if value_list is None or value_list.size == 0:
                continue

            # Check if the numpy array is of bytes type, if not invalidate the stats.
            # in examples/features to run image stas gen on.
            if stats_util.get_feature_type(
                    value_list.dtype
            ) != statistics_pb2.FeatureNameStatistics.STRING:
                accumulator.invalidate = True
                return accumulator

            # Perform heuristic for a value.
            for value in value_list:
                if not value:
                    continue
                if isinstance(
                        value,
                        bytes) and stats_util.maybe_get_utf8(value) is None:
                    accumulator.invalidate = True
                    return accumulator
                accumulator.considered += 1
                accumulator.matched += self._classifier.classify(value)
        return accumulator
 def _maybe_get_utf8(val):
     return stats_util.maybe_get_utf8(val) if isinstance(
         val, bytes) else val
예제 #8
0
 def test_get_utf8(self):
     self.assertEqual(u'This is valid.',
                      stats_util.maybe_get_utf8(b'This is valid.'))
     self.assertIsNone(stats_util.maybe_get_utf8(b'\xF0'))
예제 #9
0
def make_feature_stats_proto_with_topk_stats(feature_name,
                                             top_k_value_count_list,
                                             is_categorical, is_weighted_stats,
                                             num_top_values,
                                             frequency_threshold,
                                             num_rank_histogram_buckets):
    """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_name: The feature name.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    frequency_threshold: The minimum number of examples (possibly weighted) the
      most frequent values must be present in.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.

  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
    # Sort the top_k_value_count_list in descending order by count. Where
    # multiple feature values have the same count, consider the feature with the
    # 'larger' feature value to be larger for purposes of breaking the tie.
    top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]),
                                reverse=True)

    result = statistics_pb2.FeatureNameStatistics()
    result.name = feature_name
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    if is_weighted_stats:
        string_stats = result.string_stats.weighted_string_stats
    else:
        string_stats = result.string_stats

    for i in range(len(top_k_value_count_list)):
        value, count = top_k_value_count_list[i]
        if count < frequency_threshold:
            break
        # Convert to string if integer.
        if isinstance(value, numbers.Integral):
            value = str(value)
        # Check if we have a valid utf-8 string. If not, assign a default invalid
        # string value.
        elif isinstance(value, bytes) and maybe_get_utf8(value) is None:
            logging.warning(
                'Feature "%s" has bytes value "%s" which cannot be '
                'decoded as a UTF-8 string.', feature_name, value)
            value = _INVALID_STRING

        if i < num_top_values:
            freq_and_value = string_stats.top_values.add()
            freq_and_value.value = value
            freq_and_value.frequency = count
        if i < num_rank_histogram_buckets:
            bucket = string_stats.rank_histogram.buckets.add()
            bucket.low_rank = i
            bucket.high_rank = i
            bucket.sample_count = count
            bucket.label = value
    return result
예제 #10
0
 def _is_non_utf8(value):
     return (isinstance(value, bytes)
             and stats_util.maybe_get_utf8(value) is None)
def make_feature_stats_proto_with_topk_stats(
    feature_path: types.FeaturePath,
    top_k_value_count_list: List[FeatureValueCount], is_categorical: bool,
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[float, int],
    num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_path: The path of the feature.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    frequency_threshold: The minimum number of examples (possibly weighted) the
      most frequent values must be present in.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.

  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
  # Sort (a copy of) the top_k_value_count_list in descending order by count.
  # Where multiple feature values have the same count, consider the feature with
  # the 'larger' feature value to be larger for purposes of breaking the tie.
  top_k_value_count_list = sorted(
      top_k_value_count_list,
      key=lambda counts: (counts[1], counts[0]),
      reverse=True)

  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # If we have a categorical feature, we preserve the type to be the original
  # INT type.
  result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                 else statistics_pb2.FeatureNameStatistics.STRING)

  if is_weighted_stats:
    string_stats = result.string_stats.weighted_string_stats
  else:
    string_stats = result.string_stats

  for i in range(len(top_k_value_count_list)):
    value, count = top_k_value_count_list[i]
    if count < frequency_threshold:
      break
    # Check if we have a valid utf-8 string. If not, assign a default invalid
    # string value.
    if isinstance(value, six.binary_type):
      value = stats_util.maybe_get_utf8(value)
      if value is None:
        logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                        'decoded as a UTF-8 string.', feature_path, value)
        value = constants.NON_UTF8_PLACEHOLDER
    elif not isinstance(value, six.text_type):
      value = str(value)

    if i < num_top_values:
      freq_and_value = string_stats.top_values.add()
      freq_and_value.value = value
      freq_and_value.frequency = count
    if i < num_rank_histogram_buckets:
      bucket = string_stats.rank_histogram.buckets.add()
      bucket.low_rank = i
      bucket.high_rank = i
      bucket.sample_count = count
      bucket.label = value
  return result