def _get_sparse_feature(
        schema: schema_pb2.Schema,
        feature_path: types.FeaturePath) -> schema_pb2.SparseFeature:
    """Returns a sparse feature from the schema."""
    if not isinstance(schema, schema_pb2.Schema):
        raise TypeError('schema is of type %s, should be a Schema proto.' %
                        type(schema).__name__)

    feature_container = None
    parent = feature_path.parent()
    if parent:
        # Sparse features do not have a struct_domain and so can be only leaves.
        # Thus, we can assume that all parent steps are features, not sparse
        # features.
        feature_container = schema.feature
        for step in parent.steps():
            f = schema_util.look_up_feature(step, feature_container)
            if f is None:
                raise ValueError('Feature %s not found in the schema.' %
                                 feature_path)
            if f.type != schema_pb2.STRUCT:
                raise ValueError(
                    'Step %s in feature %s does not refer to a valid STRUCT feature'
                    % (step, feature_path))
            feature_container = f.struct_domain.sparse_feature

    if feature_container is None:
        feature_container = schema.sparse_feature
    feature = _look_up_sparse_feature(feature_path.steps()[-1],
                                      feature_container)
    if feature is None:
        raise ValueError('Sparse Feature %s not found in the schema.' %
                         feature_path)
    return feature
Пример #2
0
  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)
Пример #3
0
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
Пример #4
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     all_weights: Dict[types.FeatureName, np.ndarray],
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
   """Recursion helper."""
   array_type = array.type
   innermost_nested_type = get_innermost_nested_type(array_type)
   if pa.types.is_struct(innermost_nested_type):
     if not enumerate_leaves_only:
       weights = all_weights.get(example_weight_map.get(feature_path))
       # special handing for a flat struct array -- wrap it in a ListArray
       # whose elements are singleton lists. This way downstream can keep
       # assuming the enumerated arrays are list<*>.
       to_yield = array
       if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
         to_yield = array_util.ToSingletonListArray(array)
       yield (feature_path, to_yield, weights)
     flat_struct_array, parent_indices = flatten_nested(
         array, bool(all_weights))
     # Potential optimization:
     # Only flatten weights that we know will be used in the recursion.
     flat_all_weights = {
         weight_feature_name: w[parent_indices]
         for weight_feature_name, w in all_weights.items()
     }
     for field in flat_struct_array.type:
       field_name = field.name
       yield from _recursion_helper(
           feature_path.child(field_name), flat_struct_array.field(field_name),
           flat_all_weights)
   else:
     weights = all_weights.get(example_weight_map.get(feature_path))
     yield (feature_path, array, weights)
Пример #5
0
def _make_feature_stats_proto(
        stats_values: Dict[Text, float], feature_path: types.FeaturePath
) -> statistics_pb2.FeatureNameStatistics:
    """Creates the FeatureNameStatistics proto for one feature.

  Args:
    stats_values: A Dict[str,float] where the key of the dict is the name of the
      custom statistic and the value is the numeric value of the custom
      statistic of that feature. Ex. {
              'Mutual Information': 0.5,
              'Correlation': 0.1 }
    feature_path: The path of the feature.

  Returns:
    A FeatureNameStatistic proto containing the custom statistics for a
    feature.
  """

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())

    # Sort alphabetically by statistic name to have deterministic ordering
    stat_names = sorted(stats_values.keys())
    for stat_name in stat_names:
        result.custom_stats.add(name=stat_name, num=stats_values[stat_name])
    return result
Пример #6
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     innermost_nested_type = get_innermost_nested_type(array_type)
     if pa.types.is_struct(innermost_nested_type):
         if not enumerate_leaves_only:
             # special handing for a flat struct array -- wrap it in a ListArray
             # whose elements are singleton lists. This way downstream can keep
             # assuming the enumerated arrays are list<*>.
             to_yield = array
             if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
                 to_yield = array_util.ToSingletonListArray(array)
             yield (feature_path, to_yield, weights)
         flat_struct_array, parent_indices = flatten_nested(
             array, weights is not None)
         flat_weights = None if weights is None else weights[parent_indices]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
Пример #7
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     if is_list_like(array_type) and pa.types.is_struct(
             array_type.value_type):
         if not enumerate_leaves_only:
             yield (feature_path, array, weights)
         flat_struct_array = array.flatten()
         flat_weights = None
         if weights is not None:
             flat_weights = weights[
                 array_util.GetFlattenedArrayParentIndices(
                     array).to_numpy()]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
 def _recursion_helper(
     parent_path: types.FeaturePath,
     container: Union[schema_pb2.Schema, schema_pb2.StructDomain]
 ) -> List[Tuple[types.FeaturePath, schema_pb2.SparseFeature]]:
     """Helper function that is used in finding sparse features in a tree."""
     result = []
     for sf in container.sparse_feature:
         # Sparse features do not have a struct_domain, so they cannot be parent
         # features. Thus, once this reaches a sparse feature, add it to the
         # result.
         result.append((parent_path.child(sf.name), sf))
     for f in container.feature:
         if f.type == schema_pb2.STRUCT:
             result.extend(
                 _recursion_helper(parent_path.child(f.name),
                                   f.struct_domain))
     return result
Пример #9
0
def _make_feature_stats_proto_uniques(
    feature_path: types.FeaturePath, num_uniques: int,
) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the uniques stats."""
  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  result.string_stats.unique = num_uniques
  return result
def _make_feature_stats_proto_topk(
    feature_path: types.FeaturePath,
    top_k_values_pairs: List[FeatureValueCount], is_categorical: bool,
    is_weighted_stats: bool, num_top_values: int,
    frequency_threshold: Union[float, int],
    num_rank_histogram_buckets: int) -> statistics_pb2.FeatureNameStatistics:
  """Makes a FeatureNameStatistics proto containing the top-k stats."""
  # Sort (a copy of) the top_k_value_pairs in descending order by count.
  # Where multiple feature values have the same count, consider the feature with
  # the 'larger' feature value to be larger for purposes of breaking the tie.

  top_k_values_pairs = sorted(
      top_k_values_pairs,
      key=lambda pair: (pair.count, pair.feature_value),
      reverse=True)

  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # If we have a categorical feature, we preserve the type to be the original
  # INT type.
  result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                 else statistics_pb2.FeatureNameStatistics.STRING)

  if is_weighted_stats:
    string_stats = result.string_stats.weighted_string_stats
  else:
    string_stats = result.string_stats

  for i in range(len(top_k_values_pairs)):
    value, count = top_k_values_pairs[i]
    if count < frequency_threshold:
      break
    # Check if we have a valid utf-8 string. If not, assign a default invalid
    # string value.
    if isinstance(value, six.binary_type):
      decoded_value = stats_util.maybe_get_utf8(value)
      if decoded_value is None:
        logging.warning('Feature "%s" has bytes value "%s" which cannot be '
                        'decoded as a UTF-8 string.', feature_path, value)
        value = constants.NON_UTF8_PLACEHOLDER
      else:
        value = decoded_value
    elif not isinstance(value, six.text_type):
      value = str(value)

    if i < num_top_values:
      freq_and_value = string_stats.top_values.add()
      freq_and_value.value = value
      freq_and_value.frequency = count
    if i < num_rank_histogram_buckets:
      bucket = string_stats.rank_histogram.buckets.add()
      bucket.low_rank = i
      bucket.high_rank = i
      bucket.sample_count = count
      bucket.label = value
  return result
Пример #11
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, _LiftSeries], y_path: types.FeaturePath,
    y_boundaries: Optional[np.ndarray]
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add(
                y_count=lift_series.y_count))
        y = lift_series.y
        if y_boundaries is not None:
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
        elif isinstance(y, six.string_types):
            lift_series_proto.y_string = y
        else:
            lift_series_proto.y_int = y

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift,
                x_count=lift_value.x_count,
                x_and_y_count=lift_value.xy_count)
            x = lift_value.x
            if isinstance(x, six.string_types):
                lift_value_proto.x_string = x
            else:
                lift_value_proto.x_int = x
    return key.slice_key, stats
Пример #12
0
def _PartitionTransform(pcol, row_partitions: int, column_partitions: int,
                        label_feature: types.FeaturePath, seed: int):
    """Ptransform wrapping _default_assign_to_partition."""
    # We need to find the column name associated with the label path.
    steps = label_feature.steps()
    if not steps:
        raise ValueError("Empty label feature")
    label = steps[0]
    return pcol | "PartitionRowsCols" >> beam.ParDo(
        _PartitionFn(row_partitions, column_partitions, label, seed))
Пример #13
0
 def _recursion_helper(
     parent_path: types.FeaturePath,
     feature_container: Iterable[schema_pb2.Feature],
     result: List[Tuple[types.FeaturePath, schema_pb2.Feature]]):
   for f in feature_container:
     feature_path = parent_path.child(f.name)
     if f.type != schema_pb2.STRUCT:
       result.append((feature_path, f))
     else:
       _recursion_helper(feature_path, f.struct_domain.feature, result)
def _make_feature_stats_proto_uniques(
        feature_path: types.FeaturePath, num_uniques: int,
        is_categorical: bool) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing the uniques stats."""
    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)
    result.string_stats.unique = num_uniques
    return result
Пример #15
0
    def test_access_attributes(self):
        features_needed = {
            FeaturePath(['a', 'b']): [
                validation_options.ReasonFeatureNeeded(comment='reason1'),
                validation_options.ReasonFeatureNeeded(comment='reason2')
            ]
        }
        new_features_are_warnings = True
        options = validation_options.ValidationOptions(
            features_needed, new_features_are_warnings)

        # Test getters
        self.assertEqual(features_needed, options.features_needed)
        self.assertEqual(new_features_are_warnings,
                         options.new_features_are_warnings)
Пример #16
0
def get_array(
        table: pa.Table, query_path: types.FeaturePath,
        return_example_indices: bool) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally example indices) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the table.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not is_list_like(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a (large_)list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    example_indices = np.arange(
        table.num_rows) if return_example_indices else None
    return _recursion_helper(array_path, array, example_indices)
Пример #17
0
def get_array(
    table: pa.Table,
    query_path: types.FeaturePath,
    broadcast_column_name: Optional[Text] = None
) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Retrieve a nested array (and optionally weights) from a table.

  It assumes all the columns in `table` have only one chunk.
  It assumes `table` contains only arrays of the following supported types:
    - list<primitive>
    - list<struct<[Ts]>> where Ts are the types of the fields in the struct
      type, and they can only be one of the supported types
      (recursion intended).

  If the provided path refers to a leaf in the table, then a ListArray with a
  primitive element type will be returned. If the provided path does not refer
  to a leaf, a ListArray with a StructArray element type will be returned.

  Args:
    table: The Table whose arrays to be visited. It is assumed that the table
      contains only one chunk.
    query_path: The FeaturePath to lookup in the table.
    broadcast_column_name: The name of a column to broadcast, or None. Each list
      should contain exactly one value.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    broadcast column array for the feature array (i.e. broadcast_column[i] is
    the corresponding value for array[i]).

  Raises:
    ValueError: When the broadcast column is not a list array or its elements
      are not 1-element arrays. Or, if copy_broadcast_column is False, an error
      will be raised if its elements are not of a numeric type.
    KeyError: When the query_path is empty, or cannot be found in the table and
      its nested struct arrays.
  """
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        weights: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, weights
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_weights = None
        if weights is not None:
            flat_weights = weights[array_util.GetFlattenedArrayParentIndices(
                array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_weights)

    if not query_path:
        raise KeyError('query_path must be non-empty.')
    column_name = query_path.steps()[0]
    try:
        array = table.column(column_name).data.chunk(0)
    except KeyError:
        raise KeyError(
            'query_path step 0 "{}" not in table.'.format(column_name))
    array_path = types.FeaturePath(query_path.steps()[1:])

    broadcast_column = None
    if broadcast_column_name is not None:
        broadcast_column = np.asarray(
            get_broadcastable_column(table, broadcast_column_name))
    return _recursion_helper(array_path, array, broadcast_column)
Пример #18
0
def make_feature_stats_proto_topk_uniques_custom_stats(
    feature_path: types.FeaturePath,
    is_categorical: bool,
    num_top_values: int,
    num_rank_histogram_buckets: int,
    num_unique: int,
    value_count_list: List[FeatureValueCount],
    weighted_value_count_list: Optional[List[FeatureValueCount]] = None,
    frequency_threshold: int = 1,
    weighted_frequency_threshold: Optional[float] = None
) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing top-k and uniques stats.

  Args:
    feature_path: The path of the feature.
    is_categorical: Whether the feature is categorical.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.
    num_unique: The number of unique values in the feature.
    value_count_list: A list of FeatureValueCount tuples.
    weighted_value_count_list: An optional list of FeatureValueCount tuples for
      weighted features.
    frequency_threshold: The minimum number of examples the most frequent values
      must be present in.
    weighted_frequency_threshold: The minimum weighted number of examples the
      most frequent weighted values must be present in. Optional.

  Returns:
    A FeatureNameStatistics proto containing the top-k and uniques stats.
  """

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    # Create a FeatureNameStatistics proto that includes the unweighted top-k
    # stats.
    topk_stats = _make_feature_stats_proto_topk(feature_path, value_count_list,
                                                is_categorical, False,
                                                num_top_values,
                                                frequency_threshold,
                                                num_rank_histogram_buckets)

    # Topk rank histogram.
    topk_custom_stats = result.custom_stats.add(
        name=_TOPK_SKETCH_CUSTOM_STATS_NAME)
    topk_custom_stats.rank_histogram.CopyFrom(
        topk_stats.string_stats.rank_histogram)

    # If weights were provided, create another FeatureNameStatistics proto that
    # includes the weighted top-k stats, and then copy those weighted top-k stats
    # into the result proto.
    if weighted_value_count_list:
        assert weighted_frequency_threshold is not None
        weighted_topk_stats = _make_feature_stats_proto_topk(
            feature_path, weighted_value_count_list, is_categorical, True,
            num_top_values, weighted_frequency_threshold,
            num_rank_histogram_buckets)

        # Weighted Topk rank histogram.
        weighted_topk_custom_stats = result.custom_stats.add(
            name=_WEIGHTED_TOPK_SKETCH_CUSTOM_STATS_NAME)
        weighted_topk_custom_stats.rank_histogram.CopyFrom(
            weighted_topk_stats.string_stats.weighted_string_stats.
            rank_histogram)

    # Add the number of uniques to the FeatureNameStatistics proto.
    result.custom_stats.add(name=_UNIQUES_SKETCH_CUSTOM_STATS_NAME,
                            num=num_unique)
    return result
Пример #19
0
def make_feature_stats_proto_with_topk_stats(
        feature_path: types.FeaturePath,
        top_k_value_count_list: List[FeatureValueCount], is_categorical: bool,
        is_weighted_stats: bool, num_top_values: int,
        frequency_threshold: Union[float, int], num_rank_histogram_buckets: int
) -> statistics_pb2.FeatureNameStatistics:
    """Makes a FeatureNameStatistics proto containing the top-k stats.

  Args:
    feature_path: The path of the feature.
    top_k_value_count_list: A list of FeatureValueCount tuples.
    is_categorical: Whether the feature is categorical.
    is_weighted_stats: Whether top_k_value_count_list incorporates weights.
    num_top_values: The number of most frequent feature values to keep for
      string features.
    frequency_threshold: The minimum number of examples (possibly weighted) the
      most frequent values must be present in.
    num_rank_histogram_buckets: The number of buckets in the rank histogram for
      string features.

  Returns:
    A FeatureNameStatistics proto containing the top-k stats.
  """
    # Sort the top_k_value_count_list in descending order by count. Where
    # multiple feature values have the same count, consider the feature with the
    # 'larger' feature value to be larger for purposes of breaking the tie.
    top_k_value_count_list.sort(key=lambda counts: (counts[1], counts[0]),
                                reverse=True)

    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # If we have a categorical feature, we preserve the type to be the original
    # INT type.
    result.type = (statistics_pb2.FeatureNameStatistics.INT if is_categorical
                   else statistics_pb2.FeatureNameStatistics.STRING)

    if is_weighted_stats:
        string_stats = result.string_stats.weighted_string_stats
    else:
        string_stats = result.string_stats

    for i in range(len(top_k_value_count_list)):
        value, count = top_k_value_count_list[i]
        if count < frequency_threshold:
            break
        # Check if we have a valid utf-8 string. If not, assign a default invalid
        # string value.
        if isinstance(value, six.binary_type):
            value = stats_util.maybe_get_utf8(value)
            if value is None:
                logging.warning(
                    'Feature "%s" has bytes value "%s" which cannot be '
                    'decoded as a UTF-8 string.', feature_path, value)
                value = _INVALID_STRING
        elif not isinstance(value, six.text_type):
            value = str(value)

        if i < num_top_values:
            freq_and_value = string_stats.top_values.add()
            freq_and_value.value = value
            freq_and_value.frequency = count
        if i < num_rank_histogram_buckets:
            bucket = string_stats.rank_histogram.buckets.add()
            bucket.low_rank = i
            bucket.high_rank = i
            bucket.sample_count = count
            bucket.label = value
    return result
Пример #20
0
def _prepend_slice_path(slice_name: str,
                        path: types.FeaturePath) -> types.FeaturePath:
    steps = path.steps()
    return types.FeaturePath(('slice(%s)::' % slice_name + steps[0], ) +
                             steps[1:])
Пример #21
0
def get_array(
    record_batch: pa.RecordBatch,
    query_path: types.FeaturePath,
    return_example_indices: bool,
    wrap_flat_struct_in_list: bool = True,
) -> Tuple[pa.Array, Optional[np.ndarray]]:
  """Retrieve a nested array (and optionally example indices) from RecordBatch.

  This function has the same assumption over `record_batch` as
  `enumerate_arrays()` does.

  If the provided path refers to a leaf in the `record_batch`, then a
  "nested_list" will be returned. If the provided path does not refer to a leaf,
  a "struct" with be returned.

  See `enumerate_arrays()` for definition of "nested_list" and "struct".

  Args:
    record_batch: The RecordBatch whose arrays to be visited.
    query_path: The FeaturePath to lookup in the record_batch.
    return_example_indices: Whether to return an additional array containing the
      example indices of the elements in the array corresponding to the
      query_path.
    wrap_flat_struct_in_list: if True, and if the query_path leads to a
      struct<[Ts]> array, it will be wrapped in a list array, where each
      sub-list contains one element. Caller can make use of this option to
      assume this function always returns a list<inner_type>.

  Returns:
    A tuple. The first term is the feature array and the second term is the
    example_indeices array for the feature array (i.e. array[i] came from the
    example at row example_indices[i] in the record_batch.).

  Raises:
    KeyError: When the query_path is empty, or cannot be found in the
    record_batch and its nested struct arrays.
  """

  def _recursion_helper(
      query_path: types.FeaturePath, array: pa.Array,
      example_indices: Optional[np.ndarray]
  ) -> Tuple[pa.Array, Optional[np.ndarray]]:
    """Recursion helper."""
    array_type = array.type
    if not query_path:
      if pa.types.is_struct(array_type) and wrap_flat_struct_in_list:
        array = array_util.ToSingletonListArray(array)
      return array, example_indices
    if not pa.types.is_struct(get_innermost_nested_type(array_type)):
      raise KeyError('Cannot process query_path "{}" inside an array of type '
                     '{}. Expecting a struct<...> or '
                     '(large_)list...<struct<...>>.'.format(
                         query_path, array_type))
    flat_struct_array, parent_indices = flatten_nested(
        array, example_indices is not None)
    flat_indices = None
    if example_indices is not None:
      flat_indices = example_indices[parent_indices]

    step = query_path.steps()[0]
    try:
      child_array = flat_struct_array.field(step)
    except KeyError:
      raise KeyError('query_path step "{}" not in struct.'.format(step))
    relative_path = types.FeaturePath(query_path.steps()[1:])
    return _recursion_helper(relative_path, child_array, flat_indices)

  if not query_path:
    raise KeyError('query_path must be non-empty.')
  column_name = query_path.steps()[0]
  field_index = record_batch.schema.get_field_index(column_name)
  if field_index < 0:
    raise KeyError('query_path step 0 "{}" not in record batch.'
                   .format(column_name))
  array = record_batch.column(field_index)
  array_path = types.FeaturePath(query_path.steps()[1:])

  example_indices = np.arange(
      record_batch.num_rows) if return_example_indices else None
  return _recursion_helper(array_path, array, example_indices)
def _make_feature_stats_proto(
        feature_path: types.FeaturePath, basic_stats: _PartialBasicStats,
        parent_basic_stats: Optional[_PartialBasicStats],
        num_values_q_combiner: quantiles_util.QuantilesCombiner,
        values_q_combiner: quantiles_util.QuantilesCombiner,
        num_values_histogram_buckets: int, num_histogram_buckets: int,
        num_quantiles_histogram_buckets: int, is_categorical: bool,
        has_weights: bool) -> statistics_pb2.FeatureNameStatistics:
    """Convert the partial basic stats into a FeatureNameStatistics proto.

  Args:
    feature_path: The path of the feature.
    basic_stats: The partial basic stats associated with the feature.
    parent_basic_stats: The partial basic stats of the parent of the feature.
    num_values_q_combiner: The quantiles combiner used to construct the
        quantiles histogram for the number of values in the feature.
    values_q_combiner: The quantiles combiner used to construct the
        histogram for the values in the feature.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    num_histogram_buckets: Number of buckets in a standard
        NumericStatistics.histogram with equal-width buckets.
    num_quantiles_histogram_buckets: Number of buckets in a
        quantiles NumericStatistics.histogram.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
    # Create a new FeatureNameStatistics proto.
    result = statistics_pb2.FeatureNameStatistics()
    result.path.CopyFrom(feature_path.to_proto())
    # Set the feature type.
    # If we have a categorical feature, we preserve the type to be the original
    # INT type. Currently we don't set the type if we cannot infer it, which
    # happens when all the values are missing. We need to add an UNKNOWN type
    # to the stats proto to handle this case.
    if is_categorical:
        result.type = statistics_pb2.FeatureNameStatistics.INT
    elif basic_stats.common_stats.type is None:
        # If a feature is completely missing, we assume the type to be STRING.
        result.type = statistics_pb2.FeatureNameStatistics.STRING
    else:
        result.type = basic_stats.common_stats.type

    # Construct common statistics proto.
    common_stats_proto = _make_common_stats_proto(
        basic_stats.common_stats, parent_basic_stats.common_stats
        if parent_basic_stats is not None else None, num_values_q_combiner,
        num_values_histogram_buckets, has_weights)

    # Copy the common stats into appropriate numeric/string stats.
    # If the type is not set, we currently wrap the common stats
    # within numeric stats.
    if (is_categorical
            or result.type == statistics_pb2.FeatureNameStatistics.STRING):
        # Construct string statistics proto.
        string_stats_proto = _make_string_stats_proto(
            basic_stats.string_stats,
            basic_stats.common_stats.total_num_values)
        # Add the common stats into string stats.
        string_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.string_stats.CopyFrom(string_stats_proto)
    elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT:
        result.struct_stats.common_stats.CopyFrom(common_stats_proto)
    elif result.type in (statistics_pb2.FeatureNameStatistics.INT,
                         statistics_pb2.FeatureNameStatistics.FLOAT):
        # Construct numeric statistics proto.
        numeric_stats_proto = _make_numeric_stats_proto(
            basic_stats.numeric_stats,
            basic_stats.common_stats.total_num_values, values_q_combiner,
            num_histogram_buckets, num_quantiles_histogram_buckets,
            has_weights)
        # Add the common stats into numeric stats.
        numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
        result.num_stats.CopyFrom(numeric_stats_proto)

    return result
Пример #23
0
def _make_dataset_feature_stats_proto(
    lifts: Tuple[_SlicedFeatureKey, Iterable[_LiftSeries]],
    y_path: types.FeaturePath, y_boundaries: Optional[np.ndarray],
    weighted_examples: bool, output_custom_stats: bool
) -> Tuple[types.SliceKey, statistics_pb2.DatasetFeatureStatistics]:
    """Generates DatasetFeatureStatistics proto for a given x_path, y_path pair.

  Args:
    lifts: The result of two successive group bys of lift values. The innermost
      grouping collects all the lift values for a given (slice, x_path and
      y_value) tuple (corresponding to a single LiftSeries message). The
      outermost grouping collects all the lift values for the same (slice,
      x_path) tuple (corresponding to the set of the LiftSeries which share the
      same value of y_path). The full structure of lifts is described by:
        (slice, x_path), [(y, y_count, [(x, lift, xy_count, x_count)])]
    y_path: The path used as Y in the lift expression: lift = P(Y=y|X=x) /
      P(Y=y).
    y_boundaries: Optionally, a set of bin boundaries used for binning y_path
      values.
    weighted_examples: Whether lift is computed over weighted examples, in which
      case the proto will output weighted counts (as floats) rather than simple
      counts (as ints).
    output_custom_stats: Whether to output custom stats for use with Facets.

  Returns:
    The populated DatasetFeatureStatistics proto.
  """
    key, lift_series_list = lifts
    stats = statistics_pb2.DatasetFeatureStatistics()
    cross_stats = stats.cross_features.add(path_x=key.x_path.to_proto(),
                                           path_y=y_path.to_proto())
    if output_custom_stats:
        feature_stats = stats.features.add(path=key.x_path.to_proto())
    for lift_series in sorted(lift_series_list):
        lift_series_proto = (
            cross_stats.categorical_cross_stats.lift.lift_series.add())
        if weighted_examples:
            lift_series_proto.weighted_y_count = lift_series.y_count
        else:
            lift_series_proto.y_count = lift_series.y_count
        y = lift_series.y
        if y_boundaries is not None and isinstance(y, int):
            low_value, high_value = bin_util.get_boundaries(y, y_boundaries)
            lift_series_proto.y_bucket.low_value = low_value
            lift_series_proto.y_bucket.high_value = high_value
            y_display_fmt = '[{},{}]' if high_value == float(
                'inf') else '[{},{})'
            y_display_val = y_display_fmt.format(low_value, high_value)
        elif isinstance(y, six.text_type):
            lift_series_proto.y_string = y
            y_display_val = y
        elif isinstance(y, six.binary_type):
            y_string = _get_unicode_value(y, y_path)
            lift_series_proto.y_string = y_string
            y_display_val = y_string
        else:
            lift_series_proto.y_int = y
            y_display_val = str(y)

        if output_custom_stats:
            hist = feature_stats.custom_stats.add(
                name='Lift (Y={})'.format(y_display_val)).rank_histogram

        # dedupe possibly overlapping top_k and bottom_k x values.
        lift_values_deduped = {v.x: v for v in lift_series.lift_values}
        # sort by lift DESC, x ASC
        lift_values_sorted = sorted(lift_values_deduped.values(),
                                    key=lambda v: (-v.lift, v.x))
        for lift_value in lift_values_sorted:
            lift_value_proto = lift_series_proto.lift_values.add(
                lift=lift_value.lift)
            if weighted_examples:
                lift_value_proto.weighted_x_count = lift_value.x_count
                lift_value_proto.weighted_x_and_y_count = lift_value.xy_count
            else:
                lift_value_proto.x_count = lift_value.x_count
                lift_value_proto.x_and_y_count = lift_value.xy_count
            x = lift_value.x
            if isinstance(x, six.text_type):
                lift_value_proto.x_string = x
                x_display_val = x
            elif isinstance(x, six.binary_type):
                x_string = _get_unicode_value(x, key.x_path)
                lift_value_proto.x_string = x_string
                x_display_val = x_string
            else:
                lift_value_proto.x_int = x
                x_display_val = str(x)

            if output_custom_stats:
                hist.buckets.add(label=x_display_val,
                                 sample_count=lift_value.lift)

    return key.slice_key, stats
def _make_feature_stats_proto(
    feature_path: types.FeaturePath,
    basic_stats: _PartialBasicStats,
    parent_basic_stats: Optional[_PartialBasicStats],
    make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
    num_values_histogram_buckets: int,
    num_histogram_buckets: int,
    num_quantiles_histogram_buckets: int,
    is_bytes: bool, is_categorical: bool, has_weights: bool
) -> statistics_pb2.FeatureNameStatistics:
  """Convert the partial basic stats into a FeatureNameStatistics proto.

  Args:
    feature_path: The path of the feature.
    basic_stats: The partial basic stats associated with the feature.
    parent_basic_stats: The partial basic stats of the parent of the feature.
    make_quantiles_sketch_fn: A callable to create a quantiles sketch.
    num_values_histogram_buckets: Number of buckets in the quantiles
        histogram for the number of values per feature.
    num_histogram_buckets: Number of buckets in a standard
        NumericStatistics.histogram with equal-width buckets.
    num_quantiles_histogram_buckets: Number of buckets in a
        quantiles NumericStatistics.histogram.
    is_bytes: A boolean indicating whether the feature is bytes.
    is_categorical: A boolean indicating whether the feature is categorical.
    has_weights: A boolean indicating whether a weight feature is specified.

  Returns:
    A statistics_pb2.FeatureNameStatistics proto.
  """
  # Create a new FeatureNameStatistics proto.
  result = statistics_pb2.FeatureNameStatistics()
  result.path.CopyFrom(feature_path.to_proto())
  # Set the feature type.
  inferred_type = basic_stats.common_stats.type
  if inferred_type is not None:
    # The user claims the feature to be BYTES. Only trust them if the inferred
    # type is STRING (which means the actual data is in strings/bytes). We
    # never infer BYTES.
    if (is_bytes and
        inferred_type == statistics_pb2.FeatureNameStatistics.STRING):
      result.type = statistics_pb2.FeatureNameStatistics.BYTES
    else:
      result.type = basic_stats.common_stats.type
  # The inferred type being None means we don't see any value for this feature.
  # We trust user's claim.
  elif is_bytes:
    result.type = statistics_pb2.FeatureNameStatistics.BYTES
  elif is_categorical:
    result.type = statistics_pb2.FeatureNameStatistics.INT
  else:
    # We don't have an "unknown" type so use STRING here.
    result.type = statistics_pb2.FeatureNameStatistics.STRING

  # Construct common statistics proto.
  common_stats_proto = _make_common_stats_proto(
      basic_stats.common_stats,
      parent_basic_stats.common_stats
      if parent_basic_stats is not None else None,
      make_quantiles_sketch_fn,
      num_values_histogram_buckets, has_weights)

  # this is the total number of values at the leaf level.
  total_num_values = (
      0 if basic_stats.common_stats.presence_and_valency_stats is None else
      basic_stats.common_stats.presence_and_valency_stats[-1].total_num_values)

  # Copy the common stats into appropriate numeric/string stats.
  # If the type is not set, we currently wrap the common stats
  # within numeric stats.
  if result.type == statistics_pb2.FeatureNameStatistics.BYTES:
    # Construct bytes statistics proto.
    bytes_stats_proto = _make_bytes_stats_proto(
        basic_stats.bytes_stats, common_stats_proto.tot_num_values)
    # Add the common stats into bytes stats.
    bytes_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.bytes_stats.CopyFrom(bytes_stats_proto)
  if (result.type == statistics_pb2.FeatureNameStatistics.STRING or
      (is_categorical and
       result.type == statistics_pb2.FeatureNameStatistics.INT)):
    # Construct string statistics proto.
    string_stats_proto = _make_string_stats_proto(basic_stats.string_stats,
                                                  total_num_values)
    # Add the common stats into string stats.
    string_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.string_stats.CopyFrom(string_stats_proto)
  elif result.type == statistics_pb2.FeatureNameStatistics.STRUCT:
    result.struct_stats.common_stats.CopyFrom(common_stats_proto)
  elif result.type in (statistics_pb2.FeatureNameStatistics.INT,
                       statistics_pb2.FeatureNameStatistics.FLOAT):
    # Construct numeric statistics proto.
    numeric_stats_proto = _make_numeric_stats_proto(
        basic_stats.numeric_stats, total_num_values,
        num_histogram_buckets, num_quantiles_histogram_buckets, has_weights)
    # Add the common stats into numeric stats.
    numeric_stats_proto.common_stats.CopyFrom(common_stats_proto)
    result.num_stats.CopyFrom(numeric_stats_proto)

  result.custom_stats.extend(_make_num_values_custom_stats_proto(
      basic_stats.common_stats,
      num_values_histogram_buckets))
  return result