예제 #1
0
  def testFlattenNested(self):
    input_array = pa.array([[[1, 2]], None, [None, [3]]])
    flattened, parent_indices = arrow_util.flatten_nested(
        input_array, return_parent_indices=False)
    expected = pa.array([1, 2, 3])
    expected_parent_indices = [0, 0, 2]
    self.assertIs(parent_indices, None)
    self.assertTrue(flattened.equals(expected))

    flattened, parent_indices = arrow_util.flatten_nested(
        input_array, return_parent_indices=True)
    self.assertTrue(flattened.equals(expected))
    np.testing.assert_array_equal(parent_indices, expected_parent_indices)
예제 #2
0
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array, value_parent_indices = arrow_util.flatten_nested(
            feature_array, weights is not None)
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = np.asarray(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]

        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        curr_min = np.min(values_no_nan)
        curr_max = np.max(values_no_nan)
        self.min = min(self.min, curr_min)
        self.max = max(self.max, curr_max)
        if curr_min == float('-inf') or curr_max == float('inf'):
            finite_values = values_no_nan[np.isfinite(values_no_nan)]
            if finite_values.size > 0:
                self.finite_min = min(self.finite_min, np.min(finite_values))
                self.finite_max = max(self.finite_max, np.max(finite_values))

        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
예제 #3
0
    def add_input(self, accumulator: _PartialImageStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialImageStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        # Consider using memoryview to avoid copying after upgrading to
        # arrow 0.12. Note that this would involve modifying the subsequent logic
        # to iterate over the values in a loop.
        values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
        accumulator.total_num_values += values.size
        image_formats = self._image_decoder.get_formats(values)
        valid_mask = ~pd.isnull(image_formats)
        valid_formats = image_formats[valid_mask]
        format_counts = np.unique(valid_formats, return_counts=True)
        for (image_format, count) in zip(*format_counts):
            accumulator.counter_by_format[image_format] += count
        unknown_count = image_formats.size - valid_formats.size
        if unknown_count > 0:
            accumulator.counter_by_format[''] += unknown_count

        if self._enable_size_stats:
            # Get image height and width.
            image_sizes = self._image_decoder.get_sizes(values[valid_mask])
            if image_sizes.any():
                max_sizes = np.max(image_sizes, axis=0)
                # Update the max image height/width with all image values.
                accumulator.max_height = max(accumulator.max_height,
                                             max_sizes[0])
                accumulator.max_width = max(accumulator.max_width,
                                            max_sizes[1])

        return accumulator
    def add_input(self, accumulator: _PartialTimeStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialTimeStats:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidated:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

            def _maybe_get_utf8(val):
                return stats_util.maybe_get_utf8(val) if isinstance(
                    val, bytes) else val

            values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
            maybe_utf8 = np.vectorize(_maybe_get_utf8, otypes=[object])(values)
            if not maybe_utf8.all():
                accumulator.invalidated = True
                return accumulator
            accumulator.update(maybe_utf8, feature_type)
        elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
            values = np.asarray(arrow_util.flatten_nested(feature_array)[0])
            accumulator.update(values, feature_type)
        else:
            accumulator.invalidated = True

        return accumulator
예제 #5
0
def _get_flattened_feature_values_without_nulls(
        feature_array: pa.Array) -> List[Any]:
    """Flattens the feature array into a List and removes null values.

  Args:
    feature_array: Arrow Array.

  Returns:
    A list containing the flattened feature values with nulls removed.
  """
    non_missing_values = np.asarray(
        arrow_util.flatten_nested(feature_array)[0])
    return list(non_missing_values[~pd.isnull(non_missing_values)])
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial bytes statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.'
   flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
   if (pa.types.is_floating(flattened_values_array.type) or
       pa.types.is_integer(flattened_values_array.type)):
     raise ValueError('Bytes stats cannot be computed on INT/FLOAT features.')
   if flattened_values_array:
     num_bytes = array_util.GetElementLengths(
         flattened_values_array).to_numpy()
     self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes))
     self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes))
     self.total_num_bytes += np.sum(num_bytes)
예제 #7
0
def _apply_numerical_encoding_to_feature_array(
        feature_array: pa.Array, histogram_bin_boundaries: np.ndarray,
        encoding_length: int) -> List[int]:
    """Determines encoding of numeric feature array from histogram bins.

  Using the provided histogram_bin_boundaries, a histogram is constructed for
  each example to obtain an encoding for a feature value.

  Args:
    feature_array: Arrow Array.
    histogram_bin_boundaries: A monotonically increasing np.ndarray representing
      the boundaries of each bin in the histogram.
    encoding_length: The length of the list containing the encoded feature
      values.

  Returns:
    A list conatining the encoded feature values for each example.
  """
    if pa.types.is_null(feature_array.type):
        return []
    result = [None for _ in range(len(feature_array))]  # type: List
    flattened, non_missing_parent_indices = arrow_util.flatten_nested(
        feature_array, True)
    assert non_missing_parent_indices is not None
    non_missing_values = np.asarray(flattened)
    non_missing_parent_indices = non_missing_parent_indices.astype(np.int32)
    values_indices = np.stack((non_missing_values, non_missing_parent_indices),
                              axis=-1)
    nan_mask = pd.isnull(non_missing_values)
    for (value, index) in values_indices[~nan_mask]:
        index = int(index)
        if result[index] is None:
            result[index] = []
        result[index].append(value)
    for (value, index) in values_indices[nan_mask]:
        index = int(index)
        if result[index] is None:
            result[index] = []
    for i in range(len(result)):
        if result[i] is None:
            result[i] = [None] * encoding_length
        else:
            result[i] = np.bincount(
                np.digitize(result[i], histogram_bin_boundaries) - 1,
                minlength=encoding_length).tolist()
    return result
예제 #8
0
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    example_weight_map: ExampleWeightMap,
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
        int, Tuple[int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from the input."""
    slice_key, record_batch = sliced_record_batch

    has_any_weight = bool(example_weight_map.all_weight_features())
    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            record_batch,
            example_weight_map=example_weight_map,
            enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array_type)
        if feature_path in bytes_features:
            continue
        if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
             and feature_path in categorical_features) or feature_type
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values, parent_indices = arrow_util.flatten_nested(
                feature_array, weights is not None)
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = np.asarray(flattened_values)
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                if has_any_weight:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value),
                               (count, 1))
                else:
                    for value, count in zip(values, counts):
                        yield ((slice_key, feature_path.steps(), value), count)
예제 #9
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _ValueCounts],
        input_record_batch: pa.RecordBatch
    ) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_record_batch,
                example_weight_map=self._example_weight_map,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical int feature nor a string feature, we don't
            # bother with topk stats.
            if ((feature_type == statistics_pb2.FeatureNameStatistics.INT
                 and feature_path in self._categorical_features)
                    or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values, parent_indices = arrow_util.flatten_nested(
                    leaf_array, weights is not None)
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = flattened_values.value_counts()
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = np.asarray(flattened_values)
                    weighted_counts.weighted_update(flattened_values_np,
                                                    weights[parent_indices])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
예제 #10
0
def _apply_categorical_encoding_to_feature_array(
        feature_array: pa.Array, categorical_encoding: Dict[Any, int],
        encoding_length: int) -> List[Any]:
    """Applies the provided encoding to the feature array.

  For each example, the frequency of each category is computed. Using the
  categorical_encoding dict, an encoding is created for the example by storing
  these counts in the appropriate index of the encoding.

  Args:
    feature_array: Arrow Array.
    categorical_encoding: A dict where the key is the category and the value is
      the index in the encoding to which the category corresponds to.
    encoding_length: The length of the list containing the encoded feature
      values.

  Returns:
    A list containing the encoded feature values for each example.
  """
    if pa.types.is_null(feature_array.type):
        return []
    result = [None for _ in range(len(feature_array))]
    flattened, non_missing_parent_indices = arrow_util.flatten_nested(
        feature_array, True)
    non_missing_values = flattened.to_pylist()
    non_missing_parent_indices = list(non_missing_parent_indices)
    for (value, index) in zip(non_missing_values, non_missing_parent_indices):
        if result[index] is None:
            result[index] = []
        result[index].append(value)
    for i in range(len(result)):
        if result[i] is None:
            result[i] = [None] * encoding_length
        else:
            category_frequencies = collections.Counter(result[i])
            encoded_values = [0] * encoding_length
            for category in category_frequencies:
                if category in categorical_encoding:
                    encoded_values[categorical_encoding[category]] = (
                        category_frequencies[category])
                elif not pd.isnull(category):
                    encoded_values[-1] += category_frequencies[category]
            result[i] = encoded_values
    return result
    def _get_univalent_values_with_parent_indices(
            self,
            examples: pa.RecordBatch) -> Dict[types.FeatureName, DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_name, feat_arr in zip(examples.schema.names,
                                          examples.columns):
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feat_arr.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (None,
                                statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            value_lengths = np.asarray(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feat_arr, True)
            non_missing_values = np.asarray(flattened)
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
 def update(self, feature_array: pa.Array) -> None:
   """Update the partial string statistics using the input value."""
   if pa.types.is_null(feature_array.type):
     return
   # Iterate through the value array and update the partial stats.
   flattened_values_array, _ = arrow_util.flatten_nested(feature_array)
   if arrow_util.is_binary_like(flattened_values_array.type):
     # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
     # with Python3). To make sure we do cheaper integer arithemetics in
     # Python2, we first convert it to int.
     self.total_bytes_length += int(array_util.GetBinaryArrayTotalByteSize(
         flattened_values_array))
   elif flattened_values_array:
     # We can only do flattened_values_array.to_numpy() when it's not empty.
     # This could be computed faster by taking log10 of the integer.
     def _len_after_conv(s):
       return len(str(s))
     self.total_bytes_length += np.sum(
         np.vectorize(_len_after_conv,
                      otypes=[np.int32])(np.asarray(flattened_values_array)))
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[bool])
        classify_vec = np.vectorize(self._classifier.classify, otypes=[bool])
        values = np.asarray(
            arrow_util.flatten_nested(feature_array)[0].slice(
                0, _CROP_AT_VALUES))
        if np.any(is_non_utf_vec(values)):
            accumulator.invalidate = True
            return accumulator
        accumulator.considered += values.size
        accumulator.matched += np.sum(classify_vec(values))
        return accumulator
예제 #14
0
  def _update_combined_sketch_for_feature(
      self, feature_name: tfdv_types.FeaturePath, values: pa.Array,
      weights: Optional[np.ndarray],
      accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch]):
    """Updates combined sketch with values (and weights if provided)."""
    flattened_values, parent_indices = arrow_util.flatten_nested(
        values, weights is not None)

    combined_sketch = accumulator.get(feature_name, None)
    if combined_sketch is None:
      combined_sketch = _CombinedSketch(
          distinct=KmvSketch(self._num_kmv_buckets),
          topk_unweighted=MisraGriesSketch(self._num_misragries_buckets),
          topk_weighted=MisraGriesSketch(self._num_misragries_buckets),
      )
    weight_array = None
    if weights is not None:
      flattened_weights = weights[parent_indices]
      weight_array = pa.array(flattened_weights, type=pa.float32())
    combined_sketch.add(flattened_values, weight_array)
    accumulator[feature_name] = combined_sketch
예제 #15
0
def _encode_univalent_feature(feature_array: pa.Array) -> List[Any]:
    """Encodes univalent feature values into a fixed length representation.

  Univalent features are cast into a Python list. They are not affected by the
  encoding with the exception of null values which are replaced by None.

  Args:
    feature_array: Arrow Array.

  Returns:
    A list containing the feature values where null values are replaced by None.
  """
    result = [[None] for _ in range(len(feature_array))]
    flattened, non_missing_parent_indices = arrow_util.flatten_nested(
        feature_array, True)
    non_missing_values = np.asarray(flattened)
    nan_mask = pd.isnull(non_missing_values)
    non_nan_pairs = np.stack((non_missing_values, non_missing_parent_indices),
                             axis=-1)[~nan_mask]
    for (value, index) in non_nan_pairs:
        result[int(index)] = [value]
    return result
def _to_topk_tuples(
    sliced_record_batch: Tuple[types.SliceKey, pa.RecordBatch],
    bytes_features: FrozenSet[types.FeaturePath],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[types.SliceKey, types.FeaturePathTuple, Any], Union[
    int, Tuple[int, Union[int, float]]]]]:
  """Generates tuples for computing top-k and uniques from the input."""
  slice_key, record_batch = sliced_record_batch

  for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
      record_batch,
      weight_column=weight_feature,
      enumerate_leaves_only=True):
    feature_array_type = feature_array.type
    feature_type = stats_util.get_feature_type_from_arrow_type(
        feature_path, feature_array_type)
    # Skip null columns.
    if feature_type is None:
      continue
    if feature_path in bytes_features:
      continue
    if (feature_path in categorical_features or
        feature_type == statistics_pb2.FeatureNameStatistics.STRING):
      flattened_values, parent_indices = arrow_util.flatten_nested(
          feature_array, weights is not None)
      if weights is not None and flattened_values:
        # Slow path: weighted uniques.
        flattened_values_np = np.asarray(flattened_values)
        weights_ndarray = weights[parent_indices]
        for value, count, weight in _weighted_unique(
            flattened_values_np, weights_ndarray):
          yield (slice_key, feature_path.steps(), value), (count, weight)
      else:
        value_counts = array_util.ValueCounts(flattened_values)
        values = value_counts.field('values').to_pylist()
        counts = value_counts.field('counts').to_pylist()
        for value, count in six.moves.zip(values, counts):
          yield ((slice_key, feature_path.steps(), value), count)
예제 #17
0
    def _update_combined_sketch_for_feature(
            self, feature_name: tfdv_types.FeaturePath, values: pa.Array,
            weights: Optional[np.ndarray],
            accumulator: Dict[tfdv_types.FeaturePath, _CombinedSketch]):
        """Updates combined sketch with values (and weights if provided)."""
        flattened_values, parent_indices = arrow_util.flatten_nested(
            values, weights is not None)

        combined_sketch = accumulator.get(feature_name, None)
        if combined_sketch is None:
            self._num_kmv_buckets_gauge.update(self._num_kmv_buckets)

            def make_mg_sketch():
                num_buckets = max(self._num_misragries_buckets,
                                  self._num_top_values,
                                  self._num_rank_histogram_buckets)
                self._num_mg_buckets_gauge.update(num_buckets)
                self._num_top_values_gauge.update(self._num_top_values)
                self._num_rank_histogram_buckets_gauge.update(
                    self._num_rank_histogram_buckets)
                return MisraGriesSketch(
                    num_buckets=num_buckets,
                    invalid_utf8_placeholder=constants.NON_UTF8_PLACEHOLDER,
                    # Maximum sketch size:
                    # _LARGE_STRING_THRESHOLD * num_buckets * constant_factor.
                    large_string_threshold=_LARGE_STRING_THRESHOLD,
                    large_string_placeholder=constants.LARGE_BYTES_PLACEHOLDER)

            self._num_top_values_gauge.update(self._num_top_values)
            combined_sketch = _CombinedSketch(distinct=KmvSketch(
                self._num_kmv_buckets),
                                              topk_unweighted=make_mg_sketch(),
                                              topk_weighted=make_mg_sketch())
        weight_array = None
        if weights is not None:
            flattened_weights = weights[parent_indices]
            weight_array = pa.array(flattened_weights, type=pa.float32())
        combined_sketch.add(flattened_values, weight_array)
        accumulator[feature_name] = combined_sketch
예제 #18
0
def _get_example_value_presence(
        record_batch: pa.RecordBatch, path: types.FeaturePath,
        boundaries: Optional[Sequence[float]],
        weight_column_name: Optional[Text]) -> Optional[pd.DataFrame]:
    """Returns information about which examples contained which values.

  This function treats all values for a given path within a single example
  as a set and and returns a mapping between each example index and the distinct
  values which are present in that example.

  The result of calling this function for path 'p' on an arrow record batch with
  the two records [{'p': ['a', 'a', 'b']}, {'p': [a]}] will be
  pd.Series(['a', 'b', 'a'], index=[0, 0, 1]).

  If the array retrieved from get_array is null, this function returns None.

  Args:
    record_batch: The RecordBatch in which to look up the path.
    path: The FeaturePath for which to fetch values.
    boundaries: Optionally, a set of bin boundaries to use for binning the array
      values.
    weight_column_name: Optionally, a weight column to return in addition to the
      value and example index.

  Returns:
    A Pandas DataFrame containing distinct pairs of array values and example
    indices, along with the corresponding flattened example weights. The index
    will be the example indices and the values will be stored in a column named
    'values'. If weight_column_name is provided, a second column will be
    returned containing the array values, and 'weights' containing the weights
    for the example from which each value came.
  """
    arr, example_indices = arrow_util.get_array(record_batch,
                                                path,
                                                return_example_indices=True)
    if stats_util.get_feature_type_from_arrow_type(path, arr.type) is None:
        return None

    arr_flat, parent_indices = arrow_util.flatten_nested(
        arr, return_parent_indices=True)
    is_binary_like = arrow_util.is_binary_like(arr_flat.type)
    assert boundaries is None or not is_binary_like, (
        'Boundaries can only be applied to numeric columns')
    if is_binary_like:
        # use dictionary_encode so we can use np.unique on object arrays
        dict_array = arr_flat.dictionary_encode()
        arr_flat = dict_array.indices
        arr_flat_dict = np.asarray(dict_array.dictionary)
    example_indices_flat = example_indices[parent_indices]
    if boundaries is not None:
        element_indices, bins = bin_util.bin_array(arr_flat, boundaries)
        rows = np.vstack([example_indices_flat[element_indices], bins])
    else:
        rows = np.vstack([example_indices_flat, np.asarray(arr_flat)])
    if not rows.size:
        return None
    # Deduplicate values which show up more than once in the same example. This
    # makes P(X=x|Y=y) in the standard lift definition behave as
    # P(x \in Xs | y \in Ys) if examples contain more than one value of X and Y.
    unique_rows = np.unique(rows, axis=1)
    example_indices = unique_rows[0, :]
    values = unique_rows[1, :]
    if is_binary_like:
        # return binary like values a pd.Categorical wrapped in a Series. This makes
        # subsqeuent operations like pd.Merge cheaper.
        values = pd.Categorical.from_codes(values, categories=arr_flat_dict)
    columns = {'example_indices': example_indices, 'values': values}
    if weight_column_name:
        weights = arrow_util.get_weight_feature(record_batch,
                                                weight_column_name)
        columns['weights'] = np.asarray(weights)[example_indices]
    df = pd.DataFrame(columns)
    return df.set_index('example_indices')
예제 #19
0
    def feature_value_slicer(
            record_batch: pa.RecordBatch) -> Iterable[types.SlicedRecordBatch]:
        """A function that generates sliced record batches.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow record batch for each slice key based on the
    index ranges. This would be expensive as we are identifying the slice keys
    for each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined record batch
    by slice key to get the row indices corresponding to a slice.

    Args:
      record_batch: Arrow RecordBatch.

    Yields:
      Sliced record batch (slice_key, record_batch) where record_batch contains
      the rows corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            feature_array = record_batch.column(
                record_batch.schema.get_field_index(feature_name))
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feature_array, True)
            non_missing_values = np.asarray(flattened)
            # Create dataframe with feature value and parent index.
            df = DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   table_util.RecordBatchTake(
                       record_batch, pa.array(parent_indices.to_numpy())))
def _flatten_and_impute(
    examples: pa.RecordBatch, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow RecordBatch.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples: Arrow RecordBatch containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples.num_rows
    result = {}
    for column_name, feature_array in zip(examples.schema.names,
                                          examples.columns):
        feature_path = types.FeaturePath([column_name])
        imputation_fill_value = (CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
                                 if feature_path in categorical_features else
                                 sys.maxsize)
        if pa.types.is_null(feature_array.type):
            # If null array, impute all values.
            imputed_values_array = np.full(shape=num_rows,
                                           fill_value=imputation_fill_value)
            result[feature_path] = imputed_values_array
        else:
            # to_pandas returns a readonly array. Create a copy as we will be imputing
            # the NaN values.
            flattened_array, non_missing_parent_indices = arrow_util.flatten_nested(
                feature_array, return_parent_indices=True)
            assert non_missing_parent_indices is not None
            non_missing_values = np.copy(np.asarray(flattened_array))
            is_categorical_feature = feature_path in categorical_features
            result_dtype = non_missing_values.dtype
            if non_missing_parent_indices.size < num_rows and is_categorical_feature:
                result_dtype = np.object
            flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
            num_values = np.asarray(
                array_util.ListLengthsFromListArray(feature_array))
            missing_parent_indices = np.where(num_values == 0)[0]
            if feature_path not in categorical_features:
                # Also impute any NaN values.
                nan_mask = np.isnan(non_missing_values)
                if not np.all(nan_mask):
                    imputation_fill_value = non_missing_values[~nan_mask].max(
                    ) * 10
                non_missing_values[nan_mask.nonzero()
                                   [0]] = imputation_fill_value
            flattened_array[non_missing_parent_indices] = non_missing_values
            if missing_parent_indices.any():
                flattened_array[missing_parent_indices] = imputation_fill_value
            result[feature_path] = flattened_array
    return result
예제 #21
0
 def testFlattenNestedNonList(self):
   input_array = pa.array([1, 2])
   flattened, parent_indices = arrow_util.flatten_nested(
       input_array, return_parent_indices=True)
   self.assertTrue(flattened.equals(pa.array([1, 2])))
   np.testing.assert_array_equal(parent_indices, [0, 1])