Exemplo n.º 1
0
 def test_list_lengths(self, list_type_factory):
   list_lengths = array_util.ListLengthsFromListArray(
       pa.array([], type=list_type_factory(pa.int64())))
   self.assertTrue(list_lengths.equals(pa.array([], type=pa.int64())))
   list_lengths = array_util.ListLengthsFromListArray(
       pa.array([[1., 2.], [], [3.]], type=list_type_factory(pa.float32())))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
   list_lengths = array_util.ListLengthsFromListArray(
       pa.array([[1., 2.], None, [3.]], type=list_type_factory(pa.float64())))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int64())))
Exemplo n.º 2
0
 def test_list_lengths(self):
     list_lengths = array_util.ListLengthsFromListArray(
         pa.array([], type=pa.list_(pa.int64())))
     self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32())))
     list_lengths = array_util.ListLengthsFromListArray(
         pa.array([[1., 2.], [], [3.]]))
     self.assertTrue(
         list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
     list_lengths = array_util.ListLengthsFromListArray(
         pa.array([[1., 2.], None, [3.]]))
     self.assertTrue(
         list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
Exemplo n.º 3
0
def get_broadcastable_column(input_table: pa.Table,
                             column_name: Text) -> pa.Array:
    """Gets a column from the input table, validating that it can be broadcast.

  Args:
    input_table: Input table.
    column_name: Name of the column to be retrieved and validated.
      This column must refer to a ListArray in which each list has length 1.

  Returns:
    An arrow array containing a flattened view of the broadcast column.

  Raises:
    ValueError: If the broadcast feature is not present in the input table or is
        not a valid column. A valid column must have exactly one value per
        example and be of a numeric type.
  """
    try:
        column = input_table.column(column_name).data.chunk(0)
    except KeyError:
        raise ValueError(
            'Column "{}" not present in the input table.'.format(column_name))

    # Before flattening, check that there is a single value for each example.
    column_lengths = array_util.ListLengthsFromListArray(column).to_numpy()
    if not np.all(column_lengths == 1):
        raise ValueError(
            'Column "{}" must have exactly one value in each example.'.format(
                column_name))
    return column.flatten()
Exemplo n.º 4
0
  def list_lengths(self, path: types.FeaturePath) -> np.ndarray:
    """Returns a numpy array containing the length of each feature list.

    If the requested path is not present in the table wrapped by the InputBatch,
    the returned array will consist of zeros, and be of length equal to the
    number of rows in the table.

    Args:
      path: The path for which to return list lengths.

    Returns:
      An ndarray containing the lengths of each nested list. The returned
      ndarray will be of shape (N,) where N is the number of rows in the
      referenced array (or in the table, if the path cannot be found).

    Raises:
      ValueError: When the referenced array is neither a ListArray nor null.
    """
    key = ('list_lengths({})', path)
    if key in self._cache:
      return self._cache[key]
    try:
      array, _ = arrow_util.get_array(
          self._table, path, broadcast_column_name=None)
      if pa.types.is_null(array.type):
        lengths = np.full(self._table.num_rows, 0)
      elif not pa.types.is_list(array.type):
        raise ValueError('Can only compute list lengths on list arrays, found '
                         '{}'.format(array.type))
      else:
        lengths = np.asarray(array_util.ListLengthsFromListArray(array))
    except KeyError:
      lengths = np.full(self._table.num_rows, 0)
    self._cache[key] = lengths
    return lengths
Exemplo n.º 5
0
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            array_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            array_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
Exemplo n.º 6
0
    def update(self,
               feature_path: types.FeaturePath,
               feature_array: pa.Array,
               feature_type: types.FeatureNameStatisticsType,
               num_values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_path, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        num_values = arrow_util.primitive_array_to_numpy(
            array_util.ListLengthsFromListArray(feature_array))
        none_mask = arrow_util.primitive_array_to_numpy(
            array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(
                np.bool)

        self.num_non_missing += len(feature_array) - feature_array.null_count
        num_values_not_none = num_values[~none_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if num_values_not_none.size == 0:
            return
        # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values)
        # once we upgrade to numpy 1.16
        self.max_num_values = max(np.max(num_values_not_none),
                                  self.max_num_values)
        self.min_num_values = min(np.min(num_values_not_none),
                                  self.min_num_values)
        self.total_num_values += np.sum(num_values_not_none)
        self.num_values_summary = num_values_quantiles_combiner.add_input(
            self.num_values_summary, [num_values_not_none])

        if weights is not None:
            if weights.size != num_values.size:
                raise ValueError('Weight feature must not be missing.')
            self.weighted_total_num_values += np.sum(num_values * weights)
            self.weighted_num_non_missing += np.sum(weights[~none_mask])
  def update(self,
             feature_path: types.FeaturePath,
             feature_array: pa.Array,
             feature_type: types.FeatureNameStatisticsType,
             make_quantiles_sketch_fn: Callable[[], sketches.QuantilesSketch],
             weights: Optional[np.ndarray] = None) -> None:
    """Update the partial common statistics using the input value."""
    if self.type is None:
      self.type = feature_type  # pytype: disable=annotation-type-mismatch
    elif feature_type is not None and self.type != feature_type:
      raise TypeError('Cannot determine the type of feature %s. '
                      'Found values of types %s and %s.' %
                      (feature_path, self.type, feature_type))

    nest_level = arrow_util.get_nest_level(feature_array.type)
    if self.presence_and_valency_stats is None:
      self.presence_and_valency_stats = [
          _PresenceAndValencyStats(make_quantiles_sketch_fn)
          for _ in range(nest_level)
      ]
    elif nest_level != len(self.presence_and_valency_stats):
      raise ValueError('Inconsistent nestedness in feature {}: {} vs {}'.format(
          feature_path, nest_level, len(self.presence_and_valency_stats)))

    # And there's nothing we can collect in this case.
    if not feature_array:
      return

    level = 0
    while arrow_util.is_list_like(feature_array.type):
      presence_mask = ~np.asarray(
          array_util.GetArrayNullBitmapAsByteArray(feature_array)).view(np.bool)
      num_values = np.asarray(
          array_util.ListLengthsFromListArray(feature_array))
      num_values_not_none = num_values[presence_mask]
      self.presence_and_valency_stats[level].update(feature_array,
                                                    presence_mask, num_values,
                                                    num_values_not_none,
                                                    weights)
      flattened = feature_array.flatten()
      if weights is not None:
        parent_indices = array_util.GetFlattenedArrayParentIndices(
            feature_array).to_numpy()
        weights = weights[parent_indices]
      feature_array = flattened
      level += 1
Exemplo n.º 8
0
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                array_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
Exemplo n.º 9
0
def get_weight_feature(input_record_batch: pa.RecordBatch,
                       weight_column: Text) -> np.ndarray:
    """Gets the weight column from the input record batch.

  Args:
    input_record_batch: Input record batch.
    weight_column: Name of the column containing the weight.

  Returns:
    A numpy array containing the weights of the examples in the input
    record_batch.

  Raises:
    ValueError: If the weight feature is not present in the input record_batch
    or is not a valid weight feature (must be of numeric type and have a
    single value for each example).
  """
    weights_field_index = input_record_batch.schema.get_field_index(
        weight_column)
    if weights_field_index < 0:
        raise ValueError('Weight column "{}" not present in the input '
                         'record batch.'.format(weight_column))
    weights = input_record_batch.column(weights_field_index)

    if pa.types.is_null(weights.type):
        raise ValueError(
            'Weight column "{}" cannot be null.'.format(weight_column))
    # Before flattening, check that there is a single value for each example.
    weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight column "{}" must have exactly one value in each example.'.
            format(weight_column))
    flat_weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    flat_weights_type = flat_weights.type
    if (not pa.types.is_floating(flat_weights_type)
            and not pa.types.is_integer(flat_weights_type)):
        raise ValueError(
            'Weight column "{}" must be of numeric type. Found {}.'.format(
                weight_column, flat_weights_type))
    return np.asarray(flat_weights)
    def _get_univalent_values_with_parent_indices(
            self,
            examples: pa.RecordBatch) -> Dict[types.FeatureName, DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_name, feat_arr in zip(examples.schema.names,
                                          examples.columns):
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feat_arr.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (None,
                                statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            value_lengths = np.asarray(
                array_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            flattened, value_parent_indices = arrow_util.flatten_nested(
                feat_arr, True)
            non_missing_values = np.asarray(flattened)
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
Exemplo n.º 11
0
def _get_weight_feature(input_table: pa.Table,
                        weight_feature: Text) -> np.ndarray:
    """Gets the weight column from the input table.

  Args:
    input_table: Input table.
    weight_feature: Name of the weight feature.

  Returns:
    A numpy array containing the weights of the examples in the input table.

  Raises:
    ValueError: If the weight feature is not present in the input table or is
        not a valid weight feature (must be of numeric type and have a
        single value for each example).
  """
    try:
        weights = input_table.column(weight_feature).data.chunk(0)
    except KeyError:
        raise ValueError('Weight feature "{}" not present in the input '
                         'table.'.format(weight_feature))

    # Before flattening, check that there is a single value for each example.
    weight_lengths = array_util.ListLengthsFromListArray(weights).to_numpy()
    if not np.all(weight_lengths == 1):
        raise ValueError(
            'Weight feature "{}" must have exactly one value in each example.'.
            format(weight_feature))
    weights = weights.flatten()
    # Before converting to numpy view, check the type (cannot convert string and
    # binary arrays to numpy view).
    weights_type = weights.type
    if pa.types.is_string(weights_type) or pa.types.is_binary(weights_type):
        raise ValueError(
            'Weight feature "{}" must be of numeric type. Found {}.'.format(
                weight_feature, weights_type))
    return weights.to_numpy()
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        feature_value_list_lengths = dict()
        feature_is_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and whether the feature is missing for every feature
        # that is an index or value feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                if pa.types.is_null(leaf_array.type):
                    # If the column is a NullArray, it is missing from the entire batch
                    # (missing features have value list lengths of 0).
                    feature_value_list_lengths[feature_path] = np.full(
                        batch_example_count, 0)
                    feature_is_missing[feature_path] = np.full(
                        batch_example_count, True)
                else:
                    feature_value_list_lengths[feature_path] = np.asarray(
                        array_util.ListLengthsFromListArray(leaf_array))
                    feature_is_missing[feature_path] = np.asarray(
                        array_util.GetArrayNullBitmapAsByteArray(leaf_array))

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and feature missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features

            # Create a filter identifying examples in which the entire sparse feature
            # is missing since those examples should not be included in counting
            # missing counts or length differences.
            component_features_missing = np.array([
                feature_is_missing.get(path, np.full(batch_example_count,
                                                     True)) for path in
                itertools.chain([value_feature_path], index_feature_paths)
            ])
            entire_sparse_feature_missing = np.all(component_features_missing,
                                                   axis=0)
            num_examples_missing_sparse_feature = np.sum(
                entire_sparse_feature_missing)

            # If all examples in the batch are missing the sparse feature, do not
            # update the accumulator with the partial stats for that sparse feature.
            if num_examples_missing_sparse_feature == batch_example_count:
                continue

            is_missing_value_feature = feature_is_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if is_missing_value_feature is None:
                missing_value_count = batch_example_count
                feature_value_list_lengths[value_feature_path] = np.full(
                    batch_example_count, 0)
            else:
                missing_value_count = np.sum(is_missing_value_feature)
            # Do not include examples that are entirely missing the sparse feature in
            # the missing value count.
            missing_value_count -= num_examples_missing_sparse_feature

            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                is_missing_index_feature = feature_is_missing.get(
                    index_feature_path)
                if is_missing_index_feature is None:
                    # If this batch does not have this index feature at all,
                    # missing_index_count for that index feature is the number of
                    # examples in the batch.
                    missing_index_count = batch_example_count
                    # Populate the value list lengths for the index feature with all 0s
                    # since a missing feature is considered to have a value list length of
                    # 0.
                    feature_value_list_lengths[index_feature_path] = np.full(
                        batch_example_count, 0)
                else:
                    missing_index_count = np.sum(is_missing_index_feature)
                # Do not include examples that are entirely missing the sparse feature
                # in the missing value count.
                missing_index_counts[index_feature_path] = (
                    missing_index_count - num_examples_missing_sparse_feature)

                length_differences = np.subtract(
                    feature_value_list_lengths[index_feature_path],
                    feature_value_list_lengths[value_feature_path])

                # Do not include examples that are entirely missing the sparse feature
                # in determining the min and max length differences.
                filtered_length_differences = length_differences[
                    ~entire_sparse_feature_missing]
                # This generator should not get to this point if the current sparse
                # feature is missing from all examples in the batch (which would cause
                # filtered_length_differences to be empty).
                assert filtered_length_differences.size != 0
                min_length_diff[index_feature_path] = np.min(
                    filtered_length_differences)
                max_length_diff[index_feature_path] = np.max(
                    filtered_length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator
def _flatten_and_impute(
    examples: pa.RecordBatch, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow RecordBatch.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples: Arrow RecordBatch containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples.num_rows
    result = {}
    for column_name, feature_array in zip(examples.schema.names,
                                          examples.columns):
        feature_path = types.FeaturePath([column_name])
        imputation_fill_value = (CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
                                 if feature_path in categorical_features else
                                 sys.maxsize)
        if pa.types.is_null(feature_array.type):
            # If null array, impute all values.
            imputed_values_array = np.full(shape=num_rows,
                                           fill_value=imputation_fill_value)
            result[feature_path] = imputed_values_array
        else:
            # to_pandas returns a readonly array. Create a copy as we will be imputing
            # the NaN values.
            flattened_array, non_missing_parent_indices = arrow_util.flatten_nested(
                feature_array, return_parent_indices=True)
            assert non_missing_parent_indices is not None
            non_missing_values = np.copy(np.asarray(flattened_array))
            is_categorical_feature = feature_path in categorical_features
            result_dtype = non_missing_values.dtype
            if non_missing_parent_indices.size < num_rows and is_categorical_feature:
                result_dtype = np.object
            flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
            num_values = np.asarray(
                array_util.ListLengthsFromListArray(feature_array))
            missing_parent_indices = np.where(num_values == 0)[0]
            if feature_path not in categorical_features:
                # Also impute any NaN values.
                nan_mask = np.isnan(non_missing_values)
                if not np.all(nan_mask):
                    imputation_fill_value = non_missing_values[~nan_mask].max(
                    ) * 10
                non_missing_values[nan_mask.nonzero()
                                   [0]] = imputation_fill_value
            flattened_array[non_missing_parent_indices] = non_missing_values
            if missing_parent_indices.any():
                flattened_array[missing_parent_indices] = imputation_fill_value
            result[feature_path] = flattened_array
    return result