def test_list_lengths(self):
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([], type=pa.list_(pa.int64())))
   self.assertTrue(list_lengths.equals(pa.array([], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], [], [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
   list_lengths = arrow_util.ListLengthsFromListArray(
       pa.array([[1., 2.], None, [3.]]))
   self.assertTrue(list_lengths.equals(pa.array([2, 0, 1], type=pa.int32())))
    def update(self,
               feature_column,
               feature_type,
               num_values_quantiles_combiner,
               weight_column=None):
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_column.name, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        if weight_column and (feature_column.data.num_chunks !=
                              weight_column.data.num_chunks):
            raise ValueError(
                'Expected the feature column {} and weight column {} to have the '
                'same number of chunks.'.format(feature_column.name,
                                                weight_column.name))

        weight_chunks = weight_column.data.iterchunks(
        ) if weight_column else []
        for feature_array, weight_array in six.moves.zip_longest(
                feature_column.data.iterchunks(), weight_chunks,
                fillvalue=None):
            num_values = arrow_util.ListLengthsFromListArray(
                feature_array).to_numpy()
            none_mask = arrow_util.GetArrayNullBitmapAsByteArray(
                feature_array).to_numpy().view(np.bool)

            num_values_not_none = num_values[~none_mask]
            self.num_non_missing += len(
                feature_array) - feature_array.null_count
            self.max_num_values = max(np.max(num_values_not_none),
                                      self.max_num_values)
            self.min_num_values = min(np.min(num_values_not_none),
                                      self.min_num_values)
            self.total_num_values += np.sum(num_values_not_none)
            self.num_values_summary = num_values_quantiles_combiner.add_input(
                self.num_values_summary, [num_values_not_none])

            if weight_array:
                weights = (arrow_util.FlattenListArray(
                    weight_array).to_numpy().astype(np.float32, copy=False))
                if weights.size != num_values.size:
                    raise ValueError('Weight feature must not be missing.')
                self.weighted_total_num_values += np.sum(num_values * weights)
                self.weighted_num_non_missing += np.sum(weights[~none_mask])
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            arrow_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
    def update(self,
               feature_path: types.FeaturePath,
               feature_array: pa.Array,
               feature_type: types.FeatureNameStatisticsType,
               num_values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_path, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        none_mask = arrow_util.primitive_array_to_numpy(
            arrow_util.GetArrayNullBitmapAsByteArray(feature_array)).view(
                np.bool)

        self.num_non_missing += len(feature_array) - feature_array.null_count
        num_values_not_none = num_values[~none_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if num_values_not_none.size == 0:
            return
        # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values)
        # once we upgrade to numpy 1.16
        self.max_num_values = max(np.max(num_values_not_none),
                                  self.max_num_values)
        self.min_num_values = min(np.min(num_values_not_none),
                                  self.min_num_values)
        self.total_num_values += np.sum(num_values_not_none)
        self.num_values_summary = num_values_quantiles_combiner.add_input(
            self.num_values_summary, [num_values_not_none])

        if weights is not None:
            if weights.size != num_values.size:
                raise ValueError('Weight feature must not be missing.')
            self.weighted_total_num_values += np.sum(num_values * weights)
            self.weighted_num_non_missing += np.sum(weights[~none_mask])
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                arrow_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
示例#6
0
    def add_input(
        self, accumulator: Dict[types.FeaturePath, _PartialSparseFeatureStats],
        input_table: pa.Table
    ) -> Dict[types.FeaturePath, _PartialSparseFeatureStats]:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      input_table: An Arrow Table whose columns are features and rows are
        examples.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        component_feature_value_list_lengths = dict()
        component_feature_num_missing = dict()
        batch_example_count = input_table.num_rows
        # Do a single pass through the input table to determine the value list
        # lengths and number missing for every feature that is an index or value
        # feature in any sparse feature in the schema.
        for feature_path, leaf_array, _ in arrow_util.enumerate_arrays(
                input_table, weight_column=None, enumerate_leaves_only=True):
            if (feature_path in self._all_index_feature_paths
                    or feature_path in self._all_value_feature_paths):
                # If the column is a NullArray, skip it when populating the
                # component_feature_ dicts. Features that are missing from those dicts
                # are treated as entirely missing for the batch.
                if not pa.types.is_null(leaf_array.type):
                    component_feature_value_list_lengths[
                        feature_path] = arrow_util.primitive_array_to_numpy(
                            arrow_util.ListLengthsFromListArray(leaf_array))
                    component_feature_num_missing[
                        feature_path] = leaf_array.null_count

        # Now create a partial sparse feature stats object for each sparse feature
        # using the value list lengths and numbers missing information collected
        # above.
        for feature_path in self._sparse_feature_component_paths:
            value_feature_path = self._sparse_feature_component_paths[
                feature_path].value_feature
            index_feature_paths = self._sparse_feature_component_paths[
                feature_path].index_features
            missing_value_count = component_feature_num_missing.get(
                value_feature_path)
            # If this batch does not have the value feature at all,
            # missing_value_count is the number of examples in the batch.
            # Also populate the value list lengths for the value feature with all 0s
            # since a missing feature is considered to have a value list length of 0.
            if missing_value_count is None:
                missing_value_count = batch_example_count
                component_feature_value_list_lengths[
                    value_feature_path] = np.full(batch_example_count, 0)
            missing_index_counts = collections.Counter()
            min_length_diff = dict()
            max_length_diff = dict()
            for index_feature_path in index_feature_paths:
                missing_index_count = component_feature_num_missing.get(
                    index_feature_path)
                # If this batch does not have this index feature at all,
                # missing_index_count for that index feature is the number of
                # examples in the batch.
                # Also populate the value list lengths for the index feature with all 0s
                # since a missing feature is considered to have a value list length of
                # 0.
                if missing_index_count is None:
                    missing_index_counts[
                        index_feature_path] = batch_example_count
                    component_feature_value_list_lengths[
                        index_feature_path] = np.full(batch_example_count, 0)
                else:
                    missing_index_counts[
                        index_feature_path] = missing_index_count
                length_differences = np.subtract(
                    component_feature_value_list_lengths[index_feature_path],
                    component_feature_value_list_lengths[value_feature_path])
                min_length_diff[index_feature_path] = np.min(
                    length_differences)
                max_length_diff[index_feature_path] = np.max(
                    length_differences)

            stats_for_feature = _PartialSparseFeatureStats(
                missing_value_count, missing_index_counts, min_length_diff,
                max_length_diff)
            existing_stats_for_feature = accumulator.get(feature_path)
            if existing_stats_for_feature is None:
                accumulator[feature_path] = stats_for_feature
            else:
                accumulator[feature_path] += stats_for_feature
        return accumulator