def test_get_flattened_array_parent_indices(self):
        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([], type=pa.list_(pa.int32())))
        self.assertTrue(indices.equals(pa.array([], type=pa.int32())))

        indices = arrow_util.GetFlattenedArrayParentIndices(
            pa.array([[1.], [2.], [], [3.]]))
        self.assertTrue(indices.equals(pa.array([0, 1, 3], type=pa.int32())))
예제 #2
0
def _to_topk_tuples(
    sliced_table: Tuple[Text, pa.Table],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[
        int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table

    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            table, weight_column=weight_feature, enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        if (feature_path in categorical_features
                or stats_util.get_feature_type_from_arrow_type(
                    feature_path, feature_array_type)
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values = feature_array.flatten()
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = arrow_util.primitive_array_to_numpy(
                    flattened_values)
                parent_indices = (arrow_util.primitive_array_to_numpy(
                    arrow_util.GetFlattenedArrayParentIndices(feature_array)))
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    yield ((slice_key, feature_path.steps(), value), count)
    def add_input(self, accumulator, input_table):

        weight_ndarrays = []
        if self._weight_feature is not None:
            for a in input_table.column(
                    self._weight_feature).data.iterchunks():
                weight_array = arrow_util.FlattenListArray(a)
                if len(weight_array) != len(a):
                    raise ValueError(
                        'If weight is specified, then each example must have a weight '
                        'feature of length 1.')
                # to_numpy() can only be called against a non-empty arrow array.
                if weight_array:
                    weight_ndarrays.append(weight_array.to_numpy())
                else:
                    weight_ndarrays.append(
                        np.array([], dtype=weight_array.to_pandas_dtype()))

        for column in input_table.columns:
            feature_name = column.name
            if feature_name == self._weight_feature:
                continue
            unweighted_counts = collections.Counter()
            weighted_counts = _WeightedCounter()
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, column.type)
            if not (feature_name in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue

            for feature_array, weight_ndarray in six.moves.zip_longest(
                    column.data.iterchunks(), weight_ndarrays, fillvalue=None):
                flattened_values_array = arrow_util.FlattenListArray(
                    feature_array)
                # to_numpy() cannot be called if the array is empty.
                if not flattened_values_array:
                    continue
                if feature_type == statistics_pb2.FeatureNameStatistics.STRING:
                    values_ndarray = flattened_values_array.to_pandas()
                else:
                    values_ndarray = flattened_values_array.to_numpy()
                value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                    feature_array).to_numpy()
                unweighted_counts.update(values_ndarray)
                if weight_ndarray is not None:
                    weight_per_value = weight_ndarray[value_parent_indices]
                    weighted_counts.weighted_update(values_ndarray,
                                                    weight_per_value)

            if feature_name not in accumulator:
                accumulator[feature_name] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_name].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_name].weighted_counts.update(
                    weighted_counts)
        return accumulator
def _flatten_and_impute(
    examples_table: pa.Table, categorical_features: Set[types.FeaturePath]
) -> Dict[types.FeaturePath, np.ndarray]:
    """Flattens and imputes the values in the input Arrow table.

  Replaces missing values with CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
  for categorical features and 10*max(feature_values) for numeric features.
  We impute missing values with an extreme value that is far from observed
  values so it does not incorrectly impact KNN results. 10*max(feature_values)
  is used instead of sys.max_float because max_float is large enough to cause
  unexpected float arithmetic errors.

  Args:
    examples_table: Arrow table containing a batch of examples where all
      features are univalent.
    categorical_features: Set of categorical feature names.

  Returns:
    A Dict[FeaturePath, np.ndarray] where the key is the feature path and the
    value is a 1D numpy array corresponding to the feature values.
  """
    num_rows = examples_table.num_rows
    result = {}
    for feature_column in examples_table.itercolumns():
        feature_path = types.FeaturePath([feature_column.name])
        # Assume we have only a single chunk.
        feature_array = feature_column.data.chunk(0)
        # to_pandas returns a readonly array. Create a copy as we will be imputing
        # the NaN values.
        non_missing_values = np.copy(
            arrow_util.primitive_array_to_numpy(feature_array.flatten()))
        non_missing_parent_indices = arrow_util.primitive_array_to_numpy(
            arrow_util.GetFlattenedArrayParentIndices(feature_array))
        is_categorical_feature = feature_path in categorical_features
        result_dtype = non_missing_values.dtype
        if non_missing_parent_indices.size < num_rows and is_categorical_feature:
            result_dtype = np.object
        flattened_array = np.ndarray(shape=num_rows, dtype=result_dtype)
        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        missing_parent_indices = np.where(num_values == 0)[0]
        if feature_path in categorical_features:
            imputation_fill_value = CATEGORICAL_FEATURE_IMPUTATION_FILL_VALUE
        else:
            # Also impute any NaN values.
            nan_mask = np.isnan(non_missing_values)
            imputation_fill_value = sys.maxsize
            if not np.all(nan_mask):
                imputation_fill_value = non_missing_values[~nan_mask].max(
                ) * 10
            non_missing_values[nan_mask.nonzero()[0]] = imputation_fill_value
        flattened_array[non_missing_parent_indices] = non_missing_values
        if missing_parent_indices.any():
            flattened_array[missing_parent_indices] = imputation_fill_value
        result[feature_path] = flattened_array
    return result
    def update(self,
               feature_column,
               values_quantiles_combiner,
               weight_column=None):
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        weight_chunks = weight_column.data.iterchunks(
        ) if weight_column else []
        for feature_array, weight_array in six.moves.zip_longest(
                feature_column.data.iterchunks(), weight_chunks,
                fillvalue=None):
            flattened_value_array = arrow_util.FlattenListArray(feature_array)
            # Note: to_numpy will fail if flattened_value_array is empty.
            if not flattened_value_array:
                continue
            values = flattened_value_array.to_numpy()
            nan_mask = np.isnan(values)
            non_nan_mask = ~nan_mask
            values_no_nan = values[non_nan_mask]
            # This is to avoid integer overflow when computing sum or sum of squares.
            values_no_nan_as_double = values_no_nan.astype(np.float64)
            self.num_nan += np.sum(nan_mask)
            self.sum += np.sum(values_no_nan_as_double)
            self.sum_of_squares += np.sum(values_no_nan_as_double *
                                          values_no_nan_as_double)
            self.min = min(self.min, np.min(values_no_nan))
            self.max = max(self.max, np.max(values_no_nan))
            self.num_zeros += values_no_nan.size - np.count_nonzero(
                values_no_nan)
            self.quantiles_summary = values_quantiles_combiner.add_input(
                self.quantiles_summary,
                [values_no_nan, np.ones_like(values_no_nan)])

            if weight_array:
                example_weights = arrow_util.FlattenListArray(
                    weight_array).to_numpy().astype(np.float32, copy=False)

                if example_weights.size != len(weight_array):
                    raise ValueError('Weight feature must not be missing.')
                value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                    feature_array).to_numpy()
                weights = example_weights[value_parent_indices]
                weights_no_nan = weights[non_nan_mask]
                weighted_values = weights_no_nan * values_no_nan
                self.weighted_sum += np.sum(weighted_values)
                self.weighted_sum_of_squares += np.sum(weighted_values *
                                                       values_no_nan)
                self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                    self.weighted_quantiles_summary,
                    [values_no_nan, weights_no_nan])
                self.weighted_total_num_values += np.sum(weights_no_nan)
예제 #6
0
    def add_input(self, accumulator, input_table):
        weight_column = (input_table.column(self._weight_feature)
                         if self._weight_feature else None)
        weight_array = weight_column.data.chunk(0) if weight_column else []
        if weight_array:
            flattened_weights = arrow_util.FlattenListArray(
                weight_array).to_numpy()

        for column in input_table.columns:
            feature_name = column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, column.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if not (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue
            value_array = column.data.chunk(0)
            flattened_values = arrow_util.FlattenListArray(value_array)
            unweighted_counts = collections.Counter()
            # Compute unweighted counts.
            value_counts = arrow_util.ValueCounts(flattened_values)
            for value_count in value_counts:
                value_count = value_count.as_py()
                unweighted_counts[
                    value_count['values']] = value_count['counts']

            # Compute weighted counts if a weight feature is specified.
            weighted_counts = _WeightedCounter()
            if weight_array:
                if (pa.types.is_binary(flattened_values.type)
                        or pa.types.is_string(flattened_values.type)):
                    # no free conversion.
                    flattened_values_np = flattened_values.to_pandas()
                else:
                    flattened_values_np = flattened_values.to_numpy()
                indices = arrow_util.GetFlattenedArrayParentIndices(
                    value_array)
                weighted_counts.weighted_update(
                    flattened_values_np, flattened_weights[indices.to_numpy()])

            if feature_path not in accumulator:
                accumulator[feature_path] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_path].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_path].weighted_counts.update(
                    weighted_counts)
        return accumulator
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = arrow_util.primitive_array_to_numpy(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        self.min = min(self.min, np.min(values_no_nan))
        self.max = max(self.max, np.max(values_no_nan))
        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
    def _get_univalent_values_with_parent_indices(
            self,
            examples_table: pa.Table) -> Dict[types.FeatureName, pd.DataFrame]:
        """Extracts univalent values for each feature along with parent indices."""
        result = {}
        for feature_column in examples_table.itercolumns():
            feature_name = feature_column.name
            if (self._features_needed is not None
                    and feature_name not in self._features_needed):
                continue
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_name, feature_column.type)
            # Only consider crosses of numeric features.
            # TODO(zhuo): Support numeric features nested under structs.
            if feature_type in (statistics_pb2.FeatureNameStatistics.STRING,
                                statistics_pb2.FeatureNameStatistics.STRUCT):
                continue
            # Assume we have only a single chunk.
            assert feature_column.data.num_chunks == 1
            feat_arr = feature_column.data.chunk(0)
            value_lengths = arrow_util.primitive_array_to_numpy(
                arrow_util.ListLengthsFromListArray(feat_arr))
            univalent_parent_indices = set((value_lengths == 1).nonzero()[0])
            # If there are no univalent values, continue to the next feature.
            if not univalent_parent_indices:
                continue
            non_missing_values = arrow_util.primitive_array_to_numpy(
                feat_arr.flatten())
            value_parent_indices = arrow_util.primitive_array_to_numpy(
                arrow_util.GetFlattenedArrayParentIndices(feat_arr))
            if feature_type == statistics_pb2.FeatureNameStatistics.FLOAT:
                # Remove any NaN values if present.
                non_nan_mask = ~np.isnan(non_missing_values)
                non_missing_values = non_missing_values[non_nan_mask]
                value_parent_indices = value_parent_indices[non_nan_mask]
            df = pd.DataFrame({
                feature_name: non_missing_values,
                'parent_index': value_parent_indices
            })
            # Only keep the univalent feature values.
            df = df[df['parent_index'].isin(univalent_parent_indices)]

            result[feature_name] = df

        return result
    def add_input(
            self, accumulator: Dict[types.FeaturePath, _ValueCounts],
            input_table: pa.Table) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_table,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = arrow_util.primitive_array_to_numpy(
                        flattened_values)
                    parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[arrow_util.primitive_array_to_numpy(
                            parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
예제 #10
0
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None):
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table
    weight_column = table.column(weight_feature) if weight_feature else None
    weight_array = weight_column.data.chunk(0) if weight_column else []
    if weight_array:
        flattened_weights = arrow_util.FlattenListArray(
            weight_array).to_numpy()

    for feature_column in table.columns:
        feature_name = feature_column.name
        # Skip the weight feature.
        if feature_name == weight_feature:
            continue
        feature_path = types.FeaturePath([feature_name])
        # if it's not a categorical feature nor a string feature, we don't bother
        # with topk stats.
        if not (feature_path in categorical_features
                or feature_column.type.equals(pa.list_(pa.binary()))
                or feature_column.type.equals(pa.list_(pa.string()))):
            continue
        value_array = feature_column.data.chunk(0)
        flattened_values = arrow_util.FlattenListArray(value_array)

        if weight_array and flattened_values:
            if (pa.types.is_binary(flattened_values.type)
                    or pa.types.is_string(flattened_values.type)):
                # no free conversion.
                flattened_values_np = flattened_values.to_pandas()
            else:
                flattened_values_np = flattened_values.to_numpy()
            indices = arrow_util.GetFlattenedArrayParentIndices(value_array)
            weights_ndarray = flattened_weights[indices.to_numpy()]
            for value, count, weight in _weighted_unique(
                    flattened_values_np, weights_ndarray):
                yield (slice_key, feature_path.steps(), value), (count, weight)
        else:
            value_counts = arrow_util.ValueCounts(flattened_values)
            values = value_counts.field('values').to_pylist()
            counts = value_counts.field('counts').to_pylist()
            for value, count in six.moves.zip(values, counts):
                yield ((slice_key, feature_path.steps(), value), count)
예제 #11
0
    def feature_value_slicer(table):
        """A function that generates sliced tables.

    The naive approach of doing this would be to iterate each row, identify
    slice keys for the row and keep track of index ranges for each slice key.
    And then generate an arrow table for each slice key based on the index
    ranges. This would be expensive as we are identifying the slice keys for
    each row individually and we would have to loop over the feature values
    including crossing them when we have to slice on multiple features. The
    current approach generates the slice keys for a batch by performing joins
    over indices of individual features. And then groups the joined table by
    slice key to get the row indices corresponding to a slice.

    Args:
      table: Arrow table.

    Yields:
      Sliced table (slice_key, Arrow table) where the table contains the rows
      corresponding to a slice.
    """
        per_feature_parent_indices = []
        for feature_name, values in six.iteritems(features):
            column = table.column(feature_name)
            # Assume we have a single chunk.
            feature_array = column.data.chunk(0)
            non_missing_values = arrow_util.FlattenListArray(
                feature_array).to_pandas()
            value_parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                feature_array).to_numpy()
            # Create dataframe with feature value and parent index.
            df = pd.DataFrame({
                feature_name: non_missing_values,
                _PARENT_INDEX_COLUMN: value_parent_indices
            })
            df.drop_duplicates(inplace=True)
            # Filter based on slice values
            if values is not None:
                df = df.loc[df[feature_name].isin(values)]
            per_feature_parent_indices.append(df)

        # Join dataframes based on parent indices.
        # Note that we want the parent indices per slice key to be sorted in the
        # merged dataframe. The individual dataframes have the parent indices in
        # sorted order. We use "inner" join type to preserve the order of the left
        # keys (also note that same parent index rows would be consecutive). Hence
        # we expect the merged dataframe to have sorted parent indices per
        # slice key.
        merged_df = functools.reduce(
            lambda base, update: pd.merge(
                base,
                update,
                how='inner',  # pylint: disable=g-long-lambda
                on=_PARENT_INDEX_COLUMN),
            per_feature_parent_indices)

        # Construct a new column in the merged dataframe with the slice keys.
        merged_df[_SLICE_KEY_COLUMN] = ''
        index = 0
        for col_name in sorted(merged_df.columns):
            if col_name in [_PARENT_INDEX_COLUMN, _SLICE_KEY_COLUMN]:
                continue
            slice_key_col = (_to_slice_key(col_name) + '_' +
                             merged_df[col_name].apply(_to_slice_key))
            if index == 0:
                merged_df[_SLICE_KEY_COLUMN] = slice_key_col
                index += 1
            else:
                merged_df[_SLICE_KEY_COLUMN] += ('_' + slice_key_col)

        # Since the parent indices are sorted per slice key, the groupby would
        # preserve the sorted order within each group.
        per_slice_parent_indices = merged_df.groupby(
            _SLICE_KEY_COLUMN, sort=False)[_PARENT_INDEX_COLUMN]
        for slice_key, parent_indices in per_slice_parent_indices:
            yield (slice_key,
                   merge.SliceTableByRowIndices(table,
                                                parent_indices.to_numpy()))