def test_value_counts_binary(self):
     binary_array = pa.array(
         [b"abc", b"ghi", b"def", b"ghi", b"ghi", b"def"])
     expected_result = {b"abc": 1, b"ghi": 3, b"def": 2}
     self.assertDictEqual(
         self._value_counts_struct_array_to_dict(
             arrow_util.ValueCounts(binary_array)), expected_result)
示例#2
0
def _to_topk_tuples(
    sliced_table: Tuple[Text, pa.Table],
    categorical_features: FrozenSet[types.FeaturePath],
    weight_feature: Optional[Text]
) -> Iterable[Tuple[Tuple[Text, FeaturePathTuple, Any], Union[int, Tuple[
        int, Union[int, float]]]]]:
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table

    for feature_path, feature_array, weights in arrow_util.enumerate_arrays(
            table, weight_column=weight_feature, enumerate_leaves_only=True):
        feature_array_type = feature_array.type
        if (feature_path in categorical_features
                or stats_util.get_feature_type_from_arrow_type(
                    feature_path, feature_array_type)
                == statistics_pb2.FeatureNameStatistics.STRING):
            flattened_values = feature_array.flatten()
            if weights is not None and flattened_values:
                # Slow path: weighted uniques.
                flattened_values_np = arrow_util.primitive_array_to_numpy(
                    flattened_values)
                parent_indices = (arrow_util.primitive_array_to_numpy(
                    arrow_util.GetFlattenedArrayParentIndices(feature_array)))
                weights_ndarray = weights[parent_indices]
                for value, count, weight in _weighted_unique(
                        flattened_values_np, weights_ndarray):
                    yield (slice_key, feature_path.steps(), value), (count,
                                                                     weight)
            else:
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    yield ((slice_key, feature_path.steps(), value), count)
示例#3
0
    def add_input(self, accumulator, input_table):
        weight_column = (input_table.column(self._weight_feature)
                         if self._weight_feature else None)
        weight_array = weight_column.data.chunk(0) if weight_column else []
        if weight_array:
            flattened_weights = arrow_util.FlattenListArray(
                weight_array).to_numpy()

        for column in input_table.columns:
            feature_name = column.name
            # Skip the weight feature.
            if feature_name == self._weight_feature:
                continue
            feature_path = types.FeaturePath([feature_name])
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, column.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if not (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                continue
            value_array = column.data.chunk(0)
            flattened_values = arrow_util.FlattenListArray(value_array)
            unweighted_counts = collections.Counter()
            # Compute unweighted counts.
            value_counts = arrow_util.ValueCounts(flattened_values)
            for value_count in value_counts:
                value_count = value_count.as_py()
                unweighted_counts[
                    value_count['values']] = value_count['counts']

            # Compute weighted counts if a weight feature is specified.
            weighted_counts = _WeightedCounter()
            if weight_array:
                if (pa.types.is_binary(flattened_values.type)
                        or pa.types.is_string(flattened_values.type)):
                    # no free conversion.
                    flattened_values_np = flattened_values.to_pandas()
                else:
                    flattened_values_np = flattened_values.to_numpy()
                indices = arrow_util.GetFlattenedArrayParentIndices(
                    value_array)
                weighted_counts.weighted_update(
                    flattened_values_np, flattened_weights[indices.to_numpy()])

            if feature_path not in accumulator:
                accumulator[feature_path] = _ValueCounts(
                    unweighted_counts=unweighted_counts,
                    weighted_counts=weighted_counts)
            else:
                accumulator[feature_path].unweighted_counts.update(
                    unweighted_counts)
                accumulator[feature_path].weighted_counts.update(
                    weighted_counts)
        return accumulator
    def add_input(
            self, accumulator: Dict[types.FeaturePath, _ValueCounts],
            input_table: pa.Table) -> Dict[types.FeaturePath, _ValueCounts]:
        for feature_path, leaf_array, weights in arrow_util.enumerate_arrays(
                input_table,
                weight_column=self._weight_feature,
                enumerate_leaves_only=True):
            feature_type = stats_util.get_feature_type_from_arrow_type(
                feature_path, leaf_array.type)
            # if it's not a categorical feature nor a string feature, we don't bother
            # with topk stats.
            if (feature_path in self._categorical_features or feature_type
                    == statistics_pb2.FeatureNameStatistics.STRING):
                flattened_values = leaf_array.flatten()
                unweighted_counts = collections.Counter()
                # Compute unweighted counts.
                value_counts = arrow_util.ValueCounts(flattened_values)
                values = value_counts.field('values').to_pylist()
                counts = value_counts.field('counts').to_pylist()
                for value, count in six.moves.zip(values, counts):
                    unweighted_counts[value] = count

                # Compute weighted counts if a weight feature is specified.
                weighted_counts = _WeightedCounter()
                if weights is not None:
                    flattened_values_np = arrow_util.primitive_array_to_numpy(
                        flattened_values)
                    parent_indices = arrow_util.GetFlattenedArrayParentIndices(
                        leaf_array)
                    weighted_counts.weighted_update(
                        flattened_values_np,
                        weights[arrow_util.primitive_array_to_numpy(
                            parent_indices)])

                if feature_path not in accumulator:
                    accumulator[feature_path] = _ValueCounts(
                        unweighted_counts=unweighted_counts,
                        weighted_counts=weighted_counts)
                else:
                    accumulator[feature_path].unweighted_counts.update(
                        unweighted_counts)
                    accumulator[feature_path].weighted_counts.update(
                        weighted_counts)

        return accumulator
示例#5
0
def _to_topk_tuples(sliced_table, categorical_features, weight_feature=None):
    """Generates tuples for computing top-k and uniques from input tables."""
    slice_key, table = sliced_table
    weight_column = table.column(weight_feature) if weight_feature else None
    weight_array = weight_column.data.chunk(0) if weight_column else []
    if weight_array:
        flattened_weights = arrow_util.FlattenListArray(
            weight_array).to_numpy()

    for feature_column in table.columns:
        feature_name = feature_column.name
        # Skip the weight feature.
        if feature_name == weight_feature:
            continue
        feature_path = types.FeaturePath([feature_name])
        # if it's not a categorical feature nor a string feature, we don't bother
        # with topk stats.
        if not (feature_path in categorical_features
                or feature_column.type.equals(pa.list_(pa.binary()))
                or feature_column.type.equals(pa.list_(pa.string()))):
            continue
        value_array = feature_column.data.chunk(0)
        flattened_values = arrow_util.FlattenListArray(value_array)

        if weight_array and flattened_values:
            if (pa.types.is_binary(flattened_values.type)
                    or pa.types.is_string(flattened_values.type)):
                # no free conversion.
                flattened_values_np = flattened_values.to_pandas()
            else:
                flattened_values_np = flattened_values.to_numpy()
            indices = arrow_util.GetFlattenedArrayParentIndices(value_array)
            weights_ndarray = flattened_weights[indices.to_numpy()]
            for value, count, weight in _weighted_unique(
                    flattened_values_np, weights_ndarray):
                yield (slice_key, feature_path.steps(), value), (count, weight)
        else:
            value_counts = arrow_util.ValueCounts(flattened_values)
            values = value_counts.field('values').to_pylist()
            counts = value_counts.field('counts').to_pylist()
            for value, count in six.moves.zip(values, counts):
                yield ((slice_key, feature_path.steps(), value), count)
 def test_value_counts_empty(self):
     empty_array = pa.array([])
     expected_result = {}
     self.assertDictEqual(
         self._value_counts_struct_array_to_dict(
             arrow_util.ValueCounts(empty_array)), expected_result)
 def test_value_counts_integer(self):
     int_array = pa.array([1, 4, 1, 3, 1, 4])
     expected_result = {1: 3, 4: 2, 3: 1}
     self.assertDictEqual(
         self._value_counts_struct_array_to_dict(
             arrow_util.ValueCounts(int_array)), expected_result)