def update(self,
               feature_column,
               feature_type,
               num_values_quantiles_combiner,
               weight_column=None):
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_column.name, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_column:
            return

        if weight_column and (feature_column.data.num_chunks !=
                              weight_column.data.num_chunks):
            raise ValueError(
                'Expected the feature column {} and weight column {} to have the '
                'same number of chunks.'.format(feature_column.name,
                                                weight_column.name))

        weight_chunks = weight_column.data.iterchunks(
        ) if weight_column else []
        for feature_array, weight_array in six.moves.zip_longest(
                feature_column.data.iterchunks(), weight_chunks,
                fillvalue=None):
            num_values = arrow_util.ListLengthsFromListArray(
                feature_array).to_numpy()
            none_mask = arrow_util.GetArrayNullBitmapAsByteArray(
                feature_array).to_numpy().view(np.bool)

            num_values_not_none = num_values[~none_mask]
            self.num_non_missing += len(
                feature_array) - feature_array.null_count
            self.max_num_values = max(np.max(num_values_not_none),
                                      self.max_num_values)
            self.min_num_values = min(np.min(num_values_not_none),
                                      self.min_num_values)
            self.total_num_values += np.sum(num_values_not_none)
            self.num_values_summary = num_values_quantiles_combiner.add_input(
                self.num_values_summary, [num_values_not_none])

            if weight_array:
                weights = (arrow_util.FlattenListArray(
                    weight_array).to_numpy().astype(np.float32, copy=False))
                if weights.size != num_values.size:
                    raise ValueError('Weight feature must not be missing.')
                self.weighted_total_num_values += np.sum(num_values * weights)
                self.weighted_num_non_missing += np.sum(weights[~none_mask])
Пример #2
0
  def test_get_array_null_bitmap_as_byte_array(self):
    array = pa.array([], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([], type=pa.uint8())))

    array = pa.array([1, 2, None, 3, None], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(
        null_masks.equals(pa.array([0, 0, 1, 0, 1], type=pa.uint8())))

    array = pa.array([1, 2, 3])
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([0, 0, 0], type=pa.uint8())))

    array = pa.array([None, None, None], type=pa.int32())
    null_masks = arrow_util.GetArrayNullBitmapAsByteArray(array)
    self.assertTrue(null_masks.equals(pa.array([1, 1, 1], type=pa.uint8())))
    # Demonstrate that the returned array can be converted to a numpy boolean
    # array w/o copying
    np.testing.assert_equal(
        np.array([True, True, True]), null_masks.to_numpy().view(np.bool))
    def update(self,
               feature_path: types.FeaturePath,
               feature_array: pa.Array,
               feature_type: types.FeatureNameStatisticsType,
               num_values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial common statistics using the input value."""
        # All the values in this column is null and we cannot deduce the type of
        # the feature. This is not an error as this feature might have some values
        # in other batches.
        if feature_type is None:
            return

        if self.type is None:
            self.type = feature_type
        elif self.type != feature_type:
            raise TypeError('Cannot determine the type of feature %s. '
                            'Found values of types %s and %s.' %
                            (feature_path, self.type, feature_type))

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        num_values = arrow_util.primitive_array_to_numpy(
            arrow_util.ListLengthsFromListArray(feature_array))
        none_mask = arrow_util.primitive_array_to_numpy(
            arrow_util.GetArrayNullBitmapAsByteArray(feature_array)).view(
                np.bool)

        self.num_non_missing += len(feature_array) - feature_array.null_count
        num_values_not_none = num_values[~none_mask]
        # We do this check to avoid failing in np.min/max with empty array.
        if num_values_not_none.size == 0:
            return
        # Use np.maximum.reduce(num_values_not_none, initial=self.max_num_values)
        # once we upgrade to numpy 1.16
        self.max_num_values = max(np.max(num_values_not_none),
                                  self.max_num_values)
        self.min_num_values = min(np.min(num_values_not_none),
                                  self.min_num_values)
        self.total_num_values += np.sum(num_values_not_none)
        self.num_values_summary = num_values_quantiles_combiner.add_input(
            self.num_values_summary, [num_values_not_none])

        if weights is not None:
            if weights.size != num_values.size:
                raise ValueError('Weight feature must not be missing.')
            self.weighted_total_num_values += np.sum(num_values * weights)
            self.weighted_num_non_missing += np.sum(weights[~none_mask])