示例#1
0
 def _recursion_helper(
     feature_path: types.FeaturePath, array: pa.Array,
     weights: Optional[np.ndarray]
 ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]:
     """Recursion helper."""
     array_type = array.type
     if (pa.types.is_list(array_type)
             and pa.types.is_struct(array_type.value_type)):
         if not enumerate_leaves_only:
             yield (feature_path, array, weights)
         flat_struct_array = array.flatten()
         flat_weights = None
         if weights is not None:
             flat_weights = weights[
                 array_util.GetFlattenedArrayParentIndices(
                     array).to_numpy()]
         for field in flat_struct_array.type:
             field_name = field.name
             # use "yield from" after PY 3.3.
             for e in _recursion_helper(feature_path.child(field_name),
                                        flat_struct_array.field(field_name),
                                        flat_weights):
                 yield e
     else:
         yield (feature_path, array, weights)
示例#2
0
    def _recursion_helper(
        query_path: types.FeaturePath, array: pa.Array,
        example_indices: Optional[np.ndarray]
    ) -> Tuple[pa.Array, Optional[np.ndarray]]:
        """Recursion helper."""
        if not query_path:
            return array, example_indices
        array_type = array.type
        if (not pa.types.is_list(array_type)
                or not pa.types.is_struct(array_type.value_type)):
            raise KeyError(
                'Cannot process query_path "{}" inside an array of type '
                '{}. Expecting a list<struct<...>>.'.format(
                    query_path, array_type))
        flat_struct_array = array.flatten()
        flat_indices = None
        if example_indices is not None:
            flat_indices = example_indices[
                array_util.GetFlattenedArrayParentIndices(array).to_numpy()]

        step = query_path.steps()[0]
        try:
            child_array = flat_struct_array.field(step)
        except KeyError:
            raise KeyError('query_path step "{}" not in struct.'.format(step))
        relative_path = types.FeaturePath(query_path.steps()[1:])
        return _recursion_helper(relative_path, child_array, flat_indices)
    def update(self,
               feature_array: pa.Array,
               values_quantiles_combiner: Any,
               weights: Optional[np.ndarray] = None) -> None:
        """Update the partial numeric statistics using the input value."""

        # np.max / np.min below cannot handle empty arrays. And there's nothing
        # we can collect in this case.
        if not feature_array:
            return

        flattened_value_array = feature_array.flatten()
        # Note: to_numpy will fail if flattened_value_array is empty.
        if not flattened_value_array:
            return
        values = np.asarray(flattened_value_array)
        nan_mask = np.isnan(values)
        self.num_nan += np.sum(nan_mask)
        non_nan_mask = ~nan_mask
        values_no_nan = values[non_nan_mask]

        # We do this check to avoid failing in np.min/max with empty array.
        if values_no_nan.size == 0:
            return
        # This is to avoid integer overflow when computing sum or sum of squares.
        values_no_nan_as_double = values_no_nan.astype(np.float64)
        self.sum += np.sum(values_no_nan_as_double)
        self.sum_of_squares += np.sum(values_no_nan_as_double *
                                      values_no_nan_as_double)
        # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade
        # to numpy 1.16
        curr_min = np.min(values_no_nan)
        curr_max = np.max(values_no_nan)
        self.min = min(self.min, curr_min)
        self.max = max(self.max, curr_max)
        if curr_min == float('-inf') or curr_max == float('inf'):
            finite_values = values_no_nan[np.isfinite(values_no_nan)]
            if finite_values.size > 0:
                self.finite_min = min(self.finite_min, np.min(finite_values))
                self.finite_max = max(self.finite_max, np.max(finite_values))

        self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan)
        self.quantiles_summary = values_quantiles_combiner.add_input(
            self.quantiles_summary,
            [values_no_nan, np.ones_like(values_no_nan)])
        if weights is not None:
            value_parent_indices = np.asarray(
                array_util.GetFlattenedArrayParentIndices(feature_array))
            flat_weights = weights[value_parent_indices]
            flat_weights_no_nan = flat_weights[non_nan_mask]
            weighted_values = flat_weights_no_nan * values_no_nan
            self.weighted_sum += np.sum(weighted_values)
            self.weighted_sum_of_squares += np.sum(weighted_values *
                                                   values_no_nan)
            self.weighted_quantiles_summary = values_quantiles_combiner.add_input(
                self.weighted_quantiles_summary,
                [values_no_nan, flat_weights_no_nan])
            self.weighted_total_num_values += np.sum(flat_weights_no_nan)
示例#4
0
def primitive_array_to_numpy(primitive_array: pa.Array) -> np.ndarray:
    """Converts a primitive Arrow array to a numpy 1-D ndarray.

  Copying is avoided as much as possible.

  Args:
    primitive_array: a primitive Arrow array.

  Returns:
    A 1-D ndarray of corresponding type (for string/binary arrays, the
    corresponding numpy type is np.object (str, bytes or unicode)).
  """
    array_type = primitive_array.type
    if (pa.types.is_binary(array_type) or pa.types.is_string(array_type)
            or primitive_array.null_count > 0):
        # no free conversion.
        return primitive_array.to_pandas()
    return primitive_array.to_numpy()
示例#5
0
    def add_input(self, accumulator: _PartialTimeStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialTimeStats:
        """Returns result of folding a batch of inputs into the current accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidated:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        if feature_type == statistics_pb2.FeatureNameStatistics.STRING:

            def _maybe_get_utf8(val):
                return stats_util.maybe_get_utf8(val) if isinstance(
                    val, bytes) else val

            values = np.asarray(feature_array.flatten())
            maybe_utf8 = np.vectorize(_maybe_get_utf8,
                                      otypes=[np.object])(values)
            if not maybe_utf8.all():
                accumulator.invalidated = True
                return accumulator
            accumulator.update(maybe_utf8, feature_type)
        elif feature_type == statistics_pb2.FeatureNameStatistics.INT:
            values = np.asarray(feature_array.flatten())
            accumulator.update(values, feature_type)
        else:
            accumulator.invalidated = True

        return accumulator
示例#6
0
    def add_input(self, accumulator: _PartialImageStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialImageStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        # Consider using memoryview to avoid copying after upgrading to
        # arrow 0.12. Note that this would involve modifying the subsequent logic
        # to iterate over the values in a loop.
        values = arrow_util.primitive_array_to_numpy(feature_array.flatten())
        accumulator.total_num_values += values.size
        image_formats = self._image_decoder.get_formats(values)
        valid_mask = ~pd.isnull(image_formats)
        valid_formats = image_formats[valid_mask]
        format_counts = np.unique(valid_formats, return_counts=True)
        for (image_format, count) in zip(*format_counts):
            accumulator.counter_by_format[image_format] += count
        unknown_count = image_formats.size - valid_formats.size
        if unknown_count > 0:
            accumulator.counter_by_format[''] += unknown_count

        if self._enable_size_stats:
            # Get image height and width.
            image_sizes = self._image_decoder.get_sizes(values[valid_mask])
            if image_sizes.any():
                max_sizes = np.max(image_sizes, axis=0)
                # Update the max image height/width with all image values.
                accumulator.max_height = max(accumulator.max_height,
                                             max_sizes[0])
                accumulator.max_width = max(accumulator.max_width,
                                            max_sizes[1])

        return accumulator
 def update(self, feature_array: pa.Array) -> None:
     """Update the partial bytes statistics using the input value."""
     if pa.types.is_null(feature_array.type):
         return
     # Iterate through the value array and update the partial stats.'
     flattened_values_array = feature_array.flatten()
     if (pa.types.is_floating(flattened_values_array.type)
             or pa.types.is_integer(flattened_values_array.type)):
         raise ValueError(
             'Bytes stats cannot be computed on INT/FLOAT features.')
     if flattened_values_array:
         num_bytes = array_util.GetElementLengths(
             flattened_values_array).to_numpy()
         self.min_num_bytes = min(self.min_num_bytes, np.min(num_bytes))
         self.max_num_bytes = max(self.max_num_bytes, np.max(num_bytes))
         self.total_num_bytes += np.sum(num_bytes)
    def update(self, feature_array: pa.Array) -> None:
        """Update the partial string statistics using the input value."""
        # Iterate through the value array and update the partial stats.
        flattened_values_array = feature_array.flatten()
        if pa.types.is_binary(
                flattened_values_array.type) or pa.types.is_unicode(
                    flattened_values_array.type):
            # GetBinaryArrayTotalByteSize returns a Python long (to be compatible
            # with Python3). To make sure we do cheaper integer arithemetics in
            # Python2, we first convert it to int.
            self.total_bytes_length += int(
                arrow_util.GetBinaryArrayTotalByteSize(flattened_values_array))
        elif flattened_values_array:
            # We can only do flattened_values_array.to_numpy() when it's not empty.
            # This could be computed faster by taking log10 of the integer.
            def _len_after_conv(s):
                return len(str(s))

            self.total_bytes_length += np.sum(
                np.vectorize(_len_after_conv, otypes=[np.int32])
                (arrow_util.primitive_array_to_numpy(flattened_values_array)))
    def add_input(self, accumulator: _PartialNLStats,
                  feature_path: types.FeaturePath,
                  feature_array: pa.Array) -> _PartialNLStats:
        """Return result of folding a batch of inputs into accumulator.

    Args:
      accumulator: The current accumulator.
      feature_path: The path of the feature.
      feature_array: An arrow Array representing a batch of feature values
        which should be added to the accumulator.

    Returns:
      The accumulator after updating the statistics for the batch of inputs.
    """
        if accumulator.invalidate:
            return accumulator
        feature_type = stats_util.get_feature_type_from_arrow_type(
            feature_path, feature_array.type)
        # Ignore null array.
        if feature_type is None:
            return accumulator
        # If we see a different type, invalidate.
        if feature_type != statistics_pb2.FeatureNameStatistics.STRING:
            accumulator.invalidate = True
            return accumulator

        def _is_non_utf8(value):
            return (isinstance(value, bytes)
                    and stats_util.maybe_get_utf8(value) is None)

        is_non_utf_vec = np.vectorize(_is_non_utf8, otypes=[np.bool])
        classify_vec = np.vectorize(self._classifier.classify,
                                    otypes=[np.bool])
        values = np.asarray(feature_array.flatten())
        if np.any(is_non_utf_vec(values)):
            accumulator.invalidate = True
            return accumulator
        accumulator.considered += values.size
        accumulator.matched += np.sum(classify_vec(values))
        return accumulator