def or_array_nparray(a: pa.Array, b: np.ndarray) -> pa.Array: """Perform ``pa.Array | np.ndarray``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned_with_numpy_nonnull(len(a), a.buffers()[1], a.offset, b, result) return pa.Array.from_buffers(pa.bool_(), len(a), [None, pa.py_buffer(result)], 0) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = bitmap_or_unaligned_with_numpy(len(a), a.buffers()[0], a.buffers()[1], a.offset, b, result, valid_bits) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )
def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: zero_copy_only = _is_zero_copy_only(pa_array.type) if isinstance(pa_array, pa.ChunkedArray): # don't call to_numpy() directly or we end up with a np.array with dtype object # call to_numpy on the chunks instead # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = [ row for chunk in pa_array.chunks for row in chunk.to_pylist() ] else: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: # cast to list of arrays or we end up with a np.array with dtype object # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = pa_array.to_pylist() else: array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( isinstance(x, np.ndarray) and ( x.dtype == np.object or x.shape != array[0].shape) for x in array): return np.array(array, copy=False, **{ **self.np_array_kwargs, "dtype": np.object }) return np.array(array, copy=False, **self.np_array_kwargs)
def _text_contains_case_sensitive(data: pa.Array, pat: str) -> pa.Array: """ Check for each element in the data whether it contains the pattern ``pat``. This implementation does basic byte-by-byte comparison and is independent of any locales or encodings. """ # Convert to UTF-8 bytes pat_bytes: bytes = pat.encode() offsets_buffer, data_buffer = _extract_string_buffers(data) if data.null_count == 0: valid_buffer = np.empty(0, dtype=np.uint8) else: valid_buffer = _buffer_to_view(data.buffers()[0]) output = _text_contains_case_sensitive_numba(len(data), valid_buffer, data.offset, offsets_buffer, data_buffer, pat_bytes) if data.null_count == 0: output_valid = None else: output_valid = data.buffers()[0].slice(data.offset // 8) if data.offset % 8 != 0: output_valid = shift_unaligned_bitmap(output_valid, data.offset % 8, len(data)) buffers = [output_valid, pa.py_buffer(output)] return pa.Array.from_buffers(pa.bool_(), len(data), buffers, data.null_count)
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) if hasattr(array, "num_chunks") and array.num_chunks > 1: # we have to coerce before combining chunks, because pyarrow panics if # offsets overflow if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): # pyarrow does not seem to support casting from list to largelist # so we use convert to large list ourselves and do the re-alloc on polars/arrow side chunks = [] for arr in array.iterchunks(): chunks.append(pl.from_arrow(arr).to_arrow()) array = pa.chunked_array(chunks) array = array.combine_chunks() return array
def arrow_array_to_array_of_proto( arrow_type: pa.DataType, arrow_array: pa.Array) -> List[Value_pb2.Value]: values = [] if isinstance(arrow_type, pa.ListType): proto_list_class = ARROW_LIST_TYPE_TO_PROTO_LIST_CLASS[ arrow_type.value_type] proto_field_name = ARROW_LIST_TYPE_TO_PROTO_FIELD[ arrow_type.value_type] if arrow_type.value_type == PA_TIMESTAMP_TYPE: arrow_array = arrow_array.cast(pa.list_(pa.int64())) for v in arrow_array.tolist(): values.append( Value_pb2.Value(**{proto_field_name: proto_list_class(val=v)})) else: proto_field_name = ARROW_TYPE_TO_PROTO_FIELD[arrow_type] if arrow_type == PA_TIMESTAMP_TYPE: arrow_array = arrow_array.cast(pa.int64()) for v in arrow_array.tolist(): values.append(Value_pb2.Value(**{proto_field_name: v})) return values
def first(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_values = array.filter(array.is_valid()) nonnull_splits = nonnull_group_splits(array, group_splits) starts = np.insert(nonnull_splits, 0, 0) ends = np.append(nonnull_splits, len(nonnull_values)) nulls = starts == ends indices = pa.array(starts, pa.int64(), mask=nulls) return nonnull_values.take(indices) # taking index NULL gives NULL
def _extract_string_buffers(arr: pa.Array) -> Tuple[np.ndarray, np.ndarray]: start = arr.offset end = arr.offset + len(arr) offsets = np.asanyarray(arr.buffers()[1]).view(np.int32)[start:end + 1] data = np.asanyarray(arr.buffers()[2]).view(np.uint8) return offsets, data
def _downcast_array(array: pa.Array) -> pa.Array: if array.type in (pa.float64(), ): array = array.cast(pa.float32()) elif array.type in (pa.int64(), ): array = array.cast(pa.uint16()) elif array.type in (pa.string(), pa.bool_()): pass else: raise Exception(f"Did not downcast array with type '{array.type}'.") return array
def nunique(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_splits = nonnull_group_splits(array, group_splits) nonnull_values = array.filter( array.is_valid()).to_numpy(zero_copy_only=False) counts = np.fromiter( (np.unique(subarr).size for subarr in np.split(nonnull_values, nonnull_splits)), dtype=np.int64, count=len(nonnull_splits) + 1, ) return pa.array(counts)
def _extract_data_buffer_as_np_array(array: pa.Array) -> np.ndarray: """Extract the data buffer of a numeric-typed pyarrow.Array as an np.ndarray.""" dtype = array.type.to_pandas_dtype() start = array.offset end = array.offset + len(array) if pa.types.is_boolean(array.type): return np.unpackbits(_buffer_to_view(array.buffers()[1]).view( np.uint8), bitorder="little")[start:end].astype(bool) else: return _buffer_to_view(array.buffers()[1]).view(dtype)[start:end]
def reencode_dictionary_array(array: pa.Array) -> pa.Array: if len(array.indices) <= len(array.dictionary): # Groupby often reduces the number of values considerably. Let's shy # away from dictionary when it gives us literally nothing. return array.cast(pa.utf8()) used = np.zeros(len(array.dictionary), np.bool_) used[array.indices] = True if np.all(used): return array # no edit return array.cast(pa.utf8()).dictionary_encode() # TODO optimize
def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: if isinstance(pa_array, pa.ChunkedArray): if isinstance(pa_array.type, _ArrayXDExtensionType): # don't call to_pylist() to preserve dtype of the fixed-size array zero_copy_only = _is_zero_copy_only( pa_array.type.storage_dtype, unnest=True) if pa_array.type.shape[0] is None: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_list_of_numpy( zero_copy_only=zero_copy_only) ] else: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy( zero_copy_only=zero_copy_only) ] else: zero_copy_only = _is_zero_copy_only(pa_array.type) and all( not _is_array_with_nulls(chunk) for chunk in pa_array.chunks) array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: if isinstance(pa_array.type, _ArrayXDExtensionType): # don't call to_pylist() to preserve dtype of the fixed-size array zero_copy_only = _is_zero_copy_only( pa_array.type.storage_dtype, unnest=True) if pa_array.type.shape[0] is None: array: List = pa_array.to_list_of_numpy( zero_copy_only=zero_copy_only) else: array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only) else: zero_copy_only = _is_zero_copy_only( pa_array.type) and not _is_array_with_nulls(pa_array) array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any((isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape)) or ( isinstance(x, float) and np.isnan(x)) for x in array): return np.array(array, copy=False, **{ **self.np_array_kwargs, "dtype": np.object }) return np.array(array, copy=False, **self.np_array_kwargs)
def from_arrow(cls, data: pa.Array): dtype = Decimal64Dtype.from_arrow(data.type) mask_buf = data.buffers()[0] mask = (mask_buf if mask_buf is None else pa_mask_buffer_to_mask( mask_buf, len(data))) data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64")) data_64 = data_128[::2].copy() return cls( data=Buffer(data_64.view("uint8")), size=len(data), dtype=dtype, mask=mask, )
def flatten_nested( array: pa.Array, return_parent_indices: bool = False ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Flattens all the list arrays nesting an array. If `array` is not list-like, itself will be returned. Args: array: pa.Array to flatten. return_parent_indices: If True, also returns the parent indices array. Returns: A tuple. The first term is the flattened array. The second term is None if `return_parent_indices` is False; otherwise it's a parent indices array parallel to the flattened array: if parent_indices[i] = j, then flattened_array[i] belongs to the j-th element of the input array. """ parent_indices = None while is_list_like(array.type): if return_parent_indices: cur_parent_indices = array_util.GetFlattenedArrayParentIndices( array).to_numpy() if parent_indices is None: parent_indices = cur_parent_indices else: parent_indices = parent_indices[cur_parent_indices] array = array.flatten() # the array is not nested at the first place. if return_parent_indices and parent_indices is None: parent_indices = np.arange(len(array)) return array, parent_indices
def _get_validity_buffer( self, arr: pa.Array ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: """ Get column's validity buffer. Parameters ---------- arr : pa.Array PyArrow array holding column's data. Returns ------- tuple or None Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``). """ # According to the Arrow's memory layout, the validity buffer is always present at zero position: # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout validity_buffer = arr.buffers()[0] if validity_buffer is None: return None # If exist, validity buffer is always a bit-mask. data_size = self._get_buffer_size(bit_width=1) return ( OmnisciProtocolBuffer(validity_buffer, data_size), (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE), )
def _get_data_buffer( self, arr: pa.Array ) -> Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]: """ Get column's data buffer. Parameters ---------- arr : pa.Array PyArrow array holding column's data. Returns ------- tuple Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. """ if self.dtype[0] == DTypeKind.CATEGORICAL: # For dictionary data the buffer has to return categories codes arr = arr.indices arrow_type = self._dtype_from_pyarrow(arr.type) buff_size = ( self._get_buffer_size( bit_width=arrow_type[1]) if self.dtype[0] != DTypeKind.STRING # We don't chunk string buffers as it would require modifying offset values, # so just return the whole data buffer for every chunk. else None) return ( # According to the Arrow's memory layout, the data buffer is always present # at the last position of `.buffers()`: # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout OmnisciProtocolBuffer(arr.buffers()[-1], buff_size), arrow_type, )
def _recursion_helper( query_path: types.FeaturePath, array: pa.Array, example_indices: Optional[np.ndarray] ) -> Tuple[pa.Array, Optional[np.ndarray]]: """Recursion helper.""" if not query_path: return array, example_indices array_type = array.type if (not is_list_like(array_type) or not pa.types.is_struct(array_type.value_type)): raise KeyError( 'Cannot process query_path "{}" inside an array of type ' '{}. Expecting a (large_)list<struct<...>>.'.format( query_path, array_type)) flat_struct_array = array.flatten() flat_indices = None if example_indices is not None: flat_indices = example_indices[ array_util.GetFlattenedArrayParentIndices(array).to_numpy()] step = query_path.steps()[0] try: child_array = flat_struct_array.field(step) except KeyError: raise KeyError('query_path step "{}" not in struct.'.format(step)) relative_path = types.FeaturePath(query_path.steps()[1:]) return _recursion_helper(relative_path, child_array, flat_indices)
def _text_strip(data: pa.Array, to_strip) -> pa.Array: """ Strip the characters of ``to_strip`` from start and end of each element in the data. """ if len(data) == 0: return data offsets, data_buffer = _extract_string_buffers(data) valid_buffer = data.buffers()[0] valid_offset = data.offset builder = StringArrayBuilder(max(len(data_buffer), len(data))) _do_strip( valid_buffer, valid_offset, offsets, data_buffer, len(data), to_strip, inout_builder=builder, ) result_array = finalize_string_array(builder, pa.string()) return result_array
def _ListArrayToTensor( self, list_array: pa.Array, produce_eager_tensors: bool) -> Union[np.ndarray, tf.Tensor]: """Converts a ListArray to a dense tensor.""" values = list_array.flatten() batch_size = len(list_array) expected_num_elements = batch_size * self._unbatched_flat_len if len(values) != expected_num_elements: raise ValueError( "Unable to convert a {} to a tensor of type spec {}: size mismatch. " "Expected {} elements but got {}. " "If your data type is tf.Example, make sure that the feature " "is always present, and have the same length in all the examples. " "TFX users should make sure there is no data anomaly for the feature." .format( type(list_array), self.type_spec, expected_num_elements, len(values))) actual_shape = list(self._shape) actual_shape[0] = batch_size if self._convert_to_binary_fn is not None: values = self._convert_to_binary_fn(values) values_np = np.asarray(values).reshape(actual_shape) if produce_eager_tensors: return tf.convert_to_tensor(values_np) return values_np
def _recursion_helper( feature_path: types.FeaturePath, array: pa.Array, weights: Optional[np.ndarray] ) -> Iterable[Tuple[types.FeaturePath, pa.Array, Optional[np.ndarray]]]: """Recursion helper.""" array_type = array.type if is_list_like(array_type) and pa.types.is_struct( array_type.value_type): if not enumerate_leaves_only: yield (feature_path, array, weights) flat_struct_array = array.flatten() flat_weights = None if weights is not None: flat_weights = weights[ array_util.GetFlattenedArrayParentIndices( array).to_numpy()] for field in flat_struct_array.type: field_name = field.name # use "yield from" after PY 3.3. for e in _recursion_helper(feature_path.child(field_name), flat_struct_array.field(field_name), flat_weights): yield e else: yield (feature_path, array, weights)
def ufunc_caller(*, array: pa.Array, group_splits: np.array, **kwargs) -> pa.Array: nonnull_splits = nonnull_group_splits(array, group_splits) nonnull_values = array.filter( array.is_valid()).to_numpy(zero_copy_only=False) if force_otype: otype = force_otype else: otype = nonnull_values.dtype if pa.types.is_unicode(array.type): zero = "" else: zero = otype.type() np_result, np_empty_indices = call_ufunc(nonnull_values, nonnull_splits, otype, zero) return pa.array(np_result, mask=np_empty_indices)
def coerce_arrow(array: pa.Array) -> pa.Array: # also coerces timezone to naive representation # units are accounted for by pyarrow if "timestamp" in str(array.type): warnings.warn( "Conversion of (potentially) timezone aware to naive datetimes. TZ information may be lost", ) ts_ms = pa.compute.cast(array, pa.timestamp("ms"), safe=False) ms = pa.compute.cast(ts_ms, pa.int64()) del ts_ms array = pa.compute.cast(ms, pa.date64()) del ms # note: Decimal256 could not be cast to float elif isinstance(array.type, pa.Decimal128Type): array = pa.compute.cast(array, pa.float64()) # simplest solution is to cast to (large)-string arrays # this is copy and expensive elif isinstance(array.type, pa.DictionaryType): if pa.types.is_string(array.type.value_type): array = pa.compute.cast(array, pa.large_utf8()) else: raise ValueError( "polars does not support dictionary encoded types other than strings" ) if hasattr(array, "num_chunks") and array.num_chunks > 1: if pa.types.is_string(array.type): array = pa.compute.cast(array, pa.large_utf8()) elif pa.types.is_list(array.type): array = pa.compute.cast(array, pa.large_list()) array = array.combine_chunks() return array
def ToSingletonListArray(array: pa.Array): """Converts an array of `type` to a `ListArray<type>`. Where result[i] is null if array[i] is null; [array[i]] otherwise. Args: array: an arrow Array. Returns: a ListArray. """ array_size = len(array) # fast path: values are not copied. if array.null_count == 0: return pa.ListArray.from_arrays( pa.array(np.arange(0, array_size + 1, dtype=np.int32)), array) # null_mask[i] = 1 iff array[i] is null. null_mask = np.asarray(GetArrayNullBitmapAsByteArray(array)) # presence_mask[i] = 0 iff array[i] is null presence_mask = np.subtract(1, null_mask, dtype=np.uint8) offsets_np = np.zeros((array_size + 1, ), np.int32) np.cumsum(presence_mask, out=offsets_np[1:]) # This is the null mask over offsets (but ListArray.from_arrays() uses it as # the null mask for the ListArray), so its length is array_size +1, but the # last element is always False. list_array_null_mask = np.zeros((array_size + 1, ), np.bool) list_array_null_mask[:array_size] = null_mask.view(np.bool) values_non_null = array.take(pa.array(np.flatnonzero(presence_mask))) return pa.ListArray.from_arrays( pa.array(offsets_np, mask=list_array_null_mask), values_non_null)
def update(self, feature_array: pa.Array, values_quantiles_combiner: Any, weights: Optional[np.ndarray] = None) -> None: """Update the partial numeric statistics using the input value.""" # np.max / np.min below cannot handle empty arrays. And there's nothing # we can collect in this case. if not feature_array: return flattened_value_array = feature_array.flatten() # Note: to_numpy will fail if flattened_value_array is empty. if not flattened_value_array: return values = np.asarray(flattened_value_array) nan_mask = np.isnan(values) self.num_nan += np.sum(nan_mask) non_nan_mask = ~nan_mask values_no_nan = values[non_nan_mask] # We do this check to avoid failing in np.min/max with empty array. if values_no_nan.size == 0: return # This is to avoid integer overflow when computing sum or sum of squares. values_no_nan_as_double = values_no_nan.astype(np.float64) self.sum += np.sum(values_no_nan_as_double) self.sum_of_squares += np.sum(values_no_nan_as_double * values_no_nan_as_double) # Use np.minimum.reduce(values_no_nan, initial=self.min) once we upgrade # to numpy 1.16 curr_min = np.min(values_no_nan) curr_max = np.max(values_no_nan) self.min = min(self.min, curr_min) self.max = max(self.max, curr_max) if curr_min == float('-inf') or curr_max == float('inf'): finite_values = values_no_nan[np.isfinite(values_no_nan)] if finite_values.size > 0: self.finite_min = min(self.finite_min, np.min(finite_values)) self.finite_max = max(self.finite_max, np.max(finite_values)) self.num_zeros += values_no_nan.size - np.count_nonzero(values_no_nan) self.quantiles_summary = values_quantiles_combiner.add_input( self.quantiles_summary, [values_no_nan, np.ones_like(values_no_nan)]) if weights is not None: value_parent_indices = np.asarray( array_util.GetFlattenedArrayParentIndices(feature_array)) flat_weights = weights[value_parent_indices] flat_weights_no_nan = flat_weights[non_nan_mask] weighted_values = flat_weights_no_nan * values_no_nan self.weighted_sum += np.sum(weighted_values) self.weighted_sum_of_squares += np.sum(weighted_values * values_no_nan) self.weighted_quantiles_summary = values_quantiles_combiner.add_input( self.weighted_quantiles_summary, [values_no_nan, flat_weights_no_nan]) self.weighted_total_num_values += np.sum(flat_weights_no_nan)
def make_groupable_array( array: pa.Array, date_granularity: Optional[DateGranularity]) -> pa.Array: """Given an input array, return the array we will group by. This is for handling DEPRECATED date conversions. The idea is: with input value "2021-03-01T21:12:21.231212312Z", a "year" group should be "2021-01-01Z". """ if date_granularity is None: return array if date_granularity == DateGranularity.QUARTER: np_datetime_ns = array.to_numpy(zero_copy_only=False) np_datetime_m = np_datetime_ns.astype("datetime64[M]").astype(int) rounded_month_numbers = np.floor_divide(np_datetime_m, 3) * 3 np_rounded_ns = rounded_month_numbers.astype("datetime64[M]").astype( "datetime64[ns]") # converting to int made nulls into ... not-null. Make them null again np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT" return pa.array(np_rounded_ns) if date_granularity == DateGranularity.WEEK: # numpy "week" is counted from the Epoch -- which happens to be a # Thursday. But ISO weeks start Monday, not Thursday -- and so Numpy's # "W" type is useless. # # We do integer math: add 3 to each date and then floor-divide by 7. # That makes "1970-01-01 [Thursday] + 3" => Sunday -- so when we # floor-divide, everything from Monday to Sunday falls in the same # bucket. We could group by this ... but we convert back to day and # subtract the 3, so the group can be formatted. np_datetime_ns = array.to_numpy(zero_copy_only=False) np_datetime_d = np_datetime_ns.astype("datetime64[D]").astype(int) rounded_day_numbers = np.floor_divide(np_datetime_d + 3, 7) * 7 - 3 np_rounded_ns = rounded_day_numbers.astype("datetime64[D]").astype( "datetime64[ns]") # converting to int made nulls into ... not-null. Make them null again np_rounded_ns[np.isnan(np_datetime_ns)] = "NaT" return pa.array(np_rounded_ns) freq = date_granularity.numpy_unit np_rounded_ns = (array.to_numpy(zero_copy_only=False).astype( f"datetime64[{freq}]").astype("datetime64[ns]")) return pa.array(np_rounded_ns)
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if feature_path not in self._valid_feature_paths: accumulator.invalidate = True return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type not in self._feature_type_fns: accumulator.invalidate = True return accumulator feature_type_fn = self._feature_type_fns[feature_type] vocab = None rvocab = None if self._nld_vocabularies[feature_path]: vocab_name = self._nld_vocabularies[feature_path] vocab = self._vocabs[vocab_name] rvocab = self._rvocabs[vocab_name] excluded_string_tokens = self._nld_excluded_string_tokens[feature_path] excluded_int_tokens = self._nld_excluded_int_tokens[feature_path] oov_string_tokens = self._nld_oov_string_tokens[feature_path] int_tokens = self._nld_specified_int_tokens[feature_path] string_tokens = self._nld_specified_str_tokens[feature_path] sequence_length_excluded_int_tokens = ( self._nld_sequence_length_excluded_int_tokens[feature_path]) sequence_length_excluded_string_tokens = ( self._nld_sequence_length_excluded_string_tokens[feature_path]) # TODO(b/175875824): Benchmark and optimize performance. for row in feature_array.to_pylist(): if row is not None: feature_type_fn(row, accumulator, excluded_string_tokens, excluded_int_tokens, oov_string_tokens, vocab, rvocab, int_tokens, string_tokens, sequence_length_excluded_int_tokens, sequence_length_excluded_string_tokens, self._num_histogram_buckets) return accumulator
def _replace_with_indices( cls, chunk: pa.Array, indices: npt.NDArray[np.intp], value: npt.NDArray[Any], ) -> pa.Array: """ Replace items selected with a set of positional indices. Analogous to pyarrow.compute.replace_with_mask, except that replacement positions are identified via indices rather than a mask. Parameters ---------- chunk : pa.Array indices : npt.NDArray[np.intp] value : npt.NDArray[Any] Replacement value(s). Returns ------- pa.Array """ n = len(indices) if n == 0: return chunk start, stop = indices[[0, -1]] if (stop - start) == (n - 1): # fast path for a contiguous set of indices arrays = [ chunk[:start], pa.array(value, type=chunk.type, from_pandas=True), chunk[stop + 1:], ] arrays = [arr for arr in arrays if len(arr)] if len(arrays) == 1: return arrays[0] return pa.concat_arrays(arrays) mask = np.zeros(len(chunk), dtype=np.bool_) mask[indices] = True if pa_version_under5p0: arr = chunk.to_numpy(zero_copy_only=False) arr[mask] = value return pa.array(arr, type=chunk.type) if isna(value).all(): return pc.if_else(mask, None, chunk) return pc.replace_with_mask(chunk, mask, value)
def or_array_array(a: pa.Array, b: pa.Array) -> pa.Array: """Perform ``pyarrow.Array | pyarrow.Array``.""" output_length = len(a) // 8 if len(a) % 8 != 0: output_length += 1 if a.null_count == 0 and b.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result) return pa.Array.from_buffers(pa.bool_(), len(a), [None, pa.py_buffer(result)], 0) elif a.null_count == 0: result = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[1], b.offset, result) # b has nulls, mark all occasions of b(None) & a(True) as True -> valid_bits = a.data or b.valid_bits valid_bits = np.zeros(output_length, dtype=np.uint8) bitmap_or_unaligned(len(a), a.buffers()[1], a.offset, b.buffers()[0], b.offset, valid_bits) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)]) pass elif b.null_count == 0: return or_array_array(b, a) else: result = np.zeros(output_length, dtype=np.uint8) valid_bits = np.zeros(output_length, dtype=np.uint8) null_count = masked_bitmap_or_unaligned( len(a), a.buffers()[0], a.buffers()[1], a.offset, b.buffers()[0], b.buffers()[1], b.offset, result, valid_bits, ) return pa.Array.from_buffers( pa.bool_(), len(a), [pa.py_buffer(valid_bits), pa.py_buffer(result)], null_count, )
def nonnull_group_splits(array: pa.Array, group_splits: np.array) -> np.array: # in an array [null, 1, null, 2, null] # with group_splits [1, 2, 3], groups are [null], [1], [null], [2, null] # n_nulls_by_index will be [1, 1, 2, 2, 3] n_nulls_by_index = np.cumsum( array.is_null().to_numpy(zero_copy_only=False), dtype=np.min_scalar_type(-len(array)), ) # non-null array is [1, 2] # we want groups [], [1], [], [2] # we want nonnull_group_splits [0, 1, 1] return group_splits - n_nulls_by_index[group_splits - 1]
def _extract_isnull_bitmap(arr: pa.Array, offset: int, length: int): """ Extract isnull bitmap with offset and padding. Ensures that even when pyarrow does return an empty bitmap that a filled one will be returned. """ buf = _buffer_to_view(arr.buffers()[0]) if len(buf) > 0: return buf[offset:offset + length] else: return np.full(length, fill_value=255, dtype=np.uint8)