def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray: zero_copy_only = _is_zero_copy_only(pa_array.type) if isinstance(pa_array, pa.ChunkedArray): # don't call to_numpy() directly or we end up with a np.array with dtype object # call to_numpy on the chunks instead # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = [ row for chunk in pa_array.chunks for row in chunk.to_pylist() ] else: array: List = [ row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only) ] else: # cast to list of arrays or we end up with a np.array with dtype object # for ArrayExtensionArray call py_list directly to support dynamic dimensions if isinstance(pa_array.type, _ArrayXDExtensionType): array: List = pa_array.to_pylist() else: array: List = pa_array.to_numpy( zero_copy_only=zero_copy_only).tolist() if len(array) > 0: if any( isinstance(x, np.ndarray) and ( x.dtype == np.object or x.shape != array[0].shape) for x in array): return np.array(array, copy=False, **{ **self.np_array_kwargs, "dtype": np.object }) return np.array(array, copy=False, **self.np_array_kwargs)
def add_input(self, accumulator: _PartialNLStats, feature_path: types.FeaturePath, feature_array: pa.Array) -> _PartialNLStats: """Return result of folding a batch of inputs into accumulator. Args: accumulator: The current accumulator. feature_path: The path of the feature. feature_array: An arrow Array representing a batch of feature values which should be added to the accumulator. Returns: The accumulator after updating the statistics for the batch of inputs. """ if feature_path not in self._valid_feature_paths: accumulator.invalidate = True return accumulator feature_type = stats_util.get_feature_type_from_arrow_type( feature_path, feature_array.type) # Ignore null array. if feature_type is None: return accumulator if feature_type not in self._feature_type_fns: accumulator.invalidate = True return accumulator feature_type_fn = self._feature_type_fns[feature_type] vocab = None rvocab = None if self._nld_vocabularies[feature_path]: vocab_name = self._nld_vocabularies[feature_path] vocab = self._vocabs[vocab_name] rvocab = self._rvocabs[vocab_name] excluded_string_tokens = self._nld_excluded_string_tokens[feature_path] excluded_int_tokens = self._nld_excluded_int_tokens[feature_path] oov_string_tokens = self._nld_oov_string_tokens[feature_path] int_tokens = self._nld_specified_int_tokens[feature_path] string_tokens = self._nld_specified_str_tokens[feature_path] sequence_length_excluded_int_tokens = ( self._nld_sequence_length_excluded_int_tokens[feature_path]) sequence_length_excluded_string_tokens = ( self._nld_sequence_length_excluded_string_tokens[feature_path]) # TODO(b/175875824): Benchmark and optimize performance. for row in feature_array.to_pylist(): if row is not None: feature_type_fn(row, accumulator, excluded_string_tokens, excluded_int_tokens, oov_string_tokens, vocab, rvocab, int_tokens, string_tokens, sequence_length_excluded_int_tokens, sequence_length_excluded_string_tokens, self._num_histogram_buckets) return accumulator
def _Normalize(array: pa.Array) -> pa.Array: """Round trips array through python objects. Comparing nested arrays with slices is buggy in Arrow 2.0 this method is useful comparing two such arrays for logical equality. The bugs appears to be fixed as of Arrow 5.0 this should be removable once that becomes the minimum version. Args: array: The array to normalize. Returns: An array that doesn't have any more zero copy slices in itself or it's children. Note the schema might be slightly different for all null arrays. """ return pa.array(array.to_pylist())