def result_ilocs(self) -> npt.NDArray[np.intp]: """ Get the original integer locations of result_index in the input. """ # Original indices are where group_index would go via sorting. # But when dropna is true, we need to remove null values while accounting for # any gaps that then occur because of them. group_index = get_group_index( self.codes, self.shape, sort=self._sort, xnull=True ) group_index, _ = compress_group_index(group_index, sort=self._sort) if self.has_dropped_na: mask = np.where(group_index >= 0) # Count how many gaps are caused by previous null values for each position null_gaps = np.cumsum(group_index == -1)[mask] group_index = group_index[mask] result = get_group_index_sorter(group_index, self.ngroups) if self.has_dropped_na: # Shift by the number of prior null gaps result += np.take(null_gaps, result) return result
def _indexer_and_to_sort(self): v = self.level codes = list(self.index.codes) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1:] + [codes[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) indexer = ensure_platform_int(indexer) return indexer, to_sort
def _aggregate_series_fast(self, obj, func): # At this point we have already checked that obj.index is not a MultiIndex # and that obj is backed by an ndarray, not ExtensionArray func = self._is_builtin_func(func) group_index, _, ngroups = self.group_info # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) if obj.index._has_complex_internals: raise TypeError("Incompatible index for Cython grouper") group_index, _, ngroups = self.group_info # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]: # At this point we have already checked that # - obj.index is not a MultiIndex # - obj is backed by an ndarray, not ExtensionArray # - len(obj) > 0 func = com.is_builtin_func(func) ids, _, ngroups = self.group_info # avoids object / Series creation overhead indexer = get_group_index_sorter(ids, ngroups) obj = obj.take(indexer) ids = ids.take(indexer) sgrouper = libreduction.SeriesGrouper(obj, func, ids, ngroups) result, _ = sgrouper.get_result() return result
def _aggregate_series_fast(self, obj: Series, func: F): # At this point we have already checked that # - obj.index is not a MultiIndex # - obj is backed by an ndarray, not ExtensionArray # - len(obj) > 0 # - ngroups != 0 func = self._is_builtin_func(func) group_index, _, ngroups = self.group_info # avoids object / Series creation overhead indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = group_index.take(indexer) grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups) result, counts = grouper.get_result() return result, counts
def _indexer_and_to_sort( self, ) -> tuple[npt.NDArray[np.intp], list[np.ndarray], # each has _some_ signed integer dtype ]: v = self.level codes = list(self.index.codes) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1:] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1:] + [levs[v]]) comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) return indexer, to_sort
def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) if obj.index._has_complex_internals: raise TypeError('Incompatible index for Cython grouper') group_index, _, ngroups = self.group_info # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) obj = obj._take(indexer).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) # TODO: pre-empt this, also pre-empt get_result raising TypError if we pass a EA # for EAs backed by ndarray we may have a performant workaround if obj.index._has_complex_internals: raise TypeError("Incompatible index for Cython grouper") group_index, _, ngroups = self.group_info # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)) indexer = get_group_index_sorter(group_index, ngroups) obj = obj.take(indexer) group_index = algorithms.take_nd(group_index, indexer, allow_fill=False) grouper = libreduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _sort_idx(self) -> npt.NDArray[np.intp]: # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups)
def sort_idx(self): # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups)
def sort_idx(self): # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups)
def take(self, indices, allow_fill=False, fill_value=None): # type: (Sequence[int] , bool, Optional[Any]) -> FletcherArray """ Take elements from an array. Parameters ---------- indices : sequence of integers Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many FletcherArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if nescessary. Returns ------- FletcherArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. Notes ----- FletcherArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. See Also -------- numpy.take pandas.api.extensions.take """ threshold_ratio = 0.3 # this is the threshold to decide whether or not to concat everything first. # Benchmarks were made on string, int32, int64, float32, float64 and it turns out that 0.3 is the value where it # is best to switch to concatening everything first, both time-wise and memory-wise length = len(self) indices = np.asarray(indices, dtype=self._indices_dtype) has_negative_indices = np.any(indices < 0) # type: ignore allow_fill &= has_negative_indices if allow_fill: validate_indices(indices, length) if (has_negative_indices and not allow_fill) or np.any(indices >= length # type: ignore ): # this will raise IndexError expected by pandas in all needed cases indices = np.arange(length, dtype=self._indices_dtype).take(indices) # here we guarantee that indices is numpy array of ints # and we have checked that all indices are between -1/0 and len(self) if not allow_fill: if self._has_single_chunk: if (self.dtype.is_list and self.data.chunk(0).flatten().null_count == 0 and self.data.chunk(0).null_count == 0 and self.flatten().dtype._is_numeric): return FletcherArray( take_indices_on_pyarrow_list(self.data.chunk(0), indices)) else: return FletcherArray( self.data.chunk(0).take(pa.array(indices))) lengths = np.fromiter(map(len, self.data.iterchunks()), dtype=np.int) cum_lengths = lengths.cumsum() bins = self._get_chunk_indexer(indices) cum_lengths -= lengths limits_idx = np.concatenate( [[0], np.bincount(bins, minlength=self.data.num_chunks).cumsum()]) if pd.Series(bins).is_monotonic: del bins return self._take_on_chunks(indices, limits_idx=limits_idx, cum_lengths=cum_lengths) elif len(indices) / len(self) > threshold_ratio: # check which method is going to take less memory return self._take_on_concatenated_chunks(indices) else: sort_idx = get_group_index_sorter(bins, self.data.num_chunks) del bins indices = indices.take(sort_idx, out=indices) # type: ignore sort_idx = np.argsort(sort_idx, kind="merge") # inverse sort indices return self._take_on_chunks( indices, sort_idx=sort_idx, limits_idx=limits_idx, cum_lengths=cum_lengths, ) else: if pd.isnull(fill_value): fill_value = None return self._concat_same_type( [self, FletcherArray([fill_value], dtype=self.data.type)]).take(indices)