def group_index(self) -> Index: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes cat = self.grouper categories = cat.categories if self.observed: codes = algorithms.unique1d(cat.codes) codes = codes[codes != -1] if self.sort or cat.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) return CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=cat.ordered), name=self.name, ) if self._group_index is None: self._make_codes() assert self._group_index is not None return self._group_index
def _codes_and_uniques(self) -> tuple[np.ndarray, ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes cat = self.grouper categories = cat.categories if self.observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] if self.sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) uniques = Categorical.from_codes( codes=ucodes, categories=categories, ordered=cat.ordered ) return cat.codes, uniques elif isinstance(self.grouper, ops.BaseGrouper): # we have a list of groupers codes = self.grouper.codes_info uniques = self.grouper.result_arraylike else: # GH35667, replace dropna=False with na_sentinel=None if not self.dropna: na_sentinel = None else: na_sentinel = -1 codes, uniques = algorithms.factorize( self.grouper, sort=self.sort, na_sentinel=na_sentinel ) return codes, uniques
def unique(self): values = self._values if hasattr(values, 'unique'): result = values.unique() else: from pandas.core.algorithms import unique1d result = unique1d(values) return result
def unique(self): values = self._values if hasattr(values, "unique"): result = values.unique() else: result = unique1d(values) return result
def unique(self, level=None): # override the Index.unique method for performance GH#23083 if level is not None: # this should never occur, but is retained to make the signature # match Index.unique self._validate_index_level(level) values = self._ndarray_values result = unique1d(values) return self._shallow_copy(result)
def unique(self): values = self._values if not isinstance(values, np.ndarray): result = values.unique() if self.dtype.kind in ["m", "M"] and isinstance(self, ABCSeries): # GH#31182 Series._values returns EA, unpack for backward-compat if getattr(self.dtype, "tz", None) is None: result = np.asarray(result) else: result = unique1d(values) return result
def unique(self): values = self._values if not isinstance(values, np.ndarray): result: ArrayLike = values.unique() if ( isinstance(self.dtype, np.dtype) and self.dtype.kind in ["m", "M"] ) and isinstance(self, ABCSeries): # GH#31182 Series._values returns EA # unpack numpy datetime for backward-compat result = np.asarray(result) else: result = unique1d(values) return result
def _frame_arith_method_with_reindex(left: "DataFrame", right: "DataFrame", op) -> "DataFrame": """ For DataFrame-with-DataFrame operations that require reindexing, operate only on shared columns, then reindex. Parameters ---------- left : DataFrame right : DataFrame op : binary operator Returns ------- DataFrame """ # GH#31623, only operate on shared columns cols, lcols, rcols = left.columns.join(right.columns, how="inner", level=None, return_indexers=True) new_left = left.iloc[:, lcols] new_right = right.iloc[:, rcols] result = op(new_left, new_right) # Do the join on the columns instead of using align_method_FRAME # to avoid constructing two potentially large/sparse DataFrames join_columns, _, _ = left.columns.join(right.columns, how="outer", level=None, return_indexers=True) if result.columns.has_duplicates: # Avoid reindexing with a duplicate axis. # https://github.com/pandas-dev/pandas/issues/35194 indexer, _ = result.columns.get_indexer_non_unique(join_columns) indexer = algorithms.unique1d(indexer) result = result._reindex_with_indexers({1: [join_columns, indexer]}, allow_dups=True) else: result = result.reindex(join_columns, axis=1) return result
def _codes_and_uniques( self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]: if self._passed_categorical: # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes cat = self.grouping_vector categories = cat.categories if self._observed: ucodes = algorithms.unique1d(cat.codes) ucodes = ucodes[ucodes != -1] if self._sort or cat.ordered: ucodes = np.sort(ucodes) else: ucodes = np.arange(len(categories)) uniques = Categorical.from_codes(codes=ucodes, categories=categories, ordered=cat.ordered) return cat.codes, uniques elif isinstance(self.grouping_vector, ops.BaseGrouper): # we have a list of groupers codes = self.grouping_vector.codes_info # error: Incompatible types in assignment (expression has type "Union # [ExtensionArray, ndarray[Any, Any]]", variable has type "Categorical") uniques = ( self.grouping_vector.result_index. _values # type: ignore[assignment] ) else: # GH35667, replace dropna=False with use_na_sentinel=False # error: Incompatible types in assignment (expression has type "Union[ # ndarray[Any, Any], Index]", variable has type "Categorical") codes, uniques = algorithms.factorize( # type: ignore[assignment] self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna) return codes, uniques
def _is_unique(self): return len(unique1d(self.asi8)) == len(self)
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The fist column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = algorithms.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = np.asarray(index.dayofyear) - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = np.asarray(index.month) - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)
def unique(self): result = unique1d(self.asi8) return type(self)(result, dtype=self.dtype)
def __init__( self, index: Index, grouper=None, obj: FrameOrSeries | None = None, name: Hashable = None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper._values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] ( self.grouper, self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper( # error: Value of type variable "FrameOrSeries" of "_get_grouper" # of "Grouper" cannot be "Optional[FrameOrSeries]" self.obj, # type: ignore[type-var] validate=False, ) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered), name=self.name, ) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): grper = pprint_thing(self.grouper) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): self.grouper = self.grouper.astype("datetime64[ns]") elif is_timedelta64_dtype(self.grouper): self.grouper = self.grouper.astype("timedelta64[ns]")
def pivot_annual(series, freq=None): """ Deprecated. Use ``pivot_table`` instead. Group a series by years, taking leap years into account. The output has as many rows as distinct years in the original series, and as many columns as the length of a leap year in the units corresponding to the original frequency (366 for daily frequency, 366*24 for hourly...). The fist column of the output corresponds to Jan. 1st, 00:00:00, while the last column corresponds to Dec, 31st, 23:59:59. Entries corresponding to Feb. 29th are masked for non-leap years. For example, if the initial series has a daily frequency, the 59th column of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, and the 60th column is masked for non-leap years. With a hourly initial frequency, the (59*24)th column of the output always correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and the 24 columns between (59*24) and (61*24) are masked. If the original frequency is less than daily, the output is equivalent to ``series.convert('A', func=None)``. Parameters ---------- series : Series freq : string or None, default None Returns ------- annual : DataFrame """ msg = "pivot_annual is deprecated. Use pivot_table instead" warnings.warn(msg, FutureWarning) index = series.index year = index.year years = algorithms.unique1d(year) if freq is not None: freq = freq.upper() else: freq = series.index.freq if freq == 'D': width = 366 offset = index.dayofyear - 1 # adjust for leap year offset[(~isleapyear(year)) & (offset >= 59)] += 1 columns = lrange(1, 367) # todo: strings like 1/1, 1/25, etc.? elif freq in ('M', 'BM'): width = 12 offset = index.month - 1 columns = lrange(1, 13) elif freq == 'H': width = 8784 grouped = series.groupby(series.index.year) defaulted = grouped.apply(lambda x: x.reset_index(drop=True)) defaulted.index = defaulted.index.droplevel(0) offset = np.asarray(defaulted.index) offset[~isleapyear(year) & (offset >= 1416)] += 24 columns = lrange(1, 8785) else: raise NotImplementedError(freq) flat_index = (year - years.min()) * width + offset flat_index = _ensure_platform_int(flat_index) values = np.empty((len(years), width)) values.fill(np.nan) values.put(flat_index, series.values) return DataFrame(values, index=years, columns=columns)
def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError('Level {} not in index'.format(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes( codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError( "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ('Grouper result violates len(labels) == ' 'len(data)\nresult: %s' % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta self.grouper = to_timedelta(self.grouper)
def recode_for_groupby(c, sort, observed): """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped cat = cat.add_categories( c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None
def nunique(s): return len(algorithms.unique1d(s.dropna()))
def recode_for_groupby(c, sort, observed): """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None