def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) target = ibase.ensure_index(target) if self.is_unique and self.equals(target): return np.arange(len(self), dtype="intp") if method == "pad" or method == "backfill": raise NotImplementedError("method='pad' and method='backfill' not " "implemented yet for CategoricalIndex") elif method == "nearest": raise NotImplementedError( "method='nearest' not implemented yet for CategoricalIndex") if isinstance( target, CategoricalIndex) and self._values.is_dtype_equal(target): if self._values.equals(target._values): # we have the same codes codes = target.codes else: codes = recode_for_categories(target.codes, target.categories, self._values.categories) else: if isinstance(target, CategoricalIndex): code_indexer = self.categories.get_indexer(target.categories) codes = take_1d(code_indexer, target.codes, fill_value=-1) else: codes = self.categories.get_indexer(target) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer)
def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def union_categoricals(to_union, sort_categories: bool = False, ignore_order: bool = False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. Parameters ---------- to_union : list-like Categorical, CategoricalIndex, or Series with dtype='category'. sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order : bool, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. Returns ------- Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) ['b', 'c', 'a', 'b'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) ['a', 'b', 'a', 'b', 'a'] Categories (2, object): ['a' < 'b'] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) Traceback (most recent call last): ... TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) ['a', 'b', 'c', 'c', 'b', 'a'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] """ from pandas import Categorical from pandas.core.arrays.categorical import recode_for_categories if len(to_union) == 0: raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all( first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered all_codes = [ first._encode_with_my_categories(x)._codes for x in to_union ] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError( "Cannot use sort_categories=True with ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = cats.unique() if sort_categories: categories = categories.sort_values() new_codes = [ recode_for_categories(c.codes, c.categories, categories) for c in to_union ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = "to union ordered Categoricals, all categories must be the same" raise TypeError(msg) else: raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def recode_for_groupby( c: Categorical, sort: bool, observed: bool) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the area_data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the area_data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: # In cases with c.ordered, this is equivalent to # return c.remove_unused_categories(), c unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the area_data (GH-13179), which .unique() # above dropped cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None