def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)
def union_categoricals(to_union, sort_categories=False, ignore_order=False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. .. versionadded:: 0.19.0 Parameters ---------- to_union : list-like of Categorical, CategoricalIndex, or Series with dtype='category' sort_categories : boolean, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order: boolean, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. .. versionadded:: 0.20.0 Returns ------- result : Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <http://pandas.pydata.org/pandas-docs/stable/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) [b, c, a, b] Categories (3, object): [a, b, c] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) [a, b, a, b, a] Categories (2, object): [a < b] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) [a, b, c, c, b, a] Categories (3, object): [a, b, c] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) [b, c, a, b] Categories (3, object): [b, c, a] """ from pandas import Index, Categorical, CategoricalIndex, Series from pandas.core.categorical import _recode_for_categories if len(to_union) == 0: raise ValueError('No Categoricals to union') def _maybe_unwrap(x): if isinstance(x, (CategoricalIndex, Series)): return x.values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all(is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all(first.is_dtype_equal(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered new_codes = np.concatenate([c.codes for c in to_union]) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with " "ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_1d new_codes = take_1d(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = Index(cats.unique()) if sort_categories: categories = categories.sort_values() new_codes = [] for c in to_union: new_codes.append(_recode_for_categories(c.codes, c.categories, categories)) new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = ("to union ordered Categoricals, " "all categories must be the same") raise TypeError(msg) else: raise TypeError('Categorical.ordered must be the same') if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)