def test_from_codes(self): # too few categories with pytest.raises(ValueError): Categorical.from_codes([1, 2], [1, 2]) # no int codes with pytest.raises(ValueError): Categorical.from_codes(["a"], [1, 2]) # no unique categories with pytest.raises(ValueError): Categorical.from_codes([0, 1, 2], ["a", "a", "b"]) # NaN categories included with pytest.raises(ValueError): Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) # too negative with pytest.raises(ValueError): Categorical.from_codes([-2, 1, 2], ["a", "b", "c"]) exp = Categorical(["a", "b", "c"], ordered=False) res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"]) tm.assert_categorical_equal(exp, res) # Not available in earlier numpy versions if hasattr(np.random, "choice"): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) Categorical.from_codes(codes, categories=["train", "test"])
def test_constructor_interval(self): result = Categorical([Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True) ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)]) exp = Categorical(ii, ordered=True) tm.assert_categorical_equal(result, exp) tm.assert_index_equal(result.categories, ii)
def test_union_categoricals_nan(self): # GH 13759 res = union_categoricals([pd.Categorical([1, 2, np.nan]), pd.Categorical([3, 2, np.nan])]) exp = Categorical([1, 2, np.nan, 3, 2, np.nan]) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical(['A', 'B']), pd.Categorical(['B', 'B', np.nan])]) exp = Categorical(['A', 'B', 'B', 'B', np.nan]) tm.assert_categorical_equal(res, exp) val1 = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.NaT] val2 = [pd.NaT, pd.Timestamp('2011-01-01'), pd.Timestamp('2011-02-01')] res = union_categoricals([pd.Categorical(val1), pd.Categorical(val2)]) exp = Categorical(val1 + val2, categories=[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-03-01'), pd.Timestamp('2011-02-01')]) tm.assert_categorical_equal(res, exp) # all NaN res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical(['X'])]) exp = Categorical([np.nan, np.nan, 'X']) tm.assert_categorical_equal(res, exp) res = union_categoricals([pd.Categorical([np.nan, np.nan]), pd.Categorical([np.nan, np.nan])]) exp = Categorical([np.nan, np.nan, np.nan, np.nan]) tm.assert_categorical_equal(res, exp)
def test_cut_pass_labels(get_labels, get_expected): bins = [0, 25, 50, 100] arr = [50, 5, 10, 15, 20, 30, 70] labels = ["Small", "Medium", "Large"] result = cut(arr, bins, labels=get_labels(labels)) tm.assert_categorical_equal(result, get_expected(labels))
def test_union_categoricals_ordered(self): c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) msg = 'Categorical.ordered must be the same' with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2]) res = union_categoricals([c1, c1]) exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2]) exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) msg = "to union ordered Categoricals, all categories must be the same" with tm.assertRaisesRegexp(TypeError, msg): union_categoricals([c1, c2])
def test_from_inferred_categories_coerces(self): cats = ['1', '2', 'bad'] codes = np.array([0, 0, 1, 2], dtype='i8') dtype = CategoricalDtype([1, 2]) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected)
def test_unique(): # GH714 also, dtype=float s = Series([1.2345] * 100) s[::2] = np.nan result = s.unique() assert len(result) == 2 s = Series([1.2345] * 100, dtype='f4') s[::2] = np.nan result = s.unique() assert len(result) == 2 # NAs in object arrays #714 s = Series(['foo'] * 100, dtype='O') s[::2] = np.nan result = s.unique() assert len(result) == 2 # decision about None s = Series([1, 2, 3, None, None, None], dtype=object) result = s.unique() expected = np.array([1, 2, 3, None], dtype=object) tm.assert_numpy_array_equal(result, expected) # GH 18051 s = Series(Categorical([])) tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) s = Series(Categorical([np.nan])) tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False)
def test_take_allow_fill(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(['a', 'a', 'b']) result = cat.take([0, -1, -1], allow_fill=True) expected = pd.Categorical(['a', np.nan, np.nan], categories=['a', 'b']) tm.assert_categorical_equal(result, expected)
def test_setitem_same_but_unordered(self, other): # GH-24142 target = pd.Categorical(['a', 'b'], categories=['a', 'b']) mask = np.array([True, False]) target[mask] = other[mask] expected = pd.Categorical(['b', 'b'], categories=['a', 'b']) tm.assert_categorical_equal(target, expected)
def test_positional_take(self, ordered): cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'], ordered=ordered) result = cat.take([0, 1, 2], allow_fill=False) expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories, ordered=ordered) tm.assert_categorical_equal(result, expected)
def test_rename_categories(self): cat = Categorical(["a", "b", "c", "a"]) # inplace=False: the old one must not be changed res = cat.rename_categories([1, 2, 3]) tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) tm.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) exp_cat = Index(["a", "b", "c"]) tm.assert_index_equal(cat.categories, exp_cat) # GH18862 (let rename_categories take callables) result = cat.rename_categories(lambda x: x.upper()) expected = Categorical(["A", "B", "C", "A"]) tm.assert_categorical_equal(result, expected) # and now inplace res = cat.rename_categories([1, 2, 3], inplace=True) assert res is None tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) # Lengthen with pytest.raises(ValueError): cat.rename_categories([1, 2, 3, 4]) # Shorten with pytest.raises(ValueError): cat.rename_categories([1, 2])
def test_numpy_repeat(self): cat = Categorical(["a", "b"], categories=["a", "b"]) exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) tm.assert_categorical_equal(np.repeat(cat, 2), exp) msg = "the 'axis' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.repeat, cat, 2, axis=1)
def test_remove_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) # first inplace == False res = cat.remove_categories("c") tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) res = cat.remove_categories(["c"]) tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) # inplace == True res = cat.remove_categories("c", inplace=True) tm.assert_categorical_equal(cat, new) assert res is None # removal is not in categories def f(): cat.remove_categories(["c"]) pytest.raises(ValueError, f)
def test_basic(self): # run multiple times here for n in range(10): for s, i in self.d.items(): i_rec = self.encode_decode(i) assert_categorical_equal(i, i_rec)
def check_arbitrary(a, b): if isinstance(a, (list, tuple)) and isinstance(b, (list, tuple)): assert(len(a) == len(b)) for a_, b_ in zip(a, b): check_arbitrary(a_, b_) elif isinstance(a, DataFrame): assert_frame_equal(a, b) elif isinstance(a, Series): assert_series_equal(a, b) elif isinstance(a, Index): assert_index_equal(a, b) elif isinstance(a, Categorical): # Temp, # Categorical.categories is changed from str to bytes in PY3 # maybe the same as GH 13591 if b.categories.inferred_type == 'string': pass else: tm.assert_categorical_equal(a, b) elif a is NaT: assert b is NaT elif isinstance(a, Timestamp): assert a == b assert a.freq == b.freq else: assert(a == b)
def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
def test_factorized_sort(): cat = pd.Categorical(['b', 'b', None, 'a']) labels, uniques = pd.factorize(cat, sort=True) expected_labels = np.array([1, 1, -1, 0], dtype=np.intp) expected_uniques = pd.Categorical(['a', 'b']) tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques)
def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') expected = intervals.take([0, 0, 0, 2, 3, 0, 1]).astype('category') tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
def test_constructor_with_dtype(self, ordered): categories = ['b', 'a', 'c'] dtype = CategoricalDtype(categories, ordered=ordered) result = Categorical(['a', 'b', 'a', 'c'], dtype=dtype) expected = Categorical(['a', 'b', 'a', 'c'], categories=categories, ordered=ordered) tm.assert_categorical_equal(result, expected) assert result.ordered is ordered
def test_numpy_repeat(self): cat = Categorical(["a", "b"], categories=["a", "b"]) exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) tm.assert_categorical_equal(np.repeat(cat, 2), exp) msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): np.repeat(cat, 2, axis=1)
def test_union_categorical_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19096 c1 = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c']) c2 = Categorical(['a', 'b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2]) expected = Categorical(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected)
def test_ensure_categorical(): values = np.arange(10, dtype=np.int32) result = _ensure_categorical(values) assert (result.dtype == 'category') values = Categorical(values) result = _ensure_categorical(values) tm.assert_categorical_equal(result, values)
def test_constructor_with_index(self): ci = CategoricalIndex(list('aabbca'), categories=list('cab')) tm.assert_categorical_equal(ci.values, Categorical(ci)) ci = CategoricalIndex(list('aabbca'), categories=list('cab')) tm.assert_categorical_equal(ci.values, Categorical(ci.astype(object), categories=ci.categories))
def test_groupby_describe_categorical_columns(self): # GH 11558 cats = pd.CategoricalIndex(["qux", "foo", "baz", "bar"], categories=["foo", "bar", "baz", "qux"], ordered=True) df = DataFrame(np.random.randn(20, 4), columns=cats) result = df.groupby([1, 2, 3, 4] * 5).describe() tm.assert_index_equal(result.columns, cats) tm.assert_categorical_equal(result.columns.values, cats.values)
def test_constructor_from_categorical_with_unknown_dtype(self): dtype = CategoricalDtype(None, ordered=True) values = Categorical(['a', 'b', 'd']) result = Categorical(values, dtype=dtype) # We use values.categories, not dtype.categories expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'], ordered=True) tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_dtype(self): cats = ['a', 'b', 'd'] codes = np.array([0, 1, 0, 2], dtype='i8') dtype = CategoricalDtype(['c', 'b', 'a'], ordered=True) result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical(['a', 'b', 'a', 'd'], categories=['c', 'b', 'a'], ordered=True) tm.assert_categorical_equal(result, expected)
def test_reshape_categorical_numpy(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): cat = Categorical(["a", "b"], categories=["a", "b"]) tm.assert_categorical_equal(np.reshape(cat, cat.shape), cat) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): msg = "the 'order' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.reshape, cat, cat.shape, order='F')
def test_create_categorical(self): # https://github.com/pandas-dev/pandas/pull/17513 # The public CI constructor doesn't hit this code path with # instances of CategoricalIndex, but we still want to test the code ci = CategoricalIndex(['a', 'b', 'c']) # First ci is self, second ci is data. result = CategoricalIndex._create_categorical(ci, ci) expected = Categorical(['a', 'b', 'c']) tm.assert_categorical_equal(result, expected)
def test_map_with_nan(self, data, f): # GH 24241 values = pd.Categorical(data) result = values.map(f) if data[1] == 1: expected = pd.Categorical([False, False, np.nan]) tm.assert_categorical_equal(result, expected) else: expected = pd.Index([False, False, np.nan]) tm.assert_index_equal(result, expected)
def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) expected = Categorical(intervals, ordered=True) expected = expected.take([0, 0, 0, 2, 3, 0, 0]) tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
def test_from_inferred_categories_sorts(self, dtype): cats = ['b', 'a'] codes = np.array([0, 1, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) tm.assert_categorical_equal(result, expected)
def test_constructor(self): exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_) c1 = Categorical(exp_arr) tm.assert_numpy_array_equal(c1.__array__(), exp_arr) c2 = Categorical(exp_arr, categories=["a", "b", "c"]) tm.assert_numpy_array_equal(c2.__array__(), exp_arr) c2 = Categorical(exp_arr, categories=["c", "b", "a"]) tm.assert_numpy_array_equal(c2.__array__(), exp_arr) # categories must be unique def f(): Categorical([1, 2], [1, 2, 2]) pytest.raises(ValueError, f) def f(): Categorical(["a", "b"], ["a", "b", "b"]) pytest.raises(ValueError, f) # The default should be unordered c1 = Categorical(["a", "b", "c", "a"]) assert not c1.ordered # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1, categories=["a", "b", "c"]) tm.assert_numpy_array_equal(c1.__array__(), c2.__array__()) tm.assert_index_equal(c2.categories, Index(["a", "b", "c"])) # Series of dtype category c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(Series(c1)) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(Series(c1)) tm.assert_categorical_equal(c1, c2) # Series c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(Series(["a", "b", "c", "a"])) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(c1, c2) # This should result in integer categories, not float! cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) assert is_integer_dtype(cat.categories) # https://github.com/pandas-dev/pandas/issues/3678 cat = Categorical([np.nan, 1, 2, 3]) assert is_integer_dtype(cat.categories) # this should result in floats cat = Categorical([np.nan, 1, 2., 3]) assert is_float_dtype(cat.categories) cat = Categorical([np.nan, 1., 2., 3.]) assert is_float_dtype(cat.categories) # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notna()]) # assert is_integer_dtype(vals) # corner cases cat = Categorical([1]) assert len(cat.categories) == 1 assert cat.categories[0] == 1 assert len(cat.codes) == 1 assert cat.codes[0] == 0 cat = Categorical(["a"]) assert len(cat.categories) == 1 assert cat.categories[0] == "a" assert len(cat.codes) == 1 assert cat.codes[0] == 0 # Scalars should be converted to lists cat = Categorical(1) assert len(cat.categories) == 1 assert cat.categories[0] == 1 assert len(cat.codes) == 1 assert cat.codes[0] == 0 # two arrays # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa with tm.assert_produces_warning(None): c_old = Categorical( [0, 1, 2, 0, 1, 2], # noqa categories=[3, 4, 5]) # the next one are from the old docs with tm.assert_produces_warning(None): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa cat = Categorical([1, 2], categories=[1, 2, 3]) # this is a legitimate constructor with tm.assert_produces_warning(None): c = Categorical( np.array([], dtype='int64'), # noqa categories=[3, 2, 1], ordered=True)
def test_take_fill_value(self): # GH 12631 # numeric category idx = pd.CategoricalIndex([1, 2, 3], name='xxx') result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex([2, 1, 3], name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex([2, 1, 3], name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category idx = pd.CategoricalIndex(list('CBA'), categories=list('ABC'), ordered=True, name='xxx') result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), ordered=True, name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex(['B', 'C', np.nan], categories=list('ABC'), ordered=True, name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex(list('BCA'), categories=list('ABC'), ordered=True, name='xxx') tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) msg = ('When allow_fill=True and fill_value is not None, ' 'all indices must be >= -1') with tm.assertRaisesRegexp(ValueError, msg): idx.take(np.array([1, 0, -2]), fill_value=True) with tm.assertRaisesRegexp(ValueError, msg): idx.take(np.array([1, 0, -5]), fill_value=True) with tm.assertRaises(IndexError): idx.take(np.array([1, -5]))
def test_set_item_nan(self): cat = Categorical([1, 2, 3]) cat[1] = np.nan exp = Categorical([1, np.nan, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(cat, exp)
def test_categories_none_comparisons(self): factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"], ordered=True) tm.assert_categorical_equal(factor, self.factor)
def test_union_categoricals_sort(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['a', 'b', 'c', 'x', 'y', 'z']) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['a', 'b'], categories=['c', 'a', 'b']) c2 = Categorical(['b', 'c'], categories=['c', 'a', 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical(['x', np.nan, np.nan, 'b'], categories=['b', 'x']) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([np.nan, np.nan]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=True) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) with pytest.raises(TypeError): union_categoricals([c1, c2], sort_categories=True)
def test_comparisons(self): result = self.factor[self.factor == 'a'] expected = self.factor[np.asarray(self.factor) == 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor != 'a'] expected = self.factor[np.asarray(self.factor) != 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor < 'c'] expected = self.factor[np.asarray(self.factor) < 'c'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor > 'a'] expected = self.factor[np.asarray(self.factor) > 'a'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor >= 'b'] expected = self.factor[np.asarray(self.factor) >= 'b'] tm.assert_categorical_equal(result, expected) result = self.factor[self.factor <= 'b'] expected = self.factor[np.asarray(self.factor) <= 'b'] tm.assert_categorical_equal(result, expected) n = len(self.factor) other = self.factor[np.random.permutation(n)] result = self.factor == other expected = np.asarray(self.factor) == np.asarray(other) tm.assert_numpy_array_equal(result, expected) result = self.factor == 'd' expected = np.repeat(False, len(self.factor)) tm.assert_numpy_array_equal(result, expected) # comparisons with categoricals cat_rev = Categorical(["a", "b", "c"], categories=["c", "b", "a"], ordered=True) cat_rev_base = Categorical(["b", "b", "b"], categories=["c", "b", "a"], ordered=True) cat = Categorical(["a", "b", "c"], ordered=True) cat_base = Categorical(["b", "b", "b"], categories=cat.categories, ordered=True) # comparisons need to take categories ordering into account res_rev = cat_rev > cat_rev_base exp_rev = np.array([True, False, False]) tm.assert_numpy_array_equal(res_rev, exp_rev) res_rev = cat_rev < cat_rev_base exp_rev = np.array([False, False, True]) tm.assert_numpy_array_equal(res_rev, exp_rev) res = cat > cat_base exp = np.array([False, False, True]) tm.assert_numpy_array_equal(res, exp) # Only categories with same categories can be compared with pytest.raises(TypeError): cat > cat_rev cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) with pytest.raises(TypeError): cat_rev > cat_rev_base2 # Only categories with same ordering information can be compared cat_unorderd = cat.set_ordered(False) assert not (cat > cat).any() with pytest.raises(TypeError): cat > cat_unorderd # comparison (in both directions) with Series will raise s = Series(["b", "b", "b"]) msg = ("Cannot compare a Categorical for op __gt__ with type" r" <class 'numpy\.ndarray'>") with pytest.raises(TypeError, match=msg): cat > s with pytest.raises(TypeError, match=msg): cat_rev > s with pytest.raises(TypeError, match=msg): s < cat with pytest.raises(TypeError, match=msg): s < cat_rev # comparison with numpy.array will raise in both direction, but only on # newer numpy versions a = np.array(["b", "b", "b"]) with pytest.raises(TypeError, match=msg): cat > a with pytest.raises(TypeError, match=msg): cat_rev > a # Make sure that unequal comparison take the categories order in # account cat_rev = Categorical(list("abc"), categories=list("cba"), ordered=True) exp = np.array([True, False, False]) res = cat_rev > "b" tm.assert_numpy_array_equal(res, exp) # check that zero-dim array gets unboxed res = cat_rev > np.array("b") tm.assert_numpy_array_equal(res, exp)
def test_set_categories_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c.set_categories(new_categories, ordered=ordered) tm.assert_categorical_equal(result, expected)
def test_set_categories_rename_less(self): # GH 24675 cat = Categorical(["A", "B"]) result = cat.set_categories(["A"], rename=True) expected = Categorical(["A", np.nan]) tm.assert_categorical_equal(result, expected)
def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 c = Categorical(["a", "b"]) result = c.rename_categories(Series([0, 1], index=["a", "b"])) expected = Categorical([0, 1]) tm.assert_categorical_equal(result, expected)
def test_categories_none_comparisons(self): factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], ordered=True) tm.assert_categorical_equal(factor, self.factor)
def test_set_dtype_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c._set_dtype(expected.dtype) tm.assert_categorical_equal(result, expected)
def test_repeat(self): # GH10183 cat = Categorical(["a", "b"], categories=["a", "b"]) exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) res = cat.repeat(2) tm.assert_categorical_equal(res, exp)
def test_union_categoricals_sort_false(self): # GH 13846 c1 = Categorical(['x', 'y', 'z']) c2 = Categorical(['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(['x', 'y', 'z', 'a', 'b', 'c'], categories=['x', 'y', 'z', 'a', 'b', 'c']) tm.assert_categorical_equal(result, expected) # fastpath c1 = Categorical(['a', 'b'], categories=['b', 'a', 'c']) c2 = Categorical(['b', 'c'], categories=['b', 'a', 'c']) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(['a', 'b', 'b', 'c'], categories=['b', 'a', 'c']) tm.assert_categorical_equal(result, expected) # fastpath - skip resort c1 = Categorical(['a', 'b'], categories=['a', 'b', 'c']) c2 = Categorical(['b', 'c'], categories=['a', 'b', 'c']) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(['a', 'b', 'b', 'c'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected) c1 = Categorical(['x', np.nan]) c2 = Categorical([np.nan, 'b']) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(['x', np.nan, np.nan, 'b'], categories=['x', 'b']) tm.assert_categorical_equal(result, expected) c1 = Categorical([np.nan]) c2 = Categorical([np.nan]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical([np.nan, np.nan], categories=[]) tm.assert_categorical_equal(result, expected) c1 = Categorical([]) c2 = Categorical([]) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical([]) tm.assert_categorical_equal(result, expected) c1 = Categorical(['b', 'a'], categories=['b', 'a', 'c'], ordered=True) c2 = Categorical(['a', 'c'], categories=['b', 'a', 'c'], ordered=True) result = union_categoricals([c1, c2], sort_categories=False) expected = Categorical(['b', 'a', 'a', 'c'], categories=['b', 'a', 'c'], ordered=True) tm.assert_categorical_equal(result, expected)
def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True) # first inplace == False res = cat.add_categories("d") tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) res = cat.add_categories(["d"]) tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) # inplace == True res = cat.add_categories("d", inplace=True) tm.assert_categorical_equal(cat, new) assert res is None # new is in old categories with pytest.raises(ValueError): cat.add_categories(["d"]) # GH 9927 cat = Categorical(list("abc"), ordered=True) expected = Categorical(list("abc"), categories=list("abcde"), ordered=True) # test with Series, np.array, index, list res = cat.add_categories(Series(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(np.array(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(Index(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(["d", "e"]) tm.assert_categorical_equal(res, expected)
def test_take_fill_value(self): # GH 12631 # numeric category idx = pd.CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category idx = pd.CategoricalIndex(list("CBA"), categories=list("ABC"), ordered=True, name="xxx") result = idx.take(np.array([1, 0, -1])) expected = pd.CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = pd.CategoricalIndex(["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = pd.CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) msg = ("When allow_fill=True and fill_value is not None, " "all indices must be >= -1") with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) with pytest.raises(IndexError): idx.take(np.array([1, -5]))
def test_mode(self, values, categories, exp_mode): s = Categorical(values, categories=categories, ordered=True) res = s.mode() exp = Categorical(exp_mode, categories=categories, ordered=True) tm.assert_categorical_equal(res, exp)
def test_unique_index_series(self): c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) # Categorical.unique sorts categories by appearance order # if ordered=False exp = Categorical([3, 1, 2], categories=[3, 1, 2]) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) exp = Categorical([1, 2], categories=[1, 2]) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) # Categorical.unique keeps categories order if ordered=True exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp)
def test_union_categoricals_ignore_order(self): # GH 15219 c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], ordered=False) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) msg = 'Categorical.ordered must be the same' with tm.assert_raises_regex(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) res = union_categoricals([c1, c1], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c1, c1], ignore_order=False) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3, np.nan], ordered=True) c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, np.nan, 3, 2]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True) res = union_categoricals([c1, c2], ignore_order=True) exp = Categorical([1, 2, 3, 1, 2, 3]) tm.assert_categorical_equal(res, exp) res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True) exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3]) tm.assert_categorical_equal(res, exp) c1 = Categorical([1, 2, 3], ordered=True) c2 = Categorical([4, 5, 6], ordered=True) result = union_categoricals([c1, c2], ignore_order=True) expected = Categorical([1, 2, 3, 4, 5, 6]) tm.assert_categorical_equal(result, expected) msg = "to union ordered Categoricals, all categories must be the same" with tm.assert_raises_regex(TypeError, msg): union_categoricals([c1, c2], ignore_order=False) with tm.assert_raises_regex(TypeError, msg): union_categoricals([c1, c2])
def categories_equals(left, right): assert (left.ordered and right.ordered) or (not left.ordered and not right.ordered) is_category_ordered = left.ordered assert_categorical_equal(left, right, check_category_order=is_category_ordered)
def test_categorical_equal(c): assert_categorical_equal(c, c)
def test_qcut_index(self): result = qcut([0, 2], 2) expected = Index([Interval(-0.001, 1), Interval(1, 2)]).astype('category') tm.assert_categorical_equal(result, expected)
def test_from_inferred_categories_sorts(self, dtype): cats = ["b", "a"] codes = np.array([0, 1, 1, 1], dtype="i8") result = Categorical._from_inferred_categories(cats, codes, dtype) expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"]) tm.assert_categorical_equal(result, expected)
def test_qcut_specify_quantiles(self): arr = np.random.randn(100) factor = qcut(arr, [0, .25, .5, .75, 1.]) expected = qcut(arr, 4) tm.assert_categorical_equal(factor, expected)
def test_take_fill_value(self): # https://github.com/pandas-dev/pandas/issues/23296 cat = pd.Categorical(['a', 'b', 'c']) result = cat.take([0, 1, -1], fill_value='a', allow_fill=True) expected = pd.Categorical(['a', 'b', 'a'], categories=['a', 'b', 'c']) tm.assert_categorical_equal(result, expected)
def test_astype_categorical(): arr = period_array(['2000', '2001', '2001', None], freq='D') result = arr.astype('category') categories = pd.PeriodIndex(['2000', '2001'], freq='D') expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) tm.assert_categorical_equal(result, expected)
def test_set_dtype_no_overlap(self): c = Categorical(['a', 'b', 'c'], ['d', 'e']) result = c._set_dtype(CategoricalDtype(['a', 'b'])) expected = Categorical([None, None, None], categories=['a', 'b']) tm.assert_categorical_equal(result, expected)
def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected)
def test_set_dtype_same(self): c = Categorical(['a', 'b', 'c']) result = c._set_dtype(CategoricalDtype(['a', 'b', 'c'])) tm.assert_categorical_equal(result, c)
def test_take_fill_with_negative_one(self): # -1 was a category cat = pd.Categorical([-1, 0, 1]) result = cat.take([0, -1, 1], allow_fill=True, fill_value=-1) expected = pd.Categorical([-1, -1, 0], categories=[-1, 0, 1]) tm.assert_categorical_equal(result, expected)