def test_loc_with_non_string_categories(self, idx_values, ordered_fixture): # GH-17569 cat_idx = CategoricalIndex(idx_values, ordered=ordered_fixture) df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx) sl = slice(idx_values[0], idx_values[1]) # scalar selection result = df.loc[idx_values[0]] expected = Series(["foo"], index=["A"], name=idx_values[0]) tm.assert_series_equal(result, expected) # list selection result = df.loc[idx_values[:2]] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # slice selection result = df.loc[sl] expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"]) tm.assert_frame_equal(result, expected) # scalar assignment result = df.copy() result.loc[idx_values[0]] = "qux" expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # list assignment result = df.copy() result.loc[idx_values[:2], "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) # slice assignment result = df.copy() result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected)
def test_map(self): ci = CategoricalIndex(list("ABABC"), categories=list("CBA"), ordered=True) result = ci.map(lambda x: x.lower()) exp = CategoricalIndex(list("ababc"), categories=list("cba"), ordered=True) tm.assert_index_equal(result, exp) ci = CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False, name="XXX") result = ci.map(lambda x: x.lower()) exp = CategoricalIndex(list("ababc"), categories=list("bac"), ordered=False, name="XXX") tm.assert_index_equal(result, exp) # GH 12766: Return an index not an array tm.assert_index_equal( ci.map(lambda x: 1), Index(np.array([1] * 5, dtype=np.int64), name="XXX")) # change categories dtype ci = CategoricalIndex(list("ABABC"), categories=list("BAC"), ordered=False) def f(x): return {"A": 10, "B": 20, "C": 30}.get(x) result = ci.map(f) exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) tm.assert_index_equal(result, exp) result = ci.map(Series([10, 20, 30], index=["A", "B", "C"])) tm.assert_index_equal(result, exp) result = ci.map({"A": 10, "B": 20, "C": 30}) tm.assert_index_equal(result, exp)
def test_series_groupby_value_counts_on_categorical(): # GH38672 s = Series(Categorical(["a"], categories=["a", "b"])) result = s.groupby([0]).value_counts() expected = Series( data=[1, 0], index=MultiIndex.from_arrays([ [0, 0], CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=False, dtype="category"), ]), ) # Expected: # 0 a 1 # b 0 # dtype: int64 tm.assert_series_equal(result, expected)
def test_get_indexer_non_unique(self): np.random.seed(123456789) ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) oidx = Index(np.array(ci)) for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) # see gh-17323 # # Even when indexer is equal to the # members in the index, we should # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual)
def test_empty_prod(): # https://github.com/pandas-dev/pandas/issues/18678 df = DataFrame({ "A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1] }) expected_idx = CategoricalIndex(["a", "b", "c"], name="A") # 1 by default result = df.groupby("A", observed=False).B.prod() expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=0 result = df.groupby("A", observed=False).B.prod(min_count=0) expected = Series([2, 1, 1], expected_idx, name="B") tm.assert_series_equal(result, expected) # min_count=1 result = df.groupby("A", observed=False).B.prod(min_count=1) expected = Series([2, 1, np.nan], expected_idx, name="B") tm.assert_series_equal(result, expected)
def test_get_indexer_requires_unique(self): np.random.seed(123456789) ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) oidx = Index(np.array(ci)) msg = "Reindexing only valid with uniquely valued Index objects" for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] with pytest.raises(InvalidIndexError, match=msg): ci.get_indexer(finder) # see gh-17323 # # Even when indexer is equal to the # members in the index, we should # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: with pytest.raises(InvalidIndexError, match=msg): ci.get_indexer(finder)
def test_categorical_index_repr_period(self): # test all length idx = period_range('2011-01-01 09:00', freq='H', periods=1) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp idx = period_range('2011-01-01 09:00', freq='H', periods=2) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp idx = period_range('2011-01-01 09:00', freq='H', periods=3) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp idx = period_range('2011-01-01 09:00', freq='H', periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp i = CategoricalIndex(Categorical(idx.append(idx))) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa assert repr(i) == exp idx = period_range('2011-01', freq='M', periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa assert repr(i) == exp
def test_reindex_dtype(self): c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(["a", "c"]) tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(["a", "c"]) exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
def test_reindex_dtype(self): # GH#11586 ci = CategoricalIndex(["a", "b", "c", "a"]) with tm.assert_produces_warning(FutureWarning, match="non-unique"): res, indexer = ci.reindex(["a", "c"]) tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) ci = CategoricalIndex(["a", "b", "c", "a"]) with tm.assert_produces_warning(FutureWarning, match="non-unique"): res, indexer = ci.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) with tm.assert_produces_warning(FutureWarning, match="non-unique"): res, indexer = ci.reindex(["a", "c"]) exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) ci = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) with tm.assert_produces_warning(FutureWarning, match="non-unique"): res, indexer = ci.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
def test_groupby_sort_categorical_datetimelike(self): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame( { 'dt': [ datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1) ], 'foo': [10, 8, 5, 6, 4, 1, 7], 'bar': [10, 20, 30, 40, 50, 60, 70] }, columns=['dt', 'foo', 'bar']) # ordered=True df['dt'] = Categorical(df['dt'], ordered=True) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt', ordered=True) index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt', ordered=True) col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) # when categories is ordered, group is ordered by category's order assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt') index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt') col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
def test_categorical_categories(self): # GH17884 c1 = CategoricalDtype(Categorical(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_constructor_interval_values_mismatched_dtype(self): dti = date_range("2016-01-01", periods=3) ii = IntervalIndex.from_breaks(dti) result = Index(ii, dtype="category") expected = CategoricalIndex(ii) tm.assert_index_equal(result, expected)
def test_constructor_period_values_mismatched_dtype(self): pi = period_range("2016-01-01", periods=3, freq="D") result = Index(pi, dtype="category") expected = CategoricalIndex(pi) tm.assert_index_equal(result, expected)
def test_get_indexer_same_categories_same_order(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected)
def test_construction(self): ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) categories = ci.categories result = Index(ci) tm.assert_index_equal(result, ci, exact=True) assert not result.ordered result = Index(ci.values) tm.assert_index_equal(result, ci, exact=True) assert not result.ordered # empty result = CategoricalIndex(categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) assert not result.ordered # passing categories result = CategoricalIndex(list("aabbca"), categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal(result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")) c = Categorical(list("aabbca")) result = CategoricalIndex(c) tm.assert_index_equal(result.categories, Index(list("abc"))) tm.assert_numpy_array_equal(result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")) assert not result.ordered result = CategoricalIndex(c, categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal(result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")) assert not result.ordered ci = CategoricalIndex(c, categories=list("abcd")) result = CategoricalIndex(ci) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal(result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8")) assert not result.ordered result = CategoricalIndex(ci, categories=list("ab")) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8")) assert not result.ordered result = CategoricalIndex(ci, categories=list("ab"), ordered=True) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8")) assert result.ordered result = CategoricalIndex(ci, categories=list("ab"), ordered=True) expected = CategoricalIndex(ci, categories=list("ab"), ordered=True, dtype="category") tm.assert_index_equal(result, expected, exact=True) # turn me to an Index result = Index(np.array(ci)) assert isinstance(result, Index) assert not isinstance(result, CategoricalIndex)
idx = index idx_non_unique = idx[[0, 0, 1, 2]] check_intersection_commutative(idx, idx_non_unique) assert idx.intersection(idx_non_unique).is_unique @pytest.mark.parametrize( "cls", [ Int64Index, Float64Index, DatetimeIndex, CategoricalIndex, lambda x: CategoricalIndex(x, categories=set(x)), TimedeltaIndex, lambda x: Index(x, dtype=object), UInt64Index, ], ) def test_union_duplicate_index_subsets_of_each_other(cls): # GH#31326 a = cls([1, 2, 2, 3]) b = cls([3, 3, 4]) expected = cls([1, 2, 2, 3, 3, 4]) if isinstance(a, CategoricalIndex): expected = Index([1, 2, 2, 3, 3, 4]) result = a.union(b) tm.assert_index_equal(result, expected) result = a.union(b, sort=False)
def test_take_fill_value(self): # GH 12631 # numeric category idx = CategoricalIndex([1, 2, 3], name="xxx") result = idx.take(np.array([1, 0, -1])) expected = CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = CategoricalIndex([2, 1, 3], name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # object category idx = CategoricalIndex(list("CBA"), categories=list("ABC"), ordered=True, name="xxx") result = idx.take(np.array([1, 0, -1])) expected = CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # fill_value result = idx.take(np.array([1, 0, -1]), fill_value=True) expected = CategoricalIndex(["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) # allow_fill=False result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True) expected = CategoricalIndex(list("BCA"), categories=list("ABC"), ordered=True, name="xxx") tm.assert_index_equal(result, expected) tm.assert_categorical_equal(result.values, expected.values) msg = ("When allow_fill=True and fill_value is not None, " "all indices must be >= -1") with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -2]), fill_value=True) with pytest.raises(ValueError, match=msg): idx.take(np.array([1, 0, -5]), fill_value=True) msg = "index -5 is out of bounds for (axis 0 with )?size 3" with pytest.raises(IndexError, match=msg): idx.take(np.array([1, -5]))
def test_constructor_timedelta64_values_mismatched_dtype(self): # check we don't silently ignore the dtype keyword tdi = timedelta_range("4 Days", periods=5) result = Index(tdi, dtype="category") expected = CategoricalIndex(tdi) tm.assert_index_equal(result, expected)
def test_get_loc_unique(self): cidx = CategoricalIndex(list("abc")) result = cidx.get_loc("b") assert result == 1
def test_constructor_categorical_to_object(self): # GH#32167 Categorical data and dtype=object should return object-dtype ci = CategoricalIndex(range(5)) result = Index(ci, dtype=object) assert not isinstance(result, CategoricalIndex)
def test_get_loc_monotonic_nonunique(self): cidx = CategoricalIndex(list("abbc")) result = cidx.get_loc("b") expected = slice(1, 3, None) assert result == expected
def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # Emable Unicode option ----------------------------------------- with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa assert repr(idx) == expected
def test_get_loc_nonmonotonic_nonunique(self): cidx = CategoricalIndex(list("abcb")) result = cidx.get_loc("b") expected = np.array([False, True, False, True], dtype=bool) tm.assert_numpy_array_equal(result, expected)
def test_describe(self): # string type desc = self.factor.describe() assert self.factor.ordered exp_index = CategoricalIndex(["a", "b", "c"], name="categories", ordered=self.factor.ordered) expected = DataFrame( { "counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0] }, index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() exp_index = CategoricalIndex(list("abcd"), ordered=self.factor.ordered, name="categories") expected = DataFrame( { "counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0] }, index=exp_index, ) tm.assert_frame_equal(desc, expected) # check an integer one cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) desc = cat.describe() exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") expected = DataFrame( { "counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0] }, index=exp_index, ) tm.assert_frame_equal(desc, expected) # https://github.com/pandas-dev/pandas/issues/3678 # describe should work with NaN cat = Categorical([np.nan, 1, 2, 2]) desc = cat.describe() expected = DataFrame( { "counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0] }, index=CategoricalIndex([1, 2, np.nan], categories=[1, 2], name="categories"), ) tm.assert_frame_equal(desc, expected)
def test_contains_nan(self): ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef")) assert np.nan in ci
def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(['a', 'b', 'c']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] exp_index = CategoricalIndex(['a', 'b'], categories=index.categories) exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp_index = CategoricalIndex(['a', 'a', 'b'], categories=index.categories) exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assert_raises_regex( KeyError, 'a list-indexer must only include values that are ' 'in the categories'): df.loc[['a', 'x']] # duplicated categories and codes index = CategoricalIndex(['a', 'b', 'a']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] exp = DataFrame({ 'A': [1, 3, 2], 'B': [4, 6, 5] }, index=CategoricalIndex(['a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp = DataFrame({ 'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5] }, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assert_raises_regex( KeyError, 'a list-indexer must only include values ' 'that are in the categories'): df.loc[['a', 'x']] # contains unused category index = CategoricalIndex(['a', 'b', 'a', 'c'], categories=list('abcde')) df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) res = df.loc[['a', 'b']] exp = DataFrame({ 'A': [1, 3, 2], 'B': [5, 7, 6] }, index=CategoricalIndex(['a', 'a', 'b'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[['a', 'e']] exp = DataFrame({ 'A': [1, 3, np.nan], 'B': [5, 7, np.nan] }, index=CategoricalIndex(['a', 'a', 'e'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp = DataFrame({ 'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6] }, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) with tm.assert_raises_regex( KeyError, 'a list-indexer must only include values ' 'that are in the categories'): df.loc[['a', 'x']]
def test_contains_interval(self, item, expected): # GH 23705 ci = CategoricalIndex(IntervalIndex.from_breaks(range(3))) result = item in ci assert result is expected
def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(["a", "b", "c"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp_index = CategoricalIndex(["a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # duplicated categories and codes index = CategoricalIndex(["a", "b", "a"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp = DataFrame( {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"]) ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]}, index=CategoricalIndex(["a", "a", "a", "a", "b"]), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # contains unused category index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) res = df.loc[["a", "b"]] exp = DataFrame( {"A": [1, 3, 2], "B": [5, 7, 6]}, index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[["a", "e"]] exp = DataFrame( {"A": [1, 3, np.nan], "B": [5, 7, np.nan]}, index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]}, index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]]
def test_construction_empty_with_bool_categories(self): # see GH#22702 cat = CategoricalIndex([], categories=[True, False]) categories = sorted(cat.categories.tolist()) assert categories == [False, True]