def test_get_indexer_same_categories_same_order(self): ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) result = ci.get_indexer(CategoricalIndex(['b', 'b'], categories=['a', 'b'])) expected = np.array([1, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected)
def test_get_indexer_same_categories_different_order(self): # https://github.com/pandas-dev/pandas/issues/19551 ci = CategoricalIndex(['a', 'b'], categories=['a', 'b']) result = ci.get_indexer(CategoricalIndex(['b', 'b'], categories=['b', 'a'])) expected = np.array([1, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected)
def test_constructor_with_index(self): ci = CategoricalIndex(list('aabbca'), categories=list('cab')) tm.assert_categorical_equal(ci.values, Categorical(ci)) ci = CategoricalIndex(list('aabbca'), categories=list('cab')) tm.assert_categorical_equal(ci.values, Categorical(ci.astype(object), categories=ci.categories))
def test_get_indexer_array(self): arr = np.array([Timestamp('1999-12-31 00:00:00'), Timestamp('2000-12-31 00:00:00')], dtype=object) cats = [Timestamp('1999-12-31 00:00:00'), Timestamp('2000-12-31 00:00:00')] ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype='category') result = ci.get_indexer(arr) expected = np.array([0, 1], dtype='intp') tm.assert_numpy_array_equal(result, expected)
def test_get_indexer_same_categories_same_order(self): ci = CategoricalIndex(["a", "b"], categories=["a", "b"]) result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"])) expected = np.array([1, 1], dtype="intp") tm.assert_numpy_array_equal(result, expected)
def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) # gh-10132 (back-compat) # gh-8138 (back-compat) # gh-8869 cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df["C"] = ["foo", "bar"] * 2 # multiple groupers with a non-cat gb = df.groupby(["A", "B", "C"], observed=observed) exp_index = MultiIndex.from_arrays([cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]) expected = DataFrame({ "values": Series([1, 2, 3, 4], index=exp_index) }).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2, ["foo", "bar"]], list("ABC")) tm.assert_frame_equal(result, expected) gb = df.groupby(["A", "B"], observed=observed) exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"]) expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 d = { "cat": Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), "ints": [1, 1, 2, 2], "val": [10, 20, 30, 40], } df = DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() exp_index = CategoricalIndex(list("ab"), name="cat", categories=list("abc"), ordered=True) expected = DataFrame({ "ints": [1.5, 1.5], "val": [20.0, 30] }, index=exp_index) if not observed: index = CategoricalIndex(list("abc"), name="cat", categories=list("abc"), ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg("mean") expected = DataFrame({ "val": [10, 30, 20, 40], "cat": Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True), "ints": [1, 2, 1, 2], }).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers(expected, [df.cat.values, [1, 2]], ["cat", "ints"]) tm.assert_frame_equal(result, expected) # GH 10132 for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] assert_frame_equal(result, expected) # gh-8869 # with as_index d = { "foo": [10, 8, 4, 8, 4, 1, 1], "bar": [10, 20, 30, 40, 50, 60, 70], "baz": ["d", "c", "e", "a", "a", "d", "c"], } df = DataFrame(d) cat = pd.cut(df["foo"], np.linspace(0, 10, 3)) df["range"] = cat groups = df.groupby(["range", "baz"], as_index=False, observed=observed) result = groups.agg("mean") groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed) expected = groups2.agg("mean").reset_index() tm.assert_frame_equal(result, expected)
def test_sort_datetimelike(): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame( { "dt": [ datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1), ], "foo": [10, 8, 5, 6, 4, 1, 7], "bar": [10, 20, 30, 40, 50, 60, 70], }, columns=["dt", "foo", "bar"], ) # ordered=True df["dt"] = Categorical(df["dt"], ordered=True) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]) result_sort.index = CategoricalIndex(index, name="dt", ordered=True) index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]) result_nosort.index = CategoricalIndex(index, categories=index, name="dt", ordered=True) col = "dt" assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) # when categories is ordered, group is ordered by category's order assert_frame_equal(result_sort, df.groupby(col, sort=False, observed=False).first()) # ordered = False df["dt"] = Categorical(df["dt"], ordered=False) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1), ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=["foo", "bar"]) result_sort.index = CategoricalIndex(index, name="dt") index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1), ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=["foo", "bar"]) result_nosort.index = CategoricalIndex(index, categories=index, name="dt") col = "dt" assert_frame_equal(result_sort, df.groupby(col, sort=True, observed=False).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False, observed=False).first())
def test_reindex_empty_index(self): # See GH16770 c = CategoricalIndex([]) res, indexer = c.reindex(["a", "b"]) tm.assert_index_equal(res, Index(["a", "b"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([-1, -1], dtype=np.intp))
def test_categorial_datetimelike(self, method): i = CategoricalIndex([Timestamp("1999-12-31"), Timestamp("2000-12-31")]) result = method(i)[0] assert isinstance(result, Timestamp)
def test_constructor_timedelta64_values_mismatched_dtype(self): # check we don't silently ignore the dtype keyword tdi = timedelta_range("4 Days", periods=5) result = Index(tdi, dtype="category") expected = CategoricalIndex(tdi) tm.assert_index_equal(result, expected)
def test_constructor_categorical_to_object(self): # GH#32167 Categorical data and dtype=object should return object-dtype ci = CategoricalIndex(range(5)) result = Index(ci, dtype=object) assert not isinstance(result, CategoricalIndex)
def test_getitem_bool_mask_categorical_index(self): df3 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=True), name="B", ), ) df4 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex( [1, 1, 2, 1, 3, 2], dtype=CategoricalDtype([3, 2, 1], ordered=False), name="B", ), ) result = df3[df3.index == "a"] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) result = df4[df4.index == "a"] expected = df4.iloc[[]] tm.assert_frame_equal(result, expected) result = df3[df3.index == 1] expected = df3.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) result = df4[df4.index == 1] expected = df4.iloc[[0, 1, 3]] tm.assert_frame_equal(result, expected) # since we have an ordered categorical # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=True, # name='B') result = df3[df3.index < 2] expected = df3.iloc[[4]] tm.assert_frame_equal(result, expected) result = df3[df3.index > 1] expected = df3.iloc[[]] tm.assert_frame_equal(result, expected) # unordered # cannot be compared # CategoricalIndex([1, 1, 2, 1, 3, 2], # categories=[3, 2, 1], # ordered=False, # name='B') msg = "Unordered Categoricals can only compare equality or not" with pytest.raises(TypeError, match=msg): df4[df4.index < 2] with pytest.raises(TypeError, match=msg): df4[df4.index > 1]
def test_format_different_scalar_lengths(self): # GH#35439 idx = CategoricalIndex(["aaaaaaaaa", "b"]) expected = ["aaaaaaaaa", "b"] assert idx.format() == expected
def test_invalid_complib(setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) with tm.ensure_clean(setup_path) as path: msg = r"complib only supports \[.*\] compression." with pytest.raises(ValueError, match=msg): df.to_hdf(path, "df", complib="foolib") @pytest.mark.parametrize( "idx", [ date_range("2019", freq="D", periods=3, tz="UTC"), CategoricalIndex(list("abc")), ], ) def test_to_hdf_multiindex_extension_dtype(idx, setup_path): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) df = DataFrame(0, index=mi, columns=["a"]) with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") def test_unsuppored_hdf_file_error(datapath): # GH 9539 data_path = datapath("io", "data", "legacy_hdf/incompatible_dataset.h5") message = (r"Dataset\(s\) incompatible with Pandas data types, "
def test_equals_categorical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) ci2 = CategoricalIndex(["a", "b"], categories=["a", "b", "c"], ordered=True) assert ci1.equals(ci1) assert not ci1.equals(ci2) assert ci1.equals(ci1.astype(object)) assert ci1.astype(object).equals(ci1) assert (ci1 == ci1).all() assert not (ci1 != ci1).all() assert not (ci1 > ci1).all() assert not (ci1 < ci1).all() assert (ci1 <= ci1).all() assert (ci1 >= ci1).all() assert not (ci1 == 1).all() assert (ci1 == Index(["a", "b"])).all() assert (ci1 == ci1.values).all() # invalid comparisons with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(["a", "b", "c"]) msg = "Categoricals can only be compared if 'categories' are the same" with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, ordered=False) with pytest.raises(TypeError, match=msg): ci1 == Categorical(ci1.values, categories=list("abc")) # tests # make sure that we are testing for category inclusion properly ci = CategoricalIndex(list("aabca"), categories=["c", "a", "b"]) assert not ci.equals(list("aabca")) # Same categories, but different order # Unordered assert ci.equals(CategoricalIndex(list("aabca"))) # Ordered assert not ci.equals(CategoricalIndex(list("aabca"), ordered=True)) assert ci.equals(ci.copy()) ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) assert not ci.equals(list("aabca")) assert not ci.equals(CategoricalIndex(list("aabca"))) assert ci.equals(ci.copy()) ci = CategoricalIndex(list("aabca") + [np.nan], categories=["c", "a", "b"]) assert not ci.equals(list("aabca") + [np.nan]) assert ci.equals(CategoricalIndex(list("aabca") + [np.nan])) assert not ci.equals( CategoricalIndex(list("aabca") + [np.nan], ordered=True)) assert ci.equals(ci.copy())
def test_reindex_with_categoricalindex(self): df = DataFrame( { "A": np.arange(3, dtype="int64"), }, index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), ) # reindexing # convert to a regular index result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical cats = list("cabe") result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) df2 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): df.reindex(["a"], limit=2)
class TestDataFrameSelectReindex: # These are specific reindex-based tests; other indexing tests should go in # test_indexing def test_reindex_copies(self): # based on asv time_reindex_axis1 N = 10 df = DataFrame(np.random.randn(N * 10, N)) cols = np.arange(N) np.random.shuffle(cols) result = df.reindex(columns=cols, copy=True) assert not np.shares_memory(result[0]._values, df[0]._values) # pass both columns and index result2 = df.reindex(columns=cols, index=df.index, copy=True) assert not np.shares_memory(result2[0]._values, df[0]._values) def test_reindex_date_fill_value(self): # passing date to dt64 is deprecated arr = date_range("2016-01-01", periods=6).values.reshape(3, 2) df = DataFrame(arr, columns=["A", "B"], index=range(3)) ts = df.iloc[0, 0] fv = ts.date() with tm.assert_produces_warning(FutureWarning): res = df.reindex(index=range(4), columns=["A", "B", "C"], fill_value=fv) expected = DataFrame( {"A": df["A"].tolist() + [ts], "B": df["B"].tolist() + [ts], "C": [ts] * 4} ) tm.assert_frame_equal(res, expected) # same with a datetime-castable str res = df.reindex( index=range(4), columns=["A", "B", "C"], fill_value="2016-01-01" ) tm.assert_frame_equal(res, expected) def test_reindex_with_multi_index(self): # https://github.com/pandas-dev/pandas/issues/29896 # tests for reindexing a multi-indexed DataFrame with a new MultiIndex # # confirms that we can reindex a multi-indexed DataFrame with a new # MultiIndex object correctly when using no filling, backfilling, and # padding # # The DataFrame, `df`, used in this test is: # c # a b # -1 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # 0 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # 1 0 A # 1 B # 2 C # 3 D # 4 E # 5 F # 6 G # # and the other MultiIndex, `new_multi_index`, is: # 0: 0 0.5 # 1: 2.0 # 2: 5.0 # 3: 5.8 df = DataFrame( { "a": [-1] * 7 + [0] * 7 + [1] * 7, "b": list(range(7)) * 3, "c": ["A", "B", "C", "D", "E", "F", "G"] * 3, } ).set_index(["a", "b"]) new_index = [0.5, 2.0, 5.0, 5.8] new_multi_index = MultiIndex.from_product([[0], new_index], names=["a", "b"]) # reindexing w/o a `method` value reindexed = df.reindex(new_multi_index) expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": [np.nan, "C", "F", np.nan]} ).set_index(["a", "b"]) tm.assert_frame_equal(expected, reindexed) # reindexing with backfilling expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": ["B", "C", "F", "G"]} ).set_index(["a", "b"]) reindexed_with_backfilling = df.reindex(new_multi_index, method="bfill") tm.assert_frame_equal(expected, reindexed_with_backfilling) reindexed_with_backfilling = df.reindex(new_multi_index, method="backfill") tm.assert_frame_equal(expected, reindexed_with_backfilling) # reindexing with padding expected = DataFrame( {"a": [0] * 4, "b": new_index, "c": ["A", "C", "F", "F"]} ).set_index(["a", "b"]) reindexed_with_padding = df.reindex(new_multi_index, method="pad") tm.assert_frame_equal(expected, reindexed_with_padding) reindexed_with_padding = df.reindex(new_multi_index, method="ffill") tm.assert_frame_equal(expected, reindexed_with_padding) @pytest.mark.parametrize( "method,expected_values", [ ("nearest", [0, 1, 1, 2]), ("pad", [np.nan, 0, 1, 1]), ("backfill", [0, 1, 2, 2]), ], ) def test_reindex_methods(self, method, expected_values): df = DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) expected = DataFrame({"x": expected_values}, index=target) actual = df.reindex(target, method=method) tm.assert_frame_equal(expected, actual) actual = df.reindex(target, method=method, tolerance=1) tm.assert_frame_equal(expected, actual) actual = df.reindex(target, method=method, tolerance=[1, 1, 1, 1]) tm.assert_frame_equal(expected, actual) e2 = expected[::-1] actual = df.reindex(target[::-1], method=method) tm.assert_frame_equal(e2, actual) new_order = [3, 0, 2, 1] e2 = expected.iloc[new_order] actual = df.reindex(target[new_order], method=method) tm.assert_frame_equal(e2, actual) switched_method = ( "pad" if method == "backfill" else "backfill" if method == "pad" else method ) actual = df[::-1].reindex(target, method=switched_method) tm.assert_frame_equal(expected, actual) def test_reindex_methods_nearest_special(self): df = DataFrame({"x": list(range(5))}) target = np.array([-0.1, 0.9, 1.1, 1.5]) expected = DataFrame({"x": [0, 1, 1, np.nan]}, index=target) actual = df.reindex(target, method="nearest", tolerance=0.2) tm.assert_frame_equal(expected, actual) expected = DataFrame({"x": [0, np.nan, 1, np.nan]}, index=target) actual = df.reindex(target, method="nearest", tolerance=[0.5, 0.01, 0.4, 0.1]) tm.assert_frame_equal(expected, actual) def test_reindex_nearest_tz(self, tz_aware_fixture): # GH26683 tz = tz_aware_fixture idx = date_range("2019-01-01", periods=5, tz=tz) df = DataFrame({"x": list(range(5))}, index=idx) expected = df.head(3) actual = df.reindex(idx[:3], method="nearest") tm.assert_frame_equal(expected, actual) def test_reindex_nearest_tz_empty_frame(self): # https://github.com/pandas-dev/pandas/issues/31964 dti = pd.DatetimeIndex(["2016-06-26 14:27:26+00:00"]) df = DataFrame(index=pd.DatetimeIndex(["2016-07-04 14:00:59+00:00"])) expected = DataFrame(index=dti) result = df.reindex(dti, method="nearest") tm.assert_frame_equal(result, expected) def test_reindex_frame_add_nat(self): rng = date_range("1/1/2000 00:00:00", periods=10, freq="10s") df = DataFrame({"A": np.random.randn(len(rng)), "B": rng}) result = df.reindex(range(15)) assert np.issubdtype(result["B"].dtype, np.dtype("M8[ns]")) mask = com.isna(result)["B"] assert mask[-5:].all() assert not mask[:-5].any() @pytest.mark.parametrize( "method, exp_values", [("ffill", [0, 1, 2, 3]), ("bfill", [1.0, 2.0, 3.0, np.nan])], ) def test_reindex_frame_tz_ffill_bfill(self, frame_or_series, method, exp_values): # GH#38566 obj = frame_or_series( [0, 1, 2, 3], index=date_range("2020-01-01 00:00:00", periods=4, freq="H", tz="UTC"), ) new_index = date_range("2020-01-01 00:01:00", periods=4, freq="H", tz="UTC") result = obj.reindex(new_index, method=method, tolerance=pd.Timedelta("1 hour")) expected = frame_or_series(exp_values, index=new_index) tm.assert_equal(result, expected) def test_reindex_limit(self): # GH 28631 data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] exp_data = [ ["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"], ["D", "D", "D"], [np.nan, np.nan, np.nan], ] df = DataFrame(data) result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) expected = DataFrame(exp_data) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "idx, check_index_type", [ [["C", "B", "A"], True], [["F", "C", "A", "D"], True], [["A"], True], [["A", "B", "C"], True], [["C", "A", "B"], True], [["C", "B"], True], [["C", "A"], True], [["A", "B"], True], [["B", "A", "C"], True], # reindex by these causes different MultiIndex levels [["D", "F"], False], [["A", "C", "B"], False], ], ) def test_reindex_level_verify_first_level(self, idx, check_index_type): df = DataFrame( { "jim": list("B" * 4 + "A" * 2 + "C" * 3), "joe": list("abcdeabcd")[::-1], "jolie": [10, 20, 30] * 3, "joline": np.random.randint(0, 1000, 9), } ) icol = ["jim", "joe", "jolie"] def f(val): return np.nonzero((df["jim"] == val).to_numpy())[0] i = np.concatenate(list(map(f, idx))) left = df.set_index(icol).reindex(idx, level="jim") right = df.iloc[i].set_index(icol) tm.assert_frame_equal(left, right, check_index_type=check_index_type) @pytest.mark.parametrize( "idx", [ ("mid",), ("mid", "btm"), ("mid", "btm", "top"), ("mid",), ("mid", "top"), ("mid", "top", "btm"), ("btm",), ("btm", "mid"), ("btm", "mid", "top"), ("btm",), ("btm", "top"), ("btm", "top", "mid"), ("top",), ("top", "mid"), ("top", "mid", "btm"), ("top",), ("top", "btm"), ("top", "btm", "mid"), ], ) def test_reindex_level_verify_first_level_repeats(self, idx): df = DataFrame( { "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, "joe": ["3rd"] * 2 + ["1st"] * 3 + ["2nd"] * 3 + ["1st"] * 2 + ["3rd"] * 3 + ["1st"] * 2 + ["3rd"] * 3 + ["2nd"] * 2, # this needs to be jointly unique with jim and joe or # reindexing will fail ~1.5% of the time, this works # out to needing unique groups of same size as joe "jolie": np.concatenate( [ np.random.choice(1000, x, replace=False) for x in [2, 3, 3, 2, 3, 2, 3, 2] ] ), "joline": np.random.randn(20).round(3) * 10, } ) icol = ["jim", "joe", "jolie"] def f(val): return np.nonzero((df["jim"] == val).to_numpy())[0] i = np.concatenate(list(map(f, idx))) left = df.set_index(icol).reindex(idx, level="jim") right = df.iloc[i].set_index(icol) tm.assert_frame_equal(left, right) @pytest.mark.parametrize( "idx, indexer", [ [ ["1st", "2nd", "3rd"], [2, 3, 4, 0, 1, 8, 9, 5, 6, 7, 10, 11, 12, 13, 14, 18, 19, 15, 16, 17], ], [ ["3rd", "2nd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 5, 6, 7, 8, 9, 15, 16, 17, 18, 19, 13, 14], ], [["2nd", "3rd"], [0, 1, 5, 6, 7, 10, 11, 12, 18, 19, 15, 16, 17]], [["3rd", "1st"], [0, 1, 2, 3, 4, 10, 11, 12, 8, 9, 15, 16, 17, 13, 14]], ], ) def test_reindex_level_verify_repeats(self, idx, indexer): df = DataFrame( { "jim": ["mid"] * 5 + ["btm"] * 8 + ["top"] * 7, "joe": ["3rd"] * 2 + ["1st"] * 3 + ["2nd"] * 3 + ["1st"] * 2 + ["3rd"] * 3 + ["1st"] * 2 + ["3rd"] * 3 + ["2nd"] * 2, # this needs to be jointly unique with jim and joe or # reindexing will fail ~1.5% of the time, this works # out to needing unique groups of same size as joe "jolie": np.concatenate( [ np.random.choice(1000, x, replace=False) for x in [2, 3, 3, 2, 3, 2, 3, 2] ] ), "joline": np.random.randn(20).round(3) * 10, } ) icol = ["jim", "joe", "jolie"] left = df.set_index(icol).reindex(idx, level="joe") right = df.iloc[indexer].set_index(icol) tm.assert_frame_equal(left, right) @pytest.mark.parametrize( "idx, indexer, check_index_type", [ [list("abcde"), [3, 2, 1, 0, 5, 4, 8, 7, 6], True], [list("abcd"), [3, 2, 1, 0, 5, 8, 7, 6], True], [list("abc"), [3, 2, 1, 8, 7, 6], True], [list("eca"), [1, 3, 4, 6, 8], True], [list("edc"), [0, 1, 4, 5, 6], True], [list("eadbc"), [3, 0, 2, 1, 4, 5, 8, 7, 6], True], [list("edwq"), [0, 4, 5], True], [list("wq"), [], False], ], ) def test_reindex_level_verify(self, idx, indexer, check_index_type): df = DataFrame( { "jim": list("B" * 4 + "A" * 2 + "C" * 3), "joe": list("abcdeabcd")[::-1], "jolie": [10, 20, 30] * 3, "joline": np.random.randint(0, 1000, 9), } ) icol = ["jim", "joe", "jolie"] left = df.set_index(icol).reindex(idx, level="joe") right = df.iloc[indexer].set_index(icol) tm.assert_frame_equal(left, right, check_index_type=check_index_type) def test_non_monotonic_reindex_methods(self): dr = date_range("2013-08-01", periods=6, freq="B") data = np.random.randn(6, 1) df = DataFrame(data, index=dr, columns=list("A")) df_rev = DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], columns=list("A")) # index is not monotonic increasing or decreasing msg = "index must be monotonic increasing or decreasing" with pytest.raises(ValueError, match=msg): df_rev.reindex(df.index, method="pad") with pytest.raises(ValueError, match=msg): df_rev.reindex(df.index, method="ffill") with pytest.raises(ValueError, match=msg): df_rev.reindex(df.index, method="bfill") with pytest.raises(ValueError, match=msg): df_rev.reindex(df.index, method="nearest") def test_reindex_sparse(self): # https://github.com/pandas-dev/pandas/issues/35286 df = DataFrame( {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} ) result = df.reindex([0, 2]) expected = DataFrame( { "A": [0.0, np.nan], "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), }, index=[0, 2], ) tm.assert_frame_equal(result, expected) def test_reindex(self, float_frame): datetime_series = tm.makeTimeSeries(nper=30) newFrame = float_frame.reindex(datetime_series.index) for col in newFrame.columns: for idx, val in newFrame[col].items(): if idx in float_frame.index: if np.isnan(val): assert np.isnan(float_frame[col][idx]) else: assert val == float_frame[col][idx] else: assert np.isnan(val) for col, series in newFrame.items(): assert tm.equalContents(series.index, newFrame.index) emptyFrame = float_frame.reindex(Index([])) assert len(emptyFrame.index) == 0 # Cython code should be unit-tested directly nonContigFrame = float_frame.reindex(datetime_series.index[::2]) for col in nonContigFrame.columns: for idx, val in nonContigFrame[col].items(): if idx in float_frame.index: if np.isnan(val): assert np.isnan(float_frame[col][idx]) else: assert val == float_frame[col][idx] else: assert np.isnan(val) for col, series in nonContigFrame.items(): assert tm.equalContents(series.index, nonContigFrame.index) # corner cases # Same index, copies values but not index if copy=False newFrame = float_frame.reindex(float_frame.index, copy=False) assert newFrame.index is float_frame.index # length zero newFrame = float_frame.reindex([]) assert newFrame.empty assert len(newFrame.columns) == len(float_frame.columns) # length zero with columns reindexed with non-empty index newFrame = float_frame.reindex([]) newFrame = newFrame.reindex(float_frame.index) assert len(newFrame.index) == len(float_frame.index) assert len(newFrame.columns) == len(float_frame.columns) # pass non-Index newFrame = float_frame.reindex(list(datetime_series.index)) expected = datetime_series.index._with_freq(None) tm.assert_index_equal(newFrame.index, expected) # copy with no axes result = float_frame.reindex() tm.assert_frame_equal(result, float_frame) assert result is not float_frame def test_reindex_nan(self): df = DataFrame( [[1, 2], [3, 5], [7, 11], [9, 23]], index=[2, np.nan, 1, 5], columns=["joe", "jim"], ) i, j = [np.nan, 5, 5, np.nan, 1, 2, np.nan], [1, 3, 3, 1, 2, 0, 1] tm.assert_frame_equal(df.reindex(i), df.iloc[j]) df.index = df.index.astype("object") tm.assert_frame_equal(df.reindex(i), df.iloc[j], check_index_type=False) # GH10388 df = DataFrame( { "other": ["a", "b", np.nan, "c"], "date": ["2015-03-22", np.nan, "2012-01-08", np.nan], "amount": [2, 3, 4, 5], } ) df["date"] = pd.to_datetime(df.date) df["delta"] = (pd.to_datetime("2015-06-18") - df["date"]).shift(1) left = df.set_index(["delta", "other", "date"]).reset_index() right = df.reindex(columns=["delta", "other", "date", "amount"]) tm.assert_frame_equal(left, right) def test_reindex_name_remains(self): s = Series(np.random.rand(10)) df = DataFrame(s, index=np.arange(len(s))) i = Series(np.arange(10), name="iname") df = df.reindex(i) assert df.index.name == "iname" df = df.reindex(Index(np.arange(10), name="tmpname")) assert df.index.name == "tmpname" s = Series(np.random.rand(10)) df = DataFrame(s.T, index=np.arange(len(s))) i = Series(np.arange(10), name="iname") df = df.reindex(columns=i) assert df.columns.name == "iname" def test_reindex_int(self, int_frame): smaller = int_frame.reindex(int_frame.index[::2]) assert smaller["A"].dtype == np.int64 bigger = smaller.reindex(int_frame.index) assert bigger["A"].dtype == np.float64 smaller = int_frame.reindex(columns=["A", "B"]) assert smaller["A"].dtype == np.int64 def test_reindex_columns(self, float_frame): new_frame = float_frame.reindex(columns=["A", "B", "E"]) tm.assert_series_equal(new_frame["B"], float_frame["B"]) assert np.isnan(new_frame["E"]).all() assert "C" not in new_frame # Length zero new_frame = float_frame.reindex(columns=[]) assert new_frame.empty def test_reindex_columns_method(self): # GH 14992, reindexing over columns ignored method df = DataFrame( data=[[11, 12, 13], [21, 22, 23], [31, 32, 33]], index=[1, 2, 4], columns=[1, 2, 4], dtype=float, ) # default method result = df.reindex(columns=range(6)) expected = DataFrame( data=[ [np.nan, 11, 12, np.nan, 13, np.nan], [np.nan, 21, 22, np.nan, 23, np.nan], [np.nan, 31, 32, np.nan, 33, np.nan], ], index=[1, 2, 4], columns=range(6), dtype=float, ) tm.assert_frame_equal(result, expected) # method='ffill' result = df.reindex(columns=range(6), method="ffill") expected = DataFrame( data=[ [np.nan, 11, 12, 12, 13, 13], [np.nan, 21, 22, 22, 23, 23], [np.nan, 31, 32, 32, 33, 33], ], index=[1, 2, 4], columns=range(6), dtype=float, ) tm.assert_frame_equal(result, expected) # method='bfill' result = df.reindex(columns=range(6), method="bfill") expected = DataFrame( data=[ [11, 11, 12, 13, 13, np.nan], [21, 21, 22, 23, 23, np.nan], [31, 31, 32, 33, 33, np.nan], ], index=[1, 2, 4], columns=range(6), dtype=float, ) tm.assert_frame_equal(result, expected) def test_reindex_axes(self): # GH 3317, reindexing by both axes loses freq of the index df = DataFrame( np.ones((3, 3)), index=[datetime(2012, 1, 1), datetime(2012, 1, 2), datetime(2012, 1, 3)], columns=["a", "b", "c"], ) time_freq = date_range("2012-01-01", "2012-01-03", freq="d") some_cols = ["a", "b"] index_freq = df.reindex(index=time_freq).index.freq both_freq = df.reindex(index=time_freq, columns=some_cols).index.freq seq_freq = df.reindex(index=time_freq).reindex(columns=some_cols).index.freq assert index_freq == both_freq assert index_freq == seq_freq def test_reindex_fill_value(self): df = DataFrame(np.random.randn(10, 4)) # axis=0 result = df.reindex(list(range(15))) assert np.isnan(result.values[-5:]).all() result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) tm.assert_frame_equal(result, expected) # axis=1 result = df.reindex(columns=range(5), fill_value=0.0) expected = df.copy() expected[4] = 0.0 tm.assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value=0) expected = df.copy() expected[4] = 0 tm.assert_frame_equal(result, expected) result = df.reindex(columns=range(5), fill_value="foo") expected = df.copy() expected[4] = "foo" tm.assert_frame_equal(result, expected) # other dtypes df["foo"] = "foo" result = df.reindex(range(15), fill_value=0) expected = df.reindex(range(15)).fillna(0) tm.assert_frame_equal(result, expected) def test_reindex_dups(self): # GH4746, reindex on duplicate index error messages arr = np.random.randn(10) df = DataFrame(arr, index=[1, 2, 3, 4, 5, 1, 2, 3, 4, 5]) # set index is ok result = df.copy() result.index = list(range(len(df))) expected = DataFrame(arr, index=list(range(len(df)))) tm.assert_frame_equal(result, expected) # reindex fails msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df.reindex(index=list(range(len(df)))) def test_reindex_with_duplicate_columns(self): # reindex is invalid! df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] ) msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df.reindex(columns=["bar"]) with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df.reindex(columns=["bar", "foo"]) def test_reindex_axis_style(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) expected = DataFrame( {"A": [1, 2, np.nan], "B": [4, 5, np.nan]}, index=[0, 1, 3] ) result = df.reindex([0, 1, 3]) tm.assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis=0) tm.assert_frame_equal(result, expected) result = df.reindex([0, 1, 3], axis="index") tm.assert_frame_equal(result, expected) def test_reindex_positional_warns(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) expected = DataFrame({"A": [1.0, 2], "B": [4.0, 5], "C": [np.nan, np.nan]}) with tm.assert_produces_warning(FutureWarning): result = df.reindex([0, 1], ["A", "B", "C"]) tm.assert_frame_equal(result, expected) def test_reindex_axis_style_raises(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ["A"], axis=1) with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex([0, 1], ["A"], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], columns=[0, 1], axis="columns") with pytest.raises(TypeError, match="Cannot specify all"): df.reindex([0, 1], [0], ["A"]) # Mixing styles with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="index") with pytest.raises(TypeError, match="Cannot specify both 'axis'"): df.reindex(index=[0, 1], axis="columns") # Duplicates with pytest.raises(TypeError, match="multiple values"): df.reindex([0, 1], labels=[0, 1]) def test_reindex_single_named_indexer(self): # https://github.com/pandas-dev/pandas/issues/12392 df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3]}) result = df.reindex([0, 1], columns=["A"]) expected = DataFrame({"A": [1, 2]}) tm.assert_frame_equal(result, expected) def test_reindex_api_equivalence(self): # https://github.com/pandas-dev/pandas/issues/12392 # equivalence of the labels/axis and index/columns API's df = DataFrame( [[1, 2, 3], [3, 4, 5], [5, 6, 7]], index=["a", "b", "c"], columns=["d", "e", "f"], ) res1 = df.reindex(["b", "a"]) res2 = df.reindex(index=["b", "a"]) res3 = df.reindex(labels=["b", "a"]) res4 = df.reindex(labels=["b", "a"], axis=0) res5 = df.reindex(["b", "a"], axis=0) for res in [res2, res3, res4, res5]: tm.assert_frame_equal(res1, res) res1 = df.reindex(columns=["e", "d"]) res2 = df.reindex(["e", "d"], axis=1) res3 = df.reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) with tm.assert_produces_warning(FutureWarning) as m: res1 = df.reindex(["b", "a"], ["e", "d"]) assert "reindex" in str(m[0].message) res2 = df.reindex(columns=["e", "d"], index=["b", "a"]) res3 = df.reindex(labels=["b", "a"], axis=0).reindex(labels=["e", "d"], axis=1) for res in [res2, res3]: tm.assert_frame_equal(res1, res) def test_reindex_boolean(self): frame = DataFrame( np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2] ) reindexed = frame.reindex(np.arange(10)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[0][1]) reindexed = frame.reindex(columns=range(3)) assert reindexed.values.dtype == np.object_ assert isna(reindexed[1]).all() def test_reindex_objects(self, float_string_frame): reindexed = float_string_frame.reindex(columns=["foo", "A", "B"]) assert "foo" in reindexed reindexed = float_string_frame.reindex(columns=["A", "B"]) assert "foo" not in reindexed def test_reindex_corner(self, int_frame): index = Index(["a", "b", "c"]) dm = DataFrame({}).reindex(index=[1, 2, 3]) reindexed = dm.reindex(columns=index) tm.assert_index_equal(reindexed.columns, index) # ints are weird smaller = int_frame.reindex(columns=["A", "B", "E"]) assert smaller["E"].dtype == np.float64 def test_reindex_with_nans(self): df = DataFrame( [[1, 2], [3, 4], [np.nan, np.nan], [7, 8], [9, 10]], columns=["a", "b"], index=[100.0, 101.0, np.nan, 102.0, 103.0], ) result = df.reindex(index=[101.0, 102.0, 103.0]) expected = df.iloc[[1, 3, 4]] tm.assert_frame_equal(result, expected) result = df.reindex(index=[103.0]) expected = df.iloc[[4]] tm.assert_frame_equal(result, expected) result = df.reindex(index=[101.0]) expected = df.iloc[[1]] tm.assert_frame_equal(result, expected) def test_reindex_multi(self): df = DataFrame(np.random.randn(3, 3)) result = df.reindex(index=range(4), columns=range(4)) expected = df.reindex(list(range(4))).reindex(columns=range(4)) tm.assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(index=range(4), columns=range(4)) expected = df.reindex(list(range(4))).reindex(columns=range(4)) tm.assert_frame_equal(result, expected) df = DataFrame(np.random.randint(0, 10, (3, 3))) result = df.reindex(index=range(2), columns=range(2)) expected = df.reindex(range(2)).reindex(columns=range(2)) tm.assert_frame_equal(result, expected) df = DataFrame(np.random.randn(5, 3) + 1j, columns=["a", "b", "c"]) result = df.reindex(index=[0, 1], columns=["a", "b"]) expected = df.reindex([0, 1]).reindex(columns=["a", "b"]) tm.assert_frame_equal(result, expected) def test_reindex_multi_categorical_time(self): # https://github.com/pandas-dev/pandas/issues/21390 midx = MultiIndex.from_product( [ Categorical(["a", "b", "c"]), Categorical(date_range("2012-01-01", periods=3, freq="H")), ] ) df = DataFrame({"a": range(len(midx))}, index=midx) df2 = df.iloc[[0, 1, 2, 3, 4, 5, 6, 8]] result = df2.reindex(midx) expected = DataFrame({"a": [0, 1, 2, 3, 4, 5, 6, np.nan, 8]}, index=midx) tm.assert_frame_equal(result, expected) def test_reindex_with_categoricalindex(self): df = DataFrame( { "A": np.arange(3, dtype="int64"), }, index=CategoricalIndex(list("abc"), dtype=CDT(list("cabe")), name="B"), ) # reindexing # convert to a regular index result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["d"]) expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # since we are actually reindexing with a Categorical # then return a Categorical cats = list("cabe") result = df.reindex(Categorical(["a", "e"], categories=cats)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a"], categories=cats)) expected = DataFrame( {"A": [0], "B": Series(list("a")).astype(CDT(cats))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b", "e"]) expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index( "B" ) tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["a", "b"]) expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(["e"]) expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) # give back the type of categorical that we received result = df.reindex(Categorical(["a", "e"], categories=cats, ordered=True)) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) result = df.reindex(Categorical(["a", "d"], categories=["a", "d"])) expected = DataFrame( {"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))} ).set_index("B") tm.assert_frame_equal(result, expected, check_index_type=True) df2 = DataFrame( { "A": np.arange(6, dtype="int64"), }, index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"), ) # passed duplicate indexers are not allowed msg = "cannot reindex on an axis with duplicate labels" with pytest.raises(ValueError, match=msg): with tm.assert_produces_warning(FutureWarning, match="non-unique"): df2.reindex(["a", "b"]) # args NotImplemented ATM msg = r"argument {} is not implemented for CategoricalIndex\.reindex" with pytest.raises(NotImplementedError, match=msg.format("method")): df.reindex(["a"], method="ffill") with pytest.raises(NotImplementedError, match=msg.format("level")): df.reindex(["a"], level=1) with pytest.raises(NotImplementedError, match=msg.format("limit")): df.reindex(["a"], limit=2) def test_reindex_signature(self): sig = inspect.signature(DataFrame.reindex) parameters = set(sig.parameters) assert parameters == { "self", "labels", "index", "columns", "axis", "limit", "copy", "level", "method", "fill_value", "tolerance", } def test_reindex_multiindex_ffill_added_rows(self): # GH#23693 # reindex added rows with nan values even when fill method was specified mi = MultiIndex.from_tuples([("a", "b"), ("d", "e")]) df = DataFrame([[0, 7], [3, 4]], index=mi, columns=["x", "y"]) mi2 = MultiIndex.from_tuples([("a", "b"), ("d", "e"), ("h", "i")]) result = df.reindex(mi2, axis=0, method="ffill") expected = DataFrame([[0, 7], [3, 4], [3, 4]], index=mi2, columns=["x", "y"]) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "kwargs", [ {"method": "pad", "tolerance": timedelta(seconds=9)}, {"method": "backfill", "tolerance": timedelta(seconds=9)}, {"method": "nearest"}, {"method": None}, ], ) def test_reindex_empty_frame(self, kwargs): # GH#27315 idx = date_range(start="2020", freq="30s", periods=3) df = DataFrame([], index=Index([], name="time"), columns=["a"]) result = df.reindex(idx, **kwargs) expected = DataFrame({"a": [pd.NA] * 3}, index=idx) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "src_idx", [ Index([]), CategoricalIndex([]), ], ) @pytest.mark.parametrize( "cat_idx", [ # No duplicates Index([]), CategoricalIndex([]), Index(["A", "B"]), CategoricalIndex(["A", "B"]), # Duplicates: GH#38906 Index(["A", "A"]), CategoricalIndex(["A", "A"]), ], ) def test_reindex_empty(self, src_idx, cat_idx): df = DataFrame(columns=src_idx, index=["K"], dtype="f8") result = df.reindex(columns=cat_idx) expected = DataFrame(index=["K"], columns=cat_idx, dtype="f8") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]"]) def test_reindex_datetimelike_to_object(self, dtype): # GH#39755 dont cast dt64/td64 to ints mi = MultiIndex.from_product([list("ABCDE"), range(2)]) dti = date_range("2016-01-01", periods=10) fv = np.timedelta64("NaT", "ns") if dtype == "m8[ns]": dti = dti - dti[0] fv = np.datetime64("NaT", "ns") ser = Series(dti, index=mi) ser[::3] = pd.NaT df = ser.unstack() index = df.index.append(Index([1])) columns = df.columns.append(Index(["foo"])) res = df.reindex(index=index, columns=columns, fill_value=fv) expected = DataFrame( { 0: df[0].tolist() + [fv], 1: df[1].tolist() + [fv], "foo": np.array(["NaT"] * 6, dtype=fv.dtype), }, index=index, ) assert (res.dtypes[[0, 1]] == object).all() assert res.iloc[0, 0] is pd.NaT assert res.iloc[-1, 0] is fv assert res.iloc[-1, 1] is fv tm.assert_frame_equal(res, expected) @pytest.mark.parametrize( "index_df,index_res,index_exp", [ ( CategoricalIndex([], categories=["A"]), Index(["A"]), Index(["A"]), ), ( CategoricalIndex([], categories=["A"]), Index(["B"]), Index(["B"]), ), ( CategoricalIndex([], categories=["A"]), CategoricalIndex(["A"]), CategoricalIndex(["A"]), ), ( CategoricalIndex([], categories=["A"]), CategoricalIndex(["B"]), CategoricalIndex(["B"]), ), ], ) def test_reindex_not_category(self, index_df, index_res, index_exp): # GH#28690 df = DataFrame(index=index_df) result = df.reindex(index=index_res) expected = DataFrame(index=index_exp) tm.assert_frame_equal(result, expected)
def makeCategoricalIndex(k=10, n=3, name=None, **kwargs): """make a length k index or n categories""" x = rands_array(nchars=4, size=n) return CategoricalIndex( Categorical.from_codes(np.arange(k) % n, categories=x), name=name, **kwargs )
def test_constructor_period_values_mismatched_dtype(self): pi = period_range("2016-01-01", periods=3, freq="D") result = Index(pi, dtype="category") expected = CategoricalIndex(pi) tm.assert_index_equal(result, expected)
def test_categorical_categories(self): # GH17884 c1 = CategoricalDtype(Categorical(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"])) c1 = CategoricalDtype(CategoricalIndex(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
def test_constructor_interval_values_mismatched_dtype(self): dti = date_range("2016-01-01", periods=3) ii = IntervalIndex.from_breaks(dti) result = Index(ii, dtype="category") expected = CategoricalIndex(ii) tm.assert_index_equal(result, expected)
def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(['a', 'b', 'c']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] exp_index = CategoricalIndex(['a', 'b'], categories=index.categories) exp = DataFrame({'A': [1, 2], 'B': [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp_index = CategoricalIndex(['a', 'a', 'b'], categories=index.categories) exp = DataFrame({'A': [1, 1, 2], 'B': [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) msg = ('a list-indexer must only include ' 'values that are in the categories') with pytest.raises(KeyError, match=msg): df.loc[['a', 'x']] # duplicated categories and codes index = CategoricalIndex(['a', 'b', 'a']) df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=index) # unique slice res = df.loc[['a', 'b']] exp = DataFrame({ 'A': [1, 3, 2], 'B': [4, 6, 5] }, index=CategoricalIndex(['a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp = DataFrame({ 'A': [1, 3, 1, 3, 2], 'B': [4, 6, 4, 6, 5] }, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'])) tm.assert_frame_equal(res, exp, check_index_type=True) msg = ('a list-indexer must only include values ' 'that are in the categories') with pytest.raises(KeyError, match=msg): df.loc[['a', 'x']] # contains unused category index = CategoricalIndex(['a', 'b', 'a', 'c'], categories=list('abcde')) df = DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=index) res = df.loc[['a', 'b']] exp = DataFrame({ 'A': [1, 3, 2], 'B': [5, 7, 6] }, index=CategoricalIndex(['a', 'a', 'b'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[['a', 'e']] exp = DataFrame({ 'A': [1, 3, np.nan], 'B': [5, 7, np.nan] }, index=CategoricalIndex(['a', 'a', 'e'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[['a', 'a', 'b']] exp = DataFrame({ 'A': [1, 3, 1, 3, 2], 'B': [5, 7, 5, 7, 6] }, index=CategoricalIndex(['a', 'a', 'a', 'a', 'b'], categories=list('abcde'))) tm.assert_frame_equal(res, exp, check_index_type=True) msg = ('a list-indexer must only include values ' 'that are in the categories') with pytest.raises(KeyError, match=msg): df.loc[['a', 'x']]
def test_construction(self): ci = CategoricalIndex(list("aabbca"), categories=list("abcd"), ordered=False) categories = ci.categories result = Index(ci) tm.assert_index_equal(result, ci, exact=True) assert not result.ordered result = Index(ci.values) tm.assert_index_equal(result, ci, exact=True) assert not result.ordered # empty result = CategoricalIndex(categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal(result.codes, np.array([], dtype="int8")) assert not result.ordered # passing categories result = CategoricalIndex(list("aabbca"), categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") ) c = Categorical(list("aabbca")) result = CategoricalIndex(c) tm.assert_index_equal(result.categories, Index(list("abc"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") ) assert not result.ordered result = CategoricalIndex(c, categories=categories) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") ) assert not result.ordered ci = CategoricalIndex(c, categories=list("abcd")) result = CategoricalIndex(ci) tm.assert_index_equal(result.categories, Index(categories)) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, 2, 0], dtype="int8") ) assert not result.ordered result = CategoricalIndex(ci, categories=list("ab")) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") ) assert not result.ordered result = CategoricalIndex(ci, categories=list("ab"), ordered=True) tm.assert_index_equal(result.categories, Index(list("ab"))) tm.assert_numpy_array_equal( result.codes, np.array([0, 0, 1, 1, -1, 0], dtype="int8") ) assert result.ordered result = CategoricalIndex(ci, categories=list("ab"), ordered=True) expected = CategoricalIndex( ci, categories=list("ab"), ordered=True, dtype="category" ) tm.assert_index_equal(result, expected, exact=True) # turn me to an Index result = Index(np.array(ci)) assert isinstance(result, Index) assert not isinstance(result, CategoricalIndex)
def test_groupby(self): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b").mean() tm.assert_frame_equal(result, expected) raw_cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) raw_cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": raw_cat1, "B": raw_cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A") exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # multiple groupers gb = df.groupby(['A', 'B']) exp_index = pd.MultiIndex.from_product([ Categorical(["a", "b", "z"], ordered=True), Categorical(["c", "d", "y"], ordered=True) ], names=['A', 'B']) expected = DataFrame( {'values': [1, 2, np.nan, 3, 4, np.nan, np.nan, np.nan, np.nan]}, index=exp_index) result = gb.sum() tm.assert_frame_equal(result, expected) # multiple groupers with a non-cat df = df.copy() df['C'] = ['foo', 'bar'] * 2 gb = df.groupby(['A', 'B', 'C']) exp_index = pd.MultiIndex.from_product([ Categorical(["a", "b", "z"], ordered=True), Categorical(["c", "d", "y"], ordered=True), ['foo', 'bar'] ], names=['A', 'B', 'C']) expected = DataFrame({ 'values': Series(np.nan, index=exp_index) }).sort_index() expected.iloc[[1, 2, 7, 8], 0] = [1, 2, 3, 4] result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame( [[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id']) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal(df.a.groupby(c).filter(np.all), df['a']) tm.assert_frame_equal(df.groupby(c).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal(df.groupby(c).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected)
def test_reindex_dtype(self): c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(["a", "c"]) tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(["a", "c"]) exp = Index(["a", "a", "c"], dtype="object") tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp)) c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) res, indexer = c.reindex(Categorical(["a", "c"])) exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"]) tm.assert_index_equal(res, exp, exact=True) tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
def test_groupby_sort_categorical_datetimelike(self): # GH10505 # use same data as test_groupby_sort_categorical, which category is # corresponding to datetime.month df = DataFrame( { 'dt': [ datetime(2011, 7, 1), datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 2, 1), datetime(2011, 1, 1), datetime(2011, 5, 1) ], 'foo': [10, 8, 5, 6, 4, 1, 7], 'bar': [10, 20, 30, 40, 50, 60, 70] }, columns=['dt', 'foo', 'bar']) # ordered=True df['dt'] = Categorical(df['dt'], ordered=True) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt', ordered=True) index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt', ordered=True) col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) # when categories is ordered, group is ordered by category's order assert_frame_equal(result_sort, df.groupby(col, sort=False).first()) # ordered = False df['dt'] = Categorical(df['dt'], ordered=False) index = [ datetime(2011, 1, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 7, 1) ] result_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar']) result_sort.index = CategoricalIndex(index, name='dt') index = [ datetime(2011, 7, 1), datetime(2011, 2, 1), datetime(2011, 5, 1), datetime(2011, 1, 1) ] result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]], columns=['foo', 'bar']) result_nosort.index = CategoricalIndex(index, categories=index, name='dt') col = 'dt' assert_frame_equal(result_sort, df.groupby(col, sort=True).first()) assert_frame_equal(result_nosort, df.groupby(col, sort=False).first())
("foo", "two", "min"), ("foo", "two", "max"), ("bar", "one", "min"), ("bar", "one", "max"), ("bar", "three", "min"), ("bar", "three", "max"), ], names=["A", "B", None], ), [1, 1, 3, 3, 2, 2, 4, 4], ), ( False, MultiIndex.from_product( [ CategoricalIndex(["bar", "foo"], ordered=False), CategoricalIndex(["one", "three", "two"], ordered=False), Index(["min", "max"]), ], names=["A", "B", None], ), [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3], ), ( None, MultiIndex.from_product( [ CategoricalIndex(["bar", "foo"], ordered=False), CategoricalIndex(["one", "three", "two"], ordered=False), Index(["min", "max"]), ],
def test_loc_listlike_dtypes(self): # GH 11586 # unique categories and codes index = CategoricalIndex(["a", "b", "c"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp_index = CategoricalIndex(["a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories) exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include " "values that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # duplicated categories and codes index = CategoricalIndex(["a", "b", "a"]) df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index) # unique slice res = df.loc[["a", "b"]] exp = DataFrame({ "A": [1, 3, 2], "B": [4, 6, 5] }, index=CategoricalIndex(["a", "a", "b"])) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( { "A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5] }, index=CategoricalIndex(["a", "a", "a", "a", "b"]), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]] # contains unused category index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde")) df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index) res = df.loc[["a", "b"]] exp = DataFrame( { "A": [1, 3, 2], "B": [5, 7, 6] }, index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) res = df.loc[["a", "e"]] exp = DataFrame( { "A": [1, 3, np.nan], "B": [5, 7, np.nan] }, index=CategoricalIndex(["a", "a", "e"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) # duplicated slice res = df.loc[["a", "a", "b"]] exp = DataFrame( { "A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6] }, index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")), ) tm.assert_frame_equal(res, exp, check_index_type=True) msg = "a list-indexer must only include values " "that are in the categories" with pytest.raises(KeyError, match=msg): df.loc[["a", "x"]]
def test_basic(): cats = Categorical( ["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True, ) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True) expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True) expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame( [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]], columns=["person_id", "person_name"], ) x["person_name"] = Categorical(x.person_name) g = x.groupby(["person_id"], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[["person_name"]]) result = x.drop_duplicates("person_name") expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates("person_name").iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name="person_id") expected["person_name"] = expected["person_name"].astype("object") tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[["a"]]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df["a"]) tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df["a"]) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[["a"]]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]) # GH 9603 df = DataFrame({"a": [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd"))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = "a" tm.assert_series_equal(result, expected) # more basic levels = ["foo", "bar", "baz", "qux"] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]) expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp) exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4) tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
def test_describe(self): # string type desc = self.factor.describe() assert self.factor.ordered exp_index = CategoricalIndex(["a", "b", "c"], name="categories", ordered=self.factor.ordered) expected = DataFrame( { "counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0] }, index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() exp_index = CategoricalIndex(list("abcd"), ordered=self.factor.ordered, name="categories") expected = DataFrame( { "counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0] }, index=exp_index, ) tm.assert_frame_equal(desc, expected) # check an integer one cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) desc = cat.describe() exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories") expected = DataFrame( { "counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0] }, index=exp_index, ) tm.assert_frame_equal(desc, expected) # https://github.com/pandas-dev/pandas/issues/3678 # describe should work with NaN cat = Categorical([np.nan, 1, 2, 2]) desc = cat.describe() expected = DataFrame( { "counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0] }, index=CategoricalIndex([1, 2, np.nan], categories=[1, 2], name="categories"), ) tm.assert_frame_equal(desc, expected)
def test_construction_empty_with_bool_categories(self): # see GH#22702 cat = CategoricalIndex([], categories=[True, False]) categories = sorted(cat.categories.tolist()) assert categories == [False, True]
idx = index idx_non_unique = idx[[0, 0, 1, 2]] check_intersection_commutative(idx, idx_non_unique) assert idx.intersection(idx_non_unique).is_unique @pytest.mark.parametrize( "cls", [ Int64Index, Float64Index, DatetimeIndex, CategoricalIndex, lambda x: CategoricalIndex(x, categories=set(x)), TimedeltaIndex, lambda x: Index(x, dtype=object), UInt64Index, ], ) def test_union_duplicate_index_subsets_of_each_other(cls): # GH#31326 a = cls([1, 2, 2, 3]) b = cls([3, 3, 4]) expected = cls([1, 2, 2, 3, 3, 4]) if isinstance(a, CategoricalIndex): expected = Index([1, 2, 2, 3, 3, 4]) result = a.union(b) tm.assert_index_equal(result, expected) result = a.union(b, sort=False)
def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["a", "bb", "ccc"] * 10) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["a", "bb", "ccc"] * 100) expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa:E501 assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], categories=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', ...], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa:E501 assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected # Enable Unicode option ----------------------------------------- with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected # multiple lines idx = CategoricalIndex(["あ", "いい", "ううう"] * 10) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected # truncated idx = CategoricalIndex(["あ", "いい", "ううう"] * 100) expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa:E501 assert repr(idx) == expected # larger categories idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], categories=['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', ...], ordered=False, dtype='category')""" # noqa:E501 assert repr(idx) == expected
def test_get_indexer(self): idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc")) idx2 = CategoricalIndex(list("abf")) for indexer in [idx2, list("abf"), Index(list("abf"))]: r1 = idx1.get_indexer(idx2) tm.assert_almost_equal(r1, np.array([0, 1, 2, -1], dtype=np.intp)) msg = ("method='pad' and method='backfill' not implemented yet for " "CategoricalIndex") with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="pad") with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="backfill") msg = "method='nearest' not implemented yet for CategoricalIndex" with pytest.raises(NotImplementedError, match=msg): idx2.get_indexer(idx1, method="nearest")