def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in "abc": df[k] = np.random.randint(0, 5, 5) left = df.set_index("a") right = df.set_index("c") with pytest.raises(ValueError) as raises: left.join(right) raises.match( "there are overlapping columns but lsuffix" " and rsuffix are not defined" ) got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index("a").join( pddf.set_index("c"), lsuffix="_left", rsuffix="_right" ) # Check assert list(expect.columns) == list(got.columns) assert_eq(expect.index.values, got.index.values) for k in expect.columns: _check_series(expect[k].fillna(-1), got[k].fillna(-1))
def test_dataframe_join_cats(): lhs = DataFrame() lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) lhs["b"] = bb = np.arange(len(lhs)) lhs = lhs.set_index("a") rhs = DataFrame() rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) rhs["c"] = cc = np.arange(len(rhs)) rhs = rhs.set_index("a") got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by="b") .to_pandas() .sort_index() .reset_index(drop=True), expect.reset_index(drop=True), ) # Just do some rough checking here. assert list(got.columns) == ["b", "c"] assert len(got) > 0 assert set(got.index.to_pandas()) & set("abc") assert set(got["b"]) & set(bb) assert set(got["c"]) & set(cc)
def test_typecast_on_join_indexes_matching_categorical(): join_data_l = Series(["a", "b", "c", "d", "e"], dtype="category") join_data_r = Series(["a", "b", "c", "d", "e"], dtype="str") other_data = [1, 2, 3, 4, 5] gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) gdf_l = gdf_l.set_index("join_col") gdf_r = gdf_r.set_index("join_col") exp_join_data = ["a", "b", "c", "d", "e"] exp_other_data = [1, 2, 3, 4, 5] expect = DataFrame( { "join_col": exp_join_data, "B_x": exp_other_data, "B_y": exp_other_data, } ) expect = expect.set_index("join_col") got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got)
def test_dataframe_scatter_by_map(map_size, nelem, keep): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] np.random.seed(0) df = DataFrame() df["a"] = np.random.choice(strlist[:map_size], nelem) df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) df["c"] = np.random.randint(map_size, size=nelem) df["d"] = df["a"].astype("category") def _check_scatter_by_map(dfs, col): assert len(dfs) == map_size nrows = 0 # print(col._column) name = col.name for i, df in enumerate(dfs): nrows += len(df) if len(df) > 0: # Make sure the column types were preserved assert isinstance(df[name]._column, type(col._column)) try: sr = df[name].astype(np.int32) except ValueError: sr = df[name] assert sr.nunique() <= 1 if sr.nunique() == 1: if isinstance(df[name]._column, NumericalColumn): assert sr.iloc[0] == i assert nrows == nelem _check_scatter_by_map(df.scatter_by_map("a", map_size, keep_index=keep), df["a"]) _check_scatter_by_map(df.scatter_by_map("b", map_size, keep_index=keep), df["b"]) _check_scatter_by_map(df.scatter_by_map("c", map_size, keep_index=keep), df["c"]) _check_scatter_by_map(df.scatter_by_map("d", map_size, keep_index=keep), df["d"]) if map_size == 2 and nelem == 100: df.scatter_by_map("a") # Auto-detect map_size with pytest.raises(ValueError): df.scatter_by_map("a", map_size=1, debug=True) # Bad map_size # Test GenericIndex df2 = df.set_index("c") generic_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(generic_result, df2["b"]) if keep: for frame in generic_result: isinstance(frame.index, type(df2.index)) # Test MultiIndex df2 = df.set_index(["a", "c"]) multiindex_result = df2.scatter_by_map("b", map_size, keep_index=keep) _check_scatter_by_map(multiindex_result, df2["b"]) if keep: for frame in multiindex_result: isinstance(frame.index, type(df2.index))
def test_merge_multi(kwargs): left = DataFrame( { "a": [1, 2, 3, 4, 3, 5, 6], "b": [1, 3, 5, 7, 5, 9, 0], "c": ["o", "p", "q", "r", "s", "t", "u"], "d": ["v", "w", "x", "y", "z", "1", "2"], } ) right = DataFrame( { "a": [0, 9, 3, 4, 3, 7, 8], "b": [2, 4, 5, 7, 5, 6, 8], "c": ["a", "b", "c", "d", "e", "f", "g"], "d": ["j", "i", "j", "k", "l", "m", "n"], } ) if ( kwargs["left_on"] is not None and kwargs["right_on"] is not None and kwargs["left_index"] is False and kwargs["right_index"] is False ): left = left.set_index(["c", "d"]) right = right.set_index(["c", "d"]) elif ( kwargs["left_on"] is None and kwargs["right_on"] is None and kwargs["left_index"] is True and kwargs["right_index"] is True ): left = left.set_index(["a", "b"]) right = right.set_index(["a", "b"]) elif kwargs["left_on"] is not None and kwargs["right_index"] is True: left = left.set_index(["c", "d"]) right = right.set_index(["a", "b"]) elif kwargs["right_on"] is not None and kwargs["left_index"] is True: left = left.set_index(["a", "b"]) right = right.set_index(["c", "d"]) gleft = left.to_pandas() gright = right.to_pandas() kwargs["sort"] = True expect = gleft.merge(gright, **kwargs) got = left.merge(right, **kwargs) assert_eq(expect.sort_index().index, got.sort_index().index) expect.index = range(len(expect)) got.index = range(len(got)) expect = expect.sort_values(list(expect.columns)) got = got.sort_values(list(got.columns)) expect.index = range(len(expect)) got.index = range(len(got)) assert_eq(expect, got)
def test_typecast_on_join_multiindices(): join_data_l_0 = Series([1, 2, 3, 4, 5], dtype="int8") join_data_l_1 = Series([2, 3, 4.1, 5.9, 6], dtype="float32") join_data_l_2 = Series([7, 8, 9, 0, 1], dtype="float32") join_data_r_0 = Series([1, 2, 3, 4, 5], dtype="int32") join_data_r_1 = Series([2, 3, 4, 5, 6], dtype="int32") join_data_r_2 = Series([7, 8, 9, 0, 0], dtype="float64") other_data = ["a", "b", "c", "d", "e"] gdf_l = DataFrame( { "join_col_0": join_data_l_0, "join_col_1": join_data_l_1, "join_col_2": join_data_l_2, "B": other_data, } ) gdf_r = DataFrame( { "join_col_0": join_data_r_0, "join_col_1": join_data_r_1, "join_col_2": join_data_r_2, "B": other_data, } ) gdf_l = gdf_l.set_index(["join_col_0", "join_col_1", "join_col_2"]) gdf_r = gdf_r.set_index(["join_col_0", "join_col_1", "join_col_2"]) exp_join_data_0 = Series([1, 2], dtype="int32") exp_join_data_1 = Series([2, 3], dtype="float64") exp_join_data_2 = Series([7, 8], dtype="float64") exp_other_data = Series(["a", "b"]) expect = DataFrame( { "join_col_0": exp_join_data_0, "join_col_1": exp_join_data_1, "join_col_2": exp_join_data_2, "B_x": exp_other_data, "B_y": exp_other_data, } ) expect = expect.set_index(["join_col_0", "join_col_1", "join_col_2"]) got = gdf_l.join(gdf_r, how="inner", lsuffix="_x", rsuffix="_y") assert_eq(expect, got)
def test_categorical_index(): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3] pdf["index"] = pd.Categorical(["a", "b", "c"]) initial_df = DataFrame.from_pandas(pdf) pdf = pdf.set_index("index") gdf1 = DataFrame.from_pandas(pdf) gdf2 = DataFrame() gdf2["a"] = [1, 2, 3] gdf2["index"] = pd.Categorical(["a", "b", "c"]) assert_eq(initial_df.index, gdf2.index) gdf2 = gdf2.set_index("index") assert isinstance(gdf1.index, CategoricalIndex) assert_eq(pdf, gdf1) assert_eq(pdf.index, gdf1.index) assert_eq( pdf.index.codes, gdf1.index.codes.astype(pdf.index.codes.dtype).to_array(), ) assert isinstance(gdf2.index, CategoricalIndex) assert_eq(pdf, gdf2) assert_eq(pdf.index, gdf2.index) assert_eq( pdf.index.codes, gdf2.index.codes.astype(pdf.index.codes.dtype).to_array(), )
def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ 2992443.0, 2992447.0, 2992466.0, 2992440.0, 2992441.0, 2992442.0, 2992444.0, 2992445.0, 2992446.0, 2992448.0, ] assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) df = df.set_index("col2") assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, )
def test_df_cat_sort_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a").sort_index() expect = df.to_pandas().set_index("a").sort_index() assert_eq(got, expect)
def test_df_cat_set_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a") pddf = df.to_pandas(nullable_pd_dtype=False) expect = pddf.set_index("a") assert_eq(got, expect)
def test_df_cat_sort_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a").sort_index() expect = df.to_pandas().set_index("a").sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
def test_df_set_index_from_series(): df = DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) # Check set_index(Series) df2 = df.set_index(df["b"]) assert list(df2.columns) == ["a", "b"] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_index_join_exception_cases(): l_df = DataFrame({"a": [2, 3, 1, 4], "b": [3, 7, 8, 1]}) r_df = DataFrame({"a": [1, 5, 4, 0], "b": [3, 9, 8, 4], "c": [2, 3, 6, 0]}) # Join between two MultiIndex lhs = ["a", "b"] rhs = ["a", "c"] level = "a" how = "outer" g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index with pytest.raises(TypeError): g_lhs.join(g_rhs, level=level, how=how) # Improper level value, level should be an int or scalar value level = ["a"] rhs = ["a"] g_lhs = l_df.set_index(lhs).index g_rhs = r_df.set_index(rhs).index with pytest.raises(ValueError): g_lhs.join(g_rhs, level=level, how=how)
def test_df_set_index_from_name(): df = DataFrame() df["a"] = list(range(10)) df["b"] = list(range(0, 20, 2)) # Check set_index(column_name) df2 = df.set_index("b") print(df2) # 1 less column because 'b' is used as index assert list(df2.columns) == ["a"] sliced_strided = df2.loc[2:6] print(sliced_strided) assert len(sliced_strided) == 3 assert list(sliced_strided.index.values) == [2, 4, 6]
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame({ "a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1] }) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def test_factorize_index_obj(ncats, nelem): df = DataFrame() np.random.seed(0) # initialize data frame df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) assert isinstance(uvals, cp.core.core.ndarray) assert isinstance(labels, Index) encoder = dict((labels[idx], idx) for idx in range(len(labels))) handcoded = [encoder[v] for v in arr] np.testing.assert_array_equal(uvals.get(), handcoded)