def test_factorize_series_index(): df = DataFrame() df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"] df["col2"] = [ 2992443.0, 2992447.0, 2992466.0, 2992440.0, 2992441.0, 2992442.0, 2992444.0, 2992445.0, 2992446.0, 2992448.0, ] assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, ) df = df.set_index("col2") assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0]) assert_eq( df.col1.factorize()[1].to_pandas().values, df.to_pandas().col1.factorize()[1].values, )
def test_dataframe_join_cats(): lhs = DataFrame() lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) lhs["b"] = bb = np.arange(len(lhs)) lhs = lhs.set_index("a") rhs = DataFrame() rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc")) rhs["c"] = cc = np.arange(len(rhs)) rhs = rhs.set_index("a") got = lhs.join(rhs) expect = lhs.to_pandas().join(rhs.to_pandas()) # Note: pandas make a object Index after joining pd.util.testing.assert_frame_equal( got.sort_values(by="b") .to_pandas() .sort_index() .reset_index(drop=True), expect.reset_index(drop=True), ) # Just do some rough checking here. assert list(got.columns) == ["b", "c"] assert len(got) > 0 assert set(got.index.to_pandas()) & set("abc") assert set(got["b"]) & set(bb) assert set(got["c"]) & set(cc)
def test_merge_multi(kwargs): left = DataFrame( { "a": [1, 2, 3, 4, 3, 5, 6], "b": [1, 3, 5, 7, 5, 9, 0], "c": ["o", "p", "q", "r", "s", "t", "u"], "d": ["v", "w", "x", "y", "z", "1", "2"], } ) right = DataFrame( { "a": [0, 9, 3, 4, 3, 7, 8], "b": [2, 4, 5, 7, 5, 6, 8], "c": ["a", "b", "c", "d", "e", "f", "g"], "d": ["j", "i", "j", "k", "l", "m", "n"], } ) if ( kwargs["left_on"] is not None and kwargs["right_on"] is not None and kwargs["left_index"] is False and kwargs["right_index"] is False ): left = left.set_index(["c", "d"]) right = right.set_index(["c", "d"]) elif ( kwargs["left_on"] is None and kwargs["right_on"] is None and kwargs["left_index"] is True and kwargs["right_index"] is True ): left = left.set_index(["a", "b"]) right = right.set_index(["a", "b"]) elif kwargs["left_on"] is not None and kwargs["right_index"] is True: left = left.set_index(["c", "d"]) right = right.set_index(["a", "b"]) elif kwargs["right_on"] is not None and kwargs["left_index"] is True: left = left.set_index(["a", "b"]) right = right.set_index(["c", "d"]) gleft = left.to_pandas() gright = right.to_pandas() kwargs["sort"] = True expect = gleft.merge(gright, **kwargs) got = left.merge(right, **kwargs) assert_eq(expect.sort_index().index, got.sort_index().index) expect.index = range(len(expect)) got.index = range(len(got)) expect = expect.sort_values(list(expect.columns)) got = got.sort_values(list(got.columns)) expect.index = range(len(expect)) got.index = range(len(got)) assert_eq(expect, got)
def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # Expected result (from pandas) pddf_joined = pddf_left.merge(pddf_right, on=on, how="left") # Test (from cuDF; doesn't check for ordering) join_result = df_left.merge(df_right, on=on, how="left") join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left") join_result["right_val"] = (join_result["right_val"].astype( np.float64).fillna(np.nan)) join_result_cudf["right_val"] = (join_result_cudf["right_val"].astype( np.float64).fillna(np.nan)) for col in list(pddf_joined.columns): if col.count("_y") > 0: join_result[col] = (join_result[col].astype(np.float64).fillna( np.nan)) join_result_cudf[col] = (join_result_cudf[col].astype( np.float64).fillna(np.nan)) # Test dataframe equality (ignore order of rows and columns) cdf_result = (join_result.to_pandas().sort_values(list( pddf_joined.columns)).reset_index(drop=True)) pdf_result = pddf_joined.sort_values(list( pddf_joined.columns)).reset_index(drop=True) pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True) merge_func_result_cdf = (join_result_cudf.to_pandas().sort_values( list(pddf_joined.columns)).reset_index(drop=True)) pd.util.testing.assert_frame_equal(merge_func_result_cdf, cdf_result, check_like=True)
def test_dataframe_join_suffix(): np.random.seed(0) df = DataFrame() for k in "abc": df[k] = np.random.randint(0, 5, 5) left = df.set_index("a") right = df.set_index("c") with pytest.raises(ValueError) as raises: left.join(right) raises.match( "there are overlapping columns but lsuffix" " and rsuffix are not defined" ) got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True) # Get expected value pddf = df.to_pandas() expect = pddf.set_index("a").join( pddf.set_index("c"), lsuffix="_left", rsuffix="_right" ) # Check assert list(expect.columns) == list(got.columns) assert_eq(expect.index.values, got.index.values) for k in expect.columns: _check_series(expect[k].fillna(-1), got[k].fillna(-1))
def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() df["a"] = aa df["b"] = bb def work_pandas(df, how): df1 = df.set_index("a") df2 = df.set_index("b") if how == "leftanti": joined = pd_odd_joins(df1, df2, "leftanti") elif how == "leftsemi": joined = pd_odd_joins(df1, df2, "leftsemi") else: joined = df1.join(df2, how=how, sort=True) return joined def work_gdf(df): df1 = df.set_index("a") df2 = df.set_index("b") joined = df1.join(df2, how=how, sort=True, method=method) return joined expect = work_pandas(df.to_pandas(), how) got = work_gdf(df) expecto = expect.copy() goto = got.copy() expect = expect.astype(np.float64).fillna(np.nan)[expect.columns] got = got.astype(np.float64).fillna(np.nan)[expect.columns] assert got.index.name is None assert list(expect.columns) == list(got.columns) # test disabled until libgdf sort join gets updated with new api if method == "hash": assert_eq(sorted(expect.index.values), sorted(got.index.values)) if how != "outer": # Newly introduced ambiguous ValueError thrown when # an index and column have the same name. Rename the # index so sorts work. # TODO: What is the less hacky way? expect.index.name = "bob" got.index.name = "mary" pd.util.testing.assert_frame_equal( got.to_pandas() .sort_values(got.columns.to_list()) .reset_index(drop=True), expect.sort_values(expect.columns.to_list()).reset_index( drop=True ), ) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) # else: # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: for c in expecto.columns: _check_series(expecto[c].fillna(-1), goto[c].fillna(-1))
def test_df_cat_sort_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a").sort_index() expect = df.to_pandas().set_index("a").sort_index() assert_eq(got, expect)
def test_df_cat_set_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a") pddf = df.to_pandas(nullable_pd_dtype=False) expect = pddf.set_index("a") assert_eq(got, expect)
def test_groupby_apply_basic_agg_single_column(): gdf = DataFrame() gdf["key"] = [0, 0, 1, 1, 2, 2, 0] gdf["val"] = [0, 1, 2, 3, 4, 5, 6] gdf["mult"] = gdf["key"] * gdf["val"] pdf = gdf.to_pandas() gdg = gdf.groupby(["key", "val"]).mult.sum() pdg = pdf.groupby(["key", "val"]).mult.sum() assert_eq(pdg, gdg)
def test_df_cat_sort_index(): df = DataFrame() df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc")) df["b"] = np.arange(len(df)) got = df.set_index("a").sort_index() expect = df.to_pandas().set_index("a").sort_index() assert list(expect.columns) == list(got.columns) assert list(expect.index.values) == list(got.index.values) np.testing.assert_array_equal(expect.index.values, got.index.values) np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
def test_dataframe_sort_values_ignore_index(index, ignore_index): gdf = DataFrame({ "a": [1, 3, 5, 2, 4], "b": [1, 1, 2, 2, 3], "c": [9, 7, 7, 7, 1] }) gdf = gdf.set_index(index) pdf = gdf.to_pandas() expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index) got = gdf.sort_values((gdf.columns), ignore_index=ignore_index) assert_eq(expect, got)
def test_groupby_apply(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) got_grpby = df.groupby(["key1", "key2"]) def foo(df): df["out"] = df["val1"] + df["val2"] return df expect = expect_grpby.apply(foo) got = got_grpby.apply(foo) assert_eq(expect, got)
def test_groupby_apply_grouped(): from numba import cuda np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False, sort=True) got_grpby = df.groupby(["key1", "key2"], sort=True) def foo(key1, val1, com1, com2): for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x): com1[i] = key1[i] * 10000 + val1[i] com2[i] = i got = got_grpby.apply_grouped( foo, incols=["key1", "val1"], outcols={ "com1": np.float64, "com2": np.int32 }, tpb=8, ) got = got.to_pandas() # Get expected result by emulating the operation in pandas def emulate(df): df["com1"] = df.key1 * 10000 + df.val1 df["com2"] = np.arange(len(df), dtype=np.int32) return df expect = expect_grpby.apply(emulate) expect = expect.sort_values(["key1", "key2"]) assert_eq(expect, got)
def test_to_pandas(): df = DataFrame() df["a"] = np.arange(5, dtype=np.int32) df["b"] = np.arange(10, 15, dtype=np.float64) df["c"] = np.array([True, False, None, True, True]) pdf = df.to_pandas() assert tuple(df.columns) == tuple(pdf.columns) assert df["a"].dtype == pdf["a"].dtype assert df["b"].dtype == pdf["b"].dtype # Notice, the dtype differ when Pandas and cudf boolean series # contains None/NaN assert df["c"].dtype == np.bool assert pdf["c"].dtype == np.object assert len(df["a"]) == len(pdf["a"]) assert len(df["b"]) == len(pdf["b"]) assert len(df["c"]) == len(pdf["c"])
def test_groupby_apply(): np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = np.random.randint(0, 3, nelem) df["key2"] = np.random.randint(0, 2, nelem) df["val1"] = np.random.random(nelem) df["val2"] = np.random.random(nelem) expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False) got_grpby = df.groupby(["key1", "key2"], method="cudf") def foo(df): df["out"] = df["val1"] + df["val2"] return df expect = expect_grpby.apply(foo) expect = expect.sort_values(["key1", "key2"]).reset_index(drop=True) got = got_grpby.apply(foo).to_pandas() pd.util.testing.assert_frame_equal(expect, got)
def test_set_index_as_property(): cdf = DataFrame() col1 = np.arange(10) col2 = np.arange(0, 20, 2) cdf["a"] = col1 cdf["b"] = col2 # Check set_index(Series) cdf.index = cdf["b"] np.testing.assert_array_equal(cdf.index.values, col2) with pytest.raises(ValueError): cdf.index = [list(range(10))] idx = np.arange(0, 1000, 100) cdf.index = idx np.testing.assert_array_equal(cdf.index.values, idx) df = cdf.to_pandas() np.testing.assert_array_equal(df.index.values, idx) head = cdf.head().to_pandas() np.testing.assert_array_equal(head.index.values, idx[:5])
def test_set_index_as_property(): cdf = DataFrame() col1 = np.arange(10) col2 = np.arange(0, 20, 2) cdf["a"] = col1 cdf["b"] = col2 # Check set_index(Series) cdf.index = cdf["b"] assert_eq(cdf.index._values.to_array(), col2) with pytest.raises(ValueError): cdf.index = [list(range(10))] idx = pd.Index(np.arange(0, 1000, 100)) cdf.index = idx assert_eq(cdf.index.to_pandas(), idx) df = cdf.to_pandas() assert_eq(df.index, idx) head = cdf.head().to_pandas() assert_eq(head.index, idx[:5])
def test_dataframe_join_how(aa, bb, how, method): df = DataFrame() df["a"] = aa df["b"] = bb def work_pandas(df): ts = timer() df1 = df.set_index("a") df2 = df.set_index("b") joined = df1.join(df2, how=how, sort=True) te = timer() print("timing", type(df), te - ts) return joined def work_gdf(df): ts = timer() df1 = df.set_index("a") df2 = df.set_index("b") joined = df1.join(df2, how=how, sort=True, method=method) te = timer() print("timing", type(df), te - ts) return joined expect = work_pandas(df.to_pandas()) got = work_gdf(df) expecto = expect.copy() goto = got.copy() # Type conversion to handle NoneType expectb = expect.b expecta = expect.a gotb = got.b gota = got.a del got["b"] got.insert(len(got._data), "b", gotb.astype(np.float64).fillna(np.nan)) del got["a"] got.insert(len(got._data), "a", gota.astype(np.float64).fillna(np.nan)) expect.drop(["b"], axis=1) expect["b"] = expectb.astype(np.float64).fillna(np.nan) expect.drop(["a"], axis=1) expect["a"] = expecta.astype(np.float64).fillna(np.nan) assert got.index.name is None assert list(expect.columns) == list(got.columns) # test disabled until libgdf sort join gets updated with new api if method == "hash": assert np.all(expect.index.values == got.index.values) if how != "outer": # Newly introduced ambiguous ValueError thrown when # an index and column have the same name. Rename the # index so sorts work. # TODO: What is the less hacky way? expect.index.name = "bob" got.index.name = "mary" pd.util.testing.assert_frame_equal( got.to_pandas().sort_values(["b", "a"]).reset_index(drop=True), expect.sort_values(["b", "a"]).reset_index(drop=True), ) # if(how=='right'): # _sorted_check_series(expect['a'], expect['b'], # got['a'], got['b']) # else: # _sorted_check_series(expect['b'], expect['a'], got['b'], # got['a']) else: _check_series(expecto["b"].fillna(-1), goto["b"].fillna(-1)) _check_series(expecto["a"].fillna(-1), goto["a"].fillna(-1))