def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e"] join_data_l = Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"] ).astype(dtype_l) join_data_r = Series( ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"] ).astype(dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r)) exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"] exp_other_data = ["a", "b", "c", "d"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, } ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_typecast_on_join_categorical(dtype_l, dtype_r): if not (dtype_l == "category" or dtype_r == "category"): pytest.skip("at least one side must be category for this set of tests") if dtype_l == "category" and dtype_r == "category": pytest.skip("Can't determine which categorical to use") other_data = ["a", "b", "c", "d", "e"] join_data_l = Series([1, 2, 3, 4, 5], dtype=dtype_l) join_data_r = Series([1, 2, 3, 4, 6], dtype=dtype_r) if dtype_l == "category": exp_dtype = join_data_l.dtype exp_categories = join_data_l.astype(int)._column elif dtype_r == "category": exp_dtype = join_data_r.dtype exp_categories = join_data_r.astype(int)._column gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4] exp_other_data = ["a", "b", "c", "d"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, } ) expect["join_col"] = expect["join_col"].cat.set_categories(exp_categories) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got, check_dtype=False)
def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r): if ("int" in dtype_l and "int" in dtype_r) or ( "float" in dtype_l and "float" in dtype_r ): pytest.skip("like types not tested in this function") other_data = ["a", "b", "c", "d", "e", "f"] join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2, 3] exp_other_data = ["a", "b", "c"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, } ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_typecast_on_join_int_to_int(dtype_l, dtype_r): other_data = ["a", "b", "c"] join_data_l = Series([1, 2, 3], dtype=dtype_l) join_data_r = Series([1, 2, 4], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) exp_join_data = [1, 2] exp_other_data = ["a", "b"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, } ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_typecast_on_join_float_to_float(dtype_l, dtype_r): other_data = ["a", "b", "c", "d", "e", "f"] join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l) join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r) gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)]) if dtype_l != dtype_r: exp_join_data = [1, 2, 3, 4.5] exp_other_data = ["a", "b", "c", "e"] else: exp_join_data = [1, 2, 3, 0.9, 4.5] exp_other_data = ["a", "b", "c", "d", "e"] exp_join_col = Series(exp_join_data, dtype=exp_dtype) expect = DataFrame( { "join_col": exp_join_col, "B_x": exp_other_data, "B_y": exp_other_data, } ) got = gdf_l.merge(gdf_r, on="join_col", how="inner") assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") gdf["vals"] = Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") gdf2["vals"] = Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") gdf2["key"] = Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_join_non_key(str_data, num_cols, how, how_raise): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data pdf2 = pdf.copy() gdf2 = gdf.copy() expectation = raise_builder([how_raise], NotImplementedError) with expectation: expect = pdf.merge(pdf2, on=["a"], how=how) got = gdf.merge(gdf2, on=["a"], how=how) if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_merge_multi(kwargs): left = DataFrame( { "a": [1, 2, 3, 4, 3, 5, 6], "b": [1, 3, 5, 7, 5, 9, 0], "c": ["o", "p", "q", "r", "s", "t", "u"], "d": ["v", "w", "x", "y", "z", "1", "2"], } ) right = DataFrame( { "a": [0, 9, 3, 4, 3, 7, 8], "b": [2, 4, 5, 7, 5, 6, 8], "c": ["a", "b", "c", "d", "e", "f", "g"], "d": ["j", "i", "j", "k", "l", "m", "n"], } ) if ( kwargs["left_on"] is not None and kwargs["right_on"] is not None and kwargs["left_index"] is False and kwargs["right_index"] is False ): left = left.set_index(["c", "d"]) right = right.set_index(["c", "d"]) elif ( kwargs["left_on"] is None and kwargs["right_on"] is None and kwargs["left_index"] is True and kwargs["right_index"] is True ): left = left.set_index(["a", "b"]) right = right.set_index(["a", "b"]) elif kwargs["left_on"] is not None and kwargs["right_index"] is True: left = left.set_index(["c", "d"]) right = right.set_index(["a", "b"]) elif kwargs["right_on"] is not None and kwargs["left_index"] is True: left = left.set_index(["a", "b"]) right = right.set_index(["c", "d"]) gleft = left.to_pandas() gright = right.to_pandas() kwargs["sort"] = True expect = gleft.merge(gright, **kwargs) got = left.merge(right, **kwargs) assert_eq(expect.sort_index().index, got.sort_index().index) expect.index = range(len(expect)) got.index = range(len(got)) expect = expect.sort_values(list(expect.columns)) got = got.sort_values(list(got.columns)) expect.index = range(len(expect)) got.index = range(len(got)) assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame({"a": [], "b": []}) gdf2 = DataFrame({"a": [], "c": []}) expect = DataFrame({"a": [], "b": [], "c": []}) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([("a", []), ("b", [])]) gdf2 = DataFrame([("a", []), ("c", [])]) expect = DataFrame([("a", []), ("b", []), ("c", [])]) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got)
def test_dataframe_merge_no_common_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key3"] = np.random.randint(0, 30, nelem) df_right["key4"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(ValueError) as raises: df_left.merge(df_right, how="left") raises.match("No common columns to perform merge on")
def test_dataframe_merge_on_unknown_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(KeyError) as raises: df_left.merge(df_right, on="bad_key", how="left") raises.match("bad_key")
def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # Expected result (from pandas) pddf_joined = pddf_left.merge(pddf_right, on=on, how="left") # Test (from cuDF; doesn't check for ordering) join_result = df_left.merge(df_right, on=on, how="left") join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left") join_result["right_val"] = (join_result["right_val"].astype( np.float64).fillna(np.nan)) join_result_cudf["right_val"] = (join_result_cudf["right_val"].astype( np.float64).fillna(np.nan)) for col in list(pddf_joined.columns): if col.count("_y") > 0: join_result[col] = (join_result[col].astype(np.float64).fillna( np.nan)) join_result_cudf[col] = (join_result_cudf[col].astype( np.float64).fillna(np.nan)) # Test dataframe equality (ignore order of rows and columns) cdf_result = (join_result.to_pandas().sort_values(list( pddf_joined.columns)).reset_index(drop=True)) pdf_result = pddf_joined.sort_values(list( pddf_joined.columns)).reset_index(drop=True) pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True) merge_func_result_cdf = (join_result_cudf.to_pandas().sort_values( list(pddf_joined.columns)).reset_index(drop=True)) pd.util.testing.assert_frame_equal(merge_func_result_cdf, cdf_result, check_like=True)
def test_typecast_on_join_no_float_round(): other_data = ["a", "b", "c", "d", "e"] join_data_l = Series([1, 2, 3, 4, 5], dtype="int8") join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32") gdf_l = DataFrame({"join_col": join_data_l, "B": other_data}) gdf_r = DataFrame({"join_col": join_data_r, "B": other_data}) exp_join_data = [1, 2, 3, 4, 5] exp_Bx = ["a", "b", "c", "d", "e"] exp_By = ["a", "b", "c", None, None] exp_join_col = Series(exp_join_data, dtype="float32") expect = DataFrame( {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By} ) got = gdf_l.merge(gdf_r, on="join_col", how="left") assert_eq(expect, got)
def test_dataframe_merge_order(): gdf1 = DataFrame() gdf2 = DataFrame() gdf1["id"] = [10, 11] gdf1["timestamp"] = [1, 2] gdf1["a"] = [3, 4] gdf2["id"] = [4, 5] gdf2["a"] = [7, 8] gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash") df1 = pd.DataFrame() df2 = pd.DataFrame() df1["id"] = [10, 11] df1["timestamp"] = [1, 2] df1["a"] = [3, 4] df2["id"] = [4, 5] df2["a"] = [7, 8] df = df1.merge(df2, how="left", on=["id", "a"]) assert_eq(gdf, df)