def test_string_join_non_key(str_data, num_cols, how, how_raise): other_data = [1, 2, 3, 4, 5][:len(str_data)] pdf = pd.DataFrame() gdf = DataFrame() for i in range(num_cols): pdf[i] = pd.Series(str_data, dtype="str") gdf[i] = Series(str_data, dtype="str") pdf["a"] = other_data gdf["a"] = other_data pdf2 = pdf.copy() gdf2 = gdf.copy() expectation = raise_builder([how_raise], NotImplementedError) with expectation: expect = pdf.merge(pdf2, on=["a"], how=how) got = gdf.merge(gdf2, on=["a"], how=how) if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ["a", "b", "c", "d", "e"] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf["vals"] = pd.Series(str_data, dtype="str") gdf["vals"] = Series(str_data, dtype="str") pdf["key"] = other_data gdf["key"] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2["vals"] = pd.Series(str_data_nulls, dtype="str") gdf2["vals"] = Series(str_data_nulls, dtype="str") pdf2["key"] = pd.Series(other_data_nulls, dtype="int64") gdf2["key"] = Series(other_data_nulls, dtype="int64") expect = pdf.merge(pdf2, on="key", how="left") got = gdf.merge(gdf2, on="key", how="left") if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_string_join_non_key_nulls(str_data_nulls): str_data = ['a', 'b', 'c', 'd', 'e'] other_data = [1, 2, 3, 4, 5] other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)] pdf = pd.DataFrame() gdf = DataFrame() pdf['vals'] = pd.Series(str_data, dtype='str') gdf['vals'] = Series(str_data, dtype='str') pdf['key'] = other_data gdf['key'] = other_data pdf2 = pd.DataFrame() gdf2 = DataFrame() pdf2['vals'] = pd.Series(str_data_nulls, dtype='str') gdf2['vals'] = Series(str_data_nulls, dtype='str') pdf2['key'] = pd.Series(other_data_nulls, dtype='int64') gdf2['key'] = Series(other_data_nulls, dtype='int64') expect = pdf.merge(pdf2, on='key', how='left') got = gdf.merge(gdf2, on='key', how='left') if len(expect) == 0 and len(got) == 0: expect = expect.reset_index(drop=True) got = got[expect.columns] assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([('a', []), ('b', [])]) gdf2 = DataFrame([('a', []), ('c', [])]) expect = DataFrame([('a', []), ('b', []), ('c', [])]) got = gdf1.merge(gdf2, how='left', on=['a']) assert_eq(expect, got)
def test_dataframe_empty_merge(): gdf1 = DataFrame([("a", []), ("b", [])]) gdf2 = DataFrame([("a", []), ("c", [])]) expect = DataFrame([("a", []), ("b", []), ("c", [])]) got = gdf1.merge(gdf2, how="left", on=["a"]) assert_eq(expect, got)
def test_dataframe_merge_on(on): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 40, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['left_val'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['right_val'] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # Expected result (from pandas) pddf_joined = pddf_left.merge(pddf_right, on=on, how='left') # Test (from cuDF; doesn't check for ordering) join_result = df_left.merge(df_right, on=on, how='left') join_result_cudf = cudf.merge(df_left, df_right, on=on, how='left') join_result['right_val'] = (join_result['right_val'].astype( np.float64).fillna(np.nan)) join_result_cudf['right_val'] = (join_result_cudf['right_val'].astype( np.float64).fillna(np.nan)) for col in list(pddf_joined.columns): if (col.count('_y') > 0): join_result[col] = (join_result[col].astype(np.float64).fillna( np.nan)) join_result_cudf[col] = (join_result_cudf[col].astype( np.float64).fillna(np.nan)) # Test dataframe equality (ignore order of rows and columns) cdf_result = join_result.to_pandas() \ .sort_values(list(pddf_joined.columns)) \ .reset_index(drop=True) pdf_result = pddf_joined.sort_values(list(pddf_joined.columns)) \ .reset_index(drop=True) pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True) merge_func_result_cdf = join_result_cudf.to_pandas() \ .sort_values( list(pddf_joined.columns)) \ .reset_index(drop=True) pd.util.testing.assert_frame_equal(merge_func_result_cdf, cdf_result, check_like=True)
def test_dataframe_merge_no_common_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 40, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['left_val'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key3'] = np.random.randint(0, 30, nelem) df_right['key4'] = np.random.randint(0, 50, nelem) df_right['right_val'] = np.arange(nelem) with pytest.raises(ValueError) as raises: df_left.merge(df_right, how='left') raises.match('No common columns to perform merge on')
def test_dataframe_merge_on_unknown_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 40, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['left_val'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['right_val'] = np.arange(nelem) with pytest.raises(KeyError) as raises: df_left.merge(df_right, on='bad_key', how='left') raises.match('bad_key')
def test_dataframe_merge_no_common_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key3"] = np.random.randint(0, 30, nelem) df_right["key4"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(ValueError) as raises: df_left.merge(df_right, how="left") raises.match("No common columns to perform merge on")
def test_dataframe_merge_on_unknown_column(): np.random.seed(0) # Make cuDF df_left = DataFrame() nelem = 500 df_left["key1"] = np.random.randint(0, 40, nelem) df_left["key2"] = np.random.randint(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right["key1"] = np.random.randint(0, 30, nelem) df_right["key2"] = np.random.randint(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(KeyError) as raises: df_left.merge(df_right, on="bad_key", how="left") raises.match("bad_key")
def test_dataframe_merge_order(): gdf1 = DataFrame() gdf2 = DataFrame() gdf1['id'] = [10, 11] gdf1['timestamp'] = [1, 2] gdf1['a'] = [3, 4] gdf2['id'] = [4, 5] gdf2['a'] = [7, 8] gdf = gdf1.merge(gdf2, how='left', on=['id', 'a'], method='hash') df1 = pd.DataFrame() df2 = pd.DataFrame() df1['id'] = [10, 11] df1['timestamp'] = [1, 2] df1['a'] = [3, 4] df2['id'] = [4, 5] df2['a'] = [7, 8] df = df1.merge(df2, how='left', on=['id', 'a']) assert_eq(gdf, df)
def test_dataframe_merge_order(): gdf1 = DataFrame() gdf2 = DataFrame() gdf1["id"] = [10, 11] gdf1["timestamp"] = [1, 2] gdf1["a"] = [3, 4] gdf2["id"] = [4, 5] gdf2["a"] = [7, 8] gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash") df1 = pd.DataFrame() df2 = pd.DataFrame() df1["id"] = [10, 11] df1["timestamp"] = [1, 2] df1["a"] = [3, 4] df2["id"] = [4, 5] df2["a"] = [7, 8] df = df1.merge(df2, how="left", on=["id", "a"]) assert_eq(gdf, df)
def test_dataframe_multi_column_join(): np.random.seed(0) # Make GDF df_left = DataFrame() nelem = 500 df_left['key1'] = np.random.randint(0, 30, nelem) df_left['key2'] = np.random.randint(0, 50, nelem) df_left['val1'] = np.arange(nelem) df_right = DataFrame() nelem = 500 df_right['key1'] = np.random.randint(0, 30, nelem) df_right['key2'] = np.random.randint(0, 50, nelem) df_right['val1'] = np.arange(nelem) # Make pandas DF pddf_left = df_left.to_pandas() pddf_right = df_right.to_pandas() # Expected result pddf_joined = pddf_left.merge(pddf_right, on=['key1', 'key2'], how='left', sort=True) # Test (doesn't check for ordering) join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left') for col in list(pddf_joined.columns): if (col.count('_y') > 0): join_result[col] = (join_result[col].astype(np.float64).fillna( np.nan)) pd.util.testing.assert_frame_equal( join_result.to_pandas().sort_values(list( pddf_joined.columns)).reset_index(drop=True), pddf_joined)