def test_dataframe_pairs_of_triples(pairs, max, rows, how): np.random.seed(0) pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: pdf_left[left_column] = np.random.randint(0, max, rows) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, max, rows) gdf_left = DataFrame.from_pandas(pdf_left) gdf_right = DataFrame.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): with pytest.raises(pd.core.reshape.merge.MergeError) as raises: pdf_left.merge(pdf_right) raises.match("No common columns to perform merge on") with pytest.raises(ValueError) as raises: gdf_left.merge(gdf_right) raises.match("No common columns to perform merge on") elif not [value for value in pdf_left if value in pdf_right]: with pytest.raises(pd.core.reshape.merge.MergeError) as raises: pdf_left.merge(pdf_right) raises.match("No common columns to perform merge on") with pytest.raises(ValueError) as raises: gdf_left.merge(gdf_right) raises.match("No common columns to perform merge on") else: pdf_result = pdf_left.merge(pdf_right, how=how) gdf_result = gdf_left.merge(gdf_right, how=how) assert np.array_equal(gdf_result.columns, pdf_result.columns) for column in gdf_result: assert np.array_equal(gdf_result[column].fillna(-1).sort_values(), pdf_result[column].fillna(-1).sort_values())
def test_merge_left_right_index_left_right_on_kwargs2(kwargs): left = pd.DataFrame({'x': [1, 2, 3]}, index=[10, 20, 30]) right = pd.DataFrame({'y': [10, 20, 30]}, index=[1, 2, 30]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: assert (gd_merge.empty)
def test_merge_left_right_index_left_right_on_kwargs(kwargs): left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, **kwargs) gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge)
def test_merge_left_index_zero(): left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5]) right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]}, index=[0, 1, 2, 3, 4, 6]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, left_on="x", right_on='y') gd_merge = gleft.merge(gright, left_on="x", right_on='y') assert_eq(pd_merge, gd_merge)
def test_merge_sort(kwargs, hows): kwargs.update(hows) kwargs['sort'] = True d = range(3) left = pd.DataFrame({'k2': d, 'k1': d, 'k4': d, 'k3': d, 'k5': d}) right = pd.DataFrame({'k1': d, 'k4': d, 'k2': d, 'k3': d, 'k5': d}) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: assert (gd_merge.empty)
def test_merge_sort(kwargs, hows): kwargs.update(hows) kwargs["sort"] = True d = range(3) left = pd.DataFrame({"k2": d, "k1": d, "k4": d, "k3": d, "k5": d}) right = pd.DataFrame({"k1": d, "k4": d, "k2": d, "k3": d, "k5": d}) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) gd_merge = gleft.merge(gright, **kwargs) pd_merge = left.merge(right, **kwargs) if pd_merge.empty: assert gd_merge.empty
def test_dataframe_join_mismatch_cats(how): pdf1 = pd.DataFrame({ "join_col": ["a", "b", "c", "d", "e"], "data_col_left": [10, 20, 30, 40, 50], }) pdf2 = pd.DataFrame({ "join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8] }) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") gdf1 = DataFrame.from_pandas(pdf1) gdf2 = DataFrame.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") pdf1 = pdf1.set_index("join_col") pdf2 = pdf2.set_index("join_col") join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash") join_pdf = pdf1.join(pdf2, how=how) got = join_gdf.to_pandas() expect = join_pdf.fillna(-1) # note: cudf join doesn't mask NA # cudf creates the columns in different order than pandas for right join if how == "right": got = got[["data_col_left", "data_col_right"]] expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) # Expect has the wrong index type. Quick fix to get index type working # again I think this implies that CategoricalIndex.to_pandas() is not # working correctly, since the below corrects it. Remove this line for # an annoying error. TODO: Make CategoricalIndex.to_pandas() work # correctly for the below case. # Error: # AssertionError: Categorical Expected type <class # 'pandas.core.arrays.categorical.Categorical'>, found <class # 'numpy.ndarray'> instead expect.index = pd.Categorical(expect.index) pd.util.testing.assert_frame_equal( got, expect, check_names=False, check_index_type=False, # For inner joins, pandas returns # weird categories. check_categorical=how != "inner", ) assert list(got.index) == list(expect.index)
def test_merge_left_right_index_left_right_on_kwargs(kwargs): left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6]) right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]}, index=[1, 2, 3, 4, 5, 7]) gleft = DataFrame.from_pandas(left) gright = DataFrame.from_pandas(right) pd_merge = left.merge(right, **kwargs) if kwargs.get('left_on') and kwargs.get('right_on'): with pytest.raises(NotImplementedError) as raises: gd_merge = gleft.merge(gright, **kwargs) raises.match("left_on='x', right_on='y' not supported") else: gd_merge = gleft.merge(gright, **kwargs) assert_eq(pd_merge, gd_merge)
def test_join_datetimes_index(dtype): datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h")) pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1]) pdf_rhs = pd.DataFrame({"d": datetimes}) gdf_lhs = DataFrame.from_pandas(pdf_lhs) gdf_rhs = DataFrame.from_pandas(pdf_rhs) gdf_rhs["d"] = gdf_rhs["d"].astype(dtype) pdf = pdf_lhs.join(pdf_rhs, sort=True) gdf = gdf_lhs.join(gdf_rhs, sort=True) assert gdf["d"].dtype == np.dtype(dtype) assert_eq(pdf, gdf)
def test_groupby_column_numeral(): pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) p = pdf.groupby(1) g = gdf.groupby(1) pxx = p[0].sum() gxx = g[0].sum() assert_eq(pxx, gxx) pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) p = pdf.groupby(1.5) g = gdf.groupby(1.5) pxx = p[0.5].sum() gxx = g[0.5].sum() assert_eq(pxx, gxx)
def test_dataframe_setitem_from_masked_object(): ary = np.random.randn(100) mask = np.zeros(100, dtype=bool) mask[:20] = True np.random.shuffle(mask) ary[mask] = np.nan test1 = Series(ary) assert (test1.has_null_mask) assert (test1.null_count == 20) test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary})) assert (test2['a'].has_null_mask) assert (test2['a'].null_count == 20) gpu_ary = rmm.to_device(ary) test3 = Series(gpu_ary) assert (test3.has_null_mask) assert (test3.null_count == 20) test4 = DataFrame() lst = [1, 2, None, 4, 5, 6, None, 8, 9] test4['lst'] = lst assert (test4['lst'].has_null_mask) assert (test4['lst'].null_count == 2)
def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): from string import ascii_lowercase np.random.seed(0) by = list(ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): colname = ascii_lowercase[i] data = np.random.randint(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: idx = np.random.choice( num_rows, size=int(num_rows / 4), replace=False ) data[idx] = np.nan elif nulls == "all": data[:] = np.nan pdf[colname] = data gdf = DataFrame.from_pandas(pdf) got = gdf.sort_values(by, ascending=ascending, na_position=na_position) expect = pdf.sort_values(by, ascending=ascending, na_position=na_position) assert_eq( got[by].reset_index(drop=True), expect[by].reset_index(drop=True) )
def test_fillna_dataframe(fill_type, inplace): pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]}) gdf = DataFrame.from_pandas(pdf) if fill_type == "scalar": fill_value_pd = 5 fill_value_cudf = fill_value_pd elif fill_type == "series": fill_value_pd = pd.Series([3, 4, 5]) fill_value_cudf = Series.from_pandas(fill_value_pd) else: fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])} fill_value_cudf = { "a": fill_value_pd["a"], "b": Series.from_pandas(fill_value_pd["b"]), } # https://github.com/pandas-dev/pandas/issues/27197 # pandas df.fill_value with series is not working if isinstance(fill_value_pd, pd.Series): expect = pd.DataFrame() for col in pdf.columns: expect[col] = pdf[col].fillna(fill_value_pd) else: expect = pdf.fillna(fill_value_pd) got = gdf.fillna(fill_value_cudf, inplace=inplace) if inplace: got = gdf assert_eq(expect, got)
def test_dataframe_replace_with_nulls(): # numerical pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]}) gdf1 = DataFrame.from_pandas(pdf1) pdf2 = pdf1.replace(0, 4) gdf2 = gdf1.replace(0, None).fillna(4) pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2) # list input pdf6 = pdf1.replace([0, 1], [4, 5]) gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5) pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6) pdf7 = pdf1.replace([0, 1], 4) gdf7 = gdf1.replace([0, 1], None).fillna(4) pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7) # dict input: pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5}) gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4) pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8) gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]}) gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3) pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
def test_dataframe_merge_strings_not_supported(): pleft = pd.DataFrame({ 'x': [0, 1, 2, 3], 'name': ['Alice', 'Bob', 'Charlie', 'Dan'] }) with pytest.raises(NotImplementedError) as raises: gleft = DataFrame.from_pandas(pleft) # noqa:F841 raises.match('Strings are not yet supported')
def test_dataframe_join_mismatch_cats(how): pdf1 = pd.DataFrame({ "join_col": ["a", "b", "c", "d", "e"], "data_col_left": [10, 20, 30, 40, 50] }) pdf2 = pd.DataFrame({ "join_col": ["c", "e", "f"], "data_col_right": [6, 7, 8] }) pdf1["join_col"] = pdf1["join_col"].astype("category") pdf2["join_col"] = pdf2["join_col"].astype("category") gdf1 = DataFrame.from_pandas(pdf1) gdf2 = DataFrame.from_pandas(pdf2) gdf1 = gdf1.set_index("join_col") gdf2 = gdf2.set_index("join_col") pdf1 = pdf1.set_index('join_col') pdf2 = pdf2.set_index('join_col') join_gdf = gdf1.join(gdf2, how=how, sort=True, method='hash') join_pdf = pdf1.join(pdf2, how=how) got = join_gdf.to_pandas() expect = join_pdf.fillna(-1) # note: cudf join doesn't mask NA # cudf creates the columns in different order than pandas for right join if how == 'right': got = got[['data_col_left', 'data_col_right']] expect.data_col_right = expect.data_col_right.astype(np.int64) expect.data_col_left = expect.data_col_left.astype(np.int64) # Pandas returns a `object` dtype index for some reason... expect.index = expect.index.astype('category') pd.util.testing.assert_frame_equal( got, expect, check_names=False, check_index_type=False, # For inner joins, pandas returns # weird categories. check_categorical=how != 'inner') assert list(got.index) == list(expect.index)
def test_groupby_column_name(): pdf = pd.DataFrame({'xx': [1., 2., 3.], 'yy': [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) g = gdf.groupby('yy') p = pdf.groupby('yy') gxx = g['xx'].sum() pxx = p['xx'].sum() assert_eq(pxx, gxx)
def test_groupby_level_zero(agg): pdf = pd.DataFrame({'x': [1, 2, 3]}, index=[0, 1, 1]) gdf = DataFrame.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() assert_eq(pdresult, gdresult)
def test_groupby_column_name(): pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]}) gdf = DataFrame.from_pandas(pdf) g = gdf.groupby("yy") p = pdf.groupby("yy") gxx = g["xx"].sum() pxx = p["xx"].sum() assert_eq(pxx, gxx)
def test_dataframe_sort_values_sliced(nelem, sliceobj): np.random.seed(0) df = pd.DataFrame() df["a"] = np.random.random(nelem) expect = df[sliceobj]["a"].sort_values() gdf = DataFrame.from_pandas(df) got = gdf[sliceobj]["a"].sort_values() assert (got.to_pandas() == expect).all()
def test_groupby_level_zero(agg): pdf = pd.DataFrame({'x': [1, 2, 3]}, index=[0, 1, 1]) gdf = DataFrame.from_pandas(pdf) pdg = pdf.groupby(level=0) gdg = gdf.groupby(level=0) pdresult = getattr(pdg, agg)() gdresult = getattr(gdg, agg)() check_dtype = False if agg == 'count' else True assert_eq(pdresult, gdresult, check_dtype=check_dtype)
def test_string_set_scalar(scalar): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3, 4, 5] gdf = DataFrame.from_pandas(pdf) pdf["b"] = "a" gdf["b"] = "a" assert_eq(pdf["b"], gdf["b"]) assert_eq(pdf, gdf)
def test_query_empty_frames(): empty_pdf = pd.DataFrame({'a': [], 'b': []}) empty_gdf = DataFrame.from_pandas(empty_pdf) # Do the query expr = 'a > 2' got = empty_gdf.query(expr).to_pandas() expect = empty_pdf.query(expr) # assert euqal results assert_frame_equal(got, expect)
def test_dataframe_nsmallest_sliced(counts, sliceobj): nelem, n = counts np.random.seed(0) df = pd.DataFrame() df["a"] = np.random.random(nelem) df["b"] = np.random.random(nelem) expect = df[sliceobj].nsmallest(n, "a") gdf = DataFrame.from_pandas(df) got = gdf[sliceobj].nsmallest(n, "a") assert (got.to_pandas() == expect).all().all()
def test_dataframe_nlargest_sliced(counts, sliceobj): nelem, n = counts np.random.seed(0) df = pd.DataFrame() df['a'] = np.random.random(nelem) df['b'] = np.random.random(nelem) expect = df[sliceobj].nlargest(n, 'a') gdf = DataFrame.from_pandas(df) got = gdf[sliceobj].nlargest(n, 'a') assert (got.to_pandas() == expect).all().all()
def test_safe_merging_with_left_empty(): import numpy as np from cudf import DataFrame import pandas as pd np.random.seed(0) pairs = ('bcd', 'b') pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: pdf_left[left_column] = np.random.randint(0, 10, 0) for right_column in pairs[1]: pdf_right[right_column] = np.random.randint(0, 10, 5) gdf_left = DataFrame.from_pandas(pdf_left) gdf_right = DataFrame.from_pandas(pdf_right) pdf_result = pdf_left.merge(pdf_right) gdf_result = gdf_left.merge(gdf_right) # Simplify test because pandas does not consider empty Index and RangeIndex # to be equivalent. TODO: Allow empty Index objects to have equivalence. assert len(pdf_result) == len(gdf_result)
def test_from_pandas_with_index(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) pdf = pdf.set_index(np.asarray([4, 3, 2, 1])) df = DataFrame.from_pandas(pdf) # Check columns np.testing.assert_array_equal(df.a.to_array(fillna="pandas"), pdf.a) np.testing.assert_array_equal(df.b.to_array(fillna="pandas"), pdf.b) # Check index np.testing.assert_array_equal(df.index.values, pdf.index.values) # Check again using pandas testing tool on frames pd.util.testing.assert_frame_equal(df.to_pandas(), pdf)
def test_categorical_index(): pdf = pd.DataFrame() pdf["a"] = [1, 2, 3] pdf["index"] = pd.Categorical(["a", "b", "c"]) initial_df = DataFrame.from_pandas(pdf) pdf = pdf.set_index("index") gdf1 = DataFrame.from_pandas(pdf) gdf2 = DataFrame() gdf2["a"] = [1, 2, 3] gdf2["index"] = pd.Categorical(["a", "b", "c"]) assert_eq(initial_df.index, gdf2.index) gdf2 = gdf2.set_index("index") assert isinstance(gdf1.index, CategoricalIndex) assert_eq(pdf, gdf1) assert_eq(pdf.index, gdf1.index) assert_eq(pdf.index.codes, gdf1.index.codes.to_array()) assert isinstance(gdf2.index, CategoricalIndex) assert_eq(pdf, gdf2) assert_eq(pdf.index, gdf2.index) assert_eq(pdf.index.codes, gdf2.index.codes.to_array())
def test_dataframe_append_empty(): pdf = pd.DataFrame({ "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] }) gdf = DataFrame.from_pandas(pdf) gdf['newcol'] = 100 pdf['newcol'] = 100 assert len(gdf['newcol']) == len(pdf) assert len(pdf['newcol']) == len(pdf) pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
def test_from_pandas_ex1(): pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]}) print(pdf) df = DataFrame.from_pandas(pdf) print(df) assert tuple(df.columns) == tuple(pdf.columns) assert np.all(df["a"].to_array() == pdf["a"]) matches = df["b"].to_array(fillna="pandas") == pdf["b"] # the 3d element is False due to (nan == nan) == False assert np.all(matches == [True, True, False, True]) assert np.isnan(df["b"].to_array(fillna="pandas")[2]) assert np.isnan(pdf["b"][2])