示例#1
0
def test_dataframe_pairs_of_triples(pairs, max, rows, how):
    np.random.seed(0)

    pdf_left = pd.DataFrame()
    pdf_right = pd.DataFrame()
    for left_column in pairs[0]:
        pdf_left[left_column] = np.random.randint(0, max, rows)
    for right_column in pairs[1]:
        pdf_right[right_column] = np.random.randint(0, max, rows)
    gdf_left = DataFrame.from_pandas(pdf_left)
    gdf_right = DataFrame.from_pandas(pdf_right)
    if not set(pdf_left.columns).intersection(pdf_right.columns):
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    elif not [value for value in pdf_left if value in pdf_right]:
        with pytest.raises(pd.core.reshape.merge.MergeError) as raises:
            pdf_left.merge(pdf_right)
        raises.match("No common columns to perform merge on")
        with pytest.raises(ValueError) as raises:
            gdf_left.merge(gdf_right)
        raises.match("No common columns to perform merge on")
    else:
        pdf_result = pdf_left.merge(pdf_right, how=how)
        gdf_result = gdf_left.merge(gdf_right, how=how)
        assert np.array_equal(gdf_result.columns, pdf_result.columns)
        for column in gdf_result:
            assert np.array_equal(gdf_result[column].fillna(-1).sort_values(),
                                  pdf_result[column].fillna(-1).sort_values())
示例#2
0
def test_merge_left_right_index_left_right_on_kwargs2(kwargs):
    left = pd.DataFrame({'x': [1, 2, 3]}, index=[10, 20, 30])
    right = pd.DataFrame({'y': [10, 20, 30]}, index=[1, 2, 30])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    gd_merge = gleft.merge(gright, **kwargs)
    pd_merge = left.merge(right, **kwargs)
    if pd_merge.empty:
        assert (gd_merge.empty)
示例#3
0
def test_merge_left_right_index_left_right_on_kwargs(kwargs):
    left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6])
    right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]},
                         index=[1, 2, 3, 4, 5, 7])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    pd_merge = left.merge(right, **kwargs)
    gd_merge = gleft.merge(gright, **kwargs)
    assert_eq(pd_merge, gd_merge)
示例#4
0
def test_merge_left_index_zero():
    left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[0, 1, 2, 3, 4, 5])
    right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]},
                         index=[0, 1, 2, 3, 4, 6])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    pd_merge = left.merge(right, left_on="x", right_on='y')
    gd_merge = gleft.merge(gright, left_on="x", right_on='y')

    assert_eq(pd_merge, gd_merge)
示例#5
0
def test_merge_sort(kwargs, hows):
    kwargs.update(hows)
    kwargs['sort'] = True
    d = range(3)
    left = pd.DataFrame({'k2': d, 'k1': d, 'k4': d, 'k3': d, 'k5': d})
    right = pd.DataFrame({'k1': d, 'k4': d, 'k2': d, 'k3': d, 'k5': d})
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    gd_merge = gleft.merge(gright, **kwargs)
    pd_merge = left.merge(right, **kwargs)
    if pd_merge.empty:
        assert (gd_merge.empty)
示例#6
0
def test_merge_sort(kwargs, hows):
    kwargs.update(hows)
    kwargs["sort"] = True
    d = range(3)
    left = pd.DataFrame({"k2": d, "k1": d, "k4": d, "k3": d, "k5": d})
    right = pd.DataFrame({"k1": d, "k4": d, "k2": d, "k3": d, "k5": d})
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    gd_merge = gleft.merge(gright, **kwargs)
    pd_merge = left.merge(right, **kwargs)
    if pd_merge.empty:
        assert gd_merge.empty
示例#7
0
def test_dataframe_join_mismatch_cats(how):
    pdf1 = pd.DataFrame({
        "join_col": ["a", "b", "c", "d", "e"],
        "data_col_left": [10, 20, 30, 40, 50],
    })
    pdf2 = pd.DataFrame({
        "join_col": ["c", "e", "f"],
        "data_col_right": [6, 7, 8]
    })

    pdf1["join_col"] = pdf1["join_col"].astype("category")
    pdf2["join_col"] = pdf2["join_col"].astype("category")

    gdf1 = DataFrame.from_pandas(pdf1)
    gdf2 = DataFrame.from_pandas(pdf2)

    gdf1 = gdf1.set_index("join_col")
    gdf2 = gdf2.set_index("join_col")

    pdf1 = pdf1.set_index("join_col")
    pdf2 = pdf2.set_index("join_col")
    join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash")
    join_pdf = pdf1.join(pdf2, how=how)

    got = join_gdf.to_pandas()
    expect = join_pdf.fillna(-1)  # note: cudf join doesn't mask NA

    # cudf creates the columns in different order than pandas for right join
    if how == "right":
        got = got[["data_col_left", "data_col_right"]]

    expect.data_col_right = expect.data_col_right.astype(np.int64)
    expect.data_col_left = expect.data_col_left.astype(np.int64)
    # Expect has the wrong index type. Quick fix to get index type working
    # again I think this implies that CategoricalIndex.to_pandas() is not
    # working correctly, since the below corrects it. Remove this line for
    # an annoying error. TODO: Make CategoricalIndex.to_pandas() work
    # correctly for the below case.
    # Error:
    # AssertionError: Categorical Expected type <class
    # 'pandas.core.arrays.categorical.Categorical'>, found <class
    # 'numpy.ndarray'> instead
    expect.index = pd.Categorical(expect.index)
    pd.util.testing.assert_frame_equal(
        got,
        expect,
        check_names=False,
        check_index_type=False,
        # For inner joins, pandas returns
        # weird categories.
        check_categorical=how != "inner",
    )
    assert list(got.index) == list(expect.index)
示例#8
0
def test_merge_left_right_index_left_right_on_kwargs(kwargs):
    left = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 5, 6])
    right = pd.DataFrame({'y': [10, 20, 30, 6, 5, 4]},
                         index=[1, 2, 3, 4, 5, 7])
    gleft = DataFrame.from_pandas(left)
    gright = DataFrame.from_pandas(right)
    pd_merge = left.merge(right, **kwargs)
    if kwargs.get('left_on') and kwargs.get('right_on'):
        with pytest.raises(NotImplementedError) as raises:
            gd_merge = gleft.merge(gright, **kwargs)
        raises.match("left_on='x', right_on='y' not supported")
    else:
        gd_merge = gleft.merge(gright, **kwargs)
        assert_eq(pd_merge, gd_merge)
示例#9
0
def test_join_datetimes_index(dtype):
    datetimes = pd.Series(pd.date_range("20010101", "20010102", freq="12h"))
    pdf_lhs = pd.DataFrame(index=[1, 0, 1, 2, 0, 0, 1])
    pdf_rhs = pd.DataFrame({"d": datetimes})
    gdf_lhs = DataFrame.from_pandas(pdf_lhs)
    gdf_rhs = DataFrame.from_pandas(pdf_rhs)

    gdf_rhs["d"] = gdf_rhs["d"].astype(dtype)

    pdf = pdf_lhs.join(pdf_rhs, sort=True)
    gdf = gdf_lhs.join(gdf_rhs, sort=True)

    assert gdf["d"].dtype == np.dtype(dtype)

    assert_eq(pdf, gdf)
示例#10
0
def test_groupby_column_numeral():
    pdf = pd.DataFrame({0: [1.0, 2.0, 3.0], 1: [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    p = pdf.groupby(1)
    g = gdf.groupby(1)
    pxx = p[0].sum()
    gxx = g[0].sum()
    assert_eq(pxx, gxx)

    pdf = pd.DataFrame({0.5: [1.0, 2.0, 3.0], 1.5: [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    p = pdf.groupby(1.5)
    g = gdf.groupby(1.5)
    pxx = p[0.5].sum()
    gxx = g[0.5].sum()
    assert_eq(pxx, gxx)
示例#11
0
def test_dataframe_setitem_from_masked_object():
    ary = np.random.randn(100)
    mask = np.zeros(100, dtype=bool)
    mask[:20] = True
    np.random.shuffle(mask)
    ary[mask] = np.nan

    test1 = Series(ary)
    assert (test1.has_null_mask)
    assert (test1.null_count == 20)

    test2 = DataFrame.from_pandas(pd.DataFrame({'a': ary}))
    assert (test2['a'].has_null_mask)
    assert (test2['a'].null_count == 20)

    gpu_ary = rmm.to_device(ary)
    test3 = Series(gpu_ary)
    assert (test3.has_null_mask)
    assert (test3.null_count == 20)

    test4 = DataFrame()
    lst = [1, 2, None, 4, 5, 6, None, 8, 9]
    test4['lst'] = lst
    assert (test4['lst'].has_null_mask)
    assert (test4['lst'].null_count == 2)
示例#12
0
def test_dataframe_multi_column_nulls(
    num_cols, num_rows, dtype, nulls, ascending, na_position
):

    from string import ascii_lowercase

    np.random.seed(0)
    by = list(ascii_lowercase[:num_cols])
    pdf = pd.DataFrame()

    for i in range(5):
        colname = ascii_lowercase[i]
        data = np.random.randint(0, 26, num_rows).astype(dtype)
        if nulls == "some":
            idx = np.array([], dtype="int64")
            if num_rows > 0:
                idx = np.random.choice(
                    num_rows, size=int(num_rows / 4), replace=False
                )
            data[idx] = np.nan
        elif nulls == "all":
            data[:] = np.nan
        pdf[colname] = data

    gdf = DataFrame.from_pandas(pdf)

    got = gdf.sort_values(by, ascending=ascending, na_position=na_position)
    expect = pdf.sort_values(by, ascending=ascending, na_position=na_position)

    assert_eq(
        got[by].reset_index(drop=True), expect[by].reset_index(drop=True)
    )
示例#13
0
def test_fillna_dataframe(fill_type, inplace):
    pdf = pd.DataFrame({"a": [1, 2, None], "b": [None, None, 5]})
    gdf = DataFrame.from_pandas(pdf)

    if fill_type == "scalar":
        fill_value_pd = 5
        fill_value_cudf = fill_value_pd
    elif fill_type == "series":
        fill_value_pd = pd.Series([3, 4, 5])
        fill_value_cudf = Series.from_pandas(fill_value_pd)
    else:
        fill_value_pd = {"a": 5, "b": pd.Series([3, 4, 5])}
        fill_value_cudf = {
            "a": fill_value_pd["a"],
            "b": Series.from_pandas(fill_value_pd["b"]),
        }

    # https://github.com/pandas-dev/pandas/issues/27197
    # pandas df.fill_value with series is not working

    if isinstance(fill_value_pd, pd.Series):
        expect = pd.DataFrame()
        for col in pdf.columns:
            expect[col] = pdf[col].fillna(fill_value_pd)
    else:
        expect = pdf.fillna(fill_value_pd)

    got = gdf.fillna(fill_value_cudf, inplace=inplace)

    if inplace:
        got = gdf

    assert_eq(expect, got)
示例#14
0
def test_dataframe_replace_with_nulls():
    # numerical
    pdf1 = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, 3]})
    gdf1 = DataFrame.from_pandas(pdf1)
    pdf2 = pdf1.replace(0, 4)
    gdf2 = gdf1.replace(0, None).fillna(4)
    pd.testing.assert_frame_equal(gdf2.to_pandas(), pdf2)

    # list input
    pdf6 = pdf1.replace([0, 1], [4, 5])
    gdf6 = gdf1.replace([0, 1], [4, None]).fillna(5)
    pd.testing.assert_frame_equal(gdf6.to_pandas(), pdf6)

    pdf7 = pdf1.replace([0, 1], 4)
    gdf7 = gdf1.replace([0, 1], None).fillna(4)
    pd.testing.assert_frame_equal(gdf7.to_pandas(), pdf7)

    # dict input:
    pdf8 = pdf1.replace({"a": 0, "b": 0}, {"a": 4, "b": 5})
    gdf8 = gdf1.replace({"a": 0, "b": 0}, {"a": None, "b": 5}).fillna(4)
    pd.testing.assert_frame_equal(gdf8.to_pandas(), pdf8)

    gdf1 = DataFrame({"a": [0, 1, 2, 3], "b": [0, 1, 2, None]})
    gdf9 = gdf1.replace([0, 1], [4, 5]).fillna(3)
    pd.testing.assert_frame_equal(gdf9.to_pandas(), pdf6)
示例#15
0
def test_dataframe_merge_strings_not_supported():
    pleft = pd.DataFrame({
        'x': [0, 1, 2, 3],
        'name': ['Alice', 'Bob', 'Charlie', 'Dan']
    })
    with pytest.raises(NotImplementedError) as raises:
        gleft = DataFrame.from_pandas(pleft)  # noqa:F841
    raises.match('Strings are not yet supported')
示例#16
0
def test_dataframe_join_mismatch_cats(how):
    pdf1 = pd.DataFrame({
        "join_col": ["a", "b", "c", "d", "e"],
        "data_col_left": [10, 20, 30, 40, 50]
    })
    pdf2 = pd.DataFrame({
        "join_col": ["c", "e", "f"],
        "data_col_right": [6, 7, 8]
    })

    pdf1["join_col"] = pdf1["join_col"].astype("category")
    pdf2["join_col"] = pdf2["join_col"].astype("category")

    gdf1 = DataFrame.from_pandas(pdf1)
    gdf2 = DataFrame.from_pandas(pdf2)

    gdf1 = gdf1.set_index("join_col")
    gdf2 = gdf2.set_index("join_col")

    pdf1 = pdf1.set_index('join_col')
    pdf2 = pdf2.set_index('join_col')
    join_gdf = gdf1.join(gdf2, how=how, sort=True, method='hash')
    join_pdf = pdf1.join(pdf2, how=how)

    got = join_gdf.to_pandas()
    expect = join_pdf.fillna(-1)  # note: cudf join doesn't mask NA

    # cudf creates the columns in different order than pandas for right join
    if how == 'right':
        got = got[['data_col_left', 'data_col_right']]

    expect.data_col_right = expect.data_col_right.astype(np.int64)
    expect.data_col_left = expect.data_col_left.astype(np.int64)

    # Pandas returns a `object` dtype index for some reason...
    expect.index = expect.index.astype('category')

    pd.util.testing.assert_frame_equal(
        got,
        expect,
        check_names=False,
        check_index_type=False,
        # For inner joins, pandas returns
        # weird categories.
        check_categorical=how != 'inner')
    assert list(got.index) == list(expect.index)
示例#17
0
def test_groupby_column_name():
    pdf = pd.DataFrame({'xx': [1., 2., 3.], 'yy': [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    g = gdf.groupby('yy')
    p = pdf.groupby('yy')
    gxx = g['xx'].sum()
    pxx = p['xx'].sum()
    assert_eq(pxx, gxx)
示例#18
0
def test_groupby_level_zero(agg):
    pdf = pd.DataFrame({'x': [1, 2, 3]}, index=[0, 1, 1])
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby(level=0)
    gdg = gdf.groupby(level=0)
    pdresult = getattr(pdg, agg)()
    gdresult = getattr(gdg, agg)()
    assert_eq(pdresult, gdresult)
示例#19
0
def test_groupby_column_name():
    pdf = pd.DataFrame({"xx": [1.0, 2.0, 3.0], "yy": [1, 2, 3]})
    gdf = DataFrame.from_pandas(pdf)
    g = gdf.groupby("yy")
    p = pdf.groupby("yy")
    gxx = g["xx"].sum()
    pxx = p["xx"].sum()
    assert_eq(pxx, gxx)
示例#20
0
def test_dataframe_sort_values_sliced(nelem, sliceobj):
    np.random.seed(0)
    df = pd.DataFrame()
    df["a"] = np.random.random(nelem)

    expect = df[sliceobj]["a"].sort_values()
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj]["a"].sort_values()
    assert (got.to_pandas() == expect).all()
示例#21
0
def test_groupby_level_zero(agg):
    pdf = pd.DataFrame({'x': [1, 2, 3]}, index=[0, 1, 1])
    gdf = DataFrame.from_pandas(pdf)
    pdg = pdf.groupby(level=0)
    gdg = gdf.groupby(level=0)
    pdresult = getattr(pdg, agg)()
    gdresult = getattr(gdg, agg)()
    check_dtype = False if agg == 'count' else True
    assert_eq(pdresult, gdresult, check_dtype=check_dtype)
示例#22
0
def test_string_set_scalar(scalar):
    pdf = pd.DataFrame()
    pdf["a"] = [1, 2, 3, 4, 5]
    gdf = DataFrame.from_pandas(pdf)

    pdf["b"] = "a"
    gdf["b"] = "a"

    assert_eq(pdf["b"], gdf["b"])
    assert_eq(pdf, gdf)
示例#23
0
def test_query_empty_frames():
    empty_pdf = pd.DataFrame({'a': [], 'b': []})
    empty_gdf = DataFrame.from_pandas(empty_pdf)
    # Do the query
    expr = 'a > 2'
    got = empty_gdf.query(expr).to_pandas()
    expect = empty_pdf.query(expr)

    # assert euqal results
    assert_frame_equal(got, expect)
示例#24
0
def test_dataframe_nsmallest_sliced(counts, sliceobj):
    nelem, n = counts
    np.random.seed(0)
    df = pd.DataFrame()
    df["a"] = np.random.random(nelem)
    df["b"] = np.random.random(nelem)

    expect = df[sliceobj].nsmallest(n, "a")
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj].nsmallest(n, "a")
    assert (got.to_pandas() == expect).all().all()
示例#25
0
def test_dataframe_nlargest_sliced(counts, sliceobj):
    nelem, n = counts
    np.random.seed(0)
    df = pd.DataFrame()
    df['a'] = np.random.random(nelem)
    df['b'] = np.random.random(nelem)

    expect = df[sliceobj].nlargest(n, 'a')
    gdf = DataFrame.from_pandas(df)
    got = gdf[sliceobj].nlargest(n, 'a')
    assert (got.to_pandas() == expect).all().all()
示例#26
0
def test_safe_merging_with_left_empty():
    import numpy as np
    from cudf import DataFrame
    import pandas as pd
    np.random.seed(0)

    pairs = ('bcd', 'b')
    pdf_left = pd.DataFrame()
    pdf_right = pd.DataFrame()
    for left_column in pairs[0]:
        pdf_left[left_column] = np.random.randint(0, 10, 0)
    for right_column in pairs[1]:
        pdf_right[right_column] = np.random.randint(0, 10, 5)
    gdf_left = DataFrame.from_pandas(pdf_left)
    gdf_right = DataFrame.from_pandas(pdf_right)

    pdf_result = pdf_left.merge(pdf_right)
    gdf_result = gdf_left.merge(gdf_right)
    # Simplify test because pandas does not consider empty Index and RangeIndex
    # to be equivalent. TODO: Allow empty Index objects to have equivalence.
    assert len(pdf_result) == len(gdf_result)
示例#27
0
def test_from_pandas_with_index():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    pdf = pdf.set_index(np.asarray([4, 3, 2, 1]))
    df = DataFrame.from_pandas(pdf)

    # Check columns
    np.testing.assert_array_equal(df.a.to_array(fillna="pandas"), pdf.a)
    np.testing.assert_array_equal(df.b.to_array(fillna="pandas"), pdf.b)
    # Check index
    np.testing.assert_array_equal(df.index.values, pdf.index.values)
    # Check again using pandas testing tool on frames
    pd.util.testing.assert_frame_equal(df.to_pandas(), pdf)
示例#28
0
def test_categorical_index():
    pdf = pd.DataFrame()
    pdf["a"] = [1, 2, 3]
    pdf["index"] = pd.Categorical(["a", "b", "c"])
    initial_df = DataFrame.from_pandas(pdf)
    pdf = pdf.set_index("index")
    gdf1 = DataFrame.from_pandas(pdf)
    gdf2 = DataFrame()
    gdf2["a"] = [1, 2, 3]
    gdf2["index"] = pd.Categorical(["a", "b", "c"])
    assert_eq(initial_df.index, gdf2.index)
    gdf2 = gdf2.set_index("index")

    assert isinstance(gdf1.index, CategoricalIndex)
    assert_eq(pdf, gdf1)
    assert_eq(pdf.index, gdf1.index)
    assert_eq(pdf.index.codes, gdf1.index.codes.to_array())

    assert isinstance(gdf2.index, CategoricalIndex)
    assert_eq(pdf, gdf2)
    assert_eq(pdf.index, gdf2.index)
    assert_eq(pdf.index.codes, gdf2.index.codes.to_array())
示例#29
0
def test_dataframe_append_empty():
    pdf = pd.DataFrame({
        "key": [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4],
        "value": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
    })
    gdf = DataFrame.from_pandas(pdf)

    gdf['newcol'] = 100
    pdf['newcol'] = 100

    assert len(gdf['newcol']) == len(pdf)
    assert len(pdf['newcol']) == len(pdf)
    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
示例#30
0
def test_from_pandas_ex1():
    pdf = pd.DataFrame({"a": [0, 1, 2, 3], "b": [0.1, 0.2, None, 0.3]})
    print(pdf)
    df = DataFrame.from_pandas(pdf)
    print(df)

    assert tuple(df.columns) == tuple(pdf.columns)
    assert np.all(df["a"].to_array() == pdf["a"])
    matches = df["b"].to_array(fillna="pandas") == pdf["b"]
    # the 3d element is False due to (nan == nan) == False
    assert np.all(matches == [True, True, False, True])
    assert np.isnan(df["b"].to_array(fillna="pandas")[2])
    assert np.isnan(pdf["b"][2])