예제 #1
0
def test_factorize_series_index():
    df = DataFrame()
    df["col1"] = ["C", "H", "C", "W", "W", "W", "W", "W", "C", "W"]
    df["col2"] = [
        2992443.0,
        2992447.0,
        2992466.0,
        2992440.0,
        2992441.0,
        2992442.0,
        2992444.0,
        2992445.0,
        2992446.0,
        2992448.0,
    ]
    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )

    df = df.set_index("col2")

    assert_eq(df.col1.factorize()[0].get(), df.to_pandas().col1.factorize()[0])
    assert_eq(
        df.col1.factorize()[1].to_pandas().values,
        df.to_pandas().col1.factorize()[1].values,
    )
예제 #2
0
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    lhs["b"] = bb = np.arange(len(lhs))
    lhs = lhs.set_index("a")

    rhs = DataFrame()
    rhs["a"] = pd.Categorical(list("abcac"), categories=list("abc"))
    rhs["c"] = cc = np.arange(len(rhs))
    rhs = rhs.set_index("a")

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by="b")
        .to_pandas()
        .sort_index()
        .reset_index(drop=True),
        expect.reset_index(drop=True),
    )

    # Just do some rough checking here.
    assert list(got.columns) == ["b", "c"]
    assert len(got) > 0
    assert set(got.index.to_pandas()) & set("abc")
    assert set(got["b"]) & set(bb)
    assert set(got["c"]) & set(cc)
예제 #3
0
def test_merge_multi(kwargs):

    left = DataFrame(
        {
            "a": [1, 2, 3, 4, 3, 5, 6],
            "b": [1, 3, 5, 7, 5, 9, 0],
            "c": ["o", "p", "q", "r", "s", "t", "u"],
            "d": ["v", "w", "x", "y", "z", "1", "2"],
        }
    )
    right = DataFrame(
        {
            "a": [0, 9, 3, 4, 3, 7, 8],
            "b": [2, 4, 5, 7, 5, 6, 8],
            "c": ["a", "b", "c", "d", "e", "f", "g"],
            "d": ["j", "i", "j", "k", "l", "m", "n"],
        }
    )

    if (
        kwargs["left_on"] is not None
        and kwargs["right_on"] is not None
        and kwargs["left_index"] is False
        and kwargs["right_index"] is False
    ):
        left = left.set_index(["c", "d"])
        right = right.set_index(["c", "d"])
    elif (
        kwargs["left_on"] is None
        and kwargs["right_on"] is None
        and kwargs["left_index"] is True
        and kwargs["right_index"] is True
    ):
        left = left.set_index(["a", "b"])
        right = right.set_index(["a", "b"])
    elif kwargs["left_on"] is not None and kwargs["right_index"] is True:
        left = left.set_index(["c", "d"])
        right = right.set_index(["a", "b"])
    elif kwargs["right_on"] is not None and kwargs["left_index"] is True:
        left = left.set_index(["a", "b"])
        right = right.set_index(["c", "d"])

    gleft = left.to_pandas()
    gright = right.to_pandas()

    kwargs["sort"] = True
    expect = gleft.merge(gright, **kwargs)
    got = left.merge(right, **kwargs)

    assert_eq(expect.sort_index().index, got.sort_index().index)

    expect.index = range(len(expect))
    got.index = range(len(got))
    expect = expect.sort_values(list(expect.columns))
    got = got.sort_values(list(got.columns))
    expect.index = range(len(expect))
    got.index = range(len(got))

    assert_eq(expect, got)
예제 #4
0
def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key1"] = np.random.randint(0, 30, nelem)
    df_right["key2"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how="left")

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how="left")
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left")

    join_result["right_val"] = (join_result["right_val"].astype(
        np.float64).fillna(np.nan))

    join_result_cudf["right_val"] = (join_result_cudf["right_val"].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if col.count("_y") > 0:
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = (join_result.to_pandas().sort_values(list(
        pddf_joined.columns)).reset_index(drop=True))

    pdf_result = pddf_joined.sort_values(list(
        pddf_joined.columns)).reset_index(drop=True)

    pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = (join_result_cudf.to_pandas().sort_values(
        list(pddf_joined.columns)).reset_index(drop=True))

    pd.util.testing.assert_frame_equal(merge_func_result_cdf,
                                       cdf_result,
                                       check_like=True)
예제 #5
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in "abc":
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index("a")
    right = df.set_index("c")
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match(
        "there are overlapping columns but lsuffix"
        " and rsuffix are not defined"
    )

    got = left.join(right, lsuffix="_left", rsuffix="_right", sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index("a").join(
        pddf.set_index("c"), lsuffix="_left", rsuffix="_right"
    )
    # Check
    assert list(expect.columns) == list(got.columns)
    assert_eq(expect.index.values, got.index.values)
    for k in expect.columns:
        _check_series(expect[k].fillna(-1), got[k].fillna(-1))
예제 #6
0
def test_dataframe_join_how(aa, bb, how, method):
    df = DataFrame()
    df["a"] = aa
    df["b"] = bb

    def work_pandas(df, how):
        df1 = df.set_index("a")
        df2 = df.set_index("b")
        if how == "leftanti":
            joined = pd_odd_joins(df1, df2, "leftanti")
        elif how == "leftsemi":
            joined = pd_odd_joins(df1, df2, "leftsemi")
        else:
            joined = df1.join(df2, how=how, sort=True)
        return joined

    def work_gdf(df):
        df1 = df.set_index("a")
        df2 = df.set_index("b")
        joined = df1.join(df2, how=how, sort=True, method=method)
        return joined

    expect = work_pandas(df.to_pandas(), how)
    got = work_gdf(df)
    expecto = expect.copy()
    goto = got.copy()

    expect = expect.astype(np.float64).fillna(np.nan)[expect.columns]
    got = got.astype(np.float64).fillna(np.nan)[expect.columns]

    assert got.index.name is None

    assert list(expect.columns) == list(got.columns)
    # test disabled until libgdf sort join gets updated with new api
    if method == "hash":
        assert_eq(sorted(expect.index.values), sorted(got.index.values))
        if how != "outer":
            # Newly introduced ambiguous ValueError thrown when
            # an index and column have the same name. Rename the
            # index so sorts work.
            # TODO: What is the less hacky way?
            expect.index.name = "bob"
            got.index.name = "mary"
            pd.util.testing.assert_frame_equal(
                got.to_pandas()
                .sort_values(got.columns.to_list())
                .reset_index(drop=True),
                expect.sort_values(expect.columns.to_list()).reset_index(
                    drop=True
                ),
            )
        # if(how=='right'):
        #     _sorted_check_series(expect['a'], expect['b'],
        #                          got['a'], got['b'])
        # else:
        #     _sorted_check_series(expect['b'], expect['a'], got['b'],
        #                          got['a'])
        else:
            for c in expecto.columns:
                _check_series(expecto[c].fillna(-1), goto[c].fillna(-1))
예제 #7
0
def test_df_cat_sort_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    got = df.set_index("a").sort_index()
    expect = df.to_pandas().set_index("a").sort_index()

    assert_eq(got, expect)
예제 #8
0
def test_df_cat_set_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))
    got = df.set_index("a")

    pddf = df.to_pandas(nullable_pd_dtype=False)
    expect = pddf.set_index("a")

    assert_eq(got, expect)
예제 #9
0
def test_groupby_apply_basic_agg_single_column():
    gdf = DataFrame()
    gdf["key"] = [0, 0, 1, 1, 2, 2, 0]
    gdf["val"] = [0, 1, 2, 3, 4, 5, 6]
    gdf["mult"] = gdf["key"] * gdf["val"]
    pdf = gdf.to_pandas()

    gdg = gdf.groupby(["key", "val"]).mult.sum()
    pdg = pdf.groupby(["key", "val"]).mult.sum()
    assert_eq(pdg, gdg)
예제 #10
0
def test_df_cat_sort_index():
    df = DataFrame()
    df["a"] = pd.Categorical(list("aababcabbc"), categories=list("abc"))
    df["b"] = np.arange(len(df))

    got = df.set_index("a").sort_index()
    expect = df.to_pandas().set_index("a").sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect["b"].values, got["b"].to_array())
예제 #11
0
def test_dataframe_sort_values_ignore_index(index, ignore_index):
    gdf = DataFrame({
        "a": [1, 3, 5, 2, 4],
        "b": [1, 1, 2, 2, 3],
        "c": [9, 7, 7, 7, 1]
    })
    gdf = gdf.set_index(index)

    pdf = gdf.to_pandas()

    expect = pdf.sort_values(list(pdf.columns), ignore_index=ignore_index)
    got = gdf.sort_values((gdf.columns), ignore_index=ignore_index)

    assert_eq(expect, got)
예제 #12
0
def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
    got_grpby = df.groupby(["key1", "key2"])

    def foo(df):
        df["out"] = df["val1"] + df["val2"]
        return df

    expect = expect_grpby.apply(foo)
    got = got_grpby.apply(foo)
    assert_eq(expect, got)
예제 #13
0
def test_groupby_apply_grouped():
    from numba import cuda

    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"],
                                          as_index=False,
                                          sort=True)
    got_grpby = df.groupby(["key1", "key2"], sort=True)

    def foo(key1, val1, com1, com2):
        for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x):
            com1[i] = key1[i] * 10000 + val1[i]
            com2[i] = i

    got = got_grpby.apply_grouped(
        foo,
        incols=["key1", "val1"],
        outcols={
            "com1": np.float64,
            "com2": np.int32
        },
        tpb=8,
    )

    got = got.to_pandas()

    # Get expected result by emulating the operation in pandas
    def emulate(df):
        df["com1"] = df.key1 * 10000 + df.val1
        df["com2"] = np.arange(len(df), dtype=np.int32)
        return df

    expect = expect_grpby.apply(emulate)
    expect = expect.sort_values(["key1", "key2"])

    assert_eq(expect, got)
예제 #14
0
def test_to_pandas():
    df = DataFrame()
    df["a"] = np.arange(5, dtype=np.int32)
    df["b"] = np.arange(10, 15, dtype=np.float64)
    df["c"] = np.array([True, False, None, True, True])

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df["a"].dtype == pdf["a"].dtype
    assert df["b"].dtype == pdf["b"].dtype

    # Notice, the dtype differ when Pandas and cudf boolean series
    # contains None/NaN
    assert df["c"].dtype == np.bool
    assert pdf["c"].dtype == np.object

    assert len(df["a"]) == len(pdf["a"])
    assert len(df["b"]) == len(pdf["b"])
    assert len(df["c"]) == len(pdf["c"])
예제 #15
0
def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df["key1"] = np.random.randint(0, 3, nelem)
    df["key2"] = np.random.randint(0, 2, nelem)
    df["val1"] = np.random.random(nelem)
    df["val2"] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(["key1", "key2"], as_index=False)
    got_grpby = df.groupby(["key1", "key2"], method="cudf")

    def foo(df):
        df["out"] = df["val1"] + df["val2"]
        return df

    expect = expect_grpby.apply(foo)
    expect = expect.sort_values(["key1", "key2"]).reset_index(drop=True)

    got = got_grpby.apply(foo).to_pandas()
    pd.util.testing.assert_frame_equal(expect, got)
예제 #16
0
def test_set_index_as_property():
    cdf = DataFrame()
    col1 = np.arange(10)
    col2 = np.arange(0, 20, 2)
    cdf["a"] = col1
    cdf["b"] = col2

    # Check set_index(Series)
    cdf.index = cdf["b"]

    np.testing.assert_array_equal(cdf.index.values, col2)

    with pytest.raises(ValueError):
        cdf.index = [list(range(10))]

    idx = np.arange(0, 1000, 100)
    cdf.index = idx
    np.testing.assert_array_equal(cdf.index.values, idx)

    df = cdf.to_pandas()
    np.testing.assert_array_equal(df.index.values, idx)

    head = cdf.head().to_pandas()
    np.testing.assert_array_equal(head.index.values, idx[:5])
예제 #17
0
파일: test_index.py 프로젝트: vuule/cudf
def test_set_index_as_property():
    cdf = DataFrame()
    col1 = np.arange(10)
    col2 = np.arange(0, 20, 2)
    cdf["a"] = col1
    cdf["b"] = col2

    # Check set_index(Series)
    cdf.index = cdf["b"]

    assert_eq(cdf.index._values.to_array(), col2)

    with pytest.raises(ValueError):
        cdf.index = [list(range(10))]

    idx = pd.Index(np.arange(0, 1000, 100))
    cdf.index = idx
    assert_eq(cdf.index.to_pandas(), idx)

    df = cdf.to_pandas()
    assert_eq(df.index, idx)

    head = cdf.head().to_pandas()
    assert_eq(head.index, idx[:5])
예제 #18
0
def test_dataframe_join_how(aa, bb, how, method):
    df = DataFrame()
    df["a"] = aa
    df["b"] = bb

    def work_pandas(df):
        ts = timer()
        df1 = df.set_index("a")
        df2 = df.set_index("b")
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print("timing", type(df), te - ts)
        return joined

    def work_gdf(df):
        ts = timer()
        df1 = df.set_index("a")
        df2 = df.set_index("b")
        joined = df1.join(df2, how=how, sort=True, method=method)
        te = timer()
        print("timing", type(df), te - ts)
        return joined

    expect = work_pandas(df.to_pandas())
    got = work_gdf(df)
    expecto = expect.copy()
    goto = got.copy()

    # Type conversion to handle NoneType
    expectb = expect.b
    expecta = expect.a
    gotb = got.b
    gota = got.a
    del got["b"]
    got.insert(len(got._data), "b", gotb.astype(np.float64).fillna(np.nan))
    del got["a"]
    got.insert(len(got._data), "a", gota.astype(np.float64).fillna(np.nan))
    expect.drop(["b"], axis=1)
    expect["b"] = expectb.astype(np.float64).fillna(np.nan)
    expect.drop(["a"], axis=1)
    expect["a"] = expecta.astype(np.float64).fillna(np.nan)

    assert got.index.name is None

    assert list(expect.columns) == list(got.columns)
    # test disabled until libgdf sort join gets updated with new api
    if method == "hash":
        assert np.all(expect.index.values == got.index.values)
        if how != "outer":
            # Newly introduced ambiguous ValueError thrown when
            # an index and column have the same name. Rename the
            # index so sorts work.
            # TODO: What is the less hacky way?
            expect.index.name = "bob"
            got.index.name = "mary"
            pd.util.testing.assert_frame_equal(
                got.to_pandas().sort_values(["b", "a"]).reset_index(drop=True),
                expect.sort_values(["b", "a"]).reset_index(drop=True),
            )
        # if(how=='right'):
        #     _sorted_check_series(expect['a'], expect['b'],
        #                          got['a'], got['b'])
        # else:
        #     _sorted_check_series(expect['b'], expect['a'], got['b'],
        #                          got['a'])
        else:
            _check_series(expecto["b"].fillna(-1), goto["b"].fillna(-1))
            _check_series(expecto["a"].fillna(-1), goto["a"].fillna(-1))