Exemplo n.º 1
0
def test_typecast_on_join_dt_to_dt(dtype_l, dtype_r):
    other_data = ["a", "b", "c", "d", "e"]
    join_data_l = Series(
        ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-15"]
    ).astype(dtype_l)
    join_data_r = Series(
        ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01", "2019-08-16"]
    ).astype(dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = max(np.dtype(dtype_l), np.dtype(dtype_r))

    exp_join_data = ["1991-11-20", "1999-12-31", "2004-12-04", "2015-01-01"]
    exp_other_data = ["a", "b", "c", "d"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame(
        {
            "join_col": exp_join_col,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
Exemplo n.º 2
0
def test_typecast_on_join_categorical(dtype_l, dtype_r):
    if not (dtype_l == "category" or dtype_r == "category"):
        pytest.skip("at least one side must be category for this set of tests")
    if dtype_l == "category" and dtype_r == "category":
        pytest.skip("Can't determine which categorical to use")

    other_data = ["a", "b", "c", "d", "e"]
    join_data_l = Series([1, 2, 3, 4, 5], dtype=dtype_l)
    join_data_r = Series([1, 2, 3, 4, 6], dtype=dtype_r)
    if dtype_l == "category":
        exp_dtype = join_data_l.dtype
        exp_categories = join_data_l.astype(int)._column
    elif dtype_r == "category":
        exp_dtype = join_data_r.dtype
        exp_categories = join_data_r.astype(int)._column

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4]
    exp_other_data = ["a", "b", "c", "d"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame(
        {
            "join_col": exp_join_col,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )
    expect["join_col"] = expect["join_col"].cat.set_categories(exp_categories)

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")
    assert_eq(expect, got, check_dtype=False)
Exemplo n.º 3
0
def test_typecast_on_join_mixed_int_float(dtype_l, dtype_r):
    if ("int" in dtype_l and "int" in dtype_r) or (
        "float" in dtype_l and "float" in dtype_r
    ):
        pytest.skip("like types not tested in this function")

    other_data = ["a", "b", "c", "d", "e", "f"]

    join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l)
    join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2, 3]
    exp_other_data = ["a", "b", "c"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame(
        {
            "join_col": exp_join_col,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
Exemplo n.º 4
0
def test_typecast_on_join_int_to_int(dtype_l, dtype_r):
    other_data = ["a", "b", "c"]

    join_data_l = Series([1, 2, 3], dtype=dtype_l)
    join_data_r = Series([1, 2, 4], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    exp_join_data = [1, 2]
    exp_other_data = ["a", "b"]
    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame(
        {
            "join_col": exp_join_col,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
Exemplo n.º 5
0
def test_typecast_on_join_float_to_float(dtype_l, dtype_r):
    other_data = ["a", "b", "c", "d", "e", "f"]

    join_data_l = Series([1, 2, 3, 0.9, 4.5, 6], dtype=dtype_l)
    join_data_r = Series([1, 2, 3, 0.9, 4.5, 7], dtype=dtype_r)

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_dtype = np.find_common_type([], [np.dtype(dtype_l), np.dtype(dtype_r)])

    if dtype_l != dtype_r:
        exp_join_data = [1, 2, 3, 4.5]
        exp_other_data = ["a", "b", "c", "e"]
    else:
        exp_join_data = [1, 2, 3, 0.9, 4.5]
        exp_other_data = ["a", "b", "c", "d", "e"]

    exp_join_col = Series(exp_join_data, dtype=exp_dtype)

    expect = DataFrame(
        {
            "join_col": exp_join_col,
            "B_x": exp_other_data,
            "B_y": exp_other_data,
        }
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="inner")

    assert_eq(expect, got)
Exemplo n.º 6
0
def test_string_join_non_key_nulls(str_data_nulls):
    str_data = ["a", "b", "c", "d", "e"]
    other_data = [1, 2, 3, 4, 5]

    other_data_nulls = [6, 7, 8, 9, 10][:len(str_data_nulls)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    pdf["vals"] = pd.Series(str_data, dtype="str")
    gdf["vals"] = Series(str_data, dtype="str")
    pdf["key"] = other_data
    gdf["key"] = other_data

    pdf2 = pd.DataFrame()
    gdf2 = DataFrame()
    pdf2["vals"] = pd.Series(str_data_nulls, dtype="str")
    gdf2["vals"] = Series(str_data_nulls, dtype="str")
    pdf2["key"] = pd.Series(other_data_nulls, dtype="int64")
    gdf2["key"] = Series(other_data_nulls, dtype="int64")

    expect = pdf.merge(pdf2, on="key", how="left")
    got = gdf.merge(gdf2, on="key", how="left")

    if len(expect) == 0 and len(got) == 0:
        expect = expect.reset_index(drop=True)
        got = got[expect.columns]

    assert_eq(expect, got)
Exemplo n.º 7
0
def test_string_join_non_key(str_data, num_cols, how, how_raise):
    other_data = [1, 2, 3, 4, 5][:len(str_data)]

    pdf = pd.DataFrame()
    gdf = DataFrame()
    for i in range(num_cols):
        pdf[i] = pd.Series(str_data, dtype="str")
        gdf[i] = Series(str_data, dtype="str")
    pdf["a"] = other_data
    gdf["a"] = other_data

    pdf2 = pdf.copy()
    gdf2 = gdf.copy()

    expectation = raise_builder([how_raise], NotImplementedError)

    with expectation:
        expect = pdf.merge(pdf2, on=["a"], how=how)
        got = gdf.merge(gdf2, on=["a"], how=how)

        if len(expect) == 0 and len(got) == 0:
            expect = expect.reset_index(drop=True)
            got = got[expect.columns]

        assert_eq(expect, got)
Exemplo n.º 8
0
def test_merge_multi(kwargs):

    left = DataFrame(
        {
            "a": [1, 2, 3, 4, 3, 5, 6],
            "b": [1, 3, 5, 7, 5, 9, 0],
            "c": ["o", "p", "q", "r", "s", "t", "u"],
            "d": ["v", "w", "x", "y", "z", "1", "2"],
        }
    )
    right = DataFrame(
        {
            "a": [0, 9, 3, 4, 3, 7, 8],
            "b": [2, 4, 5, 7, 5, 6, 8],
            "c": ["a", "b", "c", "d", "e", "f", "g"],
            "d": ["j", "i", "j", "k", "l", "m", "n"],
        }
    )

    if (
        kwargs["left_on"] is not None
        and kwargs["right_on"] is not None
        and kwargs["left_index"] is False
        and kwargs["right_index"] is False
    ):
        left = left.set_index(["c", "d"])
        right = right.set_index(["c", "d"])
    elif (
        kwargs["left_on"] is None
        and kwargs["right_on"] is None
        and kwargs["left_index"] is True
        and kwargs["right_index"] is True
    ):
        left = left.set_index(["a", "b"])
        right = right.set_index(["a", "b"])
    elif kwargs["left_on"] is not None and kwargs["right_index"] is True:
        left = left.set_index(["c", "d"])
        right = right.set_index(["a", "b"])
    elif kwargs["right_on"] is not None and kwargs["left_index"] is True:
        left = left.set_index(["a", "b"])
        right = right.set_index(["c", "d"])

    gleft = left.to_pandas()
    gright = right.to_pandas()

    kwargs["sort"] = True
    expect = gleft.merge(gright, **kwargs)
    got = left.merge(right, **kwargs)

    assert_eq(expect.sort_index().index, got.sort_index().index)

    expect.index = range(len(expect))
    got.index = range(len(got))
    expect = expect.sort_values(list(expect.columns))
    got = got.sort_values(list(got.columns))
    expect.index = range(len(expect))
    got.index = range(len(got))

    assert_eq(expect, got)
Exemplo n.º 9
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame({"a": [], "b": []})
    gdf2 = DataFrame({"a": [], "c": []})

    expect = DataFrame({"a": [], "b": [], "c": []})
    got = gdf1.merge(gdf2, how="left", on=["a"])

    assert_eq(expect, got)
Exemplo n.º 10
0
def test_dataframe_empty_merge():
    gdf1 = DataFrame([("a", []), ("b", [])])
    gdf2 = DataFrame([("a", []), ("c", [])])

    expect = DataFrame([("a", []), ("b", []), ("c", [])])
    got = gdf1.merge(gdf2, how="left", on=["a"])

    assert_eq(expect, got)
Exemplo n.º 11
0
def test_dataframe_merge_no_common_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key3"] = np.random.randint(0, 30, nelem)
    df_right["key4"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    with pytest.raises(ValueError) as raises:
        df_left.merge(df_right, how="left")
    raises.match("No common columns to perform merge on")
Exemplo n.º 12
0
def test_dataframe_merge_on_unknown_column():
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key1"] = np.random.randint(0, 30, nelem)
    df_right["key2"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    with pytest.raises(KeyError) as raises:
        df_left.merge(df_right, on="bad_key", how="left")
    raises.match("bad_key")
Exemplo n.º 13
0
def test_dataframe_merge_on(on):
    np.random.seed(0)

    # Make cuDF
    df_left = DataFrame()
    nelem = 500
    df_left["key1"] = np.random.randint(0, 40, nelem)
    df_left["key2"] = np.random.randint(0, 50, nelem)
    df_left["left_val"] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right["key1"] = np.random.randint(0, 30, nelem)
    df_right["key2"] = np.random.randint(0, 50, nelem)
    df_right["right_val"] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()

    # Expected result (from pandas)
    pddf_joined = pddf_left.merge(pddf_right, on=on, how="left")

    # Test (from cuDF; doesn't check for ordering)
    join_result = df_left.merge(df_right, on=on, how="left")
    join_result_cudf = cudf.merge(df_left, df_right, on=on, how="left")

    join_result["right_val"] = (join_result["right_val"].astype(
        np.float64).fillna(np.nan))

    join_result_cudf["right_val"] = (join_result_cudf["right_val"].astype(
        np.float64).fillna(np.nan))

    for col in list(pddf_joined.columns):
        if col.count("_y") > 0:
            join_result[col] = (join_result[col].astype(np.float64).fillna(
                np.nan))
            join_result_cudf[col] = (join_result_cudf[col].astype(
                np.float64).fillna(np.nan))

    # Test dataframe equality (ignore order of rows and columns)
    cdf_result = (join_result.to_pandas().sort_values(list(
        pddf_joined.columns)).reset_index(drop=True))

    pdf_result = pddf_joined.sort_values(list(
        pddf_joined.columns)).reset_index(drop=True)

    pd.util.testing.assert_frame_equal(cdf_result, pdf_result, check_like=True)

    merge_func_result_cdf = (join_result_cudf.to_pandas().sort_values(
        list(pddf_joined.columns)).reset_index(drop=True))

    pd.util.testing.assert_frame_equal(merge_func_result_cdf,
                                       cdf_result,
                                       check_like=True)
Exemplo n.º 14
0
def test_typecast_on_join_no_float_round():

    other_data = ["a", "b", "c", "d", "e"]

    join_data_l = Series([1, 2, 3, 4, 5], dtype="int8")
    join_data_r = Series([1, 2, 3, 4.01, 4.99], dtype="float32")

    gdf_l = DataFrame({"join_col": join_data_l, "B": other_data})
    gdf_r = DataFrame({"join_col": join_data_r, "B": other_data})

    exp_join_data = [1, 2, 3, 4, 5]
    exp_Bx = ["a", "b", "c", "d", "e"]
    exp_By = ["a", "b", "c", None, None]
    exp_join_col = Series(exp_join_data, dtype="float32")

    expect = DataFrame(
        {"join_col": exp_join_col, "B_x": exp_Bx, "B_y": exp_By}
    )

    got = gdf_l.merge(gdf_r, on="join_col", how="left")

    assert_eq(expect, got)
Exemplo n.º 15
0
def test_dataframe_merge_order():
    gdf1 = DataFrame()
    gdf2 = DataFrame()
    gdf1["id"] = [10, 11]
    gdf1["timestamp"] = [1, 2]
    gdf1["a"] = [3, 4]

    gdf2["id"] = [4, 5]
    gdf2["a"] = [7, 8]

    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")

    df1 = pd.DataFrame()
    df2 = pd.DataFrame()
    df1["id"] = [10, 11]
    df1["timestamp"] = [1, 2]
    df1["a"] = [3, 4]

    df2["id"] = [4, 5]
    df2["a"] = [7, 8]

    df = df1.merge(df2, how="left", on=["id", "a"])
    assert_eq(gdf, df)