Exemplo n.º 1
0
def test_join_on_same_col_multiple_times():
    data = data_frame(ii=[1, 2, 3], jj=[1, 2, 9])
    df_a = backend.load_df(data)
    df_b = backend.load_df(data_frame(ii=[1, 2, 3]))

    out = inner_join(df_a, df_b, {("ii", "jj"): "ii"}) >> collect()
    # keeps all but last row
    assert_frame_sort_equal(out, data.iloc[:2, ])
Exemplo n.º 2
0
def test_join_on_list_arg(backend):
    # TODO: how to validate how cols are being matched up?
    data = DF1.assign(jj=lambda d: d.ii)
    df_a = backend.load_df(data)
    df_b = backend.load_df(DF2.assign(jj=lambda d: d.ii))
    out = inner_join(df_a, df_b, ["ii", "jj"]) >> collect()

    assert_frame_sort_equal(out, data.iloc[:2, :].assign(y=["a", "b"]))
Exemplo n.º 3
0
def test_anti_join_arg_sql_on(backend, df1, df2):
    cond = lambda lhs, rhs: lhs.ii > rhs.ii

    # collect sql result
    out = anti_join(df1, df2, sql_on=cond) >> collect()

    # for target, do full cartesian product, then filter based on cond
    target = data_frame(ii=[1], x=["a"])

    assert_frame_sort_equal(out, target)
Exemplo n.º 4
0
def test_left_join_arg_sql_on(backend, df1, df2):
    cond = lambda lhs, rhs: lhs.ii > rhs.ii

    # collect sql result
    out = left_join(df1, df2, sql_on=cond) >> collect()

    # for target, do full cartesian product, then filter based on cond
    target = data_frame(ii_x=[1, 2, 3, 3, 4, 4],
                        ii_y=[None, 1, 1, 2, 1, 2],
                        x=["a", "b", "c", "c", "d", "d"],
                        y=[None, "a", "a", "b", "a", "b"])

    # TODO: SQL columns seem to be returned in random order, so sort
    #       not sure why it's happening, look into in SqlAlchemy?
    assert_frame_sort_equal(out.sort_index(axis=1), target)
Exemplo n.º 5
0
def test_basic_left_join(df1, df2):
    out = left_join(df1, df2, {"ii": "ii"}) >> collect()
    target = DF1.assign(y=["a", "b", None, None])
    assert_frame_sort_equal(out, target)
Exemplo n.º 6
0
def test_join_suffixes_dupe_names(df1):
    out = inner_join(df1, df1, {"ii": "ii"}) >> collect()
    non_index_cols = DF1.columns[DF1.columns != "ii"]
    assert all((non_index_cols + "_x").isin(out))
    assert all((non_index_cols + "_y").isin(out))
Exemplo n.º 7
0
def test_join_on_str_arg(df1, df2):
    out = inner_join(df1, df2, "ii") >> collect()

    target = DF1.iloc[:2, ].assign(y=["a", "b"])
    assert_frame_sort_equal(out, target)
Exemplo n.º 8
0
def test_join_diff_vars_keeps_left(backend, df1, df2_jj):
    out = inner_join(df1, df2_jj, {"ii": "jj"}) >> collect()

    assert out.columns.tolist() == ["ii", "x", "y"]
Exemplo n.º 9
0
def test_basic_anti_join(backend, df1, df2):
    assert_frame_sort_equal(
        anti_join(df1, df2, on={
            "ii": "ii",
            "x": "y"
        }) >> collect(), DF1.iloc[2:, ])
Exemplo n.º 10
0
def test_semi_join_no_cross(backend, df1, df2):
    df_ii = backend.load_df(data_frame(ii=[1, 1]))
    assert_frame_sort_equal(
        semi_join(df1, df_ii, {"ii": "ii"}) >> collect(), DF1.iloc[:1, ])
Exemplo n.º 11
0
def test_basic_semi_join(backend, df1, df2):
    assert_frame_sort_equal(
        semi_join(df1, df2, {"ii": "ii"}) >> collect(), DF1.iloc[:2, ])
Exemplo n.º 12
0
def test_basic_full_join(skip_backend, backend, df1, df2):
    out = full_join(df1, df2, {"ii": "ii"}) >> collect()
    target = DF1.merge(DF2, on="ii", how="outer")
    assert_frame_sort_equal(out, target)
Exemplo n.º 13
0
def test_basic_inner_join(df1, df2):
    out = inner_join(df1, df2, {"ii": "ii"}) >> collect()
    target = DF1.iloc[:2, :].assign(y=["a", "b"])
    assert_frame_sort_equal(out, target)
Exemplo n.º 14
0
def test_basic_right_join(backend, df1, df2):
    # same as left join, but flip df arguments
    out = right_join(df2, df1, {"ii": "ii"}) >> collect()
    target = DF1.assign(y=["a", "b", None, None])
    assert_frame_sort_equal(out, target)
Exemplo n.º 15
0
def test_collect(df):
    assert isinstance(collect(df), pd.DataFrame)
    assert isinstance(df >> collect(), pd.DataFrame)
    assert isinstance(collect(), Pipeable)
Exemplo n.º 16
0
def test_basic_anti_join_on_str(backend, df1, df2):
    assert_frame_sort_equal(
        anti_join(df1, df2, on="ii") >> collect(), DF1.iloc[2:, ])