def test_join_on_same_col_multiple_times(): data = data_frame(ii=[1, 2, 3], jj=[1, 2, 9]) df_a = backend.load_df(data) df_b = backend.load_df(data_frame(ii=[1, 2, 3])) out = inner_join(df_a, df_b, {("ii", "jj"): "ii"}) >> collect() # keeps all but last row assert_frame_sort_equal(out, data.iloc[:2, ])
def test_join_on_list_arg(backend): # TODO: how to validate how cols are being matched up? data = DF1.assign(jj=lambda d: d.ii) df_a = backend.load_df(data) df_b = backend.load_df(DF2.assign(jj=lambda d: d.ii)) out = inner_join(df_a, df_b, ["ii", "jj"]) >> collect() assert_frame_sort_equal(out, data.iloc[:2, :].assign(y=["a", "b"]))
def test_anti_join_arg_sql_on(backend, df1, df2): cond = lambda lhs, rhs: lhs.ii > rhs.ii # collect sql result out = anti_join(df1, df2, sql_on=cond) >> collect() # for target, do full cartesian product, then filter based on cond target = data_frame(ii=[1], x=["a"]) assert_frame_sort_equal(out, target)
def test_left_join_arg_sql_on(backend, df1, df2): cond = lambda lhs, rhs: lhs.ii > rhs.ii # collect sql result out = left_join(df1, df2, sql_on=cond) >> collect() # for target, do full cartesian product, then filter based on cond target = data_frame(ii_x=[1, 2, 3, 3, 4, 4], ii_y=[None, 1, 1, 2, 1, 2], x=["a", "b", "c", "c", "d", "d"], y=[None, "a", "a", "b", "a", "b"]) # TODO: SQL columns seem to be returned in random order, so sort # not sure why it's happening, look into in SqlAlchemy? assert_frame_sort_equal(out.sort_index(axis=1), target)
def test_basic_left_join(df1, df2): out = left_join(df1, df2, {"ii": "ii"}) >> collect() target = DF1.assign(y=["a", "b", None, None]) assert_frame_sort_equal(out, target)
def test_join_suffixes_dupe_names(df1): out = inner_join(df1, df1, {"ii": "ii"}) >> collect() non_index_cols = DF1.columns[DF1.columns != "ii"] assert all((non_index_cols + "_x").isin(out)) assert all((non_index_cols + "_y").isin(out))
def test_join_on_str_arg(df1, df2): out = inner_join(df1, df2, "ii") >> collect() target = DF1.iloc[:2, ].assign(y=["a", "b"]) assert_frame_sort_equal(out, target)
def test_join_diff_vars_keeps_left(backend, df1, df2_jj): out = inner_join(df1, df2_jj, {"ii": "jj"}) >> collect() assert out.columns.tolist() == ["ii", "x", "y"]
def test_basic_anti_join(backend, df1, df2): assert_frame_sort_equal( anti_join(df1, df2, on={ "ii": "ii", "x": "y" }) >> collect(), DF1.iloc[2:, ])
def test_semi_join_no_cross(backend, df1, df2): df_ii = backend.load_df(data_frame(ii=[1, 1])) assert_frame_sort_equal( semi_join(df1, df_ii, {"ii": "ii"}) >> collect(), DF1.iloc[:1, ])
def test_basic_semi_join(backend, df1, df2): assert_frame_sort_equal( semi_join(df1, df2, {"ii": "ii"}) >> collect(), DF1.iloc[:2, ])
def test_basic_full_join(skip_backend, backend, df1, df2): out = full_join(df1, df2, {"ii": "ii"}) >> collect() target = DF1.merge(DF2, on="ii", how="outer") assert_frame_sort_equal(out, target)
def test_basic_inner_join(df1, df2): out = inner_join(df1, df2, {"ii": "ii"}) >> collect() target = DF1.iloc[:2, :].assign(y=["a", "b"]) assert_frame_sort_equal(out, target)
def test_basic_right_join(backend, df1, df2): # same as left join, but flip df arguments out = right_join(df2, df1, {"ii": "ii"}) >> collect() target = DF1.assign(y=["a", "b", None, None]) assert_frame_sort_equal(out, target)
def test_collect(df): assert isinstance(collect(df), pd.DataFrame) assert isinstance(df >> collect(), pd.DataFrame) assert isinstance(collect(), Pipeable)
def test_basic_anti_join_on_str(backend, df1, df2): assert_frame_sort_equal( anti_join(df1, df2, on="ii") >> collect(), DF1.iloc[2:, ])