def test_join_on_list_arg(backend): # TODO: how to validate how cols are being matched up? data = DF1.assign(jj=lambda d: d.ii) df_a = backend.load_df(data) df_b = backend.load_df(DF2.assign(jj=lambda d: d.ii)) out = inner_join(df_a, df_b, ["ii", "jj"]) >> collect() assert_frame_sort_equal(out, data.iloc[:2, :].assign(y=["a", "b"]))
def test_join_on_same_col_multiple_times(): data = data_frame(ii=[1, 2, 3], jj=[1, 2, 9]) df_a = backend.load_df(data) df_b = backend.load_df(data_frame(ii=[1, 2, 3])) out = inner_join(df_a, df_b, {("ii", "jj"): "ii"}) >> collect() # keeps all but last row assert_frame_sort_equal(out, data.iloc[:2, ])
def test_join_suffixes_dupe_names(df1): out = inner_join(df1, df1, {"ii": "ii"}) >> collect() non_index_cols = DF1.columns[DF1.columns != "ii"] assert all((non_index_cols + "_x").isin(out)) assert all((non_index_cols + "_y").isin(out))
def test_join_on_missing_col(df1, df2): with pytest.raises(KeyError): inner_join(df1, df2, {"ABCDEF": "ii"}) with pytest.raises(KeyError): inner_join(df1, df2, {"ii": "ABCDEF"})
def test_join_on_str_arg(df1, df2): out = inner_join(df1, df2, "ii") >> collect() target = DF1.iloc[:2, ].assign(y=["a", "b"]) assert_frame_sort_equal(out, target)
def test_join_diff_vars_keeps_left(backend, df1, df2_jj): out = inner_join(df1, df2_jj, {"ii": "jj"}) >> collect() assert out.columns.tolist() == ["ii", "x", "y"]
def test_basic_inner_join(df1, df2): out = inner_join(df1, df2, {"ii": "ii"}) >> collect() target = DF1.iloc[:2, :].assign(y=["a", "b"]) assert_frame_sort_equal(out, target)
def test_inner_join_arrange(backend, df1, df2): # NOTE: joins are free to scramble order in SQL. TODO: check dplyr joined = inner_join(arrange(df1, _.ii), df2, on="ii") assert joined.order_by == tuple()
def after_join( lhs, rhs, by_time, by_user, mode = "inner", type = "first-firstafter", max_gap = None, min_gap = None, gap_col = None, suffix = ("_x", "_y") ): if max_gap is not None or min_gap is not None or gap_col is not None: raise NotImplementedError("max_gap, min_gap, gap_col not implemented") # Get type of join for both tables, from e.g. "first-firstafter" type_lhs, type_rhs = type.split("-") # Convert join keys to dictionary form by_time_x, by_time_y = _get_key_tuple(by_time) by_user_x, by_user_y = _get_key_tuple(by_user) # mutate in row_number ---- lhs_i = (lhs >> arrange(_[by_user_x], _[by_time_x]) >> mutate(__idx = row_number(_)) >> distinct_events(by_time_x, by_user_x, type_lhs) ) rhs_i = (rhs >> arrange(_[by_user_y], _[by_time_y]) >> mutate(__idy = row_number(_)) >> distinct_events(by_time_y, by_user_y, type_rhs) ) # Handle when time column is in the other table if by_time_x == by_time_y: # TODO: don't use implicit join suffix below pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y" else: pair_time_x, pair_time_y = by_time_x, by_time_y # Inner join by user, filter by time pairs = filter( inner_join(lhs_i, rhs_i, by_user), _[pair_time_x] <= _[pair_time_y] ) # TODO: firstwithin if type_lhs in ["firstwithin", "lastbefore"]: raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs) # Handle firstafter by subsetting if type_rhs == "firstafter": pairs = (pairs >> arrange(_[pair_time_y]) >> group_by(_.__idx) >> filter(row_number(_) == 1) >> ungroup() ) distinct_pairs = select(pairs, _.__idx, _.__idy) if mode in ["inner", "left", "right", "full", "outer"]: by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")]) res = (lhs_i >> join(_, distinct_pairs, on = "__idx", how = mode) # TODO: suffix arg >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix) >> select(-_["__idx", "__idy"]) ) elif mode in ["semi", "anti"]: join_func = semi_join if mode == "semi" else anti_join res = (lhs_i >> join_func(_, distinct_pairs, "__idx") >> select(-_["__idx", "__idy"]) ) else: raise ValueError("mode not recognized: %s" %mode) return res