def test_arranges_back_to_back(backend): data = data_frame(x=range(1, 5), g=[1, 1, 2, 2]) dfs = backend.load_df(data) lazy_tbl = dfs >> arrange(_.x) >> arrange(_.g) order_by_vars = tuple(simple_varname(call) for call in lazy_tbl.order_by) assert order_by_vars == ("x", "g") assert [c.name for c in lazy_tbl.last_op._order_by_clause] == ["x", "g"]
def test_filter_via_group_by_desc_arrange(backend): dfs = backend.load_df(x=[3, 2, 1] + [2, 3, 4], g=[1] * 3 + [2] * 3) assert_equal_query( dfs, group_by(_.g) >> arrange(desc(_.x)) >> filter(_.x.cumsum() > 3), data_frame(x=[2, 1, 4, 3, 2], g=[1, 1, 2, 2, 2]))
def test_arrange_grouped(backend, df): q = group_by(_.y) >> arrange(_.x) assert_equal_query(df, q, DATA.sort_values(['x'])) # arrange w/ mutate is the same, whether used before or after group_by assert_equal_query( df, q >> mutate(res=row_number(_)), mutate(DATA.sort_values(['x']).groupby('y'), res=row_number(_)))
def distinct_events(tbl, time_col, user_col, type): if type not in ["first", "last"]: return tbl res = (tbl >> group_by(_[user_col]) >> arrange(_[time_col] if type == "first" else -_[time_col]) >> filter(row_number(_) == 1) >> ungroup() ) return res
# + st.write("Goals by month") st.write("Top 8 players not in our data") top8 >> filter(_.yr_start < 1979) # + from pandas.tseries.offsets import MonthBegin from siuba.experimental.pd_groups import fast_summarize top8_goals = ( top8_games >> mutate( date=_.date.astype("datetime64[D]"), age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >> arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >> fast_summarize(ttl_goals=_.goals.sum(), age_years=_.age_years.min()) >> group_by(_.player) >> mutate(cuml_goals=_.ttl_goals.cumsum()) >> ungroup()) p_goals = alt.Chart(top8_goals).mark_line().encode(y="cuml_goals:Q", color="player") # + time = st.selectbox("Choose a time", ["month", "age_years"]) st.write(p_goals.encode(x=time)) # + st.write("Goals by seasons")
def test_inner_join_arrange(backend, df1, df2): # NOTE: joins are free to scramble order in SQL. TODO: check dplyr joined = inner_join(arrange(df1, _.ii), df2, on="ii") assert joined.order_by == tuple()
def test_arrange_grouped_trivial(df): # note: only 1 level for z assert_equal_query(df, group_by(_.z) >> arrange(_.x), DATA.sort_values(['x']))
import pandas as pd import pytest from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql DATA = data_frame(x=[2, 2, 1], y=[2, 1, 1], z=['z'] * 3) @pytest.fixture(scope="module") def df(backend): return backend.load_df(DATA) @pytest.mark.parametrize("query, output", [(arrange(_.x), DATA.sort_values(['x'])), (arrange("x"), DATA.sort_values(['x'])), (arrange(_.x, _.y), DATA.sort_values(['x', 'y'])), (arrange("x", "y"), DATA.sort_values(['x', 'y'])), (arrange(_.x, "y"), DATA.sort_values(['x', 'y']))]) def test_basic_arrange(df, query, output): assert_equal_query(df, query, output) @pytest.mark.parametrize("query, output", [ (arrange(-_.x), DATA.sort_values(['x'], ascending=[False])), (arrange(-_.x, _.y), DATA.sort_values(['x', 'y'], ascending=[False, True ])), (arrange(-_.x, "y"), DATA.sort_values(['x', 'y'], ascending=[False, True])) ]) def test_arrange_desc(df, query, output):
def test_distinct_after_arrange(df): query = arrange(_.x) >> distinct(_.y) assert_equal_query(df, query, pd.DataFrame({'y': [5,4,3,2,1]}))
def after_join( lhs, rhs, by_time, by_user, mode = "inner", type = "first-firstafter", max_gap = None, min_gap = None, gap_col = None, suffix = ("_x", "_y") ): if max_gap is not None or min_gap is not None or gap_col is not None: raise NotImplementedError("max_gap, min_gap, gap_col not implemented") # Get type of join for both tables, from e.g. "first-firstafter" type_lhs, type_rhs = type.split("-") # Convert join keys to dictionary form by_time_x, by_time_y = _get_key_tuple(by_time) by_user_x, by_user_y = _get_key_tuple(by_user) # mutate in row_number ---- lhs_i = (lhs >> arrange(_[by_user_x], _[by_time_x]) >> mutate(__idx = row_number(_)) >> distinct_events(by_time_x, by_user_x, type_lhs) ) rhs_i = (rhs >> arrange(_[by_user_y], _[by_time_y]) >> mutate(__idy = row_number(_)) >> distinct_events(by_time_y, by_user_y, type_rhs) ) # Handle when time column is in the other table if by_time_x == by_time_y: # TODO: don't use implicit join suffix below pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y" else: pair_time_x, pair_time_y = by_time_x, by_time_y # Inner join by user, filter by time pairs = filter( inner_join(lhs_i, rhs_i, by_user), _[pair_time_x] <= _[pair_time_y] ) # TODO: firstwithin if type_lhs in ["firstwithin", "lastbefore"]: raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs) # Handle firstafter by subsetting if type_rhs == "firstafter": pairs = (pairs >> arrange(_[pair_time_y]) >> group_by(_.__idx) >> filter(row_number(_) == 1) >> ungroup() ) distinct_pairs = select(pairs, _.__idx, _.__idy) if mode in ["inner", "left", "right", "full", "outer"]: by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")]) res = (lhs_i >> join(_, distinct_pairs, on = "__idx", how = mode) # TODO: suffix arg >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix) >> select(-_["__idx", "__idy"]) ) elif mode in ["semi", "anti"]: join_func = semi_join if mode == "semi" else anti_join res = (lhs_i >> join_func(_, distinct_pairs, "__idx") >> select(-_["__idx", "__idy"]) ) else: raise ValueError("mode not recognized: %s" %mode) return res