def test_summarize_removes_series_index(): # Note: currently wouldn't work in postgresql, since _.x + _.y not an agg func df = data_frame(g=['a', 'b', 'c'], x=[1, 2, 3], y=[4, 5, 6]) assert_equal_query(df, group_by(_.g) >> summarize(res=_.x + _.y), df.assign(res=df.x + df.y).drop(columns=["x", "y"]))
def test_filter_via_group_by_arrange(backend): dfs = backend.load_df(x=[3, 2, 1] + [2, 3, 4], g=[1] * 3 + [2] * 3) assert_equal_query( dfs, group_by(_.g) >> arrange(_.x) >> filter(_.x.cumsum() > 3), data_frame(x=[3, 3, 4], g=[1, 2, 2]))
def test_filter_via_group_by_agg_two_args(backend): dfs = backend.load_df(x=range(1, 11), g=[1] * 5 + [2] * 5) assert_equal_query( dfs, group_by(_.g) >> filter(_.x > _.x.mean(), _.x != _.x.max()), data_frame(x=[4, 9], g=[1, 2]))
def test_frame_summarize(skip_backend, backend, agg_entry): entry = agg_entry test_missing_implementation(entry, backend) # Prepare input data ------------------------------------------------------ # case: inputs must be boolean crnt_data = get_data(entry, DATA, backend) df = backend.load_df(crnt_data) # Execute mutate ---------------------------------------------------------- str_expr, call_expr = get_df_expr(entry) dst = data_frame(result=call_expr(crnt_data)) # Process output ---------------------------------------------------------- # case: output is of a different type than w/ pandas dst['result'] = cast_result_type(entry, backend, dst['result']) # Run test for equality w/ pandas ---- # otherwise, verify returns same result as mutate assert_equal_query(df, summarize(result=call_expr), dst) dst_g = crnt_data.groupby('g').apply(call_expr).reset_index().rename( columns={0: 'result'}) assert_equal_query(df, group_by(_.g) >> summarize(result=call_expr), dst_g)
def test_grouping_is_preserved(): # In siuba this actually tests 3 things: # 1) Can we pipe a grouped DataFrame in to `pivot_longer`? - Not yet # 2) Does it retain grouping? - Not yet # 3) Can we get the names of grouping _columns_ ie variables? - Not yet? df = data_frame(g = [1, 2], x1 = [1, 2], x2 = [3, 4]) # Breaks; `pivot_longer` needs singledispatch for grouped DataFrames out = ( df >> group_by(_.g) >> pivot_longer(_["x1":"x2"], names_to = "x", values_to = "v") ) # Breaks, as group_vars does not exist yet. # For now, in pandas it is probably better to check if the DataFrame remains # grouped, and if it matches the expected output # assert group_vars(out) == "g" expected = data_frame( g = [1, 1, 2, 2], x = ["x1", "x2", "x1", "x2"], v = [1, 3, 2, 4], _index = [0, 0, 1, 1] ).groupby("g") # assert_frame_equal does not work with DataFrameGroupBy. isinstance(out, expected.__class__) assert_frame_equal(out.obj, expected.obj)
def test_raw_sql_mutate_refer_previous_raise_dberror(backend, df): # Note: unlikely will be able to support this case. Normally we analyze # the expression to know whether we need to create a subquery. with pytest.raises(sqlalchemy.exc.DatabaseError): assert_equal_query( df, group_by("x") >> mutate(z1=sql_raw("y + 1"), z2=sql_raw("z1 + 1")), DATA.assign(z1=lambda d: d.y + 1, z2=lambda d: d.z1 + 1))
def test_arrange_grouped(backend, df): q = group_by(_.y) >> arrange(_.x) assert_equal_query(df, q, DATA.sort_values(['x'])) # arrange w/ mutate is the same, whether used before or after group_by assert_equal_query( df, q >> mutate(res=row_number(_)), mutate(DATA.sort_values(['x']).groupby('y'), res=row_number(_)))
def test_mutate_using_agg_expr(backend): data = data_frame(x = range(1, 5), g = [1,1,2,2]) dfs = backend.load_df(data) assert_equal_query( dfs, group_by(_.g) >> mutate(y = _.x - _.x.mean()), data.assign(y = [-.5, .5, -.5, .5]) )
def test_mutate_window_funcs(backend): data = data_frame(x = range(1, 5), g = [1,1,2,2]) dfs = backend.load_df(data) assert_equal_query( dfs, group_by(_.g) >> mutate(row_num = row_number(_).astype(float)), data.assign(row_num = [1.0, 2, 1, 2]) )
def test_filter_via_group_by(backend): df = data_frame(x=range(1, 11), g=[1] * 5 + [2] * 5) dfs = backend.load_df(df) assert_equal_query(dfs, group_by(_.g) >> filter(row_number(_) < 3), data_frame(x=[1, 2, 6, 7], g=[1, 1, 2, 2]))
def test_summarize_removes_1_grouping(backend): data = data_frame(a=1, b=2, c=3) df = backend.load_df(data) q1 = df >> group_by(_.a, _.b) >> summarize(n=n(_)) assert q1.group_by == ("a") q2 = q1 >> summarize(n=n(_)) assert not len(q2.group_by)
def test_frame_set_aggregates_postgresql(): # TODO: probably shouldn't be creating backend here backend = SqlBackend("postgresql") dfs = backend.load_df(data[None]) expr = _.x.quantile(.75) assert_equal_query(dfs, group_by(_.g) >> summarize(result=expr), data_frame(g=['a', 'b'], result=[11., 13.]))
def test_mutate_using_cuml_agg(backend): data = data_frame(x=range(1, 5), g=[1, 1, 2, 2]) dfs = backend.load_df(data) # cuml window without arrange before generates warning with pytest.warns(None): assert_equal_query(dfs, group_by(_.g) >> mutate(y=_.x.cumsum()), data.assign(y=[1.0, 3, 3, 7]))
def test_filter_vector(backend, func, simple_data): if backend.name == 'sqlite': pytest.skip() df = backend.load_cached_df(simple_data) res = data_frame(y=func(simple_data)) assert_equal_query( df, filter(func), filter(simple_data, func), # ignore dtypes, since sql -> an empty data frame has object columns check_dtype=False) # grouped (vs slow_filter) assert_equal_query(df, group_by(_.g) >> filter(func), simple_data >> group_by(_.g) >> filter(func), check_dtype=False)
def test_summarize_subquery_op_vars(backend, df): query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min()) assert_equal_query( df, query, data_frame(g = ['a', 'b'], low = [2, 4]) ) # check that is uses a subquery, since x2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def test_summarize_subquery_group_vars(backend, df): query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min()) assert_equal_query( df, query, data_frame(g2 = ['A', 'B'], low = [1, 3]) ) # check that is uses a subquery, since g2 is defined in first query text = str(query(df).last_op) assert text.count('FROM') == 2
def distinct_events(tbl, time_col, user_col, type): if type not in ["first", "last"]: return tbl res = (tbl >> group_by(_[user_col]) >> arrange(_[time_col] if type == "first" else -_[time_col]) >> filter(row_number(_) == 1) >> ungroup() ) return res
def test_agg_vector(backend, func, simple_data): if backend.name == 'sqlite': pytest.skip() df = backend.load_cached_df(simple_data) res = data_frame(y=func(simple_data)) assert_equal_query(df, summarize(y=func), res) # grouped assert_equal_query( df, group_by(_.g) >> summarize(y=func), simple_data.groupby('g').apply(func).reset_index().rename( columns={0: 'y'}))
def test_mutate_vector(backend, func, simple_data): if backend.name == 'sqlite': pytest.skip() df = backend.load_cached_df(simple_data) assert_equal_query(df, mutate(y=func), simple_data.assign(y=func), check_dtype=False) # grouped assert_equal_query( df, group_by(_.g) >> mutate(y=func), simple_data.groupby('g').apply(lambda d: d.assign(y=func)).reset_index( drop=True), check_dtype=False)
def test_frame_mutate(skip_backend, backend, entry): test_missing_implementation(entry, backend) # Prepare input data ------------------------------------------------------ # case: inputs must be boolean crnt_data = get_data(entry, DATA, backend) df = backend.load_df(crnt_data) # Execute mutate ---------------------------------------------------------- str_expr, call_expr = get_df_expr(entry) # Run test for equality w/ ungrouped pandas ---- dst = crnt_data.assign(result=call_expr(crnt_data)) dst['result'] = cast_result_type(entry, backend, dst['result']) assert_equal_query(df, mutate(result=call_expr), dst) # Run test for equality w/ grouped pandas ---- g_dst = crnt_data.groupby('g').apply( lambda d: d.assign(result=call_expr)).reset_index(drop=True) g_dst['result'] = cast_result_type(entry, backend, g_dst['result']) assert_equal_query(df, group_by(_.g) >> mutate(result=call_expr), g_dst)
def test_group_by_override(df): gdf = df >> group_by(_.x, _.y) >> group_by(_.g) assert gdf.group_by == ("g", )
DATA = data_frame(a=1, b=2, c=3) @pytest.fixture(scope="module") def dfs(backend): return backend.load_df(DATA) @pytest.mark.parametrize("query, output", [ (select(_.c), data_frame(c=3)), (select(_.b == _.c), data_frame(b=3)), (select(_["a":"c"]), data_frame(a=1, b=2, c=3)), (select(_[_.a:_.c]), data_frame(a=1, b=2, c=3)), (select(_.a, _.b) >> select(_.b), data_frame(b=2)), (mutate(a=_.b + _.c) >> select(_.a), data_frame(a=5)), pytest.param(group_by(_.a) >> select(_.b), data_frame(b=2, a=1), marks=pytest.mark.xfail), ]) def test_select_siu(dfs, query, output): assert_equal_query(dfs, query, output) @pytest.mark.skip("TODO: #63") def test_select_kwargs(dfs): assert_equal_query(dfs, select(x=_.a), data_frame(x=1)) # Rename ----------------------------------------------------------------------
def test_transmute_grouped_no_mutate_grouping(dfs): assert_equal_query( dfs, group_by(_.a) >> transmute(a = _.a + 1), data_frame(a = [1,2,3], x = [2,3,4]) )
# + st.write("Goals by month") st.write("Top 8 players not in our data") top8 >> filter(_.yr_start < 1979) # + from pandas.tseries.offsets import MonthBegin from siuba.experimental.pd_groups import fast_summarize top8_goals = ( top8_games >> mutate( date=_.date.astype("datetime64[D]"), age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >> arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >> fast_summarize(ttl_goals=_.goals.sum(), age_years=_.age_years.min()) >> group_by(_.player) >> mutate(cuml_goals=_.ttl_goals.cumsum()) >> ungroup()) p_goals = alt.Chart(top8_goals).mark_line().encode(y="cuml_goals:Q", color="player") # + time = st.selectbox("Choose a time", ["month", "age_years"]) st.write(p_goals.encode(x=time)) # + st.write("Goals by seasons")
def gdf(df): return df >> group_by(_.g)
def test_group_by_no_add(df): gdf = group_by(df, _.x, _.y) assert gdf.group_by == ("x", "y")
def test_filter_via_group_by_agg(backend): dfs = backend.load_df(x=range(1, 11), g=[1] * 5 + [2] * 5) assert_equal_query(dfs, group_by(_.g) >> filter(_.x > _.x.mean()), data_frame(x=[4, 5, 9, 10], g=[1, 1, 2, 2]))
def test_group_by_add(df): gdf = group_by(df, _.x) >> group_by(_.y, add=True) assert gdf.group_by == ("x", "y")
def test_group_by_performs_mutate(df): assert_equal_query(df, group_by(z=_.x + _.y) >> summarize(n=n(_)), data_frame(z=10, n=3))
def test_group_by_ungroup(df): q1 = df >> group_by(_.g) assert q1.group_by == ("g", ) q2 = q1 >> ungroup() assert q2.group_by == tuple()