def test_summarize_removes_1_grouping(backend): data = data_frame(a=1, b=2, c=3) df = backend.load_df(data) q1 = df >> group_by(_.a, _.b) >> summarize(n=n(_)) assert q1.group_by == ("a") q2 = q1 >> summarize(n=n(_)) assert not len(q2.group_by)
max_size=10)) OMNIBUS_VECTOR_FUNCS = [ #cumall, cumany, cummean, #desc, v.dense_rank(_.x, na_option="keep"), #v.percent_rank(_.x), v.min_rank(_.x, na_option="keep"), v.cume_dist(_.x, na_option="keep"), v.row_number(_.x), #ntile, v.between(_.x, 2, 5, default=False), v.coalesce(_.x, 2), v.lead(_.x), v.lag(_.x), v.n(_.x), v.na_if(_.x, 2), #near, v.nth(_.x, 2), v.first(_.x), v.last(_.x, order_by=_.x), # TODO: in SQL getting FROM LAST requires order by ] VECTOR_AGG_FUNCS = [ v.n(_.x), v.n(_), ] VECTOR_FILTER_FUNCS = [ v.dense_rank(_.x, na_option="keep") < 2,
def test_summarize_unnamed_args(df): assert_equal_query(df, summarize(n(_)), pd.DataFrame({'n(_)': 4}))
def test_summarize_removes_order_vars(backend, df): lazy_tbl = df >> summarize(n=n(_)) assert not len(lazy_tbl.order_by)
def test_summarize_keeps_group_vars(backend, gdf): q = gdf >> summarize(n=n(_)) assert list(q.last_op.c.keys()) == ["g", "n"]
def df(backend): return backend.load_df(DATA) @pytest.fixture(scope="module") def df_float(backend): return backend.load_df(DATA.assign(x=lambda d: d.x.astype(float))) @pytest.fixture(scope="module") def gdf(df): return df >> group_by(_.g) @pytest.mark.parametrize("query, output", [ (summarize(y=n(_)), data_frame(y=4)), (summarize(y=_.x.min()), data_frame(y=1)), ]) def test_summarize_ungrouped(df, query, output): assert_equal_query(df, query, output) @pytest.mark.skip("TODO: should return 1 row (#63)") def test_ungrouped_summarize_literal(df, query, output): assert_equal_query(df, summarize(y=1), data_frame(y=1)) @backend_notimpl("sqlite") def test_summarize_after_mutate_cuml_win(backend, df_float): assert_equal_query(df_float, mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()),
def test_group_by_performs_mutate(df): assert_equal_query(df, group_by(z=_.x + _.y) >> summarize(n=n(_)), data_frame(z=10, n=3))
def test_vector_n(x): assert v.n(x) == 3