def test_errors(): # wrong type with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter(range(1, 10)) with pytest.raises(ValueError): iris >> filter(range(1, 10)) # wrong size with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter([True, False]) with pytest.raises(ValueError): iris >> rowwise(f.Species) >> filter([True, False]) with pytest.raises(ValueError): iris >> filter([True, False]) # wrong size in column with pytest.raises(ValueError): iris >> group_by(f.Species) >> filter(tibble([True, False])) with pytest.raises(ValueError): iris >> rowwise() >> filter(tibble([True, False])) with pytest.raises(ValueError): iris >> filter(tibble([True, False])) with pytest.raises(ValueError): tibble(x=1) >> filter([True, False]) # named inputs with pytest.raises(TypeError): mtcars >> filter(x=1) with pytest.raises(TypeError): mtcars >> filter(f.y > 2, z=3) with pytest.raises(TypeError): mtcars >> filter(True, x=1) # across() in filter() does not warn yet tibble(x=1, y=2) >> filter(across(everything(), lambda x: x > 0))
def test_compound_ungroup(): assert ungroup(1) == 1 g = Series([1, 2, 3]).groupby([1, 1, 2]) assert ungroup(g) is g.obj with pytest.raises(ValueError): ungroup(g, "abc") df = tibble(x=1, y=2) >> group_by(f.x, f.y) out = ungroup(df) assert group_vars(out) == [] out = ungroup(df, f.x) assert group_vars(out) == ["y"] out = ungroup(df, f.y) assert group_vars(out) == ["x"] out = group_by(df, f.y, _add=True) assert group_vars(out) == ["x", "y"] rf = df >> rowwise() with pytest.raises(ValueError): ungroup(rf, f.x) with pytest.raises(KeyError): group_by(df, f.w)
def test_slice_works_with_grouped_data(): g = mtcars >> arrange(f.cyl) >> group_by(f.cyl) res = slice(g, f[:2]) exp = filter(g, row_number() < 3) assert_frame_equal(res, exp) res = slice(g, ~f[:2]) exp = filter(g, row_number() >= 3) assert_tibble_equal(res, exp) g = group_by(tibble(x=c(1, 1, 2, 2, 2)), f.x) # out = group_keys(slice(g, 3, _preserve=True)) # assert out.x.tolist() == [1, 2] out = group_keys(slice(g, 2, _preserve=False)) assert out.x.tolist() == [2] gf = tibble(x=f[1:4]) >> group_by( g=Categorical([1, 1, 2], categories=[1, 2, 3]), _drop=False, ) with pytest.raises(TypeError): gf >> slice("a") with pytest.raises(ValueError): gf >> slice(~f[:2], 1) out = gf >> slice(0) assert out.shape[0] == 2 out = gf >> slice( Series([1, 0, 0]).groupby(gf._datar["grouped"].grouper.result_index)) assert_iterable_equal(out.x.obj, [2, 3])
def test_summarise_maintains_drop(): df = tibble( f1=factor("a", levels=c("a", "b", "c")), f2=factor("d", levels=c("d", "e", "f", "g")), x=42, ) res = df >> group_by(f.f1, f.f2, _drop=True) ng = n_groups(res) assert ng == 1 assert group_by_drop_default(res) # DataFrame.groupby(..., observed=False) doesn't support # multiple categoricals # res1 = df >> group_by(f.f1, f.f2, _drop=False) # ng = n_groups(res1) # assert ng == 12 res1 = df >> group_by(f.f1, _drop=True) ng = n_groups(res1) assert ng == 1 res1 = df >> group_by(f.f1, _drop=False) ng = n_groups(res1) assert ng == 3 res1 = df >> group_by(f.f2, _drop=False) ng = n_groups(res1) assert ng == 4 res2 = res >> summarise(x=sum(f.x), _groups="drop_last") ng = n_groups(res2) assert ng == 1 assert group_by_drop_default(res2)
def test_switch_groupby_distinct_equal(): df = tibble(g=c(1, 2), x=c(1, 2)) df1 = df >> distinct() >> group_by(f.g) df2 = df >> group_by(f.g) >> distinct() assert df1.equals(df2)
def test_auto_splicing(): df1 = iris >> group_by(f.Species) df2 = iris >> group_by(tibble(Species=iris.Species)) assert df1.equals(df2) df1 = iris >> group_by(f.Species) df2 = iris >> group_by(across(f.Species)) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(across(starts_with("Sepal"), round)) assert df1.equals(df2) # across(character()), across(NULL) not supported df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Sepal_Length, f.Sepal_Width, f.Species) ) df2 = iris >> group_by(across(starts_with("Sepal"), round), f.Species) assert df1.equals(df2) df1 = ( iris >> mutate(across(starts_with("Sepal"), round)) >> group_by(f.Species, f.Sepal_Length, f.Sepal_Width) ) df2 = iris >> group_by(f.Species, across(starts_with("Sepal"), round)) assert df1.equals(df2)
def test_mutate_semantics(): df1 = tibble(a=1, b=2) >> group_by(c=f.a * f.b, d=f.c + 1) df2 = ( tibble(a=1, b=2) >> mutate(c=f.a * f.b, d=f.c + 1) >> group_by(f.c, f.d) ) assert df1.equals(df2)
def test_group_map_errors(): # head1 = lambda df: head(df, 1) # group_modify() with pytest.raises(ValueError, match="grouping variables"): mtcars >> group_by(f.cyl) >> group_modify(lambda df: tibble(cyl=19)) with pytest.raises(ValueError, match="should be a data frame"): mtcars >> group_by(f.cyl) >> group_modify(lambda df: 10)
def test_add(df): tbl = df >> group_by(f.x, f.y, _add=True) gvars = group_vars(tbl) assert gvars == ["x", "y"] tbl = df >> group_by(f.x, _add=True) >> group_by(f.y, _add=True) gvars = group_vars(tbl) assert gvars == ["x", "y"]
def test_add_tally_can_be_given_a_weighting_variable(): df = tibble(a=c(1, 1, 2, 2, 2), w=c(1, 1, 2, 3, 4)) out = df >> group_by(f.a) >> add_tally(wt=f.w) >> pull(f.n, to="list") assert out == [2, 2, 9, 9, 9] out = df >> group_by(f.a) >> add_tally(wt=f.w + 1) >> pull(f.n, to="list") assert out == [4, 4, 12, 12, 12]
def test_handles_scalar_results(): df1 = mtcars >> filter(min(f.mpg) > 0) assert df1.equals(mtcars) df2 = (mtcars >> group_by(f.cyl) >> filter(min(f.mpg) > 0) >> arrange( f.cyl, f.mpg)) # See TibbleGrouped's Known issues df3 = mtcars >> group_by(f.cyl) >> arrange(f.cyl, f.mpg) assert_frame_equal(df2, df3)
def test_mutate_does_not_loose_variables(): df = tibble( a=rep([1, 2, 3, 4], 2), b=rep([1, 2, 3, 4], each=2), x=runif(8) ) by_ab = df >> group_by(f.a, f.b) by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last") by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4)) assert by_a_quantile.columns.tolist() == ["a", "b", "x", "quantile"]
def test_remember_drop_False(): res = ( iris >> filter(f.Species == "setosa") >> group_by(f.Species, _drop=False) ) assert not group_by_drop_default(res) res2 = res >> group_by(f.Species) assert not group_by_drop_default(res2)
def test_0_vars(df): gdata = group_data(group_by(iris)) assert names(gdata) == ["_rows"] out = gdata assert_iterable_equal(out._rows[0], range(nrow(iris))) gdata = group_data(group_by(iris, **{})) assert names(gdata) == ["_rows"] out = gdata assert_iterable_equal(out._rows[0], range(nrow(iris)))
def test_add_passes_drop(): d = tibble( f1=factor("b", levels=c("a", "b", "c")), f2=factor("g", levels=c("e", "f", "g")), x=48, ) res = group_by(group_by(d, f.f1, _drop=True), f.f2, _add=True) ng = n_groups(res) assert ng == 1 assert group_by_drop_default(res)
def test_filter_false_handles_indices(caplog): out = mtcars >> group_by(f.cyl) >> filter(False, _preserve=True) assert "support" in caplog.text # out = group_rows(out) # assert out == [[], [], []] out = mtcars >> group_by(f.cyl) >> filter(False, _preserve=False) out = group_rows(out) assert out == []
def test_deals_with_0_groups(): df = tibble(x=[]) >> group_by(f.x) out = mutate(df, y=f.x + 1) exp = tibble(x=[], y=[]) >> group_by(f.x) assert_iterable_equal(out, exp) assert group_vars(out) == group_vars(exp) out = mutate(df, y=max(f.x)) assert out.shape == (0, 2) assert group_vars(out) == ["x"]
def test_output_preserves_grouping(): df = tibble(g=c(1, 2, 2, 2)) exp = tibble(g=c(1, 2, 2, 2), n=c(1, 3, 3, 3)) out = df >> add_count(f.g) assert out.equals(exp) out = df >> group_by(f.g) >> add_count() exp >>= group_by(f.g) assert out.equals(exp) assert group_vars(out) == group_vars(exp)
def test_preserve_grouping(): df = tibble(g=c(1, 2, 2, 2)) exp = tibble(g=c(1, 2), n=c(1, 3)) out = df >> count(f.g) assert out.equals(exp) df1 = df >> group_by(f.g) >> count() df2 = exp >> group_by(f.g) assert df1.equals(df2) assert group_vars(df1) == group_vars(df2)
def test_remember_drop_True(): res = iris >> group_by(f.Species, _drop=True) assert group_by_drop_default(res) res2 = res >> filter(f.Sepal_Length > 5) assert group_by_drop_default(res2) res3 = res >> filter(f.Sepal_Length > 5, _preserve=False) assert group_by_drop_default(res3) res4 = res3 >> group_by(f.Species) assert group_by_drop_default(res4)
def test_can_add_tallies_of_a_variable(): df = tibble(a=c(2, 1, 1)) out = df >> group_by(f.a) >> add_tally() exp = tibble(a=c(2, 1, 1), n=c(1, 2, 2)) >> group_by(f.a) assert_frame_equal(out, exp) assert group_vars(out) == group_vars(exp) # sort out = df >> group_by(f.a) >> add_tally(sort=True) exp = tibble(a=c(1, 1, 2), n=c(2, 2, 1)) >> group_by(f.a) assert out.equals(exp) # assert_frame_equal(out, exp) assert group_vars(out) == group_vars(exp)
def test_errors(caplog): df = tibble(x=1, y=2) out = df >> group_by(f.x, f.y) >> summarise() assert "`summarise()` has grouped output by ['x']" in caplog.text assert out.equals(df) caplog.clear() out = tibble(x=1, y=2) >> group_by(f.x, f.y) >> summarise(z=[2, 2]) assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text exp = tibble(x=[1, 1], y=[2, 2], z=[2, 2]) assert out.equals(exp) caplog.clear() out = df >> rowwise(f.x, f.y) >> summarise() assert "`summarise()` has grouped output by ['x', 'y']" in caplog.text assert out.equals(df) caplog.clear() out = df >> rowwise() >> summarise() assert "`summarise()` has ungrouped output" in caplog.text d = dim(out) assert d == (1, 0) caplog.clear() # unsupported type (but python objects are supported by pandas) # not testing for types futher # tibble(x=1, y=c(1, 2, 2), z=runif(3)) >> summarise(a=object()) # incompatible size with pytest.raises(ValueError): tibble(z=1) >> summarise(x=[1, 2, 3], y=[1, 2]) with pytest.raises(ValueError): tibble(z=[1, 2]) >> group_by(f.z) >> summarise(x=[1, 2, 3], y=[1, 2]) with pytest.raises(ValueError): ( tibble(z=c(1, 3)) >> group_by(f.z) >> summarise(x=seq_len(f.z), y=[1, 2]) ) # Missing variable with pytest.raises(KeyError): summarise(mtcars, a=mean(f.not_there)) with pytest.raises(KeyError): summarise(group_by(mtcars, f.cyl), a=mean(f.not_there)) # Duplicate column names x = 1 df = tibble(x, x, _name_repair="minimal") with pytest.raises(NameNonUniqueError): df >> summarise(f.x)
def test_group_map_respects_empty_groups(): res = group_by(mtcars, f.cyl) >> group_map(lambda df: head(df, 2)) assert len(list(res)) == 3 res = (iris >> group_by(f.Species) >> filter(f.Species == "setosa") >> group_map(tally)) assert len(list(res)) == 1 res = (iris >> group_by(f.Species, _drop=False) >> filter(f.Species == "setosa") >> group_map.list(tally)) # filter unable to keep the structure # assert len(res) == 3 assert len(res) == 1
def test_joins_preserve_groups(): gf1 = tibble(a=[1, 2, 3]) >> group_by(f.a) gf2 = tibble(a=rep([1, 2, 3, 4], 2), b=1) >> group_by(f.b) out = inner_join(gf1, gf2, by="a") assert group_vars(out) == ["a"] out = semi_join(gf1, gf2, by="a") assert group_vars(out) == ["a"] # See comment in nest_join out = nest_join(gf1, gf2, by="a") assert group_vars(out) == ["a"]
def test_errors(): df = tibble(x=1, y=2) with pytest.raises(KeyError): df >> group_by(f.unknown) with pytest.raises(ValueError): df >> ungroup(f.x) with pytest.raises(KeyError): df >> group_by(f.x, f.y) >> ungroup(f.z) with pytest.raises(KeyError): df >> group_by(z=f.a + 1)
def test_slice_handles_df_columns(): df = tibble(x=[1, 2], y=tibble(a=[1, 2], b=[3, 4]), z=tibble(A=[1, 2], B=[3, 4])) out = slice(df, 0) assert out.equals(df.iloc[[0], :]) gdf = group_by(df, f.x) assert slice(gdf, 0).equals(gdf) # TODO: group_by a stacked df is not supported yet gdf = group_by(df, f["y$a"], f["y$b"]) assert slice(gdf, 0).equals(gdf) gdf = group_by(df, f["z$A"], f["z$B"]) assert slice(gdf, 0).equals(gdf)
def test_preserve_order_across_groups(): df = tibble(g=c(1, 2, 1, 2, 1), time=[5, 4, 3, 2, 1], x=f.time) res1 = (df >> group_by(f.g) >> filter(f.x <= 4) >> ungroup() >> arrange( f.g, f.time)) res2 = (df >> arrange(f.g) >> group_by(f.g) >> filter(f.x <= 4) >> ungroup() >> arrange(f.g, f.time)) res3 = (df >> filter(f.x <= 4) >> group_by(f.g) >> ungroup() >> arrange( f.g, f.time)) res1.reset_index(drop=True, inplace=True) res2.reset_index(drop=True, inplace=True) res3.reset_index(drop=True, inplace=True) assert res1.equals(res2) assert res1.equals(res3)
def test_group_column_names_reflect_renamed_duplicate_columns(): # test_that("group column names reflect renamed duplicate columns (#2330)", { df1 = tibble(x=range(1, 6), y=range(1, 6)) >> group_by(f.x, f.y) df2 = tibble(x=range(1, 6), y=range(1, 6)) out = inner_join(df1, df2, by="x") assert group_vars(out) == ["x"]
def test_not_duplicating_cols(): df = tibble(a=[1, 2, 3], b=[4, 5, 6]) out = df >> distinct(f.a, f.a) assert out.columns.tolist() == ["a"] out = df >> group_by(f.a) >> distinct(f.a) assert out.columns.tolist() == ["a"]
def test_input_recycled(): df1 = tibble() >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble(x=1, y=[1, 2, 3], z=1) assert df1.equals(df2) gf = group_by(tibble(a=[1, 2]), f.a) df1 = gf >> summarise(x=1, y=[1, 2, 3], z=1) df2 = tibble( a=rep([1, 2], each=3), x=1, y=rep([1, 2, 3], 2), z=1 ) >> group_by(f.a) assert_tibble_equal(df1, df2) df1 = gf >> summarise(x=seq_len(f.a), y=1) df2 = tibble(a=c(1, 2, 2), x=c(1, 1, 2), y=1) >> group_by(f.a) # assert df1.equals(df2) assert_tibble_equal(df1, df2)