示例#1
0
def test_n():
    df = tibble(x=[1, 2, 3], g=[1, 1, 2])
    out = df >> summarise(n=n())
    assert_iterable_equal(out.n, [3])

    gf = df >> group_by(f.g)
    out = gf >> summarise(n=n())
    assert_iterable_equal(out.n, [2, 1])
示例#2
0
def test_peels_off_a_single_layer_of_grouping():
    df = tibble(
        x=rep([1, 2, 3, 4], each=4), y=rep([1, 2], each=8), z=runif(16)
    )
    gf = df >> group_by(f.x, f.y)

    assert group_vars(summarise(gf)) == ["x"]
    assert group_vars(summarise(summarise(gf))) == []
示例#3
0
def test_cur_group_rows():
    df = tibble(x=c("b", "a", "b"), y=[1, 2, 3])
    gf = df >> group_by(f.x, _sort=True)

    out = gf >> summarise(x=cur_group_rows()) >> pull()
    assert out.values.tolist() == [[1], [0, 2]]
    # data frame
    out = df >> summarise(x=cur_group_rows()) >> pull()
    assert out.values.tolist() == [[0, 1, 2]]
示例#4
0
def test_recycling():
    df = tibble(x=1, y=2)
    out = df >> summarise(across(everything(), lambda col: rep(42, col)))
    expect = tibble(x=rep(42, 2), y=rep(42, 2))
    assert out.equals(expect)

    df = tibble(x=2, y=3)
    with pytest.raises(ValueError):
        df >> summarise(across(everything(), lambda col: rep(42, col)))
示例#5
0
def test_works_with_empty_data_frames():
    df = tibble(x=[])
    df1 = summarise(df)
    df2 = tibble(_rows=1)
    assert df1.equals(df2)

    df = tibble(_rows=10)
    df1 = summarise(df)
    assert df1.equals(df2)
示例#6
0
def test_summarise_cols_inside_func():
    df = tibble(x=2, y=4, z=8)

    @register_func(None, context=None)
    def data_frame(**kwargs):
        return tibble(**kwargs)

    out = df >> summarise(data_frame(x=f.x / f.y, y=f.y / f.y, z=f.z / f.y))
    expect = df >> summarise(across(everything(), lambda col: col / df.y))
    assert out.equals(expect)
示例#7
0
def test_works_with_grouped_empty_data_frames():
    df = tibble(x=[])
    df1 = df >> group_by(f.x) >> summarise(y=1)
    assert dim(df1) == (0, 2)
    assert df1.columns.tolist() == ["x", "y"]

    df1 = df >> rowwise(f.x) >> summarise(y=1)
    assert group_vars(df1) == ["x"]
    assert dim(df1) == (0, 2)
    assert df1.columns.tolist() == ["x", "y"]
示例#8
0
def test_to_functions():
    df = tibble(x=c(1, NA))  # -> float

    out = df >> summarise(across(everything(), mean, na_rm=True))
    expect = tibble(x=1.0)
    assert_frame_equal(out, expect)

    out = df >> summarise(
        across(everything(), dict(mean=mean, median=median), na_rm=True))
    expect = tibble(x_mean=1.0, x_median=1.0)
    assert_frame_equal(out, expect)
示例#9
0
def test_cur_group():
    df = tibble(g=1, x=1)
    gf = df >> group_by(f.g)

    out = df >> summarise(key=cur_group()) >> pull(f.key)
    assert len(out) == 1
    assert_iterable_equal(out, [np.nan])

    out = gf >> summarise(key=cur_group()) >> pull(f.key, to="list")
    assert len(out) == 1
    assert out[0].equals(tibble(g=1))
示例#10
0
def test_cur_group_id():
    df = tibble(x=c("b", "a", "b"))
    out = df >> summarise(id=cur_group_id())
    assert_iterable_equal(out.id, [0])

    gf = df >> group_by(f.x, _sort=True)

    out = gf >> summarise(id=cur_group_id())
    # group_by not sorted
    expect = tibble(x=c("a", "b"), id=[0, 1])
    assert_frame_equal(out, expect)

    out = gf >> mutate(id=cur_group_id())
    expect = tibble(x=["b", "a", "b"], id=[1, 0, 1])
    assert_frame_equal(out, expect)
示例#11
0
def test_error_messages():
    with pytest.raises(ValueError, match="Argument `_fns` of across must be"):
        tibble(x=1) >> summarise(res=across(where(is_numeric), 42))
    with pytest.raises(ValueError, match="must only be used inside verbs"):
        across()
    with pytest.raises(ValueError, match="must only be used inside verbs"):
        c_across()
示例#12
0
def test_zero_row_dfs():
    df = tibble(a=[], b=[], g=[])
    dfg = group_by(df, f.g, _drop=False)
    assert dfg.shape == (0, 3)
    assert group_vars(dfg) == ["g"]
    assert group_size(dfg) == []

    x = summarise(dfg, n=n())
    assert x.shape == (0, 2)
    assert group_vars(x) == []

    x = mutate(dfg, c=f.b + 1)
    assert x.shape == (0, 4)
    assert group_vars(x) == ["g"]
    assert group_size(x) == []

    x = filter(dfg, f.a == 100)
    assert x.shape == (0, 3)
    assert group_vars(x) == ["g"]
    assert group_size(x) == []

    x = arrange(dfg, f.a, f.g)
    assert x.shape == (0, 3)
    assert group_vars(x) == ["g"]
    assert group_size(x) == []

    x = select(dfg, f.a)
    assert x.shape == (0, 2)
    assert group_vars(x) == ["g"]
    assert group_size(x) == []
示例#13
0
def test_input_recycled():
    df1 = tibble() >> summarise(x=1, y=[1, 2, 3], z=1)
    df2 = tibble(x=1, y=[1, 2, 3], z=1)
    assert df1.equals(df2)

    gf = group_by(tibble(a=[1, 2]), f.a)
    df1 = gf >> summarise(x=1, y=[1, 2, 3], z=1)
    df2 = tibble(
        a=rep([1, 2], each=3), x=1, y=rep([1, 2, 3], 2), z=1
    ) >> group_by(f.a)
    assert_tibble_equal(df1, df2)

    df1 = gf >> summarise(x=seq_len(f.a), y=1)
    df2 = tibble(a=c(1, 2, 2), x=c(1, 1, 2), y=1) >> group_by(f.a)
    # assert df1.equals(df2)
    assert_tibble_equal(df1, df2)
示例#14
0
def test_one_group_for_NA():
    x = c(NA, NA, NA, range(10, 0, -1), range(10, 0, -1))
    w = numpy.array(c(20, 30, 40, range(1, 11), range(1, 11))) * 10

    assert n_distinct(x, na_rm=False) == 11
    res = tibble(x=x, w=w) >> group_by(f.x) >> summarise(n=n())
    assert nrow(res) == 11
示例#15
0
def test_allows_names():
    res = (
        tibble(x=[1, 2, 3], y=letters[:3])
        >> group_by(f.y)
        >> summarise(a=length(f.x), b=quantile(f.x, 0.5))
    )
    assert res.b.tolist() == [1.0, 2.0, 3.0]
示例#16
0
def test_summarise_maintains_drop():
    df = tibble(
        f1=factor("a", levels=c("a", "b", "c")),
        f2=factor("d", levels=c("d", "e", "f", "g")),
        x=42,
    )
    res = df >> group_by(f.f1, f.f2, _drop=True)
    ng = n_groups(res)
    assert ng == 1
    assert group_by_drop_default(res)

    # DataFrame.groupby(..., observed=False) doesn't support
    # multiple categoricals
    # res1 = df >> group_by(f.f1, f.f2, _drop=False)
    # ng = n_groups(res1)
    # assert ng == 12

    res1 = df >> group_by(f.f1, _drop=True)
    ng = n_groups(res1)
    assert ng == 1

    res1 = df >> group_by(f.f1, _drop=False)
    ng = n_groups(res1)
    assert ng == 3

    res1 = df >> group_by(f.f2, _drop=False)
    ng = n_groups(res1)
    assert ng == 4

    res2 = res >> summarise(x=sum(f.x), _groups="drop_last")
    ng = n_groups(res2)
    assert ng == 1
    assert group_by_drop_default(res2)
示例#17
0
def test_0col_df_in_results_ignored():
    df1 = tibble(x=[1, 2])
    df2 = df1 >> group_by(f.x) >> summarise(tibble())
    assert df2.equals(df1)

    df2 = df1 >> group_by(f.x) >> summarise(tibble(), y=65)
    df3 = df1 >> mutate(y=65)
    assert df2.equals(df3)

    df2 = tibble(x=[1, 2], y=[3, 4])
    df3 = df2 >> group_by(f.x) >> summarise(tibble())
    assert df3.equals(df1)

    df3 = df2 >> group_by(f.x) >> summarise(tibble(), z=98)
    df4 = df1 >> mutate(z=98)
    assert df3.equals(df4)
示例#18
0
def test_rowwise_preserved_by_major_verbs():
    rf = rowwise(tibble(x=range(1, 6), y=range(5, 0, -1)), f.x)

    out = arrange(rf, f.y)
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["x"]

    out = filter(rf, f.x < 3)
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["x"]

    out = mutate(rf, x=f.x + 1)
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["x"]

    out = rename(rf, X=f.x)
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["X"]

    out = select(rf, "x")
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["x"]

    out = slice(rf, c(0, 0))
    assert isinstance(out, TibbleRowwise)
    assert group_vars(out) == ["x"]

    # Except for summarise
    out = summarise(rf, z=mean(f.x, f.y))
    assert isinstance(out, TibbleGrouped)
    assert group_vars(out) == ["x"]
示例#19
0
def test_names_output():
    gf = tibble(x=1, y=2, z=3, s="") >> group_by(f.x)

    out = gf >> summarise(across())
    assert out.columns.tolist() == ["x", "y", "z", "s"]

    out = gf >> summarise(across(_names="id_{_col}"))
    assert out.columns.tolist() == ["x", "id_y", "id_z", "id_s"]

    out = gf >> summarise(across(where(is_numeric), mean))
    assert out.columns.tolist() == ["x", "y", "z"]

    out = gf >> summarise(across(where(is_numeric), mean,
                                 _names="mean_{_col}"))
    assert out.columns.tolist() == ["x", "mean_y", "mean_z"]

    out = gf >> summarise(across(where(is_numeric), {
        "mean": mean,
        "sum": sum
    }))
    assert out.columns.tolist() == ["x", "y_mean", "y_sum", "z_mean", "z_sum"]

    # Different from R's list
    out = gf >> summarise(across(where(is_numeric), {"mean": mean, 1: sum}))
    assert out.columns.tolist() == ["x", "y_mean", "y_1", "z_mean", "z_1"]

    # Different from R's list
    out = gf >> summarise(across(where(is_numeric), {0: mean, "sum": sum}))
    assert out.columns.tolist() == ["x", "y_0", "y_sum", "z_0", "z_sum"]

    out = gf >> summarise(across(where(is_numeric), [mean, sum]))
    assert out.columns.tolist() == ["x", "y_0", "y_1", "z_0", "z_1"]

    out = gf >> summarise(
        across(where(is_numeric), [mean, sum], _names="{_col}_{_fn1}"))
    assert out.columns.tolist() == ["x", "y_1", "y_2", "z_1", "z_2"]

    out = gf >> summarise(
        across(
            where(is_numeric),
            {
                "mean": mean,
                "sum": sum
            },
            _names="{_fn}_{_col}",
        ))
    assert out.columns.tolist() == ["x", "mean_y", "sum_y", "mean_z", "sum_z"]
示例#20
0
def test_pd_cat():
    df = tibble(
        x=Categorical(["a", "b"], categories=["a", "b", "c"])) >> group_by(
            g=[1, 2])
    out = df >> summarise(lvls=pd_cat(f.x).categories)

    assert_iterable_equal(out.lvls[0], ["a", "b", "c"])
    assert_iterable_equal(out.lvls[1], ["a", "b", "c"])
示例#21
0
def test_list_output_columns():
    df = tibble(x=range(1, 11), g=rep([1, 2], each=5))
    res = (
        df
        >> group_by(f.g)
        >> summarise(y=f.x.apply(list))
    )
    assert_iterable_equal(res.y[0], [1, 2, 3, 4, 5])
示例#22
0
def test_cur_data_all():
    df = tibble(x=c("b", "a", "b"), y=[1, 2, 3])
    gf = df >> group_by(f.x, _sort=True)

    out = df >> summarise(x=cur_data()) >> pull(f.x, to="list")
    assert out[0].equals(df)

    out = df >> summarise(x=cur_data_all()) >> pull(f.x, to="list")
    assert out[0].equals(df)

    out = gf >> summarise(x=cur_data()) >> pull(f.x)
    assert out.values[0].values.flatten().tolist() == [2]
    assert out.values[1].values.flatten().tolist() == [1, 3]

    out = gf >> summarise(x=cur_data_all()) >> pull(f.x)
    assert out.values[0].values.flatten().tolist() == ["a", 2]
    assert out.values[1].values.flatten().tolist() == ["b", 1, "b", 3]
示例#23
0
def test_correctly_reconstructs_groups():
    d = (
        tibble(x=[1, 2, 3, 4], g1=rep([1, 2], 2), g2=[1, 2, 3, 4])
        >> group_by(f.g1, f.g2)
        >> summarise(x=f.x + 1)
    )
    # Different from dplyr, original df does not reorder.
    assert group_rows(d) == [[0, 2], [1, 3]]
示例#24
0
def test_summarise_rowwise():
    params = tibble(sim=[1, 2, 3], n=[1, 2, 3], mean=[1, 2, 1], sd=[1, 4, 2])

    out = params >> rowwise(f.sim) >> summarise(z=rnorm(f.n, f.mean, f.sd))
    assert len(out.columns) == 2
    assert len(out.z.obj.values[0]) == 1
    assert len(out.z.obj.values[1]) == 2
    assert len(out.z.obj.values[2]) == 3
示例#25
0
def test_n_distinct_handles_in_na_rm():
    d = tibble(x=c([1, 2, 3, 4], NA))
    yes = True
    no = False

    out = d >> summarise(n=n_distinct(f.x, na_rm=True)) >> pull(to="list")
    assert out == [4]
    out = d >> summarise(n=n_distinct(f.x, na_rm=False)) >> pull(to="list")
    assert out == [5]
    out = d >> summarise(n=n_distinct(f.x, na_rm=yes)) >> pull(to="list")
    assert out == [4]
    out = d >> summarise(n=n_distinct(f.x, na_rm=no)) >> pull(to="list")
    assert out == [5]

    out = (d >> summarise(n=n_distinct(f.x, na_rm=True or True)) >>
           pull(to="list"))
    assert out == [4]
示例#26
0
def test_n_distinct_works_with_str_col():
    wrapper = lambda data, col: summarise(
        data, result=n_distinct(f[col], na_rm=True))

    df = tibble(x=[1, 1, 3, NA])
    out = wrapper(df, "x")
    exp = tibble(result=2)
    assert out.equals(exp)
示例#27
0
def test_mutate_does_not_loose_variables():
    df = tibble(
        a=rep([1, 2, 3, 4], 2), b=rep([1, 2, 3, 4], each=2), x=runif(8)
    )
    by_ab = df >> group_by(f.a, f.b)
    by_a = by_ab >> summarise(x=sum(f.x), _groups="drop_last")
    by_a_quantile = by_a >> group_by(quantile=ntile(f.x, 4))

    assert by_a_quantile.columns.tolist() == ["a", "b", "x", "quantile"]
示例#28
0
def test_can_be_before_group_by():
    df = tibble(id=c(1, 1, 2, 2, 2, 3, 3, 4, 4, 5),
                year=c(2013, 2013, 2012, 2013, 2013, 2013, 2012, 2012, 2013,
                       2013),
                var1=rnorm(10))
    dfagg = df >> group_by(f.id, f.year) >> select(
        f.id, f.year, f.var1) >> summarise(var1=mean(f.var1))

    assert_iterable_equal(names(dfagg), ["id", "year", "var1"])
示例#29
0
def test_result_locations_aligned_with_column_names():
    df = tibble(x=[1, 2], y=["a", "b"])
    expect = tibble(x_cls=numpy.int64, x_type=True, y_cls=object, y_type=False)
    x = df >> summarise(
        across(everything(), {
            "cls": lambda x: x.dtype,
            "type": is_numeric
        }))
    assert_frame_equal(x, expect)
示例#30
0
def test_c_across():
    df = tibble(x=[1, 2], y=[3, 4])

    out = df >> summarise(z=c_across([f.x, f.y]))
    assert_frame_equal(out["z"], df)

    # what if no columns specified
    gf = df >> rowwise(f.x)
    out = gf >> mutate(z=sum(c_across()))
    assert out.z.obj.tolist() == [3, 4]