Пример #1
0
def test_summarize_removes_series_index():
    # Note: currently wouldn't work in postgresql, since _.x + _.y not an agg func
    df = data_frame(g=['a', 'b', 'c'], x=[1, 2, 3], y=[4, 5, 6])

    assert_equal_query(df,
                       group_by(_.g) >> summarize(res=_.x + _.y),
                       df.assign(res=df.x + df.y).drop(columns=["x", "y"]))
Пример #2
0
def test_filter_via_group_by_arrange(backend):
    dfs = backend.load_df(x=[3, 2, 1] + [2, 3, 4], g=[1] * 3 + [2] * 3)

    assert_equal_query(
        dfs,
        group_by(_.g) >> arrange(_.x) >> filter(_.x.cumsum() > 3),
        data_frame(x=[3, 3, 4], g=[1, 2, 2]))
Пример #3
0
def test_filter_via_group_by_agg_two_args(backend):
    dfs = backend.load_df(x=range(1, 11), g=[1] * 5 + [2] * 5)

    assert_equal_query(
        dfs,
        group_by(_.g) >> filter(_.x > _.x.mean(), _.x != _.x.max()),
        data_frame(x=[4, 9], g=[1, 2]))
def test_frame_summarize(skip_backend, backend, agg_entry):
    entry = agg_entry
    test_missing_implementation(entry, backend)

    # Prepare input data ------------------------------------------------------
    # case: inputs must be boolean
    crnt_data = get_data(entry, DATA, backend)
    df = backend.load_df(crnt_data)

    # Execute mutate ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(entry)

    dst = data_frame(result=call_expr(crnt_data))

    # Process output ----------------------------------------------------------
    # case: output is of a different type than w/ pandas
    dst['result'] = cast_result_type(entry, backend, dst['result'])

    # Run test for equality w/ pandas ----
    # otherwise, verify returns same result as mutate
    assert_equal_query(df, summarize(result=call_expr), dst)

    dst_g = crnt_data.groupby('g').apply(call_expr).reset_index().rename(
        columns={0: 'result'})
    assert_equal_query(df, group_by(_.g) >> summarize(result=call_expr), dst_g)
Пример #5
0
def test_grouping_is_preserved():
    # In siuba this actually tests 3 things:
    # 1) Can we pipe a grouped DataFrame in to `pivot_longer`? - Not yet
    # 2) Does it retain grouping? - Not yet
    # 3) Can we get the names of grouping _columns_ ie variables? - Not yet?
    df = data_frame(g = [1, 2], x1 = [1, 2], x2 = [3, 4])
    # Breaks; `pivot_longer` needs singledispatch for grouped DataFrames
    out = (
        df
        >> group_by(_.g)
        >> pivot_longer(_["x1":"x2"], names_to = "x", values_to = "v")
    )

    # Breaks, as group_vars does not exist yet.
    # For now, in pandas it is probably better to check if the DataFrame remains
    # grouped, and if it matches the expected output
    # assert group_vars(out) == "g"
    expected = data_frame(
        g = [1, 1, 2, 2],
        x = ["x1", "x2", "x1", "x2"],
        v = [1, 3, 2, 4],
        _index = [0, 0, 1, 1]
    ).groupby("g")

    # assert_frame_equal does not work with DataFrameGroupBy.
    isinstance(out, expected.__class__)
    assert_frame_equal(out.obj, expected.obj)
Пример #6
0
def test_raw_sql_mutate_refer_previous_raise_dberror(backend, df):
    # Note: unlikely will be able to support this case. Normally we analyze
    # the expression to know whether we need to create a subquery.
    with pytest.raises(sqlalchemy.exc.DatabaseError):
        assert_equal_query(
            df,
            group_by("x") >> mutate(z1=sql_raw("y + 1"), z2=sql_raw("z1 + 1")),
            DATA.assign(z1=lambda d: d.y + 1, z2=lambda d: d.z1 + 1))
Пример #7
0
def test_arrange_grouped(backend, df):
    q = group_by(_.y) >> arrange(_.x)
    assert_equal_query(df, q, DATA.sort_values(['x']))

    # arrange w/ mutate is the same, whether used before or after group_by
    assert_equal_query(
        df, q >> mutate(res=row_number(_)),
        mutate(DATA.sort_values(['x']).groupby('y'), res=row_number(_)))
Пример #8
0
def test_mutate_using_agg_expr(backend):
    data = data_frame(x = range(1, 5), g = [1,1,2,2])
    dfs = backend.load_df(data)
    assert_equal_query(
            dfs,
            group_by(_.g) >> mutate(y = _.x - _.x.mean()),
            data.assign(y = [-.5, .5, -.5, .5])
            )
Пример #9
0
def test_mutate_window_funcs(backend):
    data = data_frame(x = range(1, 5), g = [1,1,2,2])
    dfs = backend.load_df(data)
    assert_equal_query(
            dfs,
            group_by(_.g) >> mutate(row_num = row_number(_).astype(float)),
            data.assign(row_num = [1.0, 2, 1, 2])
            )
Пример #10
0
def test_filter_via_group_by(backend):
    df = data_frame(x=range(1, 11), g=[1] * 5 + [2] * 5)

    dfs = backend.load_df(df)

    assert_equal_query(dfs,
                       group_by(_.g) >> filter(row_number(_) < 3),
                       data_frame(x=[1, 2, 6, 7], g=[1, 1, 2, 2]))
Пример #11
0
def test_summarize_removes_1_grouping(backend):
    data = data_frame(a=1, b=2, c=3)
    df = backend.load_df(data)

    q1 = df >> group_by(_.a, _.b) >> summarize(n=n(_))
    assert q1.group_by == ("a")

    q2 = q1 >> summarize(n=n(_))
    assert not len(q2.group_by)
Пример #12
0
def test_frame_set_aggregates_postgresql():
    # TODO: probably shouldn't be creating backend here
    backend = SqlBackend("postgresql")
    dfs = backend.load_df(data[None])

    expr = _.x.quantile(.75)
    assert_equal_query(dfs,
                       group_by(_.g) >> summarize(result=expr),
                       data_frame(g=['a', 'b'], result=[11., 13.]))
Пример #13
0
def test_mutate_using_cuml_agg(backend):
    data = data_frame(x=range(1, 5), g=[1, 1, 2, 2])
    dfs = backend.load_df(data)

    # cuml window without arrange before generates warning
    with pytest.warns(None):
        assert_equal_query(dfs,
                           group_by(_.g) >> mutate(y=_.x.cumsum()),
                           data.assign(y=[1.0, 3, 3, 7]))
Пример #14
0
def test_filter_vector(backend, func, simple_data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_cached_df(simple_data)

    res = data_frame(y=func(simple_data))

    assert_equal_query(
        df,
        filter(func),
        filter(simple_data, func),
        # ignore dtypes, since sql -> an empty data frame has object columns
        check_dtype=False)

    # grouped (vs slow_filter)
    assert_equal_query(df,
                       group_by(_.g) >> filter(func),
                       simple_data >> group_by(_.g) >> filter(func),
                       check_dtype=False)
Пример #15
0
def test_summarize_subquery_op_vars(backend, df):
    query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min())
    assert_equal_query(
            df,
            query,
            data_frame(g = ['a', 'b'], low = [2, 4])
            )

    # check that is uses a subquery, since x2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
Пример #16
0
def test_summarize_subquery_group_vars(backend, df):
    query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min())
    assert_equal_query(
            df,
            query,
            data_frame(g2 = ['A', 'B'], low = [1, 3])
            )

    # check that is uses a subquery, since g2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
Пример #17
0
def distinct_events(tbl, time_col, user_col, type):
    if type not in ["first", "last"]:
        return tbl

    res = (tbl
            >> group_by(_[user_col])
            >> arrange(_[time_col] if type == "first" else -_[time_col])
            >> filter(row_number(_) == 1)
            >> ungroup()
            )

    return res
Пример #18
0
def test_agg_vector(backend, func, simple_data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_cached_df(simple_data)

    res = data_frame(y=func(simple_data))

    assert_equal_query(df, summarize(y=func), res)

    # grouped
    assert_equal_query(
        df,
        group_by(_.g) >> summarize(y=func),
        simple_data.groupby('g').apply(func).reset_index().rename(
            columns={0: 'y'}))
Пример #19
0
def test_mutate_vector(backend, func, simple_data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_cached_df(simple_data)

    assert_equal_query(df,
                       mutate(y=func),
                       simple_data.assign(y=func),
                       check_dtype=False)

    # grouped
    assert_equal_query(
        df,
        group_by(_.g) >> mutate(y=func),
        simple_data.groupby('g').apply(lambda d: d.assign(y=func)).reset_index(
            drop=True),
        check_dtype=False)
Пример #20
0
def test_frame_mutate(skip_backend, backend, entry):
    test_missing_implementation(entry, backend)

    # Prepare input data ------------------------------------------------------
    # case: inputs must be boolean
    crnt_data = get_data(entry, DATA, backend)
    df = backend.load_df(crnt_data)

    # Execute mutate ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(entry)

    # Run test for equality w/ ungrouped pandas ----
    dst = crnt_data.assign(result=call_expr(crnt_data))
    dst['result'] = cast_result_type(entry, backend, dst['result'])

    assert_equal_query(df, mutate(result=call_expr), dst)

    # Run test for equality w/ grouped pandas ----
    g_dst = crnt_data.groupby('g').apply(
        lambda d: d.assign(result=call_expr)).reset_index(drop=True)
    g_dst['result'] = cast_result_type(entry, backend, g_dst['result'])
    assert_equal_query(df, group_by(_.g) >> mutate(result=call_expr), g_dst)
Пример #21
0
def test_group_by_override(df):
    gdf = df >> group_by(_.x, _.y) >> group_by(_.g)
    assert gdf.group_by == ("g", )
Пример #22
0
DATA = data_frame(a=1, b=2, c=3)


@pytest.fixture(scope="module")
def dfs(backend):
    return backend.load_df(DATA)


@pytest.mark.parametrize("query, output", [
    (select(_.c), data_frame(c=3)),
    (select(_.b == _.c), data_frame(b=3)),
    (select(_["a":"c"]), data_frame(a=1, b=2, c=3)),
    (select(_[_.a:_.c]), data_frame(a=1, b=2, c=3)),
    (select(_.a, _.b) >> select(_.b), data_frame(b=2)),
    (mutate(a=_.b + _.c) >> select(_.a), data_frame(a=5)),
    pytest.param(group_by(_.a) >> select(_.b),
                 data_frame(b=2, a=1),
                 marks=pytest.mark.xfail),
])
def test_select_siu(dfs, query, output):
    assert_equal_query(dfs, query, output)


@pytest.mark.skip("TODO: #63")
def test_select_kwargs(dfs):
    assert_equal_query(dfs, select(x=_.a), data_frame(x=1))


# Rename ----------------------------------------------------------------------

Пример #23
0
def test_transmute_grouped_no_mutate_grouping(dfs):
    assert_equal_query(
            dfs,
            group_by(_.a) >> transmute(a = _.a + 1),
            data_frame(a = [1,2,3], x = [2,3,4])
            )
# +
st.write("Goals by month")

st.write("Top 8 players not in our data")
top8 >> filter(_.yr_start < 1979)

# +
from pandas.tseries.offsets import MonthBegin
from siuba.experimental.pd_groups import fast_summarize

top8_goals = (
    top8_games >> mutate(
        date=_.date.astype("datetime64[D]"),
        age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >>
    arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >>
    fast_summarize(ttl_goals=_.goals.sum(), age_years=_.age_years.min()) >>
    group_by(_.player) >> mutate(cuml_goals=_.ttl_goals.cumsum()) >> ungroup())

p_goals = alt.Chart(top8_goals).mark_line().encode(y="cuml_goals:Q",
                                                   color="player")

# +
time = st.selectbox("Choose a time", ["month", "age_years"])

st.write(p_goals.encode(x=time))

# +
st.write("Goals by seasons")

Пример #25
0
def gdf(df):
    return df >> group_by(_.g)
Пример #26
0
def test_group_by_no_add(df):
    gdf = group_by(df, _.x, _.y)
    assert gdf.group_by == ("x", "y")
Пример #27
0
def test_filter_via_group_by_agg(backend):
    dfs = backend.load_df(x=range(1, 11), g=[1] * 5 + [2] * 5)

    assert_equal_query(dfs,
                       group_by(_.g) >> filter(_.x > _.x.mean()),
                       data_frame(x=[4, 5, 9, 10], g=[1, 1, 2, 2]))
Пример #28
0
def test_group_by_add(df):
    gdf = group_by(df, _.x) >> group_by(_.y, add=True)

    assert gdf.group_by == ("x", "y")
Пример #29
0
def test_group_by_performs_mutate(df):
    assert_equal_query(df,
                       group_by(z=_.x + _.y) >> summarize(n=n(_)),
                       data_frame(z=10, n=3))
Пример #30
0
def test_group_by_ungroup(df):
    q1 = df >> group_by(_.g)
    assert q1.group_by == ("g", )

    q2 = q1 >> ungroup()
    assert q2.group_by == tuple()