def test_frame_summarize(skip_backend, backend, agg_entry):
    entry = agg_entry
    test_missing_implementation(entry, backend)

    # Prepare input data ------------------------------------------------------
    # case: inputs must be boolean
    crnt_data = get_data(entry, DATA, backend)
    df = backend.load_df(crnt_data)

    # Execute mutate ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(entry)

    dst = data_frame(result=call_expr(crnt_data))

    # Process output ----------------------------------------------------------
    # case: output is of a different type than w/ pandas
    dst['result'] = cast_result_type(entry, backend, dst['result'])

    # Run test for equality w/ pandas ----
    # otherwise, verify returns same result as mutate
    assert_equal_query(df, summarize(result=call_expr), dst)

    dst_g = crnt_data.groupby('g').apply(call_expr).reset_index().rename(
        columns={0: 'result'})
    assert_equal_query(df, group_by(_.g) >> summarize(result=call_expr), dst_g)
示例#2
0
def test_summarize_removes_1_grouping(backend):
    data = data_frame(a=1, b=2, c=3)
    df = backend.load_df(data)

    q1 = df >> group_by(_.a, _.b) >> summarize(n=n(_))
    assert q1.group_by == ("a")

    q2 = q1 >> summarize(n=n(_))
    assert not len(q2.group_by)
示例#3
0
def test_agg_vector(backend, func, simple_data):
    if backend.name == 'sqlite':
        pytest.skip()

    df = backend.load_cached_df(simple_data)

    res = data_frame(y=func(simple_data))

    assert_equal_query(df, summarize(y=func), res)

    # grouped
    assert_equal_query(
        df,
        group_by(_.g) >> summarize(y=func),
        simple_data.groupby('g').apply(func).reset_index().rename(
            columns={0: 'y'}))
示例#4
0
def test_summarize_removes_series_index():
    # Note: currently wouldn't work in postgresql, since _.x + _.y not an agg func
    df = data_frame(g=['a', 'b', 'c'], x=[1, 2, 3], y=[4, 5, 6])

    assert_equal_query(df,
                       group_by(_.g) >> summarize(res=_.x + _.y),
                       df.assign(res=df.x + df.y).drop(columns=["x", "y"]))
def test_frame_set_aggregates_postgresql():
    # TODO: probably shouldn't be creating backend here
    backend = SqlBackend("postgresql")
    dfs = backend.load_df(data[None])

    expr = _.x.quantile(.75)
    assert_equal_query(dfs,
                       group_by(_.g) >> summarize(result=expr),
                       data_frame(g=['a', 'b'], result=[11., 13.]))
示例#6
0
def test_summarize_subquery_op_vars(backend, df):
    query = mutate(x2 = _.x + 1) >> group_by(_.g) >> summarize(low = _.x2.min())
    assert_equal_query(
            df,
            query,
            data_frame(g = ['a', 'b'], low = [2, 4])
            )

    # check that is uses a subquery, since x2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
示例#7
0
def test_summarize_subquery_group_vars(backend, df):
    query = mutate(g2 = _.g.str.upper()) >> group_by(_.g2) >> summarize(low = _.x.min())
    assert_equal_query(
            df,
            query,
            data_frame(g2 = ['A', 'B'], low = [1, 3])
            )

    # check that is uses a subquery, since g2 is defined in first query
    text = str(query(df).last_op)
    assert text.count('FROM') == 2
def plot_large_error_percentage(data, threshold):
    """
    calculate overall high error percentage based on 2 std devations from the mean 
        abs(resid_mean - resid) > 2 * resid.std()
        
    gather long form data of the columns, filter to only large errors and their percentages
    
    setup facet grid and pass in axes plotting function
    """
    resid_mean = data.final_residuals.mean()

    # Calculate distance from average and apply a threshold to indicate large errors
    data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold

    # Percentage of large errors in the overall data set
    overall_large_error_percent = data.high_error.mean()

    # long form plotting data
    # Use siuba library to help with R/dplyr style aggregations
    data_melted = (data.loc[:, [
        'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis',
        'yearsExperience', 'high_error'
    ]].melt(id_vars='high_error', var_name='column') >> group_by(
        'column', 'value', 'high_error') >> summarize(total=_.value_counts())
                   >> group_by('column', 'value') >>
                   mutate(percent=_.total / _.total.sum()) >> ungroup())

    # Filter for only large error percentages and apply a grouped sort to prepare for plotting
    data_melted = (
        data_melted.loc[data_melted.high_error, :].groupby('column').apply(
            lambda x: x.sort_values('percent', ascending=False)).reset_index(
                drop=True))

    plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}"
    plotGrid = sns.FacetGrid(data_melted,
                             col='column',
                             sharey=False,
                             col_wrap=3,
                             height=4,
                             aspect=1.2)
    plotGrid.map(_large_error_axes_plot,
                 'percent',
                 'value',
                 error_threshold=overall_large_error_percent)
    plotGrid.set_xlabels('Percentage of large errors')
    plotGrid.set_ylabels('')
    plotGrid.add_legend(loc='upper center',
                        bbox_to_anchor=(0.5, 0),
                        fancybox=True,
                        frameon=True)
    plt.suptitle(plot_title, y=1.05)
    plt.show()
def test_pandas_grouped_frame_fast_summarize(agg_entry):
    from siuba.experimental.pd_groups.dialect import fast_summarize, DataFrameGroupBy
    gdf = get_data(agg_entry, DATA).groupby('g')

    # Execute summarize ----------------------------------------------------------
    str_expr, call_expr = get_df_expr(agg_entry)

    res = fast_summarize(gdf, result=call_expr)
    dst = summarize(gdf, result=call_expr)

    # TODO: apply mark to skip failing tests, rather than downcast
    # pandas grouped aggs, when not using cython, _try_cast back to original type
    # but since summarize uses apply, it doesn't :/. Currently only affects median func.
    if str_expr == '_.x.median()':
        dst['result'] = gdf._try_cast(dst['result'], gdf.x.obj)

    assert_frame_equal(res, dst)
示例#10
0
def test_ungrouped_summarize_literal(df, query, output):
    assert_equal_query(df, summarize(y=1), data_frame(y=1))
示例#11
0
def test_frame_mode_returns_many():
    with pytest.raises(ValueError):
        df = data_frame(x=[1, 2, 3])
        res = summarize(df, result=_.x.mode())
示例#12
0
def df(backend):
    return backend.load_df(DATA)


@pytest.fixture(scope="module")
def df_float(backend):
    return backend.load_df(DATA.assign(x=lambda d: d.x.astype(float)))


@pytest.fixture(scope="module")
def gdf(df):
    return df >> group_by(_.g)


@pytest.mark.parametrize("query, output", [
    (summarize(y=n(_)), data_frame(y=4)),
    (summarize(y=_.x.min()), data_frame(y=1)),
])
def test_summarize_ungrouped(df, query, output):
    assert_equal_query(df, query, output)


@pytest.mark.skip("TODO: should return 1 row (#63)")
def test_ungrouped_summarize_literal(df, query, output):
    assert_equal_query(df, summarize(y=1), data_frame(y=1))


@backend_notimpl("sqlite")
def test_summarize_after_mutate_cuml_win(backend, df_float):
    assert_equal_query(df_float,
                       mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()),
示例#13
0
def test_summarize_removes_order_vars(backend, df):
    lazy_tbl = df >> summarize(n=n(_))

    assert not len(lazy_tbl.order_by)
示例#14
0
from siuba import _, mutate, select, group_by, summarize, filter
from siuba.dply.vector import row_number

import pytest
from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql
from string import ascii_lowercase 

DATA = data_frame(a = [1,2,3], b = [9,8,7])

@pytest.fixture(scope = "module")
def dfs(backend):
    return backend.load_df(DATA)

@pytest.mark.parametrize("query, output", [
    (mutate(x = _.a + _.b), DATA.assign(x = [10, 10, 10])),
    pytest.param( mutate(x = _.a + _.b) >> summarize(ttl = _.x.sum()), data_frame(ttl = 30.0), marks = pytest.mark.skip("TODO: failing sqlite?")),
    (mutate(x = _.a + 1, y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])),
    (mutate(x = _.a + 1) >> mutate(y = _.b - 1), DATA.assign(x = [2,3,4], y = [8,7,6])),
    (mutate(x = _.a + 1, y = _.x + 1), DATA.assign(x = [2,3,4], y = [3,4,5]))
    ])
def test_mutate_basic(dfs, query, output):
    assert_equal_query(dfs, query, output)

@pytest.mark.parametrize("query, output", [
    (mutate(x = 1), DATA.assign(x = 1)),
    (mutate(x = "a"), DATA.assign(x = "a")),
    (mutate(x = 1.2), DATA.assign(x = 1.2))
    ])
def test_mutate_literal(dfs, query, output):
    assert_equal_query(dfs, query, output)
示例#15
0
def test_distinct_after_summarize(df):
    query = group_by(g = _.x) >> summarize(z = (_.y - _.y).min()) >> distinct(_.z)
    
    assert_equal_query(df, query, pd.DataFrame({'z': [0]}))
示例#16
0
def test_summarize_after_mutate_cuml_win(backend, df_float):
    assert_equal_query(df_float,
                       mutate(y=_.x.cumsum()) >> summarize(z=_.y.max()),
                       data_frame(z=[10.]))
示例#17
0
def test_summarize_keeps_group_vars(backend, gdf):
    q = gdf >> summarize(n=n(_))
    assert list(q.last_op.c.keys()) == ["g", "n"]
示例#18
0
def test_summarize_validates_length():
    with pytest.raises(ValueError):
        summarize(data_frame(x=[1, 2]), res=_.x + 1)
示例#19
0
def test_summarize_no_same_call_var_refs(backend, df):
    with pytest.raises(NotImplementedError):
        df >> summarize(y=_.x.min(), z=_.y + 1)
示例#20
0
def test_frame_mode_returns_many():
    # related to length validation above
    with pytest.raises(ValueError):
        df = data_frame(x=[1, 2, 3])
        res = summarize(df, result=_.x.mode())
示例#21
0
def test_summarize_unnamed_args(df):
    assert_equal_query(df, summarize(n(_)), pd.DataFrame({'n(_)': 4}))
示例#22
0
def test_group_by_performs_mutate(df):
    assert_equal_query(df,
                       group_by(z=_.x + _.y) >> summarize(n=n(_)),
                       data_frame(z=10, n=3))
示例#23
0
import pytest
from .helpers import assert_equal_query, data_frame, backend_notimpl, backend_sql
from string import ascii_lowercase

DATA = data_frame(a=[1, 2, 3], b=[9, 8, 7])


@pytest.fixture(scope="module")
def dfs(backend):
    return backend.load_df(DATA)


@pytest.mark.parametrize(
    "query, output",
    [(mutate(x=_.a + _.b), DATA.assign(x=[10, 10, 10])),
     pytest.param(mutate(x=_.a + _.b) >> summarize(ttl=_.x.sum()),
                  data_frame(ttl=30.0),
                  marks=pytest.mark.skip("TODO: failing sqlite?")),
     (mutate(x=_.a + 1, y=_.b - 1), DATA.assign(x=[2, 3, 4], y=[8, 7, 6])),
     (mutate(x=_.a + 1) >> mutate(y=_.b - 1),
      DATA.assign(x=[2, 3, 4], y=[8, 7, 6])),
     (mutate(x=_.a + 1, y=_.x + 1), DATA.assign(x=[2, 3, 4], y=[3, 4, 5]))])
def test_mutate_basic(dfs, query, output):
    assert_equal_query(dfs, query, output)


@pytest.mark.parametrize("query, output",
                         [(mutate(x=1), DATA.assign(x=1)),
                          (mutate(x="a"), DATA.assign(x="a")),
                          (mutate(x=1.2), DATA.assign(x=1.2))])
def test_mutate_literal(dfs, query, output):
示例#24
0
def test_raw_sql_summarize(backend, df):
    assert_equal_query(
        df,
        summarize(z=sql_raw("SUM(y)")) >> mutate(z=_.z.astype(int)),
        data_frame(z=[3]))