示例#1
0
def test_mixed_none_concat():
    df, df2 = generate_none_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]

    assert (ray_df_equals_pandas(pd.concat(mixed_dfs),
                                 pandas.concat([df, df2, df3])))
示例#2
0
def test_mixed_inner_concat():
    df, df2 = generate_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]

    assert (ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'),
                                 pandas.concat([df, df2, df3], join='inner')))
示例#3
0
def test_ray_concat_on_column():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1),
                                pandas.concat([df, df2], axis=1))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"),
                                pandas.concat([df, df2], axis="columns"))
示例#4
0
def test_ray_concat_on_index():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'),
                                pandas.concat([df, df2], axis='index'))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'),
                                pandas.concat([df, df2], axis='rows'))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
                                pandas.concat([df, df2], axis=0))
示例#5
0
def test_simple_row_groupby():
    pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3],
                                  'col2': [4, 5, 6, 7],
                                  'col3': [3, 8, 12, 10],
                                  'col4': [17, 13, 16, 15],
                                  'col5': [-4, -5, -6, -7]})

    ray_df = from_pandas(pandas_df, 2)

    by = [1, 2, 1, 2]
    n = 1

    ray_groupby = ray_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_idxmax(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)
    test_cumsum(ray_groupby, pandas_groupby)
    test_pct_change(ray_groupby, pandas_groupby)
    test_cummax(ray_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_dtypes(ray_groupby, pandas_groupby)
    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_cummin(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_idxmin(ray_groupby, pandas_groupby)
    test_prod(ray_groupby, pandas_groupby)
    test_std(ray_groupby, pandas_groupby)

    agg_functions = ['min', 'max']
    for func in agg_functions:
        test_agg(ray_groupby, pandas_groupby, func)
        test_aggregate(ray_groupby, pandas_groupby, func)

    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_rank(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)
    test_ngroup(ray_groupby, pandas_groupby)
    test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_head(ray_groupby, pandas_groupby, n)
    test_cumprod(ray_groupby, pandas_groupby)
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_tail(ray_groupby, pandas_groupby, n)
    test_quantile(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
示例#6
0
def test_simple_col_groupby():
    pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3],
                                  'col2': [4, 1, 6, 7],
                                  'col3': [3, 8, 2, 10],
                                  'col4': [1, 13, 6, 15],
                                  'col5': [-4, 5, 6, -7]})

    ray_df = from_pandas(pandas_df, 2)

    by = [1, 2, 3, 2, 1]

    ray_groupby = ray_df.groupby(axis=1, by=by)
    pandas_groupby = pandas_df.groupby(axis=1, by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)

    if not PY2:
        # idxmax and idxmin fail on column groupby in pandas with python2
        test_idxmax(ray_groupby, pandas_groupby)
        test_idxmin(ray_groupby, pandas_groupby)
        test_rank(ray_groupby, pandas_groupby)
        test_quantile(ray_groupby, pandas_groupby)

    # https://github.com/pandas-dev/pandas/issues/21127
    # test_cumsum(ray_groupby, pandas_groupby)
    # test_cummax(ray_groupby, pandas_groupby)
    # test_cummin(ray_groupby, pandas_groupby)
    # test_cumprod(ray_groupby, pandas_groupby)

    test_pct_change(ray_groupby, pandas_groupby)
    apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_prod(ray_groupby, pandas_groupby)
    test_std(ray_groupby, pandas_groupby)
    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)

    # Pandas fails on this case with ValueError
    # test_ngroup(ray_groupby, pandas_groupby)
    # test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
示例#7
0
def test_large_row_groupby():
    pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)),
                                 columns=list('ABCD'))

    ray_df = from_pandas(pandas_df, 2)

    by = pandas_df['A'].tolist()
    n = 4

    ray_groupby = ray_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_idxmax(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)
    test_cumsum(ray_groupby, pandas_groupby)
    test_pct_change(ray_groupby, pandas_groupby)
    test_cummax(ray_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_dtypes(ray_groupby, pandas_groupby)
    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_cummin(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_idxmin(ray_groupby, pandas_groupby)
    # test_prod(ray_groupby, pandas_groupby) causes overflows
    test_std(ray_groupby, pandas_groupby)

    agg_functions = ['min', 'max']
    for func in agg_functions:
        test_agg(ray_groupby, pandas_groupby, func)
        test_aggregate(ray_groupby, pandas_groupby, func)

    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_rank(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)
    test_ngroup(ray_groupby, pandas_groupby)
    test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_head(ray_groupby, pandas_groupby, n)
    # test_cumprod(ray_groupby, pandas_groupby) causes overflows
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_tail(ray_groupby, pandas_groupby, n)
    test_quantile(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
示例#8
0
def test_invalid_axis_errors():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    with pytest.raises(ValueError):
        pd.concat([ray_df, ray_df2], axis=2)
示例#9
0
def test_ray_concat():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
                                pandas.concat([df, df2]))