def test_mixed_none_concat(): df, df2 = generate_none_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] assert (ray_df_equals_pandas(pd.concat(mixed_dfs), pandas.concat([df, df2, df3])))
def test_mixed_inner_concat(): df, df2 = generate_dfs() df3 = df.copy() mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3] assert (ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'), pandas.concat([df, df2, df3], join='inner')))
def test_ray_concat_on_column(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1), pandas.concat([df, df2], axis=1)) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"), pandas.concat([df, df2], axis="columns"))
def test_ray_concat_on_index(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'), pandas.concat([df, df2], axis='index')) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'), pandas.concat([df, df2], axis='rows')) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0), pandas.concat([df, df2], axis=0))
def test_simple_row_groupby(): pandas_df = pandas.DataFrame({'col1': [0, 1, 2, 3], 'col2': [4, 5, 6, 7], 'col3': [3, 8, 12, 10], 'col4': [17, 13, 16, 15], 'col5': [-4, -5, -6, -7]}) ray_df = from_pandas(pandas_df, 2) by = [1, 2, 1, 2] n = 1 ray_groupby = ray_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_idxmax(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) test_cumsum(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) test_cummax(ray_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_dtypes(ray_groupby, pandas_groupby) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_cummin(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) agg_functions = ['min', 'max'] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_rank(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) test_ngroup(ray_groupby, pandas_groupby) test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_head(ray_groupby, pandas_groupby, n) test_cumprod(ray_groupby, pandas_groupby) test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_tail(ray_groupby, pandas_groupby, n) test_quantile(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby)
def test_simple_col_groupby(): pandas_df = pandas.DataFrame({'col1': [0, 3, 2, 3], 'col2': [4, 1, 6, 7], 'col3': [3, 8, 2, 10], 'col4': [1, 13, 6, 15], 'col5': [-4, 5, 6, -7]}) ray_df = from_pandas(pandas_df, 2) by = [1, 2, 3, 2, 1] ray_groupby = ray_df.groupby(axis=1, by=by) pandas_groupby = pandas_df.groupby(axis=1, by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) if not PY2: # idxmax and idxmin fail on column groupby in pandas with python2 test_idxmax(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) test_rank(ray_groupby, pandas_groupby) test_quantile(ray_groupby, pandas_groupby) # https://github.com/pandas-dev/pandas/issues/21127 # test_cumsum(ray_groupby, pandas_groupby) # test_cummax(ray_groupby, pandas_groupby) # test_cummin(ray_groupby, pandas_groupby) # test_cumprod(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_prod(ray_groupby, pandas_groupby) test_std(ray_groupby, pandas_groupby) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) # Pandas fails on this case with ValueError # test_ngroup(ray_groupby, pandas_groupby) # test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby)
def test_large_row_groupby(): pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)), columns=list('ABCD')) ray_df = from_pandas(pandas_df, 2) by = pandas_df['A'].tolist() n = 4 ray_groupby = ray_df.groupby(by=by) pandas_groupby = pandas_df.groupby(by=by) ray_groupby_equals_pandas(ray_groupby, pandas_groupby) test_ngroups(ray_groupby, pandas_groupby) test_skew(ray_groupby, pandas_groupby) test_ffill(ray_groupby, pandas_groupby) test_sem(ray_groupby, pandas_groupby) test_mean(ray_groupby, pandas_groupby) test_any(ray_groupby, pandas_groupby) test_min(ray_groupby, pandas_groupby) test_idxmax(ray_groupby, pandas_groupby) test_ndim(ray_groupby, pandas_groupby) test_cumsum(ray_groupby, pandas_groupby) test_pct_change(ray_groupby, pandas_groupby) test_cummax(ray_groupby, pandas_groupby) apply_functions = [lambda df: df.sum(), lambda df: -df] for func in apply_functions: test_apply(ray_groupby, pandas_groupby, func) test_dtypes(ray_groupby, pandas_groupby) test_first(ray_groupby, pandas_groupby) test_backfill(ray_groupby, pandas_groupby) test_cummin(ray_groupby, pandas_groupby) test_bfill(ray_groupby, pandas_groupby) test_idxmin(ray_groupby, pandas_groupby) # test_prod(ray_groupby, pandas_groupby) causes overflows test_std(ray_groupby, pandas_groupby) agg_functions = ['min', 'max'] for func in agg_functions: test_agg(ray_groupby, pandas_groupby, func) test_aggregate(ray_groupby, pandas_groupby, func) test_last(ray_groupby, pandas_groupby) test_mad(ray_groupby, pandas_groupby) test_rank(ray_groupby, pandas_groupby) test_max(ray_groupby, pandas_groupby) test_var(ray_groupby, pandas_groupby) test_len(ray_groupby, pandas_groupby) test_sum(ray_groupby, pandas_groupby) test_ngroup(ray_groupby, pandas_groupby) test_nunique(ray_groupby, pandas_groupby) test_median(ray_groupby, pandas_groupby) test_head(ray_groupby, pandas_groupby, n) # test_cumprod(ray_groupby, pandas_groupby) causes overflows test_cov(ray_groupby, pandas_groupby) transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: test_transform(ray_groupby, pandas_groupby, func) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: test_pipe(ray_groupby, pandas_groupby, func) test_corr(ray_groupby, pandas_groupby) test_fillna(ray_groupby, pandas_groupby) test_count(ray_groupby, pandas_groupby) test_tail(ray_groupby, pandas_groupby, n) test_quantile(ray_groupby, pandas_groupby) test_take(ray_groupby, pandas_groupby)
def test_invalid_axis_errors(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) with pytest.raises(ValueError): pd.concat([ray_df, ray_df2], axis=2)
def test_ray_concat(): df, df2 = generate_dfs() ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2) assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]), pandas.concat([df, df2]))