示例#1
0
def test_mixed_none_concat():
    df, df2 = generate_none_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df), from_pandas(df2), df3]

    df_equals(pd.concat(mixed_dfs), pandas.concat([df, df2, df3]))
示例#2
0
def test_ray_concat():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2]), pandas.concat([df, df2])
    )
示例#3
0
def test_mixed_inner_concat():
    df, df2 = generate_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df), from_pandas(df2), df3]

    assert modin_df_equals_pandas(pd.concat(mixed_dfs, join="inner"),
                                  pandas.concat([df, df2, df3], join="inner"))
示例#4
0
def test_mixed_inner_concat():
    df, df2 = generate_dfs()
    df3 = df.copy()

    mixed_dfs = [from_pandas(df, 2), from_pandas(df2, 2), df3]

    assert (ray_df_equals_pandas(pd.concat(mixed_dfs, join='inner'),
                                 pandas.concat([df, df2, df3], join='inner')))
示例#5
0
def test_ray_concat_on_column():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=1),
                                pandas.concat([df, df2], axis=1))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="columns"),
                                pandas.concat([df, df2], axis="columns"))
示例#6
0
def test_ray_concat_on_column():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    assert modin_df_equals_pandas(pd.concat([modin_df, modin_df2], axis=1),
                                  pandas.concat([df, df2], axis=1))

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2], axis="columns"),
        pandas.concat([df, df2], axis="columns"),
    )
示例#7
0
def test_ray_concat_on_index():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='index'),
                                pandas.concat([df, df2], axis='index'))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis='rows'),
                                pandas.concat([df, df2], axis='rows'))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
                                pandas.concat([df, df2], axis=0))
def test_sort_order(sort, join, axis):
    pandas_df = pandas.DataFrame({"c": [3], "d": [4]}, columns=["d", "c"])
    pandas_df2 = pandas.DataFrame({"a": [1], "b": [2]}, columns=["b", "a"])
    modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2)
    pandas_concat = pandas.concat([pandas_df, pandas_df2],
                                  join=join,
                                  sort=sort)
    modin_concat = pd.concat([modin_df, modin_df2], join=join, sort=sort)
    df_equals(
        pandas_concat,
        modin_concat,
    )
    assert list(pandas_concat.columns) == list(modin_concat.columns)
示例#9
0
def test_ray_concat_on_index():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df), from_pandas(df2)

    assert ray_df_equals_pandas(
        pd.concat([ray_df, ray_df2], axis="index"),
        pandas.concat([df, df2], axis="index"),
    )

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis="rows"),
                                pandas.concat([df, df2], axis="rows"))

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2], axis=0),
                                pandas.concat([df, df2], axis=0))
示例#10
0
def test_concat_dictionary(axis):
    pandas_df, pandas_df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(pandas_df), from_pandas(pandas_df2)

    df_equals(
        pd.concat({
            "A": modin_df,
            "B": modin_df2
        }, axis=axis),
        pandas.concat({
            "A": pandas_df,
            "B": pandas_df2
        }, axis=axis),
    )
示例#11
0
def test_ray_concat_with_series():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)
    pandas_series = pandas.Series([1, 2, 3, 4], name="new_col")

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2, pandas_series], axis=0),
        pandas.concat([df, df2, pandas_series], axis=0),
    )

    assert modin_df_equals_pandas(
        pd.concat([modin_df, modin_df2, pandas_series], axis=1),
        pandas.concat([df, df2, pandas_series], axis=1),
    )
示例#12
0
def test_shift_freq(groupby_axis, shift_axis):
    pandas_df = pandas.DataFrame({
        "col1": [1, 0, 2, 3],
        "col2": [4, 5, np.NaN, 7],
        "col3": [np.NaN, np.NaN, 12, 10],
        "col4": [17, 13, 16, 15],
    })
    modin_df = from_pandas(pandas_df)

    new_index = pandas.date_range("1/12/2020", periods=4, freq="S")
    if groupby_axis == 0 and shift_axis == 0:
        pandas_df.index = modin_df.index = new_index
        by = [["col2", "col3"], ["col2"], ["col4"], [0, 1, 0, 2]]
    else:
        pandas_df.index = modin_df.index = new_index
        pandas_df.columns = modin_df.columns = new_index
        by = [[0, 1, 0, 2]]

    for _by in by:
        pandas_groupby = pandas_df.groupby(by=_by, axis=groupby_axis)
        modin_groupby = modin_df.groupby(by=_by, axis=groupby_axis)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda groupby: groupby.shift(axis=shift_axis, freq="S"),
        )
示例#13
0
def test_multi_column_groupby():
    pandas_df = pandas.DataFrame(
        {
            "col1": np.random.randint(0, 100, size=1000),
            "col2": np.random.randint(0, 100, size=1000),
            "col3": np.random.randint(0, 100, size=1000),
            "col4": np.random.randint(0, 100, size=1000),
            "col5": np.random.randint(0, 100, size=1000),
        },
        index=["row{}".format(i) for i in range(1000)],
    )

    ray_df = from_pandas(pandas_df)
    by = ["col1", "col2"]

    with pytest.warns(UserWarning):
        ray_df.groupby(by).count()

    with pytest.warns(UserWarning):
        for k, _ in ray_df.groupby(by):
            assert isinstance(k, tuple)

    by = ["row0", "row1"]
    with pytest.raises(KeyError):
        ray_df.groupby(by, axis=1).count()
示例#14
0
def test_ray_concat_on_index():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    df_equals(
        pd.concat([modin_df, modin_df2], axis="index"),
        pandas.concat([df, df2], axis="index"),
    )

    df_equals(
        pd.concat([modin_df, modin_df2], axis="rows"),
        pandas.concat([df, df2], axis="rows"),
    )

    df_equals(pd.concat([modin_df, modin_df2], axis=0),
              pandas.concat([df, df2], axis=0))
示例#15
0
def test_concat_on_column():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    df_equals(pd.concat([modin_df, modin_df2], axis=1),
              pandas.concat([df, df2], axis=1))

    df_equals(
        pd.concat([modin_df, modin_df2], axis="columns"),
        pandas.concat([df, df2], axis="columns"),
    )

    modin_result = pd.concat([pd.Series(np.ones(10)),
                              pd.Series(np.ones(10))],
                             axis=1,
                             ignore_index=True)
    pandas_result = pandas.concat(
        [pandas.Series(np.ones(10)),
         pandas.Series(np.ones(10))],
        axis=1,
        ignore_index=True,
    )
    df_equals(modin_result, pandas_result)
    assert modin_result.dtypes.equals(pandas_result.dtypes)
示例#16
0
def test_agg_func_None_rename():
    pandas_df = pandas.DataFrame(
        {
            "col1": np.random.randint(0, 100, size=1000),
            "col2": np.random.randint(0, 100, size=1000),
            "col3": np.random.randint(0, 100, size=1000),
            "col4": np.random.randint(0, 100, size=1000),
        },
        index=["row{}".format(i) for i in range(1000)],
    )
    modin_df = from_pandas(pandas_df)

    modin_result = modin_df.groupby(["col1", "col2"]).agg(max=("col3", np.max),
                                                          min=("col3", np.min))
    pandas_result = pandas_df.groupby(["col1",
                                       "col2"]).agg(max=("col3", np.max),
                                                    min=("col3", np.min))
    df_equals(modin_result, pandas_result)
示例#17
0
def test_mixed_dtypes_groupby():
    frame_data = np.random.randint(97, 198, size=(2 ** 6, 2 ** 4))
    pandas_df = pandas.DataFrame(frame_data).add_prefix("col")
    # Convert every other column to string
    for col in pandas_df.iloc[
        :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0]
    ]:
        pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]
    ray_df = from_pandas(pandas_df)

    n = 1

    ray_groupby = ray_df.groupby(by="col1")
    pandas_groupby = pandas_df.groupby(by="col1")

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_idxmax(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)
    test_cumsum(ray_groupby, pandas_groupby)
    test_pct_change(ray_groupby, pandas_groupby)
    test_cummax(ray_groupby, pandas_groupby)

    # TODO Add more apply functions
    apply_functions = [lambda df: df.sum(), min]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_dtypes(ray_groupby, pandas_groupby)
    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_cummin(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_idxmin(ray_groupby, pandas_groupby)
    test_prod(ray_groupby, pandas_groupby)
    test_std(ray_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        test_agg(ray_groupby, pandas_groupby, func)
        test_aggregate(ray_groupby, pandas_groupby, func)

    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)
    test_ngroup(ray_groupby, pandas_groupby)
    test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_head(ray_groupby, pandas_groupby, n)
    test_cumprod(ray_groupby, pandas_groupby)
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_tail(ray_groupby, pandas_groupby, n)
    test_quantile(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
    test___getattr__(ray_groupby, pandas_groupby)
    test_groups(ray_groupby, pandas_groupby)
示例#18
0
def test_series_groupby(by, as_index_series_or_dataframe):
    if as_index_series_or_dataframe <= 1:
        as_index = as_index_series_or_dataframe == 1
        series_data = np.random.randint(97, 198, size=2**8)
        modin_series = pd.Series(series_data)
        pandas_series = pandas.Series(series_data)
    else:
        as_index = True
        pandas_series = pandas.DataFrame({
            "col1": [0, 1, 2, 3],
            "col2": [4, 5, 6, 7],
            "col3": [3, 8, 12, 10],
            "col4": [17, 13, 16, 15],
            "col5": [-4, -5, -6, -7],
        })
        modin_series = from_pandas(pandas_series)
        if isinstance(by, np.ndarray) or by is None:
            by = np.random.randint(0, 100, size=len(pandas_series.index))

    n = 1

    try:
        pandas_groupby = pandas_series.groupby(by, as_index=as_index)
        if as_index_series_or_dataframe == 2:
            pandas_groupby = pandas_groupby["col1"]
    except Exception as e:
        with pytest.raises(type(e)):
            modin_series.groupby(by, as_index=as_index)
    else:
        modin_groupby = modin_series.groupby(by, as_index=as_index)
        if as_index_series_or_dataframe == 2:
            modin_groupby = modin_groupby["col1"]

        modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
        eval_ngroups(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.ffill(),
                     is_default=True)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.sem(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_mean(modin_groupby, pandas_groupby)
        eval_any(modin_groupby, pandas_groupby)
        eval_min(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.idxmax(),
                     is_default=True)
        eval_ndim(modin_groupby, pandas_groupby)
        eval_cumsum(modin_groupby, pandas_groupby)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.pct_change(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_cummax(modin_groupby, pandas_groupby)

        apply_functions = [lambda df: df.sum(), min]
        for func in apply_functions:
            eval_apply(modin_groupby, pandas_groupby, func)

        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.first(),
                     is_default=True)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.backfill(),
                     is_default=True)
        eval_cummin(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.bfill(),
                     is_default=True)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.idxmin(),
                     is_default=True)
        eval_prod(modin_groupby, pandas_groupby)
        if as_index:
            eval_std(modin_groupby, pandas_groupby)
            eval_var(modin_groupby, pandas_groupby)
            eval_skew(modin_groupby, pandas_groupby)

        agg_functions = ["min", "max"]
        for func in agg_functions:
            eval_agg(modin_groupby, pandas_groupby, func)
            eval_aggregate(modin_groupby, pandas_groupby, func)

        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.last(),
                     is_default=True)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.mad(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_rank(modin_groupby, pandas_groupby)
        eval_max(modin_groupby, pandas_groupby)
        eval_len(modin_groupby, pandas_groupby)
        eval_sum(modin_groupby, pandas_groupby)
        eval_ngroup(modin_groupby, pandas_groupby)
        eval_nunique(modin_groupby, pandas_groupby)
        eval_median(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.head(n),
                     is_default=True)
        eval_cumprod(modin_groupby, pandas_groupby)

        transform_functions = [lambda df: df + 4, lambda df: -df - 10]
        for func in transform_functions:
            eval_transform(modin_groupby, pandas_groupby, func)

        pipe_functions = [lambda dfgb: dfgb.sum()]
        for func in pipe_functions:
            eval_pipe(modin_groupby, pandas_groupby, func)

        eval_fillna(modin_groupby, pandas_groupby)
        eval_count(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.tail(n),
                     is_default=True)
        eval_quantile(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.take(),
                     is_default=True)
        eval_groups(modin_groupby, pandas_groupby)
示例#19
0
def test_simple_col_groupby():
    pandas_df = pandas.DataFrame({
        "col1": [0, 3, 2, 3],
        "col2": [4, 1, 6, 7],
        "col3": [3, 8, 2, 10],
        "col4": [1, 13, 6, 15],
        "col5": [-4, 5, 6, -7],
    })

    modin_df = from_pandas(pandas_df)

    by = [1, 2, 3, 2, 1]

    modin_groupby = modin_df.groupby(axis=1, by=by)
    pandas_groupby = pandas_df.groupby(axis=1, by=by)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_skew(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_ndim(modin_groupby, pandas_groupby)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)

    # https://github.com/pandas-dev/pandas/issues/21127
    # eval_cumsum(modin_groupby, pandas_groupby)
    # eval_cummax(modin_groupby, pandas_groupby)
    # eval_cummin(modin_groupby, pandas_groupby)
    # eval_cumprod(modin_groupby, pandas_groupby)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    eval_std(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_max(modin_groupby, pandas_groupby)
    eval_var(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)

    # Pandas fails on this case with ValueError
    # eval_ngroup(modin_groupby, pandas_groupby)
    # eval_nunique(modin_groupby, pandas_groupby)
    eval_median(modin_groupby, pandas_groupby)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        eval_transform(modin_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    eval_size(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    eval_groups(modin_groupby, pandas_groupby)
示例#20
0
def test_mixed_dtypes_groupby(as_index):
    frame_data = np.random.randint(97, 198, size=(2**6, 2**4))
    pandas_df = pandas.DataFrame(frame_data).add_prefix("col")
    # Convert every other column to string
    for col in pandas_df.iloc[:, [
            i for i in range(len(pandas_df.columns)) if i % 2 == 0
    ]]:
        pandas_df[col] = [str(chr(i)) for i in pandas_df[col]]
    modin_df = from_pandas(pandas_df)

    n = 1

    by_values = [
        ("col1", ),
        (lambda x: x % 2, ),
        (modin_df["col0"].copy(), pandas_df["col0"].copy()),
    ]

    for by in by_values:
        modin_groupby = modin_df.groupby(by=by[0], as_index=as_index)
        pandas_groupby = pandas_df.groupby(by=by[-1], as_index=as_index)

        modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
        eval_ngroups(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.ffill(),
                     is_default=True)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.sem(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_mean(modin_groupby, pandas_groupby)
        eval_any(modin_groupby, pandas_groupby)
        eval_min(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.idxmax(),
                     is_default=True)
        eval_ndim(modin_groupby, pandas_groupby)
        eval_cumsum(modin_groupby, pandas_groupby)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.pct_change(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_cummax(modin_groupby, pandas_groupby)

        # TODO Add more apply functions
        apply_functions = [lambda df: df.sum(), min]
        # Workaround for Pandas bug #34656. Recreate groupby object for Pandas
        pandas_groupby = pandas_df.groupby(by=by[-1], as_index=as_index)
        for func in apply_functions:
            eval_apply(modin_groupby, pandas_groupby, func)

        eval_dtypes(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.first(),
                     is_default=True)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.backfill(),
                     is_default=True)
        eval_cummin(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.bfill(),
                     is_default=True)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.idxmin(),
                     is_default=True)
        eval_prod(modin_groupby, pandas_groupby)
        if as_index:
            eval_std(modin_groupby, pandas_groupby)
            eval_var(modin_groupby, pandas_groupby)
            eval_skew(modin_groupby, pandas_groupby)

        agg_functions = ["min", "max"]
        for func in agg_functions:
            eval_agg(modin_groupby, pandas_groupby, func)
            eval_aggregate(modin_groupby, pandas_groupby, func)

        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.last(),
                     is_default=True)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.mad(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )
        eval_max(modin_groupby, pandas_groupby)
        eval_len(modin_groupby, pandas_groupby)
        eval_sum(modin_groupby, pandas_groupby)
        eval_ngroup(modin_groupby, pandas_groupby)
        eval_nunique(modin_groupby, pandas_groupby)
        eval_median(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.head(n),
                     is_default=True)
        eval_cumprod(modin_groupby, pandas_groupby)
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.cov(),
            modin_df_almost_equals_pandas,
            is_default=True,
        )

        transform_functions = [lambda df: df, lambda df: df + df]
        for func in transform_functions:
            eval_transform(modin_groupby, pandas_groupby, func)

        pipe_functions = [lambda dfgb: dfgb.sum()]
        for func in pipe_functions:
            eval_pipe(modin_groupby, pandas_groupby, func)

        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.corr(),
            modin_df_almost_equals_pandas,
        )
        eval_fillna(modin_groupby, pandas_groupby)
        eval_count(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.tail(n),
                     is_default=True)
        eval_quantile(modin_groupby, pandas_groupby)
        eval_general(modin_groupby,
                     pandas_groupby,
                     lambda df: df.take(),
                     is_default=True)
        eval___getattr__(modin_groupby, pandas_groupby, "col2")
        eval_groups(modin_groupby, pandas_groupby)
示例#21
0
def test_large_row_groupby():
    pandas_df = pandas.DataFrame(np.random.randint(0, 8, size=(100, 4)),
                                 columns=list("ABCD"))

    modin_df = from_pandas(pandas_df)

    by = [str(i) for i in pandas_df["A"].tolist()]
    n = 4

    modin_groupby = modin_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_skew(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    eval_cumsum(modin_groupby, pandas_groupby)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_cummax(modin_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_cummin(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    # eval_prod(modin_groupby, pandas_groupby) causes overflows
    eval_std(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_rank(modin_groupby, pandas_groupby)
    eval_max(modin_groupby, pandas_groupby)
    eval_var(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_nunique(modin_groupby, pandas_groupby)
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    # eval_cumprod(modin_groupby, pandas_groupby) causes overflows
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        eval_transform(modin_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    eval_size(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    eval_groups(modin_groupby, pandas_groupby)
示例#22
0
def test_single_group_row_groupby():
    pandas_df = pandas.DataFrame({
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, 36, 7],
        "col3": [3, 8, 12, 10],
        "col4": [17, 3, 16, 15],
        "col5": [-4, 5, -6, -7],
    })

    modin_df = from_pandas(pandas_df)

    by = ["1", "1", "1", "1"]
    n = 6

    modin_groupby = modin_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_skew(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    eval_cumsum(modin_groupby, pandas_groupby)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_cummax(modin_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_cummin(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    eval_std(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_rank(modin_groupby, pandas_groupby)
    eval_max(modin_groupby, pandas_groupby)
    eval_var(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_nunique(modin_groupby, pandas_groupby)
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    eval_cumprod(modin_groupby, pandas_groupby)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        eval_transform(modin_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    eval_size(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    eval___getattr__(modin_groupby, pandas_groupby, "col2")
    eval_groups(modin_groupby, pandas_groupby)
示例#23
0
def test_simple_row_groupby(by, as_index):
    pandas_df = pandas.DataFrame({
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, np.NaN, 7],
        "col3": [np.NaN, np.NaN, 12, 10],
        "col4": [17, 13, 16, 15],
        "col5": [-4, -5, -6, -7],
    })

    modin_df = from_pandas(pandas_df)
    n = 1
    modin_groupby = modin_df.groupby(by=by, as_index=as_index)
    pandas_groupby = pandas_df.groupby(by=by, as_index=as_index)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    if not check_df_columns_have_nans(modin_df, by):
        # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs
        eval_cumsum(modin_groupby, pandas_groupby)
        eval_cummax(modin_groupby, pandas_groupby)
        eval_cummin(modin_groupby, pandas_groupby)
        eval_cumprod(modin_groupby, pandas_groupby)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    # Workaround for Pandas bug #34656. Recreate groupby object for Pandas
    pandas_groupby = pandas_df.groupby(by=by, as_index=as_index)
    apply_functions = [lambda df: df.sum(), min]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    if as_index:
        eval_std(modin_groupby, pandas_groupby)
        eval_var(modin_groupby, pandas_groupby)
        eval_skew(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_rank(modin_groupby, pandas_groupby)
    eval_max(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_nunique(modin_groupby, pandas_groupby)
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    if not check_df_columns_have_nans(modin_df, by):
        # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093.
        transform_functions = [lambda df: df + 4, lambda df: -df - 10]
        for func in transform_functions:
            eval_transform(modin_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    eval_size(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    eval___getattr__(modin_groupby, pandas_groupby, "col3")
    eval_groups(modin_groupby, pandas_groupby)
示例#24
0
def test_ray_concat():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    assert ray_df_equals_pandas(pd.concat([ray_df, ray_df2]),
                                pandas.concat([df, df2]))
示例#25
0
def test_invalid_axis_errors():
    df, df2 = generate_dfs()
    modin_df, modin_df2 = from_pandas(df), from_pandas(df2)

    with pytest.raises(ValueError):
        pd.concat([modin_df, modin_df2], axis=2)
示例#26
0
def test_single_group_row_groupby():
    pandas_df = pandas.DataFrame(
        {
            "col1": [0, 1, 2, 3],
            "col2": [4, 5, 36, 7],
            "col3": [3, 8, 12, 10],
            "col4": [17, 3, 16, 15],
            "col5": [-4, 5, -6, -7],
        }
    )

    ray_df = from_pandas(pandas_df)

    by = ["1", "1", "1", "1"]
    n = 6

    ray_groupby = ray_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_idxmax(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)
    test_cumsum(ray_groupby, pandas_groupby)
    test_pct_change(ray_groupby, pandas_groupby)
    test_cummax(ray_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_dtypes(ray_groupby, pandas_groupby)
    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_cummin(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_idxmin(ray_groupby, pandas_groupby)
    test_prod(ray_groupby, pandas_groupby)
    test_std(ray_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        test_agg(ray_groupby, pandas_groupby, func)
        test_aggregate(ray_groupby, pandas_groupby, func)

    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_rank(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)
    test_ngroup(ray_groupby, pandas_groupby)
    test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_head(ray_groupby, pandas_groupby, n)
    test_cumprod(ray_groupby, pandas_groupby)
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_tail(ray_groupby, pandas_groupby, n)
    test_quantile(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
    test___getattr__(ray_groupby, pandas_groupby)
    test_groups(ray_groupby, pandas_groupby)
示例#27
0
def test_large_row_groupby():
    pandas_df = pandas.DataFrame(
        np.random.randint(0, 8, size=(100, 4)), columns=list("ABCD")
    )

    ray_df = from_pandas(pandas_df)

    by = [str(i) for i in pandas_df["A"].tolist()]
    n = 4

    ray_groupby = ray_df.groupby(by=by)
    pandas_groupby = pandas_df.groupby(by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_idxmax(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)
    test_cumsum(ray_groupby, pandas_groupby)
    test_pct_change(ray_groupby, pandas_groupby)
    test_cummax(ray_groupby, pandas_groupby)

    apply_functions = [lambda df: df.sum(), lambda df: -df]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_dtypes(ray_groupby, pandas_groupby)
    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_cummin(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_idxmin(ray_groupby, pandas_groupby)
    # test_prod(ray_groupby, pandas_groupby) causes overflows
    test_std(ray_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        test_agg(ray_groupby, pandas_groupby, func)
        test_aggregate(ray_groupby, pandas_groupby, func)

    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_rank(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)
    test_ngroup(ray_groupby, pandas_groupby)
    test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_head(ray_groupby, pandas_groupby, n)
    # test_cumprod(ray_groupby, pandas_groupby) causes overflows
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_tail(ray_groupby, pandas_groupby, n)
    test_quantile(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
    test_groups(ray_groupby, pandas_groupby)
示例#28
0
def test_simple_row_groupby(by, as_index, col1_category):
    pandas_df = pandas.DataFrame({
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, np.NaN, 7],
        "col3": [np.NaN, np.NaN, 12, 10],
        "col4": [17, 13, 16, 15],
        "col5": [-4, -5, -6, -7],
    })

    if col1_category:
        pandas_df = pandas_df.astype({"col1": "category"})

    modin_df = from_pandas(pandas_df)
    n = 1

    def maybe_get_columns(df, by):
        if isinstance(by, list):
            return [o(df) if isinstance(o, GetColumn) else o for o in by]
        else:
            return by

    modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by),
                                     as_index=as_index)

    pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by))
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_shift(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    if not check_df_columns_have_nans(modin_df, by):
        # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumsum(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummax(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummin(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumprod(axis=0))

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    # Workaround for Pandas bug #34656. Recreate groupby object for Pandas
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)
    apply_functions = [lambda df: df.sum(), min]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    if as_index:
        eval_std(modin_groupby, pandas_groupby)
        eval_var(modin_groupby, pandas_groupby)
        eval_skew(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_general(modin_groupby, pandas_groupby, lambda df: df.rank())
    eval_max(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique())
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    if not check_df_columns_have_nans(modin_df, by):
        # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093.
        transform_functions = [lambda df: df + 4, lambda df: -df - 10]
        for func in transform_functions:
            eval_general(
                modin_groupby,
                pandas_groupby,
                lambda df: df.transform(func),
                check_exception_type=None,
            )

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    if get_current_backend() != "BaseOnPython":
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.size(),
            check_exception_type=None,
        )
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    if isinstance(by, list) and not any(
            isinstance(o, (pd.Series, pandas.Series)) for o in by):
        # Not yet supported for non-original-column-from-dataframe Series in by:
        eval___getattr__(modin_groupby, pandas_groupby, "col3")
    eval_groups(modin_groupby, pandas_groupby)
示例#29
0
def test_invalid_axis_errors():
    df, df2 = generate_dfs()
    ray_df, ray_df2 = from_pandas(df, 2), from_pandas(df2, 2)

    with pytest.raises(ValueError):
        pd.concat([ray_df, ray_df2], axis=2)
示例#30
0
def test_simple_col_groupby():
    pandas_df = pandas.DataFrame(
        {
            "col1": [0, 3, 2, 3],
            "col2": [4, 1, 6, 7],
            "col3": [3, 8, 2, 10],
            "col4": [1, 13, 6, 15],
            "col5": [-4, 5, 6, -7],
        }
    )

    ray_df = from_pandas(pandas_df)

    by = [1, 2, 3, 2, 1]

    ray_groupby = ray_df.groupby(axis=1, by=by)
    pandas_groupby = pandas_df.groupby(axis=1, by=by)

    ray_groupby_equals_pandas(ray_groupby, pandas_groupby)
    test_ngroups(ray_groupby, pandas_groupby)
    test_skew(ray_groupby, pandas_groupby)
    test_ffill(ray_groupby, pandas_groupby)
    test_sem(ray_groupby, pandas_groupby)
    test_mean(ray_groupby, pandas_groupby)
    test_any(ray_groupby, pandas_groupby)
    test_min(ray_groupby, pandas_groupby)
    test_ndim(ray_groupby, pandas_groupby)

    if not PY2:
        # idxmax and idxmin fail on column groupby in pandas with python2
        test_idxmax(ray_groupby, pandas_groupby)
        test_idxmin(ray_groupby, pandas_groupby)
        test_quantile(ray_groupby, pandas_groupby)

    # https://github.com/pandas-dev/pandas/issues/21127
    # test_cumsum(ray_groupby, pandas_groupby)
    # test_cummax(ray_groupby, pandas_groupby)
    # test_cummin(ray_groupby, pandas_groupby)
    # test_cumprod(ray_groupby, pandas_groupby)

    test_pct_change(ray_groupby, pandas_groupby)
    apply_functions = [lambda df: -df, lambda df: df.sum(axis=1)]
    for func in apply_functions:
        test_apply(ray_groupby, pandas_groupby, func)

    test_first(ray_groupby, pandas_groupby)
    test_backfill(ray_groupby, pandas_groupby)
    test_bfill(ray_groupby, pandas_groupby)
    test_prod(ray_groupby, pandas_groupby)
    test_std(ray_groupby, pandas_groupby)
    test_last(ray_groupby, pandas_groupby)
    test_mad(ray_groupby, pandas_groupby)
    test_max(ray_groupby, pandas_groupby)
    test_var(ray_groupby, pandas_groupby)
    test_len(ray_groupby, pandas_groupby)
    test_sum(ray_groupby, pandas_groupby)

    # Pandas fails on this case with ValueError
    # test_ngroup(ray_groupby, pandas_groupby)
    # test_nunique(ray_groupby, pandas_groupby)
    test_median(ray_groupby, pandas_groupby)
    test_cov(ray_groupby, pandas_groupby)

    transform_functions = [lambda df: df + 4, lambda df: -df - 10]
    for func in transform_functions:
        test_transform(ray_groupby, pandas_groupby, func)

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        test_pipe(ray_groupby, pandas_groupby, func)

    test_corr(ray_groupby, pandas_groupby)
    test_fillna(ray_groupby, pandas_groupby)
    test_count(ray_groupby, pandas_groupby)
    test_take(ray_groupby, pandas_groupby)
    test___getattr__(ray_groupby, pandas_groupby)
    test_groups(ray_groupby, pandas_groupby)