예제 #1
0
    def _update_factory(cls, _):
        """
        Update and prepare factory with a new one specified via Modin config.

        Parameters
        ----------
        _ : object
            This parameters serves the compatibility purpose.
            Does not affect the result.
        """
        factory_name = get_current_backend() + "Factory"
        try:
            cls.__factory = getattr(factories, factory_name)
        except AttributeError:
            if not IsExperimental.get():
                # allow missing factories in experimenal mode only
                if hasattr(factories, "Experimental" + factory_name):
                    msg = (
                        "{0} on {1} is only accessible through the experimental API.\nRun "
                        "`import modin.experimental.pandas as pd` to use {0} on {1}."
                    )
                else:
                    msg = (
                        "Cannot find a factory for partition '{}' and execution engine '{}'. "
                        "Potential reason might be incorrect environment variable value for "
                        f"{Backend.varname} or {Engine.varname}")
                raise FactoryNotFoundError(
                    msg.format(Backend.get(), Engine.get()))
            cls.__factory = StubFactory.set_failing_name(factory_name)
        else:
            cls.__factory.prepare()
예제 #2
0
파일: conftest.py 프로젝트: Sdoof/modin
def pytest_runtest_call(item):
    custom_markers = ["xfail", "skip"]

    # dynamicly adding custom markers to tests
    for custom_marker in custom_markers:
        for marker in item.iter_markers(name=f"{custom_marker}_backends"):
            backends = marker.args[0]
            if not isinstance(backends, list):
                backends = [backends]

            current_backend = get_current_backend()
            reason = marker.kwargs.pop("reason", "")

            item.add_marker(
                getattr(pytest.mark, custom_marker)(
                    condition=current_backend in backends,
                    reason=f"Backend {current_backend} does not pass this test. {reason}",
                    **marker.kwargs,
                )
            )
예제 #3
0
 def _update_engine(cls, _):
     factory_name = get_current_backend() + "Factory"
     try:
         cls.__engine = getattr(factories, factory_name)
     except AttributeError:
         if not IsExperimental.get():
             # allow missing factories in experimenal mode only
             if hasattr(factories, "Experimental" + factory_name):
                 msg = (
                     "{0} on {1} is only accessible through the experimental API.\nRun "
                     "`import modin.experimental.pandas as pd` to use {0} on {1}."
                 )
             else:
                 msg = (
                     "Cannot find a factory for partition '{}' and execution engine '{}'. "
                     "Potential reason might be incorrect environment variable value for "
                     f"{Backend.varname} or {Engine.varname}"
                 )
             raise FactoryNotFoundError(msg.format(Backend.get(), Engine.get()))
         cls.__engine = StubFactory.set_failing_name(factory_name)
     else:
         cls.__engine.prepare()
예제 #4
0
def test_copy(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)  # noqa F841

    # pandas_df is unused but there so there won't be confusing list comprehension
    # stuff in the pytest.mark.parametrize
    new_modin_df = modin_df.copy()

    assert new_modin_df is not modin_df
    if get_current_backend() != "BaseOnPython":
        assert np.array_equal(
            new_modin_df._query_compiler._modin_frame._partitions,
            modin_df._query_compiler._modin_frame._partitions,
        )
    assert new_modin_df is not modin_df
    df_equals(new_modin_df, modin_df)

    # Shallow copy tests
    modin_df = pd.DataFrame(data)
    modin_df_cp = modin_df.copy(False)

    modin_df[modin_df.columns[0]] = 0
    df_equals(modin_df, modin_df_cp)
예제 #5
0
    md_df.index = index
    md_df.columns = columns

    pd_df = md_df._to_pandas()

    for axis in [0, 1]:
        assert md_df.axes[axis].equals(
            pd_df.axes[axis]
        ), f"Indices at axis {axis} are different!"
        assert md_df.axes[axis].equal_levels(
            pd_df.axes[axis]
        ), f"Levels of indices at axis {axis} are different!"


@pytest.mark.skipif(
    get_current_backend() != "BaseOnPython",
    reason="This test make sense only on BaseOnPython backend.",
)
@pytest.mark.parametrize(
    "func, regex",
    [
        (lambda df: df.mean(level=0), r"DataFrame\.mean"),
        (lambda df: df + df, r"DataFrame\.add"),
        (lambda df: df.index, r"DataFrame\.get_axis\(0\)"),
        (
            lambda df: df.drop(columns="col1").squeeze().repeat(2),
            r"Series\.repeat",
        ),
        (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"),
        (lambda df: df.rolling(1).count(), r"Rolling\.count"),
    ],
예제 #6
0
def test_append(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    data_to_append = {"append_a": 2, "append_b": 1000}

    ignore_idx_values = [True, False]

    for ignore in ignore_idx_values:
        try:
            pandas_result = pandas_df.append(data_to_append,
                                             ignore_index=ignore)
        except Exception as e:
            with pytest.raises(type(e)):
                modin_df.append(data_to_append, ignore_index=ignore)
        else:
            modin_result = modin_df.append(data_to_append, ignore_index=ignore)
            df_equals(modin_result, pandas_result)

    try:
        pandas_result = pandas_df.append(pandas_df.iloc[-1])
    except Exception as e:
        with pytest.raises(type(e)):
            modin_df.append(modin_df.iloc[-1])
    else:
        modin_result = modin_df.append(modin_df.iloc[-1])
        df_equals(modin_result, pandas_result)

    try:
        pandas_result = pandas_df.append(list(pandas_df.iloc[-1]))
    except Exception as e:
        with pytest.raises(type(e)):
            modin_df.append(list(modin_df.iloc[-1]))
    else:
        modin_result = modin_df.append(list(modin_df.iloc[-1]))
        # Pandas has bug where sort=False is ignored
        # (https://github.com/pandas-dev/pandas/issues/35092), but Modin
        # now does the right thing, so for now manually sort to workaround
        # this. Once the Pandas bug is fixed and Modin upgrades to that
        # Pandas release, this sort will cause the test to fail, and the
        # next three lines should be deleted.
        if get_current_backend() != "BaseOnPython":
            assert list(modin_result.columns) == list(modin_df.columns) + [0]
            modin_result = modin_result[[0] + sorted(modin_df.columns)]
        df_equals(modin_result, pandas_result)

    verify_integrity_values = [True, False]

    for verify_integrity in verify_integrity_values:
        try:
            pandas_result = pandas_df.append([pandas_df, pandas_df],
                                             verify_integrity=verify_integrity)
        except Exception as e:
            with pytest.raises(type(e)):
                modin_df.append([modin_df, modin_df],
                                verify_integrity=verify_integrity)
        else:
            modin_result = modin_df.append([modin_df, modin_df],
                                           verify_integrity=verify_integrity)
            df_equals(modin_result, pandas_result)

        try:
            pandas_result = pandas_df.append(pandas_df,
                                             verify_integrity=verify_integrity)
        except Exception as e:
            with pytest.raises(type(e)):
                modin_df.append(modin_df, verify_integrity=verify_integrity)
        else:
            modin_result = modin_df.append(modin_df,
                                           verify_integrity=verify_integrity)
            df_equals(modin_result, pandas_result)
예제 #7
0
def test_simple_row_groupby(by, as_index, col1_category):
    pandas_df = pandas.DataFrame({
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, np.NaN, 7],
        "col3": [np.NaN, np.NaN, 12, 10],
        "col4": [17, 13, 16, 15],
        "col5": [-4, -5, -6, -7],
    })

    if col1_category:
        pandas_df = pandas_df.astype({"col1": "category"})

    modin_df = from_pandas(pandas_df)
    n = 1

    def maybe_get_columns(df, by):
        if isinstance(by, list):
            return [o(df) if isinstance(o, GetColumn) else o for o in by]
        else:
            return by

    modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by),
                                     as_index=as_index)

    pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by))
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)

    modin_groupby_equals_pandas(modin_groupby, pandas_groupby)
    eval_ngroups(modin_groupby, pandas_groupby)
    eval_shift(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.ffill(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.sem(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_mean(modin_groupby, pandas_groupby)
    eval_any(modin_groupby, pandas_groupby)
    eval_min(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmax(),
                 is_default=True)
    eval_ndim(modin_groupby, pandas_groupby)
    if not check_df_columns_have_nans(modin_df, by):
        # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumsum(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummax(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cummin(axis=0))
        eval_general(modin_groupby, pandas_groupby,
                     lambda df: df.cumprod(axis=0))

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.pct_change(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    # Workaround for Pandas bug #34656. Recreate groupby object for Pandas
    pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index)
    apply_functions = [lambda df: df.sum(), min]
    for func in apply_functions:
        eval_apply(modin_groupby, pandas_groupby, func)

    eval_dtypes(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.first(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.backfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.bfill(),
                 is_default=True)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.idxmin(),
                 is_default=True)
    eval_prod(modin_groupby, pandas_groupby)
    if as_index:
        eval_std(modin_groupby, pandas_groupby)
        eval_var(modin_groupby, pandas_groupby)
        eval_skew(modin_groupby, pandas_groupby)

    agg_functions = ["min", "max"]
    for func in agg_functions:
        eval_agg(modin_groupby, pandas_groupby, func)
        eval_aggregate(modin_groupby, pandas_groupby, func)

    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.last(),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.mad(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_general(modin_groupby, pandas_groupby, lambda df: df.rank())
    eval_max(modin_groupby, pandas_groupby)
    eval_len(modin_groupby, pandas_groupby)
    eval_sum(modin_groupby, pandas_groupby)
    eval_ngroup(modin_groupby, pandas_groupby)
    eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique())
    eval_median(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.head(n),
                 is_default=True)
    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.cov(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )

    if not check_df_columns_have_nans(modin_df, by):
        # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093.
        transform_functions = [lambda df: df + 4, lambda df: -df - 10]
        for func in transform_functions:
            eval_general(
                modin_groupby,
                pandas_groupby,
                lambda df: df.transform(func),
                check_exception_type=None,
            )

    pipe_functions = [lambda dfgb: dfgb.sum()]
    for func in pipe_functions:
        eval_pipe(modin_groupby, pandas_groupby, func)

    eval_general(
        modin_groupby,
        pandas_groupby,
        lambda df: df.corr(),
        modin_df_almost_equals_pandas,
        is_default=True,
    )
    eval_fillna(modin_groupby, pandas_groupby)
    eval_count(modin_groupby, pandas_groupby)
    if get_current_backend() != "BaseOnPython":
        eval_general(
            modin_groupby,
            pandas_groupby,
            lambda df: df.size(),
            check_exception_type=None,
        )
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.tail(n),
                 is_default=True)
    eval_quantile(modin_groupby, pandas_groupby)
    eval_general(modin_groupby,
                 pandas_groupby,
                 lambda df: df.take(),
                 is_default=True)
    if isinstance(by, list) and not any(
            isinstance(o, (pd.Series, pandas.Series)) for o in by):
        # Not yet supported for non-original-column-from-dataframe Series in by:
        eval___getattr__(modin_groupby, pandas_groupby, "col3")
    eval_groups(modin_groupby, pandas_groupby)