def _update_factory(cls, _): """ Update and prepare factory with a new one specified via Modin config. Parameters ---------- _ : object This parameters serves the compatibility purpose. Does not affect the result. """ factory_name = get_current_backend() + "Factory" try: cls.__factory = getattr(factories, factory_name) except AttributeError: if not IsExperimental.get(): # allow missing factories in experimenal mode only if hasattr(factories, "Experimental" + factory_name): msg = ( "{0} on {1} is only accessible through the experimental API.\nRun " "`import modin.experimental.pandas as pd` to use {0} on {1}." ) else: msg = ( "Cannot find a factory for partition '{}' and execution engine '{}'. " "Potential reason might be incorrect environment variable value for " f"{Backend.varname} or {Engine.varname}") raise FactoryNotFoundError( msg.format(Backend.get(), Engine.get())) cls.__factory = StubFactory.set_failing_name(factory_name) else: cls.__factory.prepare()
def pytest_runtest_call(item): custom_markers = ["xfail", "skip"] # dynamicly adding custom markers to tests for custom_marker in custom_markers: for marker in item.iter_markers(name=f"{custom_marker}_backends"): backends = marker.args[0] if not isinstance(backends, list): backends = [backends] current_backend = get_current_backend() reason = marker.kwargs.pop("reason", "") item.add_marker( getattr(pytest.mark, custom_marker)( condition=current_backend in backends, reason=f"Backend {current_backend} does not pass this test. {reason}", **marker.kwargs, ) )
def _update_engine(cls, _): factory_name = get_current_backend() + "Factory" try: cls.__engine = getattr(factories, factory_name) except AttributeError: if not IsExperimental.get(): # allow missing factories in experimenal mode only if hasattr(factories, "Experimental" + factory_name): msg = ( "{0} on {1} is only accessible through the experimental API.\nRun " "`import modin.experimental.pandas as pd` to use {0} on {1}." ) else: msg = ( "Cannot find a factory for partition '{}' and execution engine '{}'. " "Potential reason might be incorrect environment variable value for " f"{Backend.varname} or {Engine.varname}" ) raise FactoryNotFoundError(msg.format(Backend.get(), Engine.get())) cls.__engine = StubFactory.set_failing_name(factory_name) else: cls.__engine.prepare()
def test_copy(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # noqa F841 # pandas_df is unused but there so there won't be confusing list comprehension # stuff in the pytest.mark.parametrize new_modin_df = modin_df.copy() assert new_modin_df is not modin_df if get_current_backend() != "BaseOnPython": assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, ) assert new_modin_df is not modin_df df_equals(new_modin_df, modin_df) # Shallow copy tests modin_df = pd.DataFrame(data) modin_df_cp = modin_df.copy(False) modin_df[modin_df.columns[0]] = 0 df_equals(modin_df, modin_df_cp)
md_df.index = index md_df.columns = columns pd_df = md_df._to_pandas() for axis in [0, 1]: assert md_df.axes[axis].equals( pd_df.axes[axis] ), f"Indices at axis {axis} are different!" assert md_df.axes[axis].equal_levels( pd_df.axes[axis] ), f"Levels of indices at axis {axis} are different!" @pytest.mark.skipif( get_current_backend() != "BaseOnPython", reason="This test make sense only on BaseOnPython backend.", ) @pytest.mark.parametrize( "func, regex", [ (lambda df: df.mean(level=0), r"DataFrame\.mean"), (lambda df: df + df, r"DataFrame\.add"), (lambda df: df.index, r"DataFrame\.get_axis\(0\)"), ( lambda df: df.drop(columns="col1").squeeze().repeat(2), r"Series\.repeat", ), (lambda df: df.groupby("col1").prod(), r"GroupBy\.prod"), (lambda df: df.rolling(1).count(), r"Rolling\.count"), ],
def test_append(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) data_to_append = {"append_a": 2, "append_b": 1000} ignore_idx_values = [True, False] for ignore in ignore_idx_values: try: pandas_result = pandas_df.append(data_to_append, ignore_index=ignore) except Exception as e: with pytest.raises(type(e)): modin_df.append(data_to_append, ignore_index=ignore) else: modin_result = modin_df.append(data_to_append, ignore_index=ignore) df_equals(modin_result, pandas_result) try: pandas_result = pandas_df.append(pandas_df.iloc[-1]) except Exception as e: with pytest.raises(type(e)): modin_df.append(modin_df.iloc[-1]) else: modin_result = modin_df.append(modin_df.iloc[-1]) df_equals(modin_result, pandas_result) try: pandas_result = pandas_df.append(list(pandas_df.iloc[-1])) except Exception as e: with pytest.raises(type(e)): modin_df.append(list(modin_df.iloc[-1])) else: modin_result = modin_df.append(list(modin_df.iloc[-1])) # Pandas has bug where sort=False is ignored # (https://github.com/pandas-dev/pandas/issues/35092), but Modin # now does the right thing, so for now manually sort to workaround # this. Once the Pandas bug is fixed and Modin upgrades to that # Pandas release, this sort will cause the test to fail, and the # next three lines should be deleted. if get_current_backend() != "BaseOnPython": assert list(modin_result.columns) == list(modin_df.columns) + [0] modin_result = modin_result[[0] + sorted(modin_df.columns)] df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] for verify_integrity in verify_integrity_values: try: pandas_result = pandas_df.append([pandas_df, pandas_df], verify_integrity=verify_integrity) except Exception as e: with pytest.raises(type(e)): modin_df.append([modin_df, modin_df], verify_integrity=verify_integrity) else: modin_result = modin_df.append([modin_df, modin_df], verify_integrity=verify_integrity) df_equals(modin_result, pandas_result) try: pandas_result = pandas_df.append(pandas_df, verify_integrity=verify_integrity) except Exception as e: with pytest.raises(type(e)): modin_df.append(modin_df, verify_integrity=verify_integrity) else: modin_result = modin_df.append(modin_df, verify_integrity=verify_integrity) df_equals(modin_result, pandas_result)
def test_simple_row_groupby(by, as_index, col1_category): pandas_df = pandas.DataFrame({ "col1": [0, 1, 2, 3], "col2": [4, 5, np.NaN, 7], "col3": [np.NaN, np.NaN, 12, 10], "col4": [17, 13, 16, 15], "col5": [-4, -5, -6, -7], }) if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) modin_df = from_pandas(pandas_df) n = 1 def maybe_get_columns(df, by): if isinstance(by, list): return [o(df) if isinstance(o, GetColumn) else o for o in by] else: return by modin_groupby = modin_df.groupby(by=maybe_get_columns(modin_df, by), as_index=as_index) pandas_by = maybe_get_columns(pandas_df, try_cast_to_pandas(by)) pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) modin_groupby_equals_pandas(modin_groupby, pandas_groupby) eval_ngroups(modin_groupby, pandas_groupby) eval_shift(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.ffill(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.sem(), modin_df_almost_equals_pandas, is_default=True, ) eval_mean(modin_groupby, pandas_groupby) eval_any(modin_groupby, pandas_groupby) eval_min(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmax(), is_default=True) eval_ndim(modin_groupby, pandas_groupby) if not check_df_columns_have_nans(modin_df, by): # cum* functions produce undefined results for columns with NaNs so we run them only when "by" columns contain no NaNs eval_general(modin_groupby, pandas_groupby, lambda df: df.cumsum(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummax(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cummin(axis=0)) eval_general(modin_groupby, pandas_groupby, lambda df: df.cumprod(axis=0)) eval_general( modin_groupby, pandas_groupby, lambda df: df.pct_change(), modin_df_almost_equals_pandas, is_default=True, ) # Workaround for Pandas bug #34656. Recreate groupby object for Pandas pandas_groupby = pandas_df.groupby(by=pandas_by, as_index=as_index) apply_functions = [lambda df: df.sum(), min] for func in apply_functions: eval_apply(modin_groupby, pandas_groupby, func) eval_dtypes(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.first(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.backfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.bfill(), is_default=True) eval_general(modin_groupby, pandas_groupby, lambda df: df.idxmin(), is_default=True) eval_prod(modin_groupby, pandas_groupby) if as_index: eval_std(modin_groupby, pandas_groupby) eval_var(modin_groupby, pandas_groupby) eval_skew(modin_groupby, pandas_groupby) agg_functions = ["min", "max"] for func in agg_functions: eval_agg(modin_groupby, pandas_groupby, func) eval_aggregate(modin_groupby, pandas_groupby, func) eval_general(modin_groupby, pandas_groupby, lambda df: df.last(), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.mad(), modin_df_almost_equals_pandas, is_default=True, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.rank()) eval_max(modin_groupby, pandas_groupby) eval_len(modin_groupby, pandas_groupby) eval_sum(modin_groupby, pandas_groupby) eval_ngroup(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.nunique()) eval_median(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.head(n), is_default=True) eval_general( modin_groupby, pandas_groupby, lambda df: df.cov(), modin_df_almost_equals_pandas, is_default=True, ) if not check_df_columns_have_nans(modin_df, by): # Pandas groupby.transform does not work correctly with NaN values in grouping columns. See Pandas bug 17093. transform_functions = [lambda df: df + 4, lambda df: -df - 10] for func in transform_functions: eval_general( modin_groupby, pandas_groupby, lambda df: df.transform(func), check_exception_type=None, ) pipe_functions = [lambda dfgb: dfgb.sum()] for func in pipe_functions: eval_pipe(modin_groupby, pandas_groupby, func) eval_general( modin_groupby, pandas_groupby, lambda df: df.corr(), modin_df_almost_equals_pandas, is_default=True, ) eval_fillna(modin_groupby, pandas_groupby) eval_count(modin_groupby, pandas_groupby) if get_current_backend() != "BaseOnPython": eval_general( modin_groupby, pandas_groupby, lambda df: df.size(), check_exception_type=None, ) eval_general(modin_groupby, pandas_groupby, lambda df: df.tail(n), is_default=True) eval_quantile(modin_groupby, pandas_groupby) eval_general(modin_groupby, pandas_groupby, lambda df: df.take(), is_default=True) if isinstance(by, list) and not any( isinstance(o, (pd.Series, pandas.Series)) for o in by): # Not yet supported for non-original-column-from-dataframe Series in by: eval___getattr__(modin_groupby, pandas_groupby, "col3") eval_groups(modin_groupby, pandas_groupby)