def test_hasattr_sparse(is_sparse_data): modin_df, pandas_df = ( create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values())) if is_sparse_data else create_test_dfs(test_data["float_nan_data"]) ) eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
def test_duplicate_indexes(): data = [0, 1, 2, 3, 4, 5] modin_df1, pandas_df1 = create_test_dfs( {"a": data, "b": data}, index=[0, 1, 2, 0, 1, 2] ) modin_df2, pandas_df2 = create_test_dfs({"a": data, "b": data}) df_equals(modin_df1 / modin_df2, pandas_df1 / pandas_df2)
def test_cov(min_periods, ddof): eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.cov(min_periods=min_periods, ddof=ddof), ) # Modin result may slightly differ from pandas result # due to floating pointing arithmetic. eval_general( *create_test_dfs(test_data["float_nan_data"]), lambda df: df.cov(min_periods=min_periods), comparator=modin_df_almost_equals_pandas, )
def test_reduction_specific(fn, numeric_only, axis): if fn == "mean" and axis == 1: pytest.skip("See issue #2313 for details") eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis), )
def test_unstack(data, is_multi_idx, is_multi_col): modin_df, pandas_df = create_test_dfs(data) if is_multi_idx: index = generate_multiindex(len(pandas_df), nlevels=4, is_tree_like=True) else: index = pandas_df.index if is_multi_col: columns = generate_multiindex(len(pandas_df.columns), nlevels=3, is_tree_like=True) else: columns = pandas_df.columns pandas_df.columns = modin_df.columns = columns pandas_df.index = modin_df.index = index df_equals(modin_df.unstack(), pandas_df.unstack()) df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1)) if is_multi_idx: df_equals(modin_df.unstack(level=[0, 1]), pandas_df.unstack(level=[0, 1])) df_equals(modin_df.unstack(level=[0, 1, 2]), pandas_df.unstack(level=[0, 1, 2])) df_equals(modin_df.unstack(level=[0, 1, 2, 3]), pandas_df.unstack(level=[0, 1, 2, 3]))
def test_apply_udf(data, func): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.apply(*args, **kwargs), func=func, other=lambda df: df, )
def test_loc_series(): md_df, pd_df = create_test_dfs({"a": [1, 2], "b": [3, 4]}) pd_df.loc[pd_df["a"] > 1, "b"] = np.log(pd_df["b"]) md_df.loc[md_df["a"] > 1, "b"] = np.log(md_df["b"]) df_equals(pd_df, md_df)
def test_prod_specific(min_count, numeric_only): if min_count == 5 and numeric_only: pytest.xfail("see #1953 for details") eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: df.prod(min_count=min_count, numeric_only=numeric_only), )
def test_prod( data, axis, skipna, is_transposed, method, ): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: getattr(df.T if is_transposed else df, method)( axis=axis, skipna=skipna, ), ) # test for issue #1953 arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] modin_df = pd.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays) pandas_df = pandas.DataFrame( [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays) modin_result = modin_df.prod(level=0) pandas_result = pandas_df.prod(level=0) df_equals(modin_result, pandas_result)
def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna, numeric_only=numeric_only ), )
def test_all_any(data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna, bool_only=None ), )
def test_idxmin_idxmax(data, axis, skipna, is_transposed, method): eval_general( *create_test_dfs(data), lambda df: getattr((df.T if is_transposed else df), method)( axis=axis, skipna=skipna ), )
def test_describe_specific(exclude, include): eval_general( *create_test_dfs(test_data_diff_dtype), lambda df: df.drop("str_col", axis=1).describe( exclude=exclude, include=include ), )
def test___setitem__partitions_aligning(): # from issue #2390 modin_df = pd.DataFrame({"a": [1, 2, 3]}) pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) df_equals(modin_df, pandas_df) # from issue #2442 data = {"a": [1, 2, 3, 4]} # Index with duplicated timestamp index = pandas.to_datetime( ["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"]) md_df, pd_df = create_test_dfs(data, index=index) # Setting new column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) # Setting existing column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) pd_df["a"] = pandas.Series(np.arange(4)) md_df["a"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df)
def test_comparison_except(data, op, other): # So far `eq` and `ne` are excluded from testing because # pandas throws an exception but Modin doesn't throw it. with pytest.raises(AssertionError): eval_general( *create_test_dfs(data), lambda df: getattr(df, op)(other), )
def test_loc_assignment(index, columns): md_df, pd_df = create_test_dfs(index=index, columns=columns) for i, ind in enumerate(index): for j, col in enumerate(columns): value_to_assign = int(str(i) + str(j)) md_df.loc[ind][col] = value_to_assign pd_df.loc[ind][col] = value_to_assign df_equals(md_df, pd_df)
def test_melt(data, id_vars, value_vars): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.melt(*args, **kwargs).sort_values( ["variable", "value"]).reset_index(drop=True), id_vars=id_vars, value_vars=value_vars, )
def test_kurt_kurtosis(axis, skipna, numeric_only, method): data = test_data["float_nan_data"] eval_general( *create_test_dfs(data), lambda df: getattr(df, method) (axis=axis, skipna=skipna, numeric_only=numeric_only), )
def loc_iter_dfs(): columns = ["col1", "col2", "col3"] index = ["row1", "row2", "row3"] return create_test_dfs( {col: ([idx] * len(index)) for idx, col in enumerate(columns)}, columns=columns, index=index, )
def test_describe_dtypes(): data = { "col1": list("abc"), "col2": list("abc"), "col3": list("abc"), "col4": [1, 2, 3], } eval_general(*create_test_dfs(data), lambda df: df.describe())
def test_aligning_partitions(): data = [0, 1, 2, 3, 4, 5] modin_df1, _ = create_test_dfs({"a": data, "b": data}) modin_df = modin_df1.loc[:2] modin_df2 = modin_df.append(modin_df) modin_df2["c"] = modin_df1["b"] repr(modin_df2)
def test_pivot_table_dropna(data): eval_general( *create_test_dfs(data), operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs), index=lambda df: df.columns[0], columns=lambda df: df.columns[1], values=lambda df: df.columns[-1], dropna=False, )
def test_pivot(data, index, columns, values): eval_general( *create_test_dfs(data), lambda df, *args, **kwargs: df.pivot(*args, **kwargs), index=index, columns=columns, values=values, check_exception_type=None, )
def test_resample_getitem(columns): index = pandas.date_range("1/1/2013", periods=9, freq="T") data = { "price": range(9), "volume": range(10, 19), } eval_general( *create_test_dfs(data, index=index), lambda df: df.resample("3T")[columns].mean(), )
def test_multiindex_from_frame(data, sortorder): modin_df, pandas_df = create_test_dfs(data) def call_from_frame(df): if type(df).__module__.startswith("pandas"): return pandas.MultiIndex.from_frame(df, sortorder) else: return pd.MultiIndex.from_frame(df, sortorder) eval_general(modin_df, pandas_df, call_from_frame, comparator=assert_index_equal)
def test_agg_dict(): md_df, pd_df = create_test_dfs(test_data_values[0]) agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")} eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True) agg_dict = { "new_col1": (pd_df.columns[0], "sum"), "new_col2": (pd_df.columns[-1], "count"), } eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
def test_value_counts_categorical(): # from issue #3571 data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000) random_state = np.random.RandomState(seed=42) random_state.shuffle(data) eval_general( *create_test_dfs({"col1": data, "col2": data}, dtype="category"), lambda df: df.value_counts(), )
def test_agg_apply_axis_names(axis, func, op): # AssertionError may be arisen in case of # mismathing of index/columns in Modin and pandas. # See details in pandas issue 36189. try: eval_general( *create_test_dfs(test_data["int_data"]), lambda df: getattr(df, op)(func, axis), ) except AssertionError: pass
def test_apply_args(axis, args): def apply_func(series, y): try: return series + y except TypeError: return series.map(str) + str(y) eval_general( *create_test_dfs(test_data["int_data"]), lambda df: df.apply(apply_func, axis=axis, args=args), )
def test_explode_all_partitions(column, ignore_index): # Test explode with enough rows to fill all partitions. explode should # expand every row in the input data into two rows. It's especially # important that the input data has list-like elements that must be # expanded at the boundaries of the partitions, e.g. at row 31. num_rows = NPartitions.get() * MinPartitionSize.get() data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows} eval_general( *create_test_dfs(data), lambda df: df.explode(column, ignore_index=ignore_index), )