def test_compare(align_axis, keep_shape, keep_equal): kwargs = { "align_axis": align_axis, "keep_shape": keep_shape, "keep_equal": keep_equal, } frame_data1 = random_state.randn(100, 10) frame_data2 = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data1, columns=list("abcdefghij")) pandas_df2 = pandas.DataFrame(frame_data2, columns=list("abcdefghij")) modin_df = pd.DataFrame(frame_data1, columns=list("abcdefghij")) modin_df2 = pd.DataFrame(frame_data2, columns=list("abcdefghij")) modin_result = modin_df.compare(modin_df2, **kwargs) pandas_result = pandas_df.compare(pandas_df2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_df2.compare(modin_df, **kwargs) pandas_result = pandas_df2.compare(pandas_df, **kwargs) assert to_pandas(modin_result).equals(pandas_result) series_data1 = ["a", "b", "c", "d", "e"] series_data2 = ["a", "a", "c", "b", "e"] pandas_series1 = pandas.Series(series_data1) pandas_series2 = pandas.Series(series_data2) modin_series1 = pd.Series(series_data1) modin_series2 = pd.Series(series_data2) modin_result = modin_series1.compare(modin_series2, **kwargs) pandas_result = pandas_series1.compare(pandas_series2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_series2.compare(modin_series1, **kwargs) pandas_result = pandas_series2.compare(pandas_series1, **kwargs) assert to_pandas(modin_result).equals(pandas_result)
def test_eval_df_arithmetic_subexpression(): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) modin_df.eval("not_e = sin(a + b)", engine="python", parser="pandas", inplace=True) # TODO: Use a series equality validator. df_equals(modin_df, df)
def test_fillna_inplace(): frame_data = random_state.randn(10, 4) df = pandas.DataFrame(frame_data) df[1][:4] = np.nan df[3][-4:] = np.nan modin_df = pd.DataFrame(df) df.fillna(value=0, inplace=True) try: df_equals(modin_df, df) except AssertionError: pass else: assert False modin_df.fillna(value=0, inplace=True) df_equals(modin_df, df) modin_df = pd.DataFrame(df).fillna(value={0: 0}, inplace=True) assert modin_df is None df[1][:4] = np.nan df[3][-4:] = np.nan modin_df = pd.DataFrame(df) df.fillna(method="ffill", inplace=True) try: df_equals(modin_df, df) except AssertionError: pass else: assert False modin_df.fillna(method="ffill", inplace=True) df_equals(modin_df, df)
def test_eval_df_use_case(): frame_data = {"a": random_state.randn(10), "b": random_state.randn(10)} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) # test eval for series results tmp_pandas = df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") tmp_modin = modin_df.eval("arctan2(sin(a), b)", engine="python", parser="pandas") assert isinstance(tmp_modin, pd.Series) df_equals(tmp_modin, tmp_pandas) # Test not inplace assignments tmp_pandas = df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") tmp_modin = modin_df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas") df_equals(tmp_modin, tmp_pandas) # Test inplace assignments df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True) modin_df.eval("e = arctan2(sin(a), b)", engine="python", parser="pandas", inplace=True) # TODO: Use a series equality validator. df_equals(modin_df, df)
def test_where(): frame_data = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij")) modin_df = pd.DataFrame(frame_data, columns=list("abcdefghij")) pandas_cond_df = pandas_df % 5 < 2 modin_cond_df = modin_df % 5 < 2 pandas_result = pandas_df.where(pandas_cond_df, -pandas_df) modin_result = modin_df.where(modin_cond_df, -modin_df) assert all((to_pandas(modin_result) == pandas_result).all()) other = pandas_df.loc[3] pandas_result = pandas_df.where(pandas_cond_df, other, axis=1) modin_result = modin_df.where(modin_cond_df, other, axis=1) assert all((to_pandas(modin_result) == pandas_result).all()) other = pandas_df["e"] pandas_result = pandas_df.where(pandas_cond_df, other, axis=0) modin_result = modin_df.where(modin_cond_df, other, axis=0) assert all((to_pandas(modin_result) == pandas_result).all()) pandas_result = pandas_df.where(pandas_df < 2, True) modin_result = modin_df.where(modin_df < 2, True) assert all((to_pandas(modin_result) == pandas_result).all())
def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) modin_simple = pd.DataFrame(frame_data) df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) pytest.raises(ValueError, modin_simple.drop, 5) pytest.raises(ValueError, modin_simple.drop, "C", 1) pytest.raises(ValueError, modin_simple.drop, [1, 5]) pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) # errors = 'ignore' df_equals(modin_simple.drop(5, errors="ignore"), simple) df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) # non-unique nu_df = pandas.DataFrame(zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"]) modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) df_equals(modin_nu_df.drop([]), nu_df) nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) nu_df.columns = list("abc") modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = random_state.randn(10, 3) df = pandas.DataFrame(frame_data, columns=list("abc")) modin_df = pd.DataFrame(frame_data, columns=list("abc")) expected = df[~(df.b > 0)] modin_df.drop(labels=df[df.b > 0].index, inplace=True) df_equals(modin_df, expected) midx = pd.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) df = pd.DataFrame( index=midx, columns=["big", "small"], data=[ [45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3, 0.2], ], ) with pytest.warns(UserWarning): df.drop(index="length", level=1)