def test_fillna_dict_series(): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df_equals(modin_df.fillna({"a": 0, "b": 5}), df.fillna({"a": 0, "b": 5})) df_equals( modin_df.fillna({ "a": 0, "b": 5, "d": 7 }), df.fillna({ "a": 0, "b": 5, "d": 7 }), ) # Series treated same as dict df_equals(modin_df.fillna(modin_df.max()), df.fillna(df.max()))
def test_reindex_multiindex(): data1, data2 = np.random.randint(1, 20, (5, 5)), np.random.randint(10, 25, 6) index = np.array(["AUD", "BRL", "CAD", "EUR", "INR"]) modin_midx = pd.MultiIndex.from_product( [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] ) pandas_midx = pandas.MultiIndex.from_product( [["Bank_1", "Bank_2"], ["AUD", "CAD", "EUR"]], names=["Bank", "Curency"] ) modin_df1, modin_df2 = ( pd.DataFrame(data=data1, index=index, columns=index), pd.DataFrame(data2, modin_midx), ) pandas_df1, pandas_df2 = ( pandas.DataFrame(data=data1, index=index, columns=index), pandas.DataFrame(data2, pandas_midx), ) modin_df2.columns, pandas_df2.columns = ["Notional"], ["Notional"] md_midx = pd.MultiIndex.from_product([modin_df2.index.levels[0], modin_df1.index]) pd_midx = pandas.MultiIndex.from_product( [pandas_df2.index.levels[0], pandas_df1.index] ) # reindex without axis, index, or columns modin_result = modin_df1.reindex(md_midx, fill_value=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0) df_equals(modin_result, pandas_result) # reindex with only axis modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0) df_equals(modin_result, pandas_result) # reindex with axis and level modin_result = modin_df1.reindex(md_midx, fill_value=0, axis=0, level=0) pandas_result = pandas_df1.reindex(pd_midx, fill_value=0, axis=0, level=0) df_equals(modin_result, pandas_result)
def test___getitem__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key = modin_df.columns[0] modin_col = modin_df.__getitem__(key) assert isinstance(modin_col, pd.Series) pd_col = pandas_df[key] df_equals(pd_col, modin_col) slices = [ (None, -1), (-1, None), (1, 2), (1, None), (None, 1), (1, -1), (-3, -1), (1, -1, 2), ] # slice test for slice_param in slices: s = slice(*slice_param) df_equals(modin_df[s], pandas_df[s]) # Test empty df_equals(pd.DataFrame([])[:10], pandas.DataFrame([])[:10])
def test_indexing_duplicate_axis(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df.index = pandas_df.index = [i // 3 for i in range(len(modin_df))] assert any(modin_df.index.duplicated()) assert any(pandas_df.index.duplicated()) df_equals(modin_df.iloc[0], pandas_df.iloc[0]) df_equals(modin_df.loc[0], pandas_df.loc[0]) df_equals(modin_df.iloc[0, 0:4], pandas_df.iloc[0, 0:4]) df_equals( modin_df.loc[0, modin_df.columns[0:4]], pandas_df.loc[0, pandas_df.columns[0:4]], )
def test_export_indivisible_chunking(data_has_nulls): """ Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'indivisibly chunked'. The setup for the test is a PyArrow table having one of the chunk consisting of a single row, meaning that the chunk can't be subdivide. """ data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) pd_chunks = (pd_df.iloc[:1], pd_df.iloc[1:]) chunked_at = pa.concat_tables( [pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) md_df = from_arrow(chunked_at) assert (len(md_df._query_compiler._modin_frame._partitions[0][0].get(). column(0).chunks) == md_df.__dataframe__().num_chunks() == 2) # Meaning that we can't subdivide first chunk np.testing.assert_array_equal(md_df.__dataframe__()._chunk_slices, [0, 1, len(pd_df)]) exported_df = export_frame(md_df, n_chunks=2) df_equals(md_df, exported_df) exported_df = export_frame(md_df, n_chunks=4) df_equals(md_df, exported_df) exported_df = export_frame(md_df, n_chunks=40) df_equals(md_df, exported_df)
def test_select_dtypes(): frame_data = { "test1": list("abc"), "test2": np.arange(3, 6).astype("u1"), "test3": np.arange(8.0, 11.0, dtype="float64"), "test4": [True, False, True], "test5": pandas.date_range("now", periods=3).values, "test6": list(range(5, 8)), } df = pandas.DataFrame(frame_data) rd = pd.DataFrame(frame_data) include = np.float, "integer" exclude = (np.bool_,) r = rd.select_dtypes(include=include, exclude=exclude) e = df[["test2", "test3", "test6"]] df_equals(r, e) r = rd.select_dtypes(include=np.bool_) e = df[["test4"]] df_equals(r, e) r = rd.select_dtypes(exclude=np.bool_) e = df[["test1", "test2", "test3", "test5", "test6"]] df_equals(r, e) try: pd.DataFrame().select_dtypes() assert False except ValueError: assert True
def test_set_index(request, data, drop, append): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name: key = modin_df.columns[0] modin_result = modin_df.set_index(key, drop=drop, append=append, inplace=False) pandas_result = pandas_df.set_index(key, drop=drop, append=append, inplace=False) df_equals(modin_result, pandas_result) modin_df_copy = modin_df.copy() modin_df.set_index(key, drop=drop, append=append, inplace=True) # Check that the copy and original are different try: df_equals(modin_df, modin_df_copy) except AssertionError: assert True else: assert False pandas_df.set_index(key, drop=drop, append=append, inplace=True) df_equals(modin_df, pandas_df)
def test_set_axis(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) x = pandas.DataFrame()._get_axis_number(axis) index = modin_df.columns if x else modin_df.index labels = ["{0}_{1}".format(index[i], i) for i in range(modin_df.shape[x])] modin_result = modin_df.set_axis(labels, axis=axis, inplace=False) pandas_result = pandas_df.set_axis(labels, axis=axis, inplace=False) df_equals(modin_result, pandas_result) modin_df_copy = modin_df.copy() modin_df.set_axis(labels, axis=axis, inplace=True) # Check that the copy and original are different try: df_equals(modin_df, modin_df_copy) except AssertionError: assert True else: assert False pandas_df.set_axis(labels, axis=axis, inplace=True) df_equals(modin_df, pandas_df)
def test_matmul(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df @ arr pandas_result = pandas_df @ arr df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_result = modin_df @ np.arange(col_len + 10) # Test series input modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) modin_result = modin_df @ modin_series pandas_result = pandas_df @ pandas_series df_equals(modin_result, pandas_result) # Test dataframe input modin_result = modin_df @ modin_df.T pandas_result = pandas_df @ pandas_df.T df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_result = modin_df @ pd.Series(np.arange(col_len))
def test_from_sql_distributed(make_sql_connection): # noqa: F811 if Engine.get() == "Ray": pytest.xfail("Distributed read_sql is broken, see GH#2194") filename = "test_from_sql_distributed.db" table = "test_from_sql_distributed" conn = make_sql_connection(filename, table) query = "select * from {0}".format(table) pandas_df = pandas.read_sql(query, conn) modin_df_from_query = pd.read_sql(query, conn, partition_column="col1", lower_bound=0, upper_bound=6) modin_df_from_table = pd.read_sql(table, conn, partition_column="col1", lower_bound=0, upper_bound=6) df_equals(modin_df_from_query, pandas_df) df_equals(modin_df_from_table, pandas_df)
def test_simple_import(): modin_df_producer = pd.DataFrame(test_data["int_data"]) internal_modin_df_producer = modin_df_producer.__dataframe__() # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` with warns_that_defaulting_to_pandas(): modin_df_consumer = from_dataframe(modin_df_producer) internal_modin_df_consumer = from_dataframe(internal_modin_df_producer) # TODO: the following assertions verify that `from_dataframe` doesn't return # the same object untouched due to optimization branching, it actually should # do so but the logic is not implemented yet, so the assertions are passing # for now. It's required to replace the producer's type with a different one # to consumer when we have some other implementation of the protocol as the # assertions may start failing shortly. assert modin_df_producer is not modin_df_consumer assert internal_modin_df_producer is not internal_modin_df_consumer assert (modin_df_producer._query_compiler._modin_frame is not modin_df_consumer._query_compiler._modin_frame) df_equals(modin_df_producer, modin_df_consumer) df_equals(modin_df_producer, internal_modin_df_consumer)
def test_export_when_delayed_computations(): """ Test that export works properly when OmnisciOnNative has delayed computations. If there are delayed functions and export is required, it has to trigger the execution first prior materializing protocol's buffers, so the buffers contain actual result of the computations. """ # OmniSci can't import 'uint64' as well as booleans, so exclude them # issue for bool: https://github.com/modin-project/modin/issues/4299 data = get_data_of_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) md_df = pd.DataFrame(data) pd_df = pandas.DataFrame(data) md_res = md_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) pd_res = pd_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) assert (not md_res._query_compiler._modin_frame._has_arrow_table() ), "There are no delayed computations for the frame" exported_df = export_frame(md_res) df_equals(exported_df, pd_res)
def test_tz_convert(): modin_idx = pd.date_range( "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" ) pandas_idx = pandas.date_range( "1/1/2012", periods=500, freq="2D", tz="America/Los_Angeles" ) data = np.random.randint(0, 100, size=(len(modin_idx), 4)) modin_df = pd.DataFrame(data, index=modin_idx) pandas_df = pandas.DataFrame(data, index=pandas_idx) modin_result = modin_df.tz_convert("UTC", axis=0) pandas_result = pandas_df.tz_convert("UTC", axis=0) df_equals(modin_result, pandas_result) modin_multi = pd.MultiIndex.from_arrays([modin_idx, range(len(modin_idx))]) pandas_multi = pandas.MultiIndex.from_arrays([pandas_idx, range(len(modin_idx))]) modin_series = pd.DataFrame(data, index=modin_multi) pandas_series = pandas.DataFrame(data, index=pandas_multi) df_equals( modin_series.tz_convert("UTC", axis=0, level=0), pandas_series.tz_convert("UTC", axis=0, level=0), )
def test_h2o_q3(self): df = self._get_h2o_df() ref = df.groupby(["id3"], observed=True).agg({ "v1": "sum", "v3": "mean" }) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) set_execution_mode(modin_df, "lazy") modin_df = modin_df.groupby(["id3"], observed=True, as_index=False).agg({ "v1": "sum", "v3": "mean" }) set_execution_mode(modin_df, None) exp = to_pandas(modin_df) exp["id3"] = exp["id3"].astype("category") df_equals(ref, exp)
def test_copy(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # noqa F841 # pandas_df is unused but there so there won't be confusing list comprehension # stuff in the pytest.mark.parametrize new_modin_df = modin_df.copy() assert new_modin_df is not modin_df assert np.array_equal( new_modin_df._query_compiler._modin_frame._partitions, modin_df._query_compiler._modin_frame._partitions, ) assert new_modin_df is not modin_df df_equals(new_modin_df, modin_df) # Shallow copy tests modin_df = pd.DataFrame(data) modin_df_cp = modin_df.copy(False) modin_df[modin_df.columns[0]] = 0 df_equals(modin_df, modin_df_cp)
def test_fillna_dataframe(): frame_data = { "a": [np.nan, 1, 2, np.nan, np.nan], "b": [1, 2, 3, np.nan, np.nan], "c": [np.nan, 1, 2, 3, 4], } df = pandas.DataFrame(frame_data, index=list("VWXYZ")) modin_df = pd.DataFrame(frame_data, index=list("VWXYZ")) # df2 may have different index and columns df2 = pandas.DataFrame( { "a": [np.nan, 10, 20, 30, 40], "b": [50, 60, 70, 80, 90], "foo": ["bar"] * 5, }, index=list("VWXuZ"), ) modin_df2 = pd.DataFrame(df2) # only those columns and indices which are shared get filled df_equals(modin_df.fillna(modin_df2), df.fillna(df2))
def test_reorder_levels(): data = np.random.randint(1, 100, 12) modin_df = pd.DataFrame( data, index=pd.MultiIndex.from_tuples( [(num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"]], names=["Number", "Letter", "Color"], ), ) pandas_df = pandas.DataFrame( data, index=pandas.MultiIndex.from_tuples( [(num, letter, color) for num in range(1, 3) for letter in ["a", "b", "c"] for color in ["Red", "Green"]], names=["Number", "Letter", "Color"], ), ) df_equals( modin_df.reorder_levels(["Letter", "Color", "Number"]), pandas_df.reorder_levels(["Letter", "Color", "Number"]), )
def test_getitem_same_name(): data = [ [1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], ] columns = ["c1", "c2", "c1", "c3"] modin_df = pd.DataFrame(data, columns=columns) pandas_df = pandas.DataFrame(data, columns=columns) df_equals(modin_df["c1"], pandas_df["c1"]) df_equals(modin_df["c2"], pandas_df["c2"]) df_equals(modin_df[["c1", "c2"]], pandas_df[["c1", "c2"]]) df_equals(modin_df["c3"], pandas_df["c3"])
def test_replace(): modin_df = pd.DataFrame({ "A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"] }) pandas_df = pandas.DataFrame({ "A": [0, 1, 2, 3, 4], "B": [5, 6, 7, 8, 9], "C": ["a", "b", "c", "d", "e"] }) modin_result = modin_df.replace({"A": 0, "B": 5}, 100) pandas_result = pandas_df.replace({"A": 0, "B": 5}, 100) df_equals(modin_result, pandas_result) modin_result = modin_df.replace({"A": {0: 100, 4: 400}}) pandas_result = pandas_df.replace({"A": {0: 100, 4: 400}}) df_equals(modin_result, pandas_result) modin_df = pd.DataFrame({ "A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"] }) pandas_df = pandas.DataFrame({ "A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"] }) modin_result = modin_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) pandas_result = pandas_df.replace(regex={r"^ba.$": "new", "foo": "xyz"}) df_equals(modin_result, pandas_result) modin_result = modin_df.replace(regex=[r"^ba.$", "foo"], value="new") pandas_result = pandas_df.replace(regex=[r"^ba.$", "foo"], value="new") df_equals(modin_result, pandas_result) modin_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) pandas_df.replace(regex=[r"^ba.$", "foo"], value="new", inplace=True) df_equals(modin_df, pandas_df)
def test_unwrap_partitions(axis): data = np.random.randint(0, 100, size=(2**16, 2**8)) df = pd.DataFrame(data) if axis is None: expected_partitions = df._query_compiler._modin_frame._partitions actual_partitions = np.array(unwrap_partitions(df, axis=axis)) assert (expected_partitions.shape[0] == actual_partitions.shape[0] and expected_partitions.shape[1] == expected_partitions.shape[1]) for row_idx in range(expected_partitions.shape[0]): for col_idx in range(expected_partitions.shape[1]): if Engine.get() == "Ray": assert (expected_partitions[row_idx][col_idx].oid == actual_partitions[row_idx][col_idx]) if Engine.get() == "Dask": assert (expected_partitions[row_idx][col_idx].future == actual_partitions[row_idx][col_idx]) else: expected_axis_partitions = ( df._query_compiler._modin_frame._frame_mgr_cls.axis_partition( df._query_compiler._modin_frame._partitions, axis ^ 1)) expected_axis_partitions = [ axis_partition.force_materialization().unwrap(squeeze=True) for axis_partition in expected_axis_partitions ] actual_axis_partitions = unwrap_partitions(df, axis=axis) assert len(expected_axis_partitions) == len(actual_axis_partitions) for item_idx in range(len(expected_axis_partitions)): if Engine.get() == "Ray": df_equals( ray.get(expected_axis_partitions[item_idx]), ray.get(actual_axis_partitions[item_idx]), ) if Engine.get() == "Dask": df_equals( expected_axis_partitions[item_idx].result(), actual_axis_partitions[item_idx].result(), )
def test_take(): modin_df = pd.DataFrame( [ ("falcon", "bird", 389.0), ("parrot", "bird", 24.0), ("lion", "mammal", 80.5), ("monkey", "mammal", np.nan), ], columns=["name", "class", "max_speed"], index=[0, 2, 3, 1], ) pandas_df = pandas.DataFrame( [ ("falcon", "bird", 389.0), ("parrot", "bird", 24.0), ("lion", "mammal", 80.5), ("monkey", "mammal", np.nan), ], columns=["name", "class", "max_speed"], index=[0, 2, 3, 1], ) df_equals(modin_df.take([0, 3]), pandas_df.take([0, 3])) df_equals(modin_df.take([2], axis=1), pandas_df.take([2], axis=1))
def test_reset_index_with_named_index(): modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) modin_df.index.name = pandas_df.index.name = "NAME_OF_INDEX" df_equals(modin_df, pandas_df) df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False)) modin_df.reset_index(drop=True, inplace=True) pandas_df.reset_index(drop=True, inplace=True) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(test_data_values[0]) pandas_df = pandas.DataFrame(test_data_values[0]) modin_df.index.name = pandas_df.index.name = "NEW_NAME" df_equals(modin_df.reset_index(drop=False), pandas_df.reset_index(drop=False))
def test_nsmallest(): data = { "population": [ 59000000, 65000000, 434000, 434000, 434000, 337000, 11300, 11300, 11300, ], "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], } index = [ "Italy", "France", "Malta", "Maldives", "Brunei", "Iceland", "Nauru", "Tuvalu", "Anguilla", ] modin_df = pd.DataFrame(data=data, index=index) pandas_df = pandas.DataFrame(data=data, index=index) df_equals( modin_df.nsmallest(n=3, columns="population"), pandas_df.nsmallest(n=3, columns="population"), ) df_equals( modin_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), pandas_df.nsmallest(n=2, columns=["population", "GDP"], keep="all"), )
def test_fillna_sanity(): # with different dtype frame_data = [ ["a", "a", np.nan, "a"], ["b", "b", np.nan, "b"], ["c", "c", np.nan, "c"], ] df = pandas.DataFrame(frame_data) result = df.fillna({2: "foo"}) modin_df = pd.DataFrame(frame_data).fillna({2: "foo"}) df_equals(modin_df, result) modin_df = pd.DataFrame(df) df.fillna({2: "foo"}, inplace=True) modin_df.fillna({2: "foo"}, inplace=True) df_equals(modin_df, result) frame_data = { "Date": [pandas.NaT, pandas.Timestamp("2014-1-1")], "Date2": [pandas.Timestamp("2013-1-1"), pandas.NaT], } df = pandas.DataFrame(frame_data) result = df.fillna(value={"Date": df["Date2"]}) modin_df = pd.DataFrame(frame_data).fillna(value={"Date": df["Date2"]}) df_equals(modin_df, result) frame_data = {"A": [pandas.Timestamp("2012-11-11 00:00:00+01:00"), pandas.NaT]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) df_equals(modin_df.fillna(method="pad"), df.fillna(method="pad")) frame_data = {"A": [pandas.NaT, pandas.Timestamp("2012-11-11 00:00:00+01:00")]} df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data).fillna(method="bfill") df_equals(modin_df, df.fillna(method="bfill"))
def test_at_time(): i = pd.date_range("2008-01-01", periods=1000, freq="12H") modin_df = pd.DataFrame({"A": list(range(1000)), "B": list(range(1000))}, index=i) pandas_df = pandas.DataFrame( {"A": list(range(1000)), "B": list(range(1000))}, index=i ) df_equals(modin_df.at_time("12:00"), pandas_df.at_time("12:00")) df_equals(modin_df.at_time("3:00"), pandas_df.at_time("3:00")) df_equals(modin_df.T.at_time("12:00", axis=1), pandas_df.T.at_time("12:00", axis=1))
def test_drop_api_equivalence(): # equivalence of the labels/axis and index/columns API's frame_data = [[1, 2, 3], [3, 4, 5], [5, 6, 7]] modin_df = pd.DataFrame(frame_data, index=["a", "b", "c"], columns=["d", "e", "f"]) modin_df1 = modin_df.drop("a") modin_df2 = modin_df.drop(index="a") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop("d", 1) modin_df2 = modin_df.drop(columns="d") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(labels="e", axis=1) modin_df2 = modin_df.drop(columns="e") df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(["a"], axis=0) modin_df2 = modin_df.drop(index=["a"]) df_equals(modin_df1, modin_df2) modin_df1 = modin_df.drop(["a"], axis=0).drop(["d"], axis=1) modin_df2 = modin_df.drop(index=["a"], columns=["d"]) df_equals(modin_df1, modin_df2) with pytest.raises(ValueError): modin_df.drop(labels="a", index="b") with pytest.raises(ValueError): modin_df.drop(labels="a", columns="b") with pytest.raises(ValueError): modin_df.drop(axis=1)
def test_astype_category_large(): series_length = 10_000 modin_df = pd.DataFrame( { "col1": ["str{0}".format(i) for i in range(0, series_length)], "col2": [i for i in range(0, series_length)], } ) pandas_df = pandas.DataFrame( { "col1": ["str{0}".format(i) for i in range(0, series_length)], "col2": [i for i in range(0, series_length)], } ) modin_result = modin_df.astype({"col1": "category"}) pandas_result = pandas_df.astype({"col1": "category"}) df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes) modin_result = modin_df.astype("category") pandas_result = pandas_df.astype("category") df_equals(modin_result, pandas_result) assert modin_result.dtypes.equals(pandas_result.dtypes)
def test_fillna_datetime_columns(): frame_data = { "A": [-1, -2, np.nan], "B": pd.date_range("20130101", periods=3), "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?")) frame_data = { "A": [-1, -2, np.nan], "B": [ pandas.Timestamp("2013-01-01"), pandas.Timestamp("2013-01-02"), pandas.NaT, ], "C": ["foo", "bar", None], "D": ["foo2", "bar2", None], } df = pandas.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) modin_df = pd.DataFrame(frame_data, index=pd.date_range("20130110", periods=3)) df_equals(modin_df.fillna("?"), df.fillna("?"))
def test_h2o_q10(self): df = self._get_h2o_df() ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], observed=True).agg({ "v3": "sum", "v1": "count" }) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) modin_df = modin_df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], observed=True).agg({ "v3": "sum", "v1": "count" }) modin_df.reset_index(inplace=True) exp = to_pandas(modin_df) exp["id1"] = exp["id1"].astype("category") exp["id2"] = exp["id2"].astype("category") exp["id3"] = exp["id3"].astype("category") df_equals(ref, exp)
def test_h2o_q7(self): df = self._get_h2o_df() ref = (df.groupby(["id3"], observed=True).agg({ "v1": "max", "v2": "min" }).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]]) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) set_execution_mode(modin_df, "lazy") modin_df = modin_df.groupby(["id3"], observed=True).agg({ "v1": "max", "v2": "min" }) modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"] modin_df = modin_df[["range_v1_v2"]] modin_df.reset_index(inplace=True) set_execution_mode(modin_df, None) exp = to_pandas(modin_df) exp["id3"] = exp["id3"].astype("category") df_equals(ref, exp)