def setup(self, self_type, value_type, shape, limit, inplace): pd = IMPL[ASV_USE_IMPL] columns = [f"col{x}" for x in range(shape[1])] if self_type == "DataFrame": self.dataset = pd.DataFrame(np.nan, index=pd.RangeIndex(shape[0]), columns=columns) elif self_type == "Series": self.dataset = pd.Series(np.nan, index=pd.RangeIndex(shape[0])) else: assert False if value_type == "scalar": self.value = 18.19 elif value_type == "dict": self.value = {k: k * 1.23 for k in range(shape[0])} elif value_type == "Series": self.value = pd.Series([k * 1.23 for k in range(shape[0])], index=pd.RangeIndex(shape[0])) elif value_type == "DataFrame": if self_type == "Series": raise NotImplementedError self.value = pd.DataFrame( { k: [i + j * 1.23 for j in range(shape[0])] for i, k in enumerate(columns) }, index=pd.RangeIndex(shape[0]), columns=columns, ) else: assert False self.limit = int(limit * shape[0]) if limit else None
def test___setitem__partitions_aligning(): # from issue #2390 modin_df = pd.DataFrame({"a": [1, 2, 3]}) pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) df_equals(modin_df, pandas_df) # from issue #2442 data = {"a": [1, 2, 3, 4]} # Index with duplicated timestamp index = pandas.to_datetime( ["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"]) md_df, pd_df = create_test_dfs(data, index=index) # Setting new column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) # Setting existing column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) pd_df["a"] = pandas.Series(np.arange(4)) md_df["a"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df)
def test_matmul(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df @ arr pandas_result = pandas_df @ arr df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_result = modin_df @ np.arange(col_len + 10) # Test series input modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) modin_result = modin_df @ modin_series pandas_result = pandas_df @ pandas_series df_equals(modin_result, pandas_result) # Test dataframe input modin_result = modin_df @ modin_df.T pandas_result = pandas_df @ pandas_df.T df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_result = modin_df @ pd.Series(np.arange(col_len))
def test_compare(align_axis, keep_shape, keep_equal): kwargs = { "align_axis": align_axis, "keep_shape": keep_shape, "keep_equal": keep_equal, } frame_data1 = random_state.randn(100, 10) frame_data2 = random_state.randn(100, 10) pandas_df = pandas.DataFrame(frame_data1, columns=list("abcdefghij")) pandas_df2 = pandas.DataFrame(frame_data2, columns=list("abcdefghij")) modin_df = pd.DataFrame(frame_data1, columns=list("abcdefghij")) modin_df2 = pd.DataFrame(frame_data2, columns=list("abcdefghij")) modin_result = modin_df.compare(modin_df2, **kwargs) pandas_result = pandas_df.compare(pandas_df2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_df2.compare(modin_df, **kwargs) pandas_result = pandas_df2.compare(pandas_df, **kwargs) assert to_pandas(modin_result).equals(pandas_result) series_data1 = ["a", "b", "c", "d", "e"] series_data2 = ["a", "a", "c", "b", "e"] pandas_series1 = pandas.Series(series_data1) pandas_series2 = pandas.Series(series_data2) modin_series1 = pd.Series(series_data1) modin_series2 = pd.Series(series_data2) modin_result = modin_series1.compare(modin_series2, **kwargs) pandas_result = pandas_series1.compare(pandas_series2, **kwargs) assert to_pandas(modin_result).equals(pandas_result) modin_result = modin_series2.compare(modin_series1, **kwargs) pandas_result = pandas_series2.compare(pandas_series1, **kwargs) assert to_pandas(modin_result).equals(pandas_result)
def create_test_series(vals): if isinstance(vals, dict): modin_series = pd.Series(vals[next(iter(vals.keys()))]) pandas_series = pandas.Series(vals[next(iter(vals.keys()))]) else: modin_series = pd.Series(vals) pandas_series = pandas.Series(vals) return modin_series, pandas_series
def test_unique(): modin_result = pd.unique([2, 1, 3, 3]) pandas_result = pandas.unique([2, 1, 3, 3]) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique(pd.Series([2] + [1] * 5)) pandas_result = pandas.unique(pandas.Series([2] + [1] * 5)) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]) ) pandas_result = pandas.unique( pandas.Series([pandas.Timestamp("20160101"), pandas.Timestamp("20160101")]) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Series( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Series( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique( pd.Index( [ pd.Timestamp("20160101", tz="US/Eastern"), pd.Timestamp("20160101", tz="US/Eastern"), ] ) ) pandas_result = pandas.unique( pandas.Index( [ pandas.Timestamp("20160101", tz="US/Eastern"), pandas.Timestamp("20160101", tz="US/Eastern"), ] ) ) assert_array_equal(modin_result, pandas_result) modin_result = pd.unique(pd.Series(pd.Categorical(list("baabc")))) pandas_result = pandas.unique(pandas.Series(pandas.Categorical(list("baabc")))) assert_array_equal(modin_result, pandas_result)
def test_to_datetime(): # DataFrame input for to_datetime modin_df = pd.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) pandas_df = pandas.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) df_equals(pd.to_datetime(modin_df), pandas.to_datetime(pandas_df)) # Series input for to_datetime modin_s = pd.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) pandas_s = pandas.Series(["3/11/2000", "3/12/2000", "3/13/2000"] * 1000) df_equals(pd.to_datetime(modin_s), pandas.to_datetime(pandas_s)) # Other inputs for to_datetime value = 1490195805 assert pd.to_datetime(value, unit="s") == pandas.to_datetime(value, unit="s") value = 1490195805433502912 assert pd.to_datetime(value, unit="ns") == pandas.to_datetime(value, unit="ns") value = [1, 2, 3] assert pd.to_datetime( value, unit="D", origin=pd.Timestamp("2000-01-01")).equals( pandas.to_datetime(value, unit="D", origin=pandas.Timestamp("2000-01-01")))
def test_assign(): data = test_data_values[0] modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = modin_df.assign(new_column=pd.Series(modin_df.iloc[:, 0])) pandas_result = pandas_df.assign(new_column=pandas.Series(pandas_df.iloc[:, 0])) df_equals(modin_result, pandas_result) modin_result = modin_df.assign( new_column=pd.Series(modin_df.iloc[:, 0]), new_column2=pd.Series(modin_df.iloc[:, 1]), ) pandas_result = pandas_df.assign( new_column=pandas.Series(pandas_df.iloc[:, 0]), new_column2=pandas.Series(pandas_df.iloc[:, 1]), ) df_equals(modin_result, pandas_result)
def setup(self, value_type, shape, limit): pd = IMPL[ASV_USE_IMPL] self.df = gen_nan_data(ASV_USE_IMPL, *shape) columns = self.df.columns if value_type == "scalar": self.value = 18.19 elif value_type == "dict": self.value = {k: i * 1.23 for i, k in enumerate(columns)} elif value_type == "Series": self.value = pd.Series([i * 1.23 for i in range(len(columns))], index=columns) elif value_type == "DataFrame": self.value = pd.DataFrame( { k: [i + j * 1.23 for j in range(shape[0])] for i, k in enumerate(columns) }, index=pd.RangeIndex(shape[0]), columns=columns, ) else: assert False limit = int(limit * shape[0]) if limit else None self.kw = {"value": self.value, "limit": limit}
def test_apply_on_empty_modin_series(self): LOG.info("test_apply_on_empty_series") md = self.modinSetUp() series = md.Series() md_val = series.apply(math_foo, compare_to=1) swifter_val = series.swifter.apply(math_foo, compare_to=1) self.assertEqual(md_val, swifter_val) # equality test
def test_setitem_on_empty_df(data, value, convert_to_series, new_col_id): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value modin_df[new_col_id] = pd.Series(value) if convert_to_series else value df_equals(modin_df, pandas_df)
def test_object_dtype_categorical(): cat_series = pd.Series( pd.Categorical(my_object_vals, categories=my_object_vals)) widget = show_grid(cat_series) constraints_enum = widget._columns[0]["constraints"]["enum"] assert not isinstance(constraints_enum[0], dict) assert not isinstance(constraints_enum[1], dict) widget._handle_view_msg_helper({ "type": "show_filter_dropdown", "field": 0, "search_val": None }) widget._handle_view_msg_helper({ "field": 0, "filter_info": { "field": 0, "selected": [0], "type": "text", "excluded": [], }, "type": "change_filter", }) assert len(widget._df) == 1 assert widget._df[0][0] == cat_series[0]
def test_add_row_button(): widget = SpreadsheetWidget(df=create_df()) event_history = init_event_history("row_added", widget=widget) widget._handle_view_msg_helper({"type": "add_row"}) assert event_history == [{ "name": "row_added", "index": 4, "source": "gui" }] # make sure the added row in the internal dataframe contains the # expected values added_index = event_history[0]["index"] expected_values = pd.Series({ "modin_spreadsheet_unfiltered_index": 4, "A": 1, "C": 1, "D": 3, "Date": pd.Timestamp("2013-01-02 00:00:00"), "E": "bar", "F": "fox", }) sort_idx = widget._df.loc[added_index].index assert (widget._df.loc[added_index] == expected_values[sort_idx]).all()
def test_astype(): td = pandas.DataFrame( test_data["int_data"])[["col1", "index", "col3", "col4"]] modin_df = pd.DataFrame(td.values, index=td.index, columns=td.columns) expected_df = pandas.DataFrame(td.values, index=td.index, columns=td.columns) modin_df_casted = modin_df.astype(np.int32) expected_df_casted = expected_df.astype(np.int32) df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype(np.float64) expected_df_casted = expected_df.astype(np.float64) df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype(str) expected_df_casted = expected_df.astype(str) df_equals(modin_df_casted, expected_df_casted) modin_df_casted = modin_df.astype("category") expected_df_casted = expected_df.astype("category") df_equals(modin_df_casted, expected_df_casted) dtype_dict = {"col1": np.int32, "index": np.int64, "col3": str} modin_df_casted = modin_df.astype(dtype_dict) expected_df_casted = expected_df.astype(dtype_dict) df_equals(modin_df_casted, expected_df_casted) # Ignore lint because this is testing bad input bad_dtype_dict = { "index": np.int32, "index": np.int64, "index": str } # noqa F601 modin_df_casted = modin_df.astype(bad_dtype_dict) expected_df_casted = expected_df.astype(bad_dtype_dict) df_equals(modin_df_casted, expected_df_casted) modin_df = pd.DataFrame(index=["row1"], columns=["col1"]) modin_df["col1"]["row1"] = 11 modin_df_casted = modin_df.astype(int) expected_df = pandas.DataFrame(index=["row1"], columns=["col1"]) expected_df["col1"]["row1"] = 11 expected_df_casted = expected_df.astype(int) df_equals(modin_df_casted, expected_df_casted) with pytest.raises(KeyError): modin_df.astype({"not_exists": np.uint8}) # The dtypes series must have a unique index. eval_general( modin_df, expected_df, lambda df: df.astype( pd.Series([str, str], index=["col1", "col1"]) if isinstance(df, pd.DataFrame) else pandas.Series( [str, str], index=["col1", "col1"])), )
def applyier(df): if convert_to_series: converted_value = (pandas.Series(value) if isinstance( df, pandas.DataFrame) else pd.Series(value)) else: converted_value = value df[new_col_id] = converted_value return df
def test_concat_series_only(): modin_series = pd.Series(list(range(1000))) pandas_series = pandas.Series(list(range(1000))) df_equals( pd.concat([modin_series, modin_series]), pandas.concat([pandas_series, pandas_series]), )
def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="T") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) with pytest.warns(UserWarning): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S")
def test___setitem__with_mismatched_partitions(): fname = "200kx99.csv" np.savetxt(fname, np.random.randint(0, 100, size=(200_000, 99)), delimiter=",") modin_df = pd.read_csv(fname) pandas_df = pandas.read_csv(fname) modin_df["new"] = pd.Series(list(range(len(modin_df)))) pandas_df["new"] = pandas.Series(list(range(len(pandas_df)))) df_equals(modin_df, pandas_df)
def test_constructor(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) df_equals(pandas_df, modin_df) pandas_df = pandas.DataFrame({k: pandas.Series(v) for k, v in data.items()}) modin_df = pd.DataFrame({k: pd.Series(v) for k, v in data.items()}) df_equals(pandas_df, modin_df)
def test_frame_fillna_limit(data, limit): pandas_df = pandas.DataFrame(data) replace_pandas_series = pandas_df.columns.to_series().sample(frac=1) replace_dict = replace_pandas_series.to_dict() replace_pandas_df = pandas.DataFrame( {col: pandas_df.index.to_series() for col in pandas_df.columns}, index=pandas_df.index, ).sample(frac=1) replace_modin_series = pd.Series(replace_pandas_series) replace_modin_df = pd.DataFrame(replace_pandas_df) index = pandas_df.index result = pandas_df[:2].reindex(index) modin_df = pd.DataFrame(result) if isinstance(limit, float): limit = int(len(modin_df) * limit) if limit is not None and limit < 0: limit = len(modin_df) + limit df_equals( modin_df.fillna(method="pad", limit=limit), result.fillna(method="pad", limit=limit), ) df_equals( modin_df.fillna(replace_dict, limit=limit), result.fillna(replace_dict, limit=limit), ) df_equals( modin_df.fillna(replace_modin_series, limit=limit), result.fillna(replace_pandas_series, limit=limit), ) df_equals( modin_df.fillna(replace_modin_df, limit=limit), result.fillna(replace_pandas_df, limit=limit), ) result = pandas_df[-2:].reindex(index) modin_df = pd.DataFrame(result) df_equals( modin_df.fillna(method="backfill", limit=limit), result.fillna(method="backfill", limit=limit), ) df_equals( modin_df.fillna(replace_dict, limit=limit), result.fillna(replace_dict, limit=limit), ) df_equals( modin_df.fillna(replace_modin_series, limit=limit), result.fillna(replace_pandas_series, limit=limit), ) df_equals( modin_df.fillna(replace_modin_df, limit=limit), result.fillna(replace_pandas_df, limit=limit), )
def get_test_data(): return { "A": 1.0, "B": pd.Timestamp("20130102"), "C": pd.Series(1, index=list(range(4)), dtype="float32"), "D": np.array([3] * 4, dtype="int32"), "E": pd.Categorical(["test", "train", "foo", "bar"]), "F": ["foo", "bar", "buzz", "fox"], }
def test_modin_series_errors_on_missing_transformations(self): LOG.info("test_modin_series_errors_on_missing_transformations") md = self.modinSetUp() series = md.Series() with self.assertRaises(NotImplementedError): series.swifter.rolling(1) with self.assertRaises(NotImplementedError): series.swifter.resample(1)
def test_concat_with_empty_frame(): modin_empty_df = pd.DataFrame() pandas_empty_df = pandas.DataFrame() modin_row = pd.Series({0: "a", 1: "b"}) pandas_row = pandas.Series({0: "a", 1: "b"}) df_equals( pd.concat([modin_empty_df, modin_row]), pandas.concat([pandas_empty_df, pandas_row]), )
def calculateAuthorRanking(stars_list, authors_loves, authors_views, nFollowers, shared): import modin.pandas as pd import numpy as np stars_list = pd.Series(stars_list) authors_loves = pd.Series(authors_loves) authors_views = pd.Series(authors_views) nFollowers = pd.Series(nFollowers) shared = pd.Series(shared) author_ranking = (stars_list + nFollowers + authors_loves + authors_views) / shared author_ranking = author_ranking.fillna(0) author_ranking = author_ranking.replace([np.inf, -np.inf], 0) author_ranking = author_ranking.tolist() return author_ranking
def test_get_dummies(): s = pd.Series(list("abca")) with pytest.warns(UserWarning): pd.get_dummies(s) s1 = ["a", "b", np.nan] with pytest.warns(UserWarning): pd.get_dummies(s1) with pytest.warns(UserWarning): pd.get_dummies(s1, dummy_na=True) data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]} modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = pd.get_dummies(modin_df, prefix=["col1", "col2"]) pandas_result = pandas.get_dummies(pandas_df, prefix=["col1", "col2"]) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape modin_result = pd.get_dummies(pd.DataFrame(pd.Series(list("abcdeabac")))) pandas_result = pandas.get_dummies( pandas.DataFrame(pandas.Series(list("abcdeabac"))) ) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape with pytest.raises(NotImplementedError): pd.get_dummies(modin_df, prefix=["col1", "col2"], sparse=True) with pytest.warns(UserWarning): pd.get_dummies(pd.Series(list("abcaa"))) with pytest.warns(UserWarning): pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) with pytest.warns(UserWarning): pd.get_dummies(pd.Series(list("abc")), dtype=float) with pytest.warns(UserWarning): pd.get_dummies(1)
def test_dot(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) col_len = len(modin_df.columns) # Test list input arr = np.arange(col_len) modin_result = modin_df.dot(arr) pandas_result = pandas_df.dot(arr) df_equals(modin_result, pandas_result) # Test bad dimensions with pytest.raises(ValueError): modin_result = modin_df.dot(np.arange(col_len + 10)) # Test series input modin_series = pd.Series(np.arange(col_len), index=modin_df.columns) pandas_series = pandas.Series(np.arange(col_len), index=pandas_df.columns) modin_result = modin_df.dot(modin_series) pandas_result = pandas_df.dot(pandas_series) df_equals(modin_result, pandas_result) # Test dataframe input modin_result = modin_df.dot(modin_df.T) pandas_result = pandas_df.dot(pandas_df.T) df_equals(modin_result, pandas_result) # Test when input series index doesn't line up with columns with pytest.raises(ValueError): modin_result = modin_df.dot(pd.Series(np.arange(col_len))) # Test case when left dataframe has size (n x 1) # and right dataframe has size (1 x n) modin_df = pd.DataFrame(modin_series) pandas_df = pandas.DataFrame(pandas_series) modin_result = modin_df.dot(modin_df.T) pandas_result = pandas_df.dot(pandas_df.T) df_equals(modin_result, pandas_result) # Test case when left dataframe has size (1 x 1) # and right dataframe has size (1 x n) modin_result = pd.DataFrame([1]).dot(modin_df.T) pandas_result = pandas.DataFrame([1]).dot(pandas_df.T) df_equals(modin_result, pandas_result)
def test_aligning_blocks(): # Test problem when modin frames have the same number of rows, but different # blocks (partition.list_of_blocks). See #2322 for details accm = pd.DataFrame(["-22\n"] * 162) accm = accm.iloc[2:, :] accm.reset_index(drop=True, inplace=True) accm["T"] = pd.Series(["24.67\n"] * 145) # see #2322 for details repr(accm)
def test_to_numeric(data, errors, downcast): modin_series = pd.Series(data) pandas_series = pandas.Series(data) modin_result = pd.to_numeric(modin_series, errors=errors, downcast=downcast) pandas_result = pandas.to_numeric(pandas_series, errors=errors, downcast=downcast) df_equals(modin_result, pandas_result)
def test_notnull(data): pandas_df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) pandas_result = pandas.notnull(pandas_df) modin_result = pd.notnull(modin_df) df_equals(modin_result, pandas_result) modin_result = pd.notnull(pd.Series([1, np.nan, 2])) pandas_result = pandas.notnull(pandas.Series([1, np.nan, 2])) df_equals(modin_result, pandas_result) assert pd.isna(np.nan) == pandas.isna(np.nan)
def test_series_dt_index(closed): index = pandas.date_range("1/1/2000", periods=12, freq="T") pandas_series = pandas.Series(range(12), index=index) modin_series = pd.Series(range(12), index=index) pandas_rolled = pandas_series.rolling("3s", closed=closed) modin_rolled = modin_series.rolling("3s", closed=closed) df_equals(modin_rolled.count(), pandas_rolled.count()) df_equals(modin_rolled.skew(), pandas_rolled.skew()) df_equals(modin_rolled.apply(np.sum, raw=True), pandas_rolled.apply(np.sum, raw=True)) df_equals(modin_rolled.aggregate(np.sum), pandas_rolled.aggregate(np.sum)) df_equals(modin_rolled.quantile(0.1), pandas_rolled.quantile(0.1))