def testEagerMode(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, ClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.to_numpy(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 3) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.to_numpy(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.datasource.series import from_pandas as from_pandas_series from mars.dataframe.arithmetic import add self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): data1 = pd.DataFrame( np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame( np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) pd.testing.assert_series_equal(series1.fetch(), s1) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 4)
def testDecideChunks(self): with option_context() as options: options.tensor.chunk_store_limit = 64 memory_usage = pd.Series([8, 22.2, 4, 2, 11.2], index=list('abcde')) shape = (10, 5) nsplit = decide_chunk_sizes(shape, None, memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, {0: 4}, memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, (2, 3), memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, (10, 3), memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) options.tensor.chunk_store_limit = 20 shape = (10, 5) nsplit = decide_chunk_sizes(shape, None, memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, {1: 3}, memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, (2, 3), memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit)) nsplit = decide_chunk_sizes(shape, (10, 3), memory_usage) [ self.assertTrue(all(isinstance(i, Integral) for i in ns)) for ns in nsplit ] self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))
def wrapped2(): wrapped() with option_context({'eager_mode': True}): self.assertTrue(options.eager_mode) self.assertFalse(utils.is_eager_mode())
def test_arrow_list_functions(): lst = np.array([['a, bc'], ['de'], ['e', 'ee'], ['中文', '中文2']], dtype=object) has_na_lst = lst.copy() has_na_lst[1] = None for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowListArray(lst) has_na_arrow_array = ArrowListArray(has_na_lst) # getitem, scalar assert arrow_array[1] == lst[1] assert list(arrow_array[-1]) == lst[-1] # getitem, slice np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2]) # setitem arrow_array2 = arrow_array.copy() lst2 = lst.copy() for s in [['ss'], pd.Series(['ss'])]: arrow_array2[0] = s lst2[0] = ['ss'] np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) arrow_array2[0] = None lst2[0] = None np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2) with pytest.raises(ValueError): # must set list like object arrow_array2[0] = 'ss' # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), lst) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(na_value=1), pd.Series(has_na_lst).fillna(1).to_numpy()) # test fillna if not pandas_only: arrow_array3 = has_na_arrow_array.fillna(lst[1]) np.testing.assert_array_equal(arrow_array3.to_numpy(), lst) # test astype with pytest.raises(TypeError): arrow_array.astype(np.int64) with pytest.raises(TypeError): arrow_array.astype(ArrowListDtype(np.int64)) arrow_array4 = ArrowListArray([[1, 2], [3]]) expected = np.array([['1', '2'], ['3']], dtype=object) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(str)), expected) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)), arrow_array4) np.testing.assert_array_equal( arrow_array4.astype(ArrowListDtype(arrow_array4.dtype), copy=False), arrow_array4) # test nbytes assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True) # test memory_usage if not pandas_only: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), pd.Series(has_na_lst).isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( pd.Series(lst).take([1, 2, -1])) # test shift assert list(arrow_array.shift( 2, fill_value=['aa'])) == [['aa']] * 2 + lst[:-2].tolist() # test all any if _use_bool_any_all: assert arrow_array.all() == pd.array(lst).all() assert arrow_array.any() == pd.array(lst).any() else: assert arrow_array.all() == lst.all() assert arrow_array.any() == lst.any() # test repr assert 'ArrowListArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowListArray( pa.chunked_array([], type=pa.list_(pa.string()))) concatenated = ArrowListArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
def test_arrow_string_array_functions(): lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object) # leverage string array to get the right answer string_array = pd.arrays.StringArray(lst) has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文']) has_na_string_array = pd.arrays.StringArray( np.array(['abc', pd.NA, 'eee', '中文'], dtype=object)) for pandas_only in [False, True]: with option_context({'dataframe.arrow_array.pandas_only': pandas_only}): arrow_array = ArrowStringArray(lst) # getitem, scalar assert arrow_array[1] == string_array[1] assert arrow_array[-1] == string_array[-1] # getitem, slice assert list(arrow_array[:2]) == list(string_array[:2]) assert list(arrow_array[1:-1]) == list(string_array[1:-1]) assert list(arrow_array[::2]) == list(string_array[::2]) # getitem, boolean index cond = np.array([len(c) > 2 for c in lst]) assert list(arrow_array[cond]) == list(string_array[cond]) # getitem, fancy index selection = [3, 1, 2] assert list(arrow_array[selection]) == list( string_array[selection]) selection = [3, -1, 2, -4] assert list(arrow_array[selection]) == list( string_array[selection]) selection = np.array([3, -1, 2, -4]) assert list(arrow_array[selection]) == list( string_array[selection]) # setitem arrow_array2 = arrow_array.copy() string_array2 = string_array.copy() arrow_array2[0] = 'ss' string_array2[0] = 'ss' assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = ['ss1', 'ss2'] string_array2[1:3] = ['ss1', 'ss2'] assert list(arrow_array2) == list(string_array2) arrow_array2[1:3] = arrow_array2[2:4] string_array2[1:3] = string_array2[2:4] assert list(arrow_array2) == list(string_array2) arrow_array2[2:] = pd.Series(['ss3', 'ss4']) string_array2[2:] = pd.Series(['ss3', 'ss4']) assert list(arrow_array2) == list(string_array2) with pytest.raises(ValueError): arrow_array2[0] = ['a', 'b'] arrow_array2[-1] = None string_array2[-1] = None assert list(arrow_array2)[:-1] == list(string_array2)[:-1] assert pd.isna(list(arrow_array2)[-1]) is True with pytest.raises(ValueError): arrow_array2[0] = 2 with pytest.raises(ValueError): arrow_array2[:2] = [1, 2] # test to_numpy np.testing.assert_array_equal(arrow_array.to_numpy(), string_array.to_numpy()) np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), string_array.to_numpy(copy=True)) np.testing.assert_array_equal( has_na_arrow_array.to_numpy(copy=True, na_value='ss'), has_na_string_array.to_numpy(copy=True, na_value='ss')) # test fillna arrow_array3 = has_na_arrow_array.fillna('filled') string_array3 = has_na_string_array.fillna('filled') assert list(arrow_array3) == list(string_array3) # test astype arrow_array4 = ArrowStringArray(['1', '10', '100']) # leverage string array to get the right answer string_array4 = pd.arrays.StringArray( np.array(['1', '10', '100'], dtype=object)) np.testing.assert_array_equal(arrow_array4.astype(np.int64), string_array4.astype(np.int64)) np.testing.assert_almost_equal(arrow_array4.astype(float), string_array4.astype(float)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=False)) == list( string_array4.astype(pd.StringDtype(), copy=False)) assert list(arrow_array4.astype( ArrowStringDtype(), copy=True)) == list( string_array4.astype(pd.StringDtype(), copy=True)) # test factorize codes, unique = arrow_array.factorize() codes2, unique2 = string_array.factorize() assert list(codes) == list(codes2) assert list(unique) == list(unique2) # test nbytes assert arrow_array.nbytes < pd.Series( string_array.astype(object)).memory_usage(deep=True, index=False) # test memory_usage if pandas_only: assert arrow_array.memory_usage( deep=False) == pd.Series(string_array).memory_usage( index=False) else: assert arrow_array.memory_usage( deep=True) == arrow_array.nbytes # test isna np.testing.assert_array_equal(has_na_arrow_array.isna(), has_na_string_array.isna()) has_na_arrow_array2 = has_na_arrow_array.copy() has_na_arrow_array2._force_use_pandas = True np.testing.assert_array_equal(has_na_arrow_array2.isna(), has_na_string_array.isna()) # test take assert list(arrow_array.take([1, 2, -1])) == list( string_array.take([1, 2, -1])) assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa')) assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \ == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) # test shift assert list(arrow_array.shift(2, fill_value='aa')) == list( string_array.shift(2, fill_value='aa')) # test value_counts assert list(arrow_array.value_counts()) == list( string_array.value_counts()) assert list(has_na_arrow_array.value_counts(dropna=True)) == list( has_na_string_array.value_counts(dropna=True)) # test all any assert arrow_array.all() == string_array.all() assert arrow_array.any() == string_array.any() # test arithmetic assert list(arrow_array + 's') == list(string_array + 's') assert list( (arrow_array + has_na_arrow_array).fillna('ss')) == list( (string_array + has_na_string_array).fillna('ss')) # test comparison np.testing.assert_array_equal(arrow_array < 's', string_array < 's') pd.testing.assert_series_equal( pd.Series(arrow_array < has_na_arrow_array), pd.Series(string_array < has_na_string_array)) # test repr assert 'ArrowStringArray' in repr(arrow_array) # test concat empty arrow_array5 = ArrowStringArray( pa.chunked_array([], type=pa.string())) concatenated = ArrowStringArray._concat_same_type( [arrow_array5, arrow_array5]) if not pandas_only: assert len(concatenated._arrow_array.chunks) == 1 pd.testing.assert_series_equal(pd.Series(arrow_array5), pd.Series(concatenated))
def test_euclidean_distances_execution(setup): dense_raw_x = np.random.rand(30, 10) dense_raw_y = np.random.rand(40, 10) sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format='csr')) sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format='csr')) for raw_x, raw_y in [(dense_raw_x, dense_raw_y), (sparse_raw_x, sparse_raw_y)]: x = mt.tensor(raw_x, chunk_size=9) y = mt.tensor(raw_y, chunk_size=7) distance = euclidean_distances(x, y) result = distance.execute().fetch() expected = sk_euclidean_distances(raw_x, Y=raw_y) np.testing.assert_almost_equal(result, expected) x_norm = x.sum(axis=1)[..., np.newaxis] y_norm = y.sum(axis=1)[np.newaxis, ...] distance = euclidean_distances(x, y, X_norm_squared=x_norm, Y_norm_squared=y_norm) x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis] y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...] result = distance.execute().fetch() expected = sk_euclidean_distances(raw_x, raw_y, X_norm_squared=x_raw_norm, Y_norm_squared=y_raw_norm) np.testing.assert_almost_equal(result, expected) x_sq = (x**2).astype(np.float32) y_sq = (y**2).astype(np.float32) distance = euclidean_distances(x_sq, y_sq, squared=True) x_raw_sq = (raw_x**2).astype(np.float32) y_raw_sq = (raw_y**2).astype(np.float32) result = distance.execute().fetch() expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True) np.testing.assert_almost_equal(result, expected, decimal=6) # test x is y distance = euclidean_distances(x) result = distance.execute().fetch() expected = sk_euclidean_distances(raw_x) np.testing.assert_almost_equal(result, expected) # test size adjust raw1 = np.random.rand(12, 4) raw2 = np.random.rand(18, 4) t1 = mt.tensor(raw1, chunk_size=4) t2 = mt.tensor(raw2, chunk_size=6) with option_context({'chunk_store_limit': 80}): distance = euclidean_distances(t1, t2) result = distance.execute().fetch() expected = sk_euclidean_distances(raw1, raw2) np.testing.assert_almost_equal(result, expected) distance = euclidean_distances(t2, t1) result = distance.execute().fetch() expected = sk_euclidean_distances(raw2, raw1) np.testing.assert_almost_equal(result, expected) with option_context({'chunk_store_limit': 20}): distance = euclidean_distances(t1, t2) result = distance.execute().fetch() expected = sk_euclidean_distances(raw1, raw2) np.testing.assert_almost_equal(result, expected)
def setup(): with option_context({'show_progress': False}): yield
def testGroupByPruneReadCSV(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) df.to_csv(file_path, index=False) # Use test executor mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'}) result = self.executor.execute_dataframe(mdf)[0] expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(result, expected) mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'}) expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c']) mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c']) mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) + 1 expected = df.groupby('c').agg({'b': 'sum'}) + 1 pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) mdf = md.read_csv(file_path, usecols=['a', 'b', 'c']).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c']) in_df = md.read_csv(file_path) df1 = in_df.groupby('c').agg({'b': 'sum'}) df2 = in_df.groupby('b').agg({'a': 'sum'}) dfs = ExecutableTuple((df1, df2)) results = dfs.execute().fetch() expected1 = df.groupby('c').agg({'b': 'sum'}) expected2 = df.groupby('b').agg({'a': 'sum'}) pd.testing.assert_frame_equal(results[0], expected1) pd.testing.assert_frame_equal(results[1], expected2) in_df = md.read_csv(file_path) df1 = in_df.groupby('c').agg({'b': 'sum'}) dfs = ExecutableTuple((in_df, df1)) results = dfs.execute().fetch() expected1 = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(results[0], df) pd.testing.assert_frame_equal(results[1], expected1) with option_context({'optimize_tileable_graph': False}): mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) expected = df.groupby('c').agg({'b': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) pd.testing.assert_frame_equal(mdf.fetch(), expected) tileable_graph = mdf.build_graph() self.assertIsNone( list(tileable_graph.topological_iter())[0].op.usecols)