def testFetchDataFrame(self, *_): from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.expressions.arithmetic import add with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: session = cluster.session data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) r1 = session.run(df3, compose=False, timeout=_exec_timeout) r2 = session.fetch(df3) pd.testing.assert_frame_equal(r1, r2) data4 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas_df(data4, chunk_size=6) df5 = add(df3, df4) r1 = session.run(df5, compose=False, timeout=_exec_timeout) r2 = session.fetch(df5) pd.testing.assert_frame_equal(r1, r2)
def testFromPandasDataFrameExecution(self): pdf = pd.DataFrame(np.random.rand(20, 30), index=[np.arange(20), np.arange(20, 0, -1)]) df = from_pandas_df(pdf, chunk_size=(13, 21)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result)
def testChunkSerialize(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10, )), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas_df(data).tiles() # pb chunk = df.chunks[0] serials = self._pb_serial(chunk) op, pb = serials[chunk.op, chunk.data] self.assertEqual(tuple(pb.index), chunk.index) self.assertEqual(pb.key, chunk.key) self.assertEqual(tuple(pb.shape), chunk.shape) self.assertEqual(int(op.type.split('.', 1)[1]), OperandDef.DATAFRAME_DATA_SOURCE) chunk2 = self._pb_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) pd.testing.assert_index_equal(chunk2.columns.to_pandas(), chunk.columns.to_pandas()) # json chunk = df.chunks[0] serials = self._json_serial(chunk) chunk2 = self._json_deserial(serials)[chunk.data] self.assertEqual(chunk.index, chunk2.index) self.assertEqual(chunk.key, chunk2.key) self.assertEqual(chunk.shape, chunk2.shape) pd.testing.assert_index_equal(chunk2.index_value.to_pandas(), chunk.index_value.to_pandas()) pd.testing.assert_index_equal(chunk2.columns.to_pandas(), chunk.columns.to_pandas())
def testDataFrameGraphSerialize(self): df = from_pandas_df( pd.DataFrame(np.random.rand(10, 10), columns=[np.random.bytes(10) for _ in range(10)])) graph = df.build_graph(tiled=False) pb = graph.to_pb() graph2 = DAG.from_pb(pb) self.assertEqual(len(graph), len(graph2)) t = next(iter(graph)) t2 = next(iter(graph2)) self.assertTrue( t2.op.outputs[0], ReferenceType) # make sure outputs are all weak reference self.assertBaseEqual(t.op, t2.op) self.assertEqual(t.shape, t2.shape) self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs)) pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas()) pd.testing.assert_index_equal(t2.columns.to_pandas(), t.columns.to_pandas()) jsn = graph.to_json() graph2 = DAG.from_json(jsn) self.assertEqual(len(graph), len(graph2)) t = next(iter(graph)) t2 = next(iter(graph2)) self.assertTrue( t2.op.outputs[0], ReferenceType) # make sure outputs are all weak reference self.assertBaseEqual(t.op, t2.op) self.assertEqual(t.shape, t2.shape) self.assertEqual(sorted(i.key for i in t.inputs), sorted(i.key for i in t2.inputs)) pd.testing.assert_index_equal(t2.index_value.to_pandas(), t.index_value.to_pandas()) pd.testing.assert_index_equal(t2.columns.to_pandas(), t.columns.to_pandas()) # test graph with tiled DataFrame t2 = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)), chunk_size=(5, 4)).tiles() graph = DAG() graph.add_node(t2) pb = graph.to_pb() graph2 = DAG.from_pb(pb) self.assertEqual(len(graph), len(graph2)) chunks = next(iter(graph2)).chunks self.assertEqual(len(chunks), 6) self.assertIsInstance(chunks[0], DataFrameChunk) self.assertEqual(chunks[0].index, t2.chunks[0].index) self.assertBaseEqual(chunks[0].op, t2.chunks[0].op) pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas()) pd.testing.assert_index_equal(chunks[0].columns.to_pandas(), t2.chunks[0].columns.to_pandas()) jsn = graph.to_json() graph2 = DAG.from_json(jsn) self.assertEqual(len(graph), len(graph2)) chunks = next(iter(graph2)).chunks self.assertEqual(len(chunks), 6) self.assertIsInstance(chunks[0], DataFrameChunk) self.assertEqual(chunks[0].index, t2.chunks[0].index) self.assertBaseEqual(chunks[0].op, t2.chunks[0].op) pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(), t2.chunks[0].index_value.to_pandas()) pd.testing.assert_index_equal(chunks[0].columns.to_pandas(), t2.chunks[0].columns.to_pandas())
def testFromPandasDataFrame(self): data = pd.DataFrame(np.random.rand(10, 10), columns=['c' + str(i) for i in range(10)]) df = from_pandas_df(data, chunk_size=4) pd.testing.assert_series_equal(df.op.dtypes, data.dtypes) self.assertIsInstance(df.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(df.index_value._index_value._slice, slice(0, 10, 1)) self.assertTrue(df.index_value.is_monotonic_increasing) self.assertFalse(df.index_value.is_monotonic_decreasing) self.assertTrue(df.index_value.is_unique) self.assertEqual(df.index_value.min_val, 0) self.assertEqual(df.index_value.max_val, 9) np.testing.assert_equal(df.columns._index_value._data, data.columns.values) df.tiles() self.assertEqual(len(df.chunks), 9) pd.testing.assert_frame_equal(df.chunks[0].op.data, df.op.data.iloc[:4, :4]) self.assertEqual(df.chunks[0].index_value._index_value._slice, slice(0, 4, 1)) self.assertTrue( df.chunks[0].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[0].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[0].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[1].op.data, df.op.data.iloc[:4, 4:8]) self.assertEqual(df.chunks[1].index_value._index_value._slice, slice(0, 4, 1)) self.assertTrue( df.chunks[1].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[1].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[1].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[2].op.data, df.op.data.iloc[:4, 8:]) self.assertEqual(df.chunks[2].index_value._index_value._slice, slice(0, 4, 1)) self.assertTrue( df.chunks[2].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[2].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[2].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[3].op.data, df.op.data.iloc[4:8, :4]) self.assertEqual(df.chunks[3].index_value._index_value._slice, slice(4, 8, 1)) self.assertTrue( df.chunks[3].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[3].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[3].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[4].op.data, df.op.data.iloc[4:8, 4:8]) self.assertEqual(df.chunks[4].index_value._index_value._slice, slice(4, 8, 1)) self.assertTrue( df.chunks[4].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[4].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[4].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[5].op.data, df.op.data.iloc[4:8, 8:]) self.assertEqual(df.chunks[5].index_value._index_value._slice, slice(4, 8, 1)) self.assertTrue( df.chunks[5].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[5].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[5].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[6].op.data, df.op.data.iloc[8:, :4]) self.assertEqual(df.chunks[6].index_value._index_value._slice, slice(8, 10, 1)) self.assertTrue( df.chunks[6].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[6].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[6].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[7].op.data, df.op.data.iloc[8:, 4:8]) self.assertEqual(df.chunks[7].index_value._index_value._slice, slice(8, 10, 1)) self.assertTrue( df.chunks[7].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[7].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[7].index_value._index_value._is_unique) pd.testing.assert_frame_equal(df.chunks[8].op.data, df.op.data.iloc[8:, 8:]) self.assertEqual(df.chunks[8].index_value._index_value._slice, slice(8, 10, 1)) self.assertTrue( df.chunks[8].index_value._index_value._is_monotonic_increasing) self.assertFalse( df.chunks[8].index_value._index_value._is_monotonic_decreasing) self.assertTrue(df.chunks[8].index_value._index_value._is_unique) data2 = data[::2] df2 = from_pandas_df(data2, chunk_size=4) pd.testing.assert_series_equal(df.op.dtypes, data2.dtypes) self.assertIsInstance(df2.index_value._index_value, IndexValue.RangeIndex) self.assertEqual(df2.index_value._index_value._slice, slice(0, 10, 2)) df2.tiles() self.assertEqual(len(df2.chunks), 6) pd.testing.assert_frame_equal(df2.chunks[0].op.data, df2.op.data.iloc[:4, :4]) self.assertEqual(df2.chunks[0].index_value._index_value._slice, slice(0, 8, 2)) pd.testing.assert_frame_equal(df2.chunks[1].op.data, df2.op.data.iloc[:4, 4:8]) self.assertEqual(df2.chunks[1].index_value._index_value._slice, slice(0, 8, 2)) pd.testing.assert_frame_equal(df2.chunks[2].op.data, df2.op.data.iloc[:4, 8:]) self.assertEqual(df2.chunks[2].index_value._index_value._slice, slice(0, 8, 2)) pd.testing.assert_frame_equal(df2.chunks[3].op.data, df2.op.data.iloc[4:, :4]) self.assertEqual(df2.chunks[3].index_value._index_value._slice, slice(8, 10, 2)) pd.testing.assert_frame_equal(df2.chunks[4].op.data, df2.op.data.iloc[4:, 4:8]) self.assertEqual(df2.chunks[3].index_value._index_value._slice, slice(8, 10, 2)) pd.testing.assert_frame_equal(df2.chunks[5].op.data, df2.op.data.iloc[4:, 8:]) self.assertEqual(df2.chunks[3].index_value._index_value._slice, slice(8, 10, 2))
def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.expressions.datasource.series import from_pandas as from_pandas_series from mars.dataframe.expressions.arithmetic import add self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) graph = series1.build_graph() targets = [series1.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, series1.key) pd.testing.assert_series_equal(s1, loads(result))
def testEagerMode(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True) as cluster: self.assertIsInstance(Session.default_or_local()._sess, LocalClusterSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): a_data = np.random.rand(10, 10) a = mt.tensor(a_data, chunk_size=3) np.testing.assert_array_equal(a, a_data) r1 = a + 1 expected1 = a_data + 1 np.testing.assert_array_equal(r1, expected1) r2 = r1.dot(r1) expected2 = expected1.dot(expected1) np.testing.assert_array_almost_equal(r2, expected2) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 3) a = mt.ones((10, 10), chunk_size=3) with self.assertRaises(ValueError): a.fetch() r = a.dot(a) np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10) with new_session('http://' + cluster._web_endpoint).as_default(): from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df from mars.dataframe.expressions.datasource.series import from_pandas as from_pandas_series from mars.dataframe.expressions.arithmetic import add self.assertIsInstance(Session.default_or_local()._sess, WebSession) with option_context({'eager_mode': True}): data1 = pd.DataFrame( np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas_df(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame( np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas_df(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = from_pandas_series(s1) pd.testing.assert_series_equal(series1.fetch(), s1) web_session = Session.default_or_local()._sess self.assertEqual(web_session.get_task_count(), 4)