示例#1
0
    def testFetchDataFrame(self, *_):
        from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.expressions.arithmetic import add

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            session = cluster.session

            data1 = pd.DataFrame(np.random.rand(10, 10))
            df1 = from_pandas_df(data1, chunk_size=5)
            data2 = pd.DataFrame(np.random.rand(10, 10))
            df2 = from_pandas_df(data2, chunk_size=6)

            df3 = add(df1, df2)

            r1 = session.run(df3, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df3)
            pd.testing.assert_frame_equal(r1, r2)

            data4 = pd.DataFrame(np.random.rand(10, 10))
            df4 = from_pandas_df(data4, chunk_size=6)

            df5 = add(df3, df4)

            r1 = session.run(df5, compose=False, timeout=_exec_timeout)
            r2 = session.fetch(df5)
            pd.testing.assert_frame_equal(r1, r2)
示例#2
0
    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)
示例#3
0
    def testChunkSerialize(self):
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data).tiles()

        # pb
        chunk = df.chunks[0]
        serials = self._pb_serial(chunk)
        op, pb = serials[chunk.op, chunk.data]

        self.assertEqual(tuple(pb.index), chunk.index)
        self.assertEqual(pb.key, chunk.key)
        self.assertEqual(tuple(pb.shape), chunk.shape)
        self.assertEqual(int(op.type.split('.', 1)[1]),
                         OperandDef.DATAFRAME_DATA_SOURCE)

        chunk2 = self._pb_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())
        pd.testing.assert_index_equal(chunk2.columns.to_pandas(),
                                      chunk.columns.to_pandas())

        # json
        chunk = df.chunks[0]
        serials = self._json_serial(chunk)

        chunk2 = self._json_deserial(serials)[chunk.data]

        self.assertEqual(chunk.index, chunk2.index)
        self.assertEqual(chunk.key, chunk2.key)
        self.assertEqual(chunk.shape, chunk2.shape)
        pd.testing.assert_index_equal(chunk2.index_value.to_pandas(),
                                      chunk.index_value.to_pandas())
        pd.testing.assert_index_equal(chunk2.columns.to_pandas(),
                                      chunk.columns.to_pandas())
示例#4
0
    def testDataFrameGraphSerialize(self):
        df = from_pandas_df(
            pd.DataFrame(np.random.rand(10, 10),
                         columns=[np.random.bytes(10) for _ in range(10)]))
        graph = df.build_graph(tiled=False)

        pb = graph.to_pb()
        graph2 = DAG.from_pb(pb)
        self.assertEqual(len(graph), len(graph2))
        t = next(iter(graph))
        t2 = next(iter(graph2))
        self.assertTrue(
            t2.op.outputs[0],
            ReferenceType)  # make sure outputs are all weak reference
        self.assertBaseEqual(t.op, t2.op)
        self.assertEqual(t.shape, t2.shape)
        self.assertEqual(sorted(i.key for i in t.inputs),
                         sorted(i.key for i in t2.inputs))
        pd.testing.assert_index_equal(t2.index_value.to_pandas(),
                                      t.index_value.to_pandas())
        pd.testing.assert_index_equal(t2.columns.to_pandas(),
                                      t.columns.to_pandas())

        jsn = graph.to_json()
        graph2 = DAG.from_json(jsn)
        self.assertEqual(len(graph), len(graph2))
        t = next(iter(graph))
        t2 = next(iter(graph2))
        self.assertTrue(
            t2.op.outputs[0],
            ReferenceType)  # make sure outputs are all weak reference
        self.assertBaseEqual(t.op, t2.op)
        self.assertEqual(t.shape, t2.shape)
        self.assertEqual(sorted(i.key for i in t.inputs),
                         sorted(i.key for i in t2.inputs))
        pd.testing.assert_index_equal(t2.index_value.to_pandas(),
                                      t.index_value.to_pandas())
        pd.testing.assert_index_equal(t2.columns.to_pandas(),
                                      t.columns.to_pandas())

        # test graph with tiled DataFrame
        t2 = from_pandas_df(pd.DataFrame(np.random.rand(10, 10)),
                            chunk_size=(5, 4)).tiles()
        graph = DAG()
        graph.add_node(t2)

        pb = graph.to_pb()
        graph2 = DAG.from_pb(pb)
        self.assertEqual(len(graph), len(graph2))
        chunks = next(iter(graph2)).chunks
        self.assertEqual(len(chunks), 6)
        self.assertIsInstance(chunks[0], DataFrameChunk)
        self.assertEqual(chunks[0].index, t2.chunks[0].index)
        self.assertBaseEqual(chunks[0].op, t2.chunks[0].op)
        pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(),
                                      t2.chunks[0].index_value.to_pandas())
        pd.testing.assert_index_equal(chunks[0].columns.to_pandas(),
                                      t2.chunks[0].columns.to_pandas())

        jsn = graph.to_json()
        graph2 = DAG.from_json(jsn)
        self.assertEqual(len(graph), len(graph2))
        chunks = next(iter(graph2)).chunks
        self.assertEqual(len(chunks), 6)
        self.assertIsInstance(chunks[0], DataFrameChunk)
        self.assertEqual(chunks[0].index, t2.chunks[0].index)
        self.assertBaseEqual(chunks[0].op, t2.chunks[0].op)
        pd.testing.assert_index_equal(chunks[0].index_value.to_pandas(),
                                      t2.chunks[0].index_value.to_pandas())
        pd.testing.assert_index_equal(chunks[0].columns.to_pandas(),
                                      t2.chunks[0].columns.to_pandas())
示例#5
0
    def testFromPandasDataFrame(self):
        data = pd.DataFrame(np.random.rand(10, 10),
                            columns=['c' + str(i) for i in range(10)])
        df = from_pandas_df(data, chunk_size=4)

        pd.testing.assert_series_equal(df.op.dtypes, data.dtypes)
        self.assertIsInstance(df.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(df.index_value._index_value._slice, slice(0, 10, 1))
        self.assertTrue(df.index_value.is_monotonic_increasing)
        self.assertFalse(df.index_value.is_monotonic_decreasing)
        self.assertTrue(df.index_value.is_unique)
        self.assertEqual(df.index_value.min_val, 0)
        self.assertEqual(df.index_value.max_val, 9)
        np.testing.assert_equal(df.columns._index_value._data,
                                data.columns.values)

        df.tiles()

        self.assertEqual(len(df.chunks), 9)
        pd.testing.assert_frame_equal(df.chunks[0].op.data,
                                      df.op.data.iloc[:4, :4])
        self.assertEqual(df.chunks[0].index_value._index_value._slice,
                         slice(0, 4, 1))
        self.assertTrue(
            df.chunks[0].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[0].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[0].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[1].op.data,
                                      df.op.data.iloc[:4, 4:8])
        self.assertEqual(df.chunks[1].index_value._index_value._slice,
                         slice(0, 4, 1))
        self.assertTrue(
            df.chunks[1].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[1].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[1].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[2].op.data,
                                      df.op.data.iloc[:4, 8:])
        self.assertEqual(df.chunks[2].index_value._index_value._slice,
                         slice(0, 4, 1))
        self.assertTrue(
            df.chunks[2].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[2].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[2].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[3].op.data,
                                      df.op.data.iloc[4:8, :4])
        self.assertEqual(df.chunks[3].index_value._index_value._slice,
                         slice(4, 8, 1))
        self.assertTrue(
            df.chunks[3].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[3].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[3].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[4].op.data,
                                      df.op.data.iloc[4:8, 4:8])
        self.assertEqual(df.chunks[4].index_value._index_value._slice,
                         slice(4, 8, 1))
        self.assertTrue(
            df.chunks[4].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[4].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[4].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[5].op.data,
                                      df.op.data.iloc[4:8, 8:])
        self.assertEqual(df.chunks[5].index_value._index_value._slice,
                         slice(4, 8, 1))
        self.assertTrue(
            df.chunks[5].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[5].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[5].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[6].op.data,
                                      df.op.data.iloc[8:, :4])
        self.assertEqual(df.chunks[6].index_value._index_value._slice,
                         slice(8, 10, 1))
        self.assertTrue(
            df.chunks[6].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[6].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[6].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[7].op.data,
                                      df.op.data.iloc[8:, 4:8])
        self.assertEqual(df.chunks[7].index_value._index_value._slice,
                         slice(8, 10, 1))
        self.assertTrue(
            df.chunks[7].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[7].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[7].index_value._index_value._is_unique)
        pd.testing.assert_frame_equal(df.chunks[8].op.data,
                                      df.op.data.iloc[8:, 8:])
        self.assertEqual(df.chunks[8].index_value._index_value._slice,
                         slice(8, 10, 1))
        self.assertTrue(
            df.chunks[8].index_value._index_value._is_monotonic_increasing)
        self.assertFalse(
            df.chunks[8].index_value._index_value._is_monotonic_decreasing)
        self.assertTrue(df.chunks[8].index_value._index_value._is_unique)

        data2 = data[::2]
        df2 = from_pandas_df(data2, chunk_size=4)

        pd.testing.assert_series_equal(df.op.dtypes, data2.dtypes)
        self.assertIsInstance(df2.index_value._index_value,
                              IndexValue.RangeIndex)
        self.assertEqual(df2.index_value._index_value._slice, slice(0, 10, 2))

        df2.tiles()

        self.assertEqual(len(df2.chunks), 6)
        pd.testing.assert_frame_equal(df2.chunks[0].op.data,
                                      df2.op.data.iloc[:4, :4])
        self.assertEqual(df2.chunks[0].index_value._index_value._slice,
                         slice(0, 8, 2))
        pd.testing.assert_frame_equal(df2.chunks[1].op.data,
                                      df2.op.data.iloc[:4, 4:8])
        self.assertEqual(df2.chunks[1].index_value._index_value._slice,
                         slice(0, 8, 2))
        pd.testing.assert_frame_equal(df2.chunks[2].op.data,
                                      df2.op.data.iloc[:4, 8:])
        self.assertEqual(df2.chunks[2].index_value._index_value._slice,
                         slice(0, 8, 2))
        pd.testing.assert_frame_equal(df2.chunks[3].op.data,
                                      df2.op.data.iloc[4:, :4])
        self.assertEqual(df2.chunks[3].index_value._index_value._slice,
                         slice(8, 10, 2))
        pd.testing.assert_frame_equal(df2.chunks[4].op.data,
                                      df2.op.data.iloc[4:, 4:8])
        self.assertEqual(df2.chunks[3].index_value._index_value._slice,
                         slice(8, 10, 2))
        pd.testing.assert_frame_equal(df2.chunks[5].op.data,
                                      df2.op.data.iloc[4:, 8:])
        self.assertEqual(df2.chunks[3].index_value._index_value._slice,
                         slice(8, 10, 2))
示例#6
0
    def testMainDataFrameWithoutEtcd(self):
        import pandas as pd
        from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df
        from mars.dataframe.expressions.datasource.series import from_pandas as from_pandas_series
        from mars.dataframe.expressions.arithmetic import add

        self.start_processes(etcd=False)

        session_id = uuid.uuid1()
        actor_client = new_client()

        session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id))

        data1 = pd.DataFrame(np.random.rand(10, 10))
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=(10, 5))
        data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=(10, 6))

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas_df(data2, chunk_size=6)

        df3 = add(df1, df2)

        graph = df3.build_graph()
        targets = [df3.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        expected = data1 + data2
        result = session_ref.fetch_result(graph_key, df3.key)
        pd.testing.assert_frame_equal(expected, loads(result))

        s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series1 = from_pandas_series(s1)

        graph = series1.build_graph()
        targets = [series1.key]
        graph_key = uuid.uuid1()
        session_ref.submit_tileable_graph(json.dumps(graph.to_json()),
                                          graph_key, target_tileables=targets)

        state = self.wait_for_termination(actor_client, session_ref, graph_key)
        self.assertEqual(state, GraphState.SUCCEEDED)

        result = session_ref.fetch_result(graph_key, series1.key)
        pd.testing.assert_series_equal(s1, loads(result))
示例#7
0
    def testEagerMode(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:

            self.assertIsInstance(Session.default_or_local()._sess,
                                  LocalClusterSession)

            with option_context({'eager_mode': True}):
                a_data = np.random.rand(10, 10)

                a = mt.tensor(a_data, chunk_size=3)
                np.testing.assert_array_equal(a, a_data)

                r1 = a + 1
                expected1 = a_data + 1
                np.testing.assert_array_equal(r1, expected1)

                r2 = r1.dot(r1)
                expected2 = expected1.dot(expected1)
                np.testing.assert_array_almost_equal(r2, expected2)

            a = mt.ones((10, 10), chunk_size=3)
            with self.assertRaises(ValueError):
                a.fetch()

            r = a.dot(a)
            np.testing.assert_array_equal(r.execute(), np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    a_data = np.random.rand(10, 10)

                    a = mt.tensor(a_data, chunk_size=3)
                    np.testing.assert_array_equal(a, a_data)

                    r1 = a + 1
                    expected1 = a_data + 1
                    np.testing.assert_array_equal(r1, expected1)

                    r2 = r1.dot(r1)
                    expected2 = expected1.dot(expected1)
                    np.testing.assert_array_almost_equal(r2, expected2)

                    web_session = Session.default_or_local()._sess
                    self.assertEqual(web_session.get_task_count(), 3)

                a = mt.ones((10, 10), chunk_size=3)
                with self.assertRaises(ValueError):
                    a.fetch()

                r = a.dot(a)
                np.testing.assert_array_equal(r.execute(),
                                              np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                from mars.dataframe.expressions.datasource.dataframe import from_pandas as from_pandas_df
                from mars.dataframe.expressions.datasource.series import from_pandas as from_pandas_series
                from mars.dataframe.expressions.arithmetic import add

                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    data1 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
                    df1 = from_pandas_df(data1, chunk_size=5)
                    pd.testing.assert_frame_equal(df1.fetch(), data1)

                    data2 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
                    df2 = from_pandas_df(data2, chunk_size=6)
                    pd.testing.assert_frame_equal(df2.fetch(), data2)

                    df3 = add(df1, df2)
                    pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)

                    s1 = pd.Series(np.random.rand(10),
                                   index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
                    series1 = from_pandas_series(s1)
                    pd.testing.assert_series_equal(series1.fetch(), s1)

                web_session = Session.default_or_local()._sess
                self.assertEqual(web_session.get_task_count(), 4)