示例#1
0
    def testEagerMode(self, *_):
        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:

            self.assertIsInstance(Session.default_or_local()._sess,
                                  ClusterSession)

            with option_context({'eager_mode': True}):
                a_data = np.random.rand(10, 10)

                a = mt.tensor(a_data, chunk_size=3)
                np.testing.assert_array_equal(a, a_data)

                r1 = a + 1
                expected1 = a_data + 1
                np.testing.assert_array_equal(r1, expected1)

                r2 = r1.dot(r1)
                expected2 = expected1.dot(expected1)
                np.testing.assert_array_almost_equal(r2, expected2)

            a = mt.ones((10, 10), chunk_size=3)
            with self.assertRaises(ValueError):
                a.fetch()

            r = a.dot(a)
            np.testing.assert_array_equal(r.to_numpy(), np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    a_data = np.random.rand(10, 10)

                    a = mt.tensor(a_data, chunk_size=3)
                    np.testing.assert_array_equal(a, a_data)

                    r1 = a + 1
                    expected1 = a_data + 1
                    np.testing.assert_array_equal(r1, expected1)

                    r2 = r1.dot(r1)
                    expected2 = expected1.dot(expected1)
                    np.testing.assert_array_almost_equal(r2, expected2)

                    web_session = Session.default_or_local()._sess
                    self.assertEqual(web_session.get_task_count(), 3)

                a = mt.ones((10, 10), chunk_size=3)
                with self.assertRaises(ValueError):
                    a.fetch()

                r = a.dot(a)
                np.testing.assert_array_equal(r.to_numpy(),
                                              np.ones((10, 10)) * 10)

            with new_session('http://' + cluster._web_endpoint).as_default():
                from mars.dataframe.datasource.dataframe import from_pandas as from_pandas_df
                from mars.dataframe.datasource.series import from_pandas as from_pandas_series
                from mars.dataframe.arithmetic import add

                self.assertIsInstance(Session.default_or_local()._sess,
                                      WebSession)

                with option_context({'eager_mode': True}):
                    data1 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                        columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
                    df1 = from_pandas_df(data1, chunk_size=5)
                    pd.testing.assert_frame_equal(df1.fetch(), data1)

                    data2 = pd.DataFrame(
                        np.random.rand(10, 10),
                        index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                        columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
                    df2 = from_pandas_df(data2, chunk_size=6)
                    pd.testing.assert_frame_equal(df2.fetch(), data2)

                    df3 = add(df1, df2)
                    pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)

                    s1 = pd.Series(np.random.rand(10),
                                   index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
                    series1 = from_pandas_series(s1)
                    pd.testing.assert_series_equal(series1.fetch(), s1)

                web_session = Session.default_or_local()._sess
                self.assertEqual(web_session.get_task_count(), 4)
示例#2
0
    def testDecideChunks(self):
        with option_context() as options:
            options.tensor.chunk_store_limit = 64

            memory_usage = pd.Series([8, 22.2, 4, 2, 11.2],
                                     index=list('abcde'))

            shape = (10, 5)
            nsplit = decide_chunk_sizes(shape, None, memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, {0: 4}, memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, (2, 3), memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, (10, 3), memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            options.tensor.chunk_store_limit = 20

            shape = (10, 5)
            nsplit = decide_chunk_sizes(shape, None, memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, {1: 3}, memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, (2, 3), memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))

            nsplit = decide_chunk_sizes(shape, (10, 3), memory_usage)
            [
                self.assertTrue(all(isinstance(i, Integral) for i in ns))
                for ns in nsplit
            ]
            self.assertEqual(shape, tuple(sum(ns) for ns in nsplit))
示例#3
0
 def wrapped2():
     wrapped()
     with option_context({'eager_mode': True}):
         self.assertTrue(options.eager_mode)
         self.assertFalse(utils.is_eager_mode())
示例#4
0
def test_arrow_list_functions():
    lst = np.array([['a, bc'], ['de'], ['e', 'ee'], ['中文', '中文2']],
                   dtype=object)
    has_na_lst = lst.copy()
    has_na_lst[1] = None

    for pandas_only in [False, True]:
        with option_context({'dataframe.arrow_array.pandas_only':
                             pandas_only}):
            arrow_array = ArrowListArray(lst)
            has_na_arrow_array = ArrowListArray(has_na_lst)

            # getitem, scalar
            assert arrow_array[1] == lst[1]
            assert list(arrow_array[-1]) == lst[-1]
            # getitem, slice
            np.testing.assert_array_equal(arrow_array[:2].to_numpy(), lst[:2])

            # setitem
            arrow_array2 = arrow_array.copy()
            lst2 = lst.copy()
            for s in [['ss'], pd.Series(['ss'])]:
                arrow_array2[0] = s
                lst2[0] = ['ss']
                np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
            arrow_array2[0] = None
            lst2[0] = None
            np.testing.assert_array_equal(arrow_array2.to_numpy(), lst2)
            with pytest.raises(ValueError):
                # must set list like object
                arrow_array2[0] = 'ss'

            # test to_numpy
            np.testing.assert_array_equal(arrow_array.to_numpy(), lst)
            np.testing.assert_array_equal(arrow_array.to_numpy(copy=True), lst)
            np.testing.assert_array_equal(
                has_na_arrow_array.to_numpy(na_value=1),
                pd.Series(has_na_lst).fillna(1).to_numpy())

            # test fillna
            if not pandas_only:
                arrow_array3 = has_na_arrow_array.fillna(lst[1])
                np.testing.assert_array_equal(arrow_array3.to_numpy(), lst)

            # test astype
            with pytest.raises(TypeError):
                arrow_array.astype(np.int64)
            with pytest.raises(TypeError):
                arrow_array.astype(ArrowListDtype(np.int64))
            arrow_array4 = ArrowListArray([[1, 2], [3]])
            expected = np.array([['1', '2'], ['3']], dtype=object)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(str)), expected)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype)),
                arrow_array4)
            np.testing.assert_array_equal(
                arrow_array4.astype(ArrowListDtype(arrow_array4.dtype),
                                    copy=False), arrow_array4)

            # test nbytes
            assert arrow_array.nbytes < pd.Series(lst).memory_usage(deep=True)

            # test memory_usage
            if not pandas_only:
                assert arrow_array.memory_usage(
                    deep=True) == arrow_array.nbytes

            # test isna
            np.testing.assert_array_equal(has_na_arrow_array.isna(),
                                          pd.Series(has_na_lst).isna())

            # test take
            assert list(arrow_array.take([1, 2, -1])) == list(
                pd.Series(lst).take([1, 2, -1]))

            # test shift
            assert list(arrow_array.shift(
                2, fill_value=['aa'])) == [['aa']] * 2 + lst[:-2].tolist()

            # test all any
            if _use_bool_any_all:
                assert arrow_array.all() == pd.array(lst).all()
                assert arrow_array.any() == pd.array(lst).any()
            else:
                assert arrow_array.all() == lst.all()
                assert arrow_array.any() == lst.any()

            # test repr
            assert 'ArrowListArray' in repr(arrow_array)

            # test concat empty
            arrow_array5 = ArrowListArray(
                pa.chunked_array([], type=pa.list_(pa.string())))
            concatenated = ArrowListArray._concat_same_type(
                [arrow_array5, arrow_array5])
            if not pandas_only:
                assert len(concatenated._arrow_array.chunks) == 1
            pd.testing.assert_series_equal(pd.Series(arrow_array5),
                                           pd.Series(concatenated))
示例#5
0
def test_arrow_string_array_functions():
    lst = np.array(['abc', 'de', 'eee', '中文'], dtype=object)
    # leverage string array to get the right answer
    string_array = pd.arrays.StringArray(lst)
    has_na_arrow_array = ArrowStringArray(['abc', None, 'eee', '中文'])
    has_na_string_array = pd.arrays.StringArray(
        np.array(['abc', pd.NA, 'eee', '中文'], dtype=object))

    for pandas_only in [False, True]:
        with option_context({'dataframe.arrow_array.pandas_only':
                             pandas_only}):
            arrow_array = ArrowStringArray(lst)

            # getitem, scalar
            assert arrow_array[1] == string_array[1]
            assert arrow_array[-1] == string_array[-1]
            # getitem, slice
            assert list(arrow_array[:2]) == list(string_array[:2])
            assert list(arrow_array[1:-1]) == list(string_array[1:-1])
            assert list(arrow_array[::2]) == list(string_array[::2])
            # getitem, boolean index
            cond = np.array([len(c) > 2 for c in lst])
            assert list(arrow_array[cond]) == list(string_array[cond])
            # getitem, fancy index
            selection = [3, 1, 2]
            assert list(arrow_array[selection]) == list(
                string_array[selection])
            selection = [3, -1, 2, -4]
            assert list(arrow_array[selection]) == list(
                string_array[selection])
            selection = np.array([3, -1, 2, -4])
            assert list(arrow_array[selection]) == list(
                string_array[selection])

            # setitem
            arrow_array2 = arrow_array.copy()
            string_array2 = string_array.copy()
            arrow_array2[0] = 'ss'
            string_array2[0] = 'ss'
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[1:3] = ['ss1', 'ss2']
            string_array2[1:3] = ['ss1', 'ss2']
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[1:3] = arrow_array2[2:4]
            string_array2[1:3] = string_array2[2:4]
            assert list(arrow_array2) == list(string_array2)
            arrow_array2[2:] = pd.Series(['ss3', 'ss4'])
            string_array2[2:] = pd.Series(['ss3', 'ss4'])
            assert list(arrow_array2) == list(string_array2)
            with pytest.raises(ValueError):
                arrow_array2[0] = ['a', 'b']
            arrow_array2[-1] = None
            string_array2[-1] = None
            assert list(arrow_array2)[:-1] == list(string_array2)[:-1]
            assert pd.isna(list(arrow_array2)[-1]) is True
            with pytest.raises(ValueError):
                arrow_array2[0] = 2
            with pytest.raises(ValueError):
                arrow_array2[:2] = [1, 2]

            # test to_numpy
            np.testing.assert_array_equal(arrow_array.to_numpy(),
                                          string_array.to_numpy())
            np.testing.assert_array_equal(arrow_array.to_numpy(copy=True),
                                          string_array.to_numpy(copy=True))
            np.testing.assert_array_equal(
                has_na_arrow_array.to_numpy(copy=True, na_value='ss'),
                has_na_string_array.to_numpy(copy=True, na_value='ss'))

            # test fillna
            arrow_array3 = has_na_arrow_array.fillna('filled')
            string_array3 = has_na_string_array.fillna('filled')
            assert list(arrow_array3) == list(string_array3)

            # test astype
            arrow_array4 = ArrowStringArray(['1', '10', '100'])
            # leverage string array to get the right answer
            string_array4 = pd.arrays.StringArray(
                np.array(['1', '10', '100'], dtype=object))
            np.testing.assert_array_equal(arrow_array4.astype(np.int64),
                                          string_array4.astype(np.int64))
            np.testing.assert_almost_equal(arrow_array4.astype(float),
                                           string_array4.astype(float))
            assert list(arrow_array4.astype(
                ArrowStringDtype(), copy=False)) == list(
                    string_array4.astype(pd.StringDtype(), copy=False))
            assert list(arrow_array4.astype(
                ArrowStringDtype(), copy=True)) == list(
                    string_array4.astype(pd.StringDtype(), copy=True))

            # test factorize
            codes, unique = arrow_array.factorize()
            codes2, unique2 = string_array.factorize()
            assert list(codes) == list(codes2)
            assert list(unique) == list(unique2)

            # test nbytes
            assert arrow_array.nbytes < pd.Series(
                string_array.astype(object)).memory_usage(deep=True,
                                                          index=False)

            # test memory_usage
            if pandas_only:
                assert arrow_array.memory_usage(
                    deep=False) == pd.Series(string_array).memory_usage(
                        index=False)
            else:
                assert arrow_array.memory_usage(
                    deep=True) == arrow_array.nbytes

            # test isna
            np.testing.assert_array_equal(has_na_arrow_array.isna(),
                                          has_na_string_array.isna())
            has_na_arrow_array2 = has_na_arrow_array.copy()
            has_na_arrow_array2._force_use_pandas = True
            np.testing.assert_array_equal(has_na_arrow_array2.isna(),
                                          has_na_string_array.isna())

            # test take
            assert list(arrow_array.take([1, 2, -1])) == list(
                string_array.take([1, 2, -1]))
            assert list(arrow_array.take([1, 2, -1], allow_fill=True).fillna('aa')) \
                   == list(string_array.take([1, 2, -1], allow_fill=True).fillna('aa'))
            assert list(arrow_array.take([1, 2, -1], allow_fill=True, fill_value='aa')) \
                   == list(string_array.take([1, 2, -1], allow_fill=True, fill_value='aa'))

            # test shift
            assert list(arrow_array.shift(2, fill_value='aa')) == list(
                string_array.shift(2, fill_value='aa'))

            # test value_counts
            assert list(arrow_array.value_counts()) == list(
                string_array.value_counts())
            assert list(has_na_arrow_array.value_counts(dropna=True)) == list(
                has_na_string_array.value_counts(dropna=True))

            # test all any
            assert arrow_array.all() == string_array.all()
            assert arrow_array.any() == string_array.any()

            # test arithmetic
            assert list(arrow_array + 's') == list(string_array + 's')
            assert list(
                (arrow_array + has_na_arrow_array).fillna('ss')) == list(
                    (string_array + has_na_string_array).fillna('ss'))

            # test comparison
            np.testing.assert_array_equal(arrow_array < 's',
                                          string_array < 's')
            pd.testing.assert_series_equal(
                pd.Series(arrow_array < has_na_arrow_array),
                pd.Series(string_array < has_na_string_array))

            # test repr
            assert 'ArrowStringArray' in repr(arrow_array)

            # test concat empty
            arrow_array5 = ArrowStringArray(
                pa.chunked_array([], type=pa.string()))
            concatenated = ArrowStringArray._concat_same_type(
                [arrow_array5, arrow_array5])
            if not pandas_only:
                assert len(concatenated._arrow_array.chunks) == 1
            pd.testing.assert_series_equal(pd.Series(arrow_array5),
                                           pd.Series(concatenated))
示例#6
0
def test_euclidean_distances_execution(setup):
    dense_raw_x = np.random.rand(30, 10)
    dense_raw_y = np.random.rand(40, 10)
    sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format='csr'))
    sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format='csr'))

    for raw_x, raw_y in [(dense_raw_x, dense_raw_y),
                         (sparse_raw_x, sparse_raw_y)]:
        x = mt.tensor(raw_x, chunk_size=9)
        y = mt.tensor(raw_y, chunk_size=7)

        distance = euclidean_distances(x, y)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw_x, Y=raw_y)
        np.testing.assert_almost_equal(result, expected)

        x_norm = x.sum(axis=1)[..., np.newaxis]
        y_norm = y.sum(axis=1)[np.newaxis, ...]
        distance = euclidean_distances(x,
                                       y,
                                       X_norm_squared=x_norm,
                                       Y_norm_squared=y_norm)
        x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis]
        y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...]

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw_x,
                                          raw_y,
                                          X_norm_squared=x_raw_norm,
                                          Y_norm_squared=y_raw_norm)
        np.testing.assert_almost_equal(result, expected)

        x_sq = (x**2).astype(np.float32)
        y_sq = (y**2).astype(np.float32)

        distance = euclidean_distances(x_sq, y_sq, squared=True)

        x_raw_sq = (raw_x**2).astype(np.float32)
        y_raw_sq = (raw_y**2).astype(np.float32)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True)
        np.testing.assert_almost_equal(result, expected, decimal=6)

        # test x is y
        distance = euclidean_distances(x)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw_x)

        np.testing.assert_almost_equal(result, expected)

    # test size adjust
    raw1 = np.random.rand(12, 4)
    raw2 = np.random.rand(18, 4)

    t1 = mt.tensor(raw1, chunk_size=4)
    t2 = mt.tensor(raw2, chunk_size=6)
    with option_context({'chunk_store_limit': 80}):
        distance = euclidean_distances(t1, t2)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw1, raw2)
        np.testing.assert_almost_equal(result, expected)

        distance = euclidean_distances(t2, t1)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw2, raw1)
        np.testing.assert_almost_equal(result, expected)

    with option_context({'chunk_store_limit': 20}):
        distance = euclidean_distances(t1, t2)

        result = distance.execute().fetch()
        expected = sk_euclidean_distances(raw1, raw2)
        np.testing.assert_almost_equal(result, expected)
示例#7
0
def setup():
    with option_context({'show_progress': False}):
        yield
示例#8
0
    def testGroupByPruneReadCSV(self):
        with tempfile.TemporaryDirectory() as tempdir:
            file_path = os.path.join(tempdir, 'test.csv')

            df = pd.DataFrame({
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce'),
                'd': list('abaaaddce')
            })
            df.to_csv(file_path, index=False)

            # Use test executor
            mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'})
            result = self.executor.execute_dataframe(mdf)[0]
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(result, expected)

            mdf = md.read_csv(file_path).groupby('c').agg({'a': 'sum'})
            expected = df.groupby('c').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c'])

            mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'})
            expected = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c'])

            mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'}) + 1
            expected = df.groupby('c').agg({'b': 'sum'}) + 1
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)

            mdf = md.read_csv(file_path,
                              usecols=['a', 'b',
                                       'c']).groupby('c').agg({'b': 'sum'})
            expected = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
            pd.testing.assert_frame_equal(mdf.fetch(), expected)
            optimized_df = tileable_optimized[mdf.data]
            self.assertEqual(optimized_df.inputs[0].op.usecols, ['b', 'c'])

            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('c').agg({'b': 'sum'})
            df2 = in_df.groupby('b').agg({'a': 'sum'})

            dfs = ExecutableTuple((df1, df2))
            results = dfs.execute().fetch()
            expected1 = df.groupby('c').agg({'b': 'sum'})
            expected2 = df.groupby('b').agg({'a': 'sum'})
            pd.testing.assert_frame_equal(results[0], expected1)
            pd.testing.assert_frame_equal(results[1], expected2)

            in_df = md.read_csv(file_path)
            df1 = in_df.groupby('c').agg({'b': 'sum'})

            dfs = ExecutableTuple((in_df, df1))
            results = dfs.execute().fetch()
            expected1 = df.groupby('c').agg({'b': 'sum'})
            pd.testing.assert_frame_equal(results[0], df)
            pd.testing.assert_frame_equal(results[1], expected1)

            with option_context({'optimize_tileable_graph': False}):
                mdf = md.read_csv(file_path).groupby('c').agg({'b': 'sum'})
                expected = df.groupby('c').agg({'b': 'sum'})
                pd.testing.assert_frame_equal(mdf.to_pandas(), expected)
                pd.testing.assert_frame_equal(mdf.fetch(), expected)

                tileable_graph = mdf.build_graph()
                self.assertIsNone(
                    list(tileable_graph.topological_iter())[0].op.usecols)