Пример #1
0
class TestIndexReduction(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testIndexReduction(self):
        rs = np.random.RandomState(0)
        data = pd.Index(rs.randint(0, 5, (100, )))
        data2 = pd.Index(rs.randint(1, 6, (100, )))

        for method in ['min', 'max', 'all', 'any']:
            idx = md.Index(data)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data, method)())

            idx = md.Index(data, chunk_size=10)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data, method)())

            idx = md.Index(data2)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data2, method)())

            idx = md.Index(data2, chunk_size=10)
            result = self.executor.execute_dataframe(getattr(idx, method)(),
                                                     concat=True)[0]
            self.assertEqual(result, getattr(data2, method)())
Пример #2
0
class TestCustomAggregate(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testDataFrameAggregate(self):
        data = pd.DataFrame(np.random.rand(30, 20))

        df = md.DataFrame(data)
        result = df.agg(MockReduction1())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction1()))

        result = df.agg(MockReduction2())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))

        df = md.DataFrame(data, chunk_size=5)
        result = df.agg(MockReduction2())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))

        result = df.agg(MockReduction2())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))

    def testSeriesAggregate(self):
        data = pd.Series(np.random.rand(20))

        s = md.Series(data)
        result = s.agg(MockReduction1())
        self.assertEqual(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction1()))

        result = s.agg(MockReduction2())
        self.assertEqual(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))

        s = md.Series(data, chunk_size=5)
        result = s.agg(MockReduction2())
        self.assertAlmostEqual(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))

        result = s.agg(MockReduction2())
        self.assertAlmostEqual(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(MockReduction2()))
Пример #3
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testToCSVExecution(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100)
            },
            index=index)
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # test one file
            path = os.path.join(base_path, 'out.csv')

            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)

            # test multi files
            path = os.path.join(base_path, 'out-*.csv')
            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.iloc[33:66])
Пример #4
0
class TestUnary(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testAbs(self):
        data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(abs(df1), concat=True)[0]
        expected = data1.abs()
        pd.testing.assert_frame_equal(expected, result)
Пример #5
0
class TestGPUReduction(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testGPUExecution(self):
        df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
        df = to_gpu(md.DataFrame(df_raw, chunk_size=6))

        r = df.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())

        r = df.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())

        r = df.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(res.to_pandas(),
                                      df_raw.agg(['sum', 'var']))

        s_raw = pd.Series(np.random.rand(30))
        s = to_gpu(md.Series(s_raw, chunk_size=6))

        r = s.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.sum())

        r = s.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.kurt())

        r = s.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(),
                                       s_raw.agg(['sum', 'var']))

        s_raw = pd.Series(
            np.random.randint(0, 3, size=(30, )) *
            np.random.randint(0, 5, size=(30, )))
        s = to_gpu(md.Series(s_raw, chunk_size=6))

        r = s.unique()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        np.testing.assert_array_equal(
            cp.asnumpy(res).sort(),
            s_raw.unique().sort())
Пример #6
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    @require_cudf
    def testToGPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)

        res = self.executor.execute_dataframe(cdf, concat=True)[0]
        self.assertIsInstance(res, cudf.DataFrame)
        pd.testing.assert_frame_equal(res.to_pandas(), pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries)
        cseries = series.to_gpu()

        res = self.executor.execute_dataframe(cseries, concat=True)[0]
        self.assertIsInstance(res, cudf.Series)
        pd.testing.assert_series_equal(res.to_pandas(), pseries)

    @require_cudf
    def testToCPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        res = self.executor.execute_dataframe(df2, concat=True)[0]
        self.assertIsInstance(res, pd.DataFrame)
        pd.testing.assert_frame_equal(res, pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries, chunk_size=(13, 21))
        cseries = to_gpu(series)
        series2 = to_cpu(cseries)

        res = self.executor.execute_dataframe(series2, concat=True)[0]
        self.assertIsInstance(res, pd.Series)
        pd.testing.assert_series_equal(res, pseries)

    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10,))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        # test index rechunk execution
        data = pd.Index(np.random.rand(10,))
        index = from_pandas_index(data)
        index2 = index.rechunk(3)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)

        index2 = index.rechunk(1)
        res = self.executor.execute_dataframe(index2, concat=True)[0]
        pd.testing.assert_index_equal(data, res)

    def testResetIndexExecution(self):
        data = pd.DataFrame([('bird',    389.0),
                             ('bird',     24.0),
                             ('mammal',   80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = from_pandas_df(data)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, drop=True)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(drop=True)
        pd.testing.assert_frame_equal(result, expected)

        index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                           ('bird', 'parrot'),
                                           ('mammal', 'lion'),
                                           ('mammal', 'monkey')],
                                          names=['class', 'name'])
        data = pd.DataFrame([('bird',    389.0),
                             ('bird',     24.0),
                             ('mammal',   80.5),
                             ('mammal', np.nan)],
                            index=index,
                            columns=('type', 'max_speed'))
        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, level='class')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class')
        pd.testing.assert_frame_equal(result, expected)

        columns = pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')])
        data.columns = columns
        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df, level='class', col_level=1, col_fill='species')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class', col_level=1, col_fill='species')
        pd.testing.assert_frame_equal(result, expected)

        # Test Series

        s = pd.Series([1, 2, 3, 4], name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))

        series = from_pandas_series(s)
        s2 = series_reset_index(series, name='bar')
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(name='bar')
        pd.testing.assert_frame_equal(result, expected)

        series = from_pandas_series(s, chunk_size=2)
        s2 = series_reset_index(series, drop=True)
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(drop=True)
        pd.testing.assert_series_equal(result, expected)

        # Test Unknown shape
        sess = new_session()
        data1 = pd.DataFrame(np.random.rand(10, 3), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 3), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        df2 = from_pandas_df(data2, chunk_size=6)
        df = (df1 + df2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

        data1 = pd.Series(np.random.rand(10,), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        series1 = from_pandas_series(data1, chunk_size=3)
        data2 = pd.Series(np.random.rand(10,), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series2 = from_pandas_series(data2, chunk_size=3)
        df = (series1 + series2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

    def testSeriesMapExecution(self):
        raw = pd.Series(np.arange(10))
        s = from_pandas_series(raw, chunk_size=7)

        with self.assertRaises(ValueError):
            # cannot infer dtype, the inferred is int,
            # but actually it is float
            # just due to nan
            s.map({5: 10})

        r = s.map({5: 10}, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10})
        pd.testing.assert_series_equal(result, expected)

        r = s.map({i: 10 + i for i in range(7)}, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({i: 10 + i for i in range(7)})
        pd.testing.assert_series_equal(result, expected)

        r = s.map({5: 10}, dtype=float, na_action='ignore')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10}, na_action='ignore')
        pd.testing.assert_series_equal(result, expected)

        # dtype can be inferred
        r = s.map({5: 10.})
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10.})
        pd.testing.assert_series_equal(result, expected)

        r = s.map(lambda x: x + 1, dtype=int)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(lambda x: x + 1)
        pd.testing.assert_series_equal(result, expected)

        def f(x: int) -> float:
            return x + 1.

        # dtype can be inferred for function
        r = s.map(f)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(lambda x: x + 1.)
        pd.testing.assert_series_equal(result, expected)

        # test arg is a md.Series
        raw2 = pd.Series([10], index=[5])
        s2 = from_pandas_series(raw2)

        r = s.map(s2, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(raw2)
        pd.testing.assert_series_equal(result, expected)

        # test arg is a md.Series, and dtype can be inferred
        raw2 = pd.Series([10.], index=[5])
        s2 = from_pandas_series(raw2)

        r = s.map(s2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(raw2)
        pd.testing.assert_series_equal(result, expected)

        # test str
        raw = pd.Series(['a', 'b', 'c', 'd'])
        s = from_pandas_series(raw, chunk_size=2)

        r = s.map({'c': 'e'})
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({'c': 'e'})
        pd.testing.assert_series_equal(result, expected)

    def testDescribeExecution(self):
        s_raw = pd.Series(np.random.rand(10))

        # test one chunk
        series = from_pandas_series(s_raw, chunk_size=10)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        series = from_pandas_series(s_raw, chunk_size=3)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd'))
        df_raw['e'] = np.random.randint(100, size=10)

        # test one chunk
        df = from_pandas_df(df_raw, chunk_size=10)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = series.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        df = from_pandas_df(df_raw, chunk_size=3)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = df.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_frame_equal(result, expected)

        with self.assertRaises(ValueError):
            df.describe(percentiles=[1.1])

    def testDataFrameFillNAExecution(self):
        df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)

        value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype(np.float32),
                                    columns=list('ABCDEFG'))

        # test DataFrame single chunk with numeric fill
        df = from_pandas_df(df_raw)
        r = df.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test DataFrame single chunk with value as single chunk
        df = from_pandas_df(df_raw)
        value_df = from_pandas_df(value_df_raw)
        r = df.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_df_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test chunked with numeric fill
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test inplace tile
        df = from_pandas_df(df_raw, chunk_size=3)
        df.fillna(1, inplace=True)
        result = self.executor.execute_dataframe(df, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test forward fill in axis=0 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(method='pad')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(method='pad')
        pd.testing.assert_frame_equal(result, expected)

        # test backward fill in axis=0 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(method='backfill')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(method='backfill')
        pd.testing.assert_frame_equal(result, expected)

        # test forward fill in axis=1 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.ffill(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.ffill(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        # test backward fill in axis=1 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.bfill(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.bfill(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        # test fill with dataframe
        df = from_pandas_df(df_raw, chunk_size=3)
        value_df = from_pandas_df(value_df_raw, chunk_size=4)
        r = df.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_df_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test fill with series
        value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32),
                                     index=list('ABCDEFGHIJ'))
        df = from_pandas_df(df_raw, chunk_size=3)
        value_series = from_pandas_series(value_series_raw, chunk_size=4)
        r = df.fillna(value_series)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_series_raw)
        pd.testing.assert_frame_equal(result, expected)

    def testSeriesFillNAExecution(self):
        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
        value_series_raw = pd.Series(np.random.randint(0, 100, (10,)).astype(np.float32))

        series = from_pandas_series(series_raw)
        r = series.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test DataFrame single chunk with value as single chunk
        series = from_pandas_series(series_raw)
        value_series = from_pandas_series(value_series_raw)
        r = series.fillna(value_series)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(value_series_raw)
        pd.testing.assert_series_equal(result, expected)

        # test chunked with numeric fill
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test inplace tile
        series = from_pandas_series(series_raw, chunk_size=3)
        series.fillna(1, inplace=True)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test forward fill in axis=0 without limit
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(method='pad')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(method='pad')
        pd.testing.assert_series_equal(result, expected)

        # test backward fill in axis=0 without limit
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(method='backfill')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(method='backfill')
        pd.testing.assert_series_equal(result, expected)

        # test fill with series
        series = from_pandas_series(series_raw, chunk_size=3)
        value_df = from_pandas_series(value_series_raw, chunk_size=4)
        r = series.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(value_series_raw)
        pd.testing.assert_series_equal(result, expected)

    def testDataFrameApplyExecute(self):
        cols = [chr(ord('A') + i) for i in range(10)]
        df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols))

        old_chunk_store_limit = options.chunk_store_limit
        try:
            options.chunk_store_limit = 20

            df = from_pandas_df(df_raw, chunk_size=5)

            r = df.apply('ffill')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply('ffill')
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(['sum', 'max'])
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(['sum', 'max'])
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(np.sqrt)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sqrt)
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: pd.Series([1, 2]))
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: pd.Series([1, 2]))
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(np.sum, axis='index')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sum, axis='index')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(np.sum, axis='columns')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sum, axis='columns')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: [1, 2], axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: [1, 2], axis=1)
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: [1, 2], axis=1, result_type='expand')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: [1, 2], axis=1, result_type='expand')
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: list(range(10)), axis=1, result_type='reduce')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='reduce')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: list(range(10)), axis=1, result_type='broadcast')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: list(range(10)), axis=1, result_type='broadcast')
            pd.testing.assert_frame_equal(result, expected)
        finally:
            options.chunk_store_limit = old_chunk_store_limit

    def testSeriesApplyExecute(self):
        idxes = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i ** 2 for i in range(20)], index=idxes)

        series = from_pandas_series(s_raw, chunk_size=5)

        r = series.apply('add', args=(1,))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply('add', args=(1,))
        pd.testing.assert_series_equal(result, expected)

        r = series.apply(['sum', 'max'])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply(['sum', 'max'])
        pd.testing.assert_series_equal(result, expected)

        r = series.apply(np.sqrt)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply(np.sqrt)
        pd.testing.assert_series_equal(result, expected)

        r = series.apply('sqrt')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply('sqrt')
        pd.testing.assert_series_equal(result, expected)

        r = series.apply(lambda x: [x, x + 1], convert_dtype=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False)
        pd.testing.assert_series_equal(result, expected)

    def testTransformExecute(self):
        cols = [chr(ord('A') + i) for i in range(10)]
        df_raw = pd.DataFrame(dict((c, [i ** 2 for i in range(20)]) for c in cols))

        idx_vals = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i ** 2 for i in range(20)], index=idx_vals)

        def rename_fn(f, new_name):
            f.__name__ = new_name
            return f

        old_chunk_store_limit = options.chunk_store_limit
        try:
            options.chunk_store_limit = 20

            # DATAFRAME CASES
            df = from_pandas_df(df_raw, chunk_size=5)

            # test transform scenarios on data frames
            r = df.transform(lambda x: list(range(len(x))))
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))))
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: list(range(len(x))), axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))), axis=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(['cumsum', 'cummax', lambda x: x + 1])
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(['cumsum', 'cummax', lambda x: x + 1])
            pd.testing.assert_frame_equal(result, expected)

            fn_dict = OrderedDict([
                ('A', 'cumsum'),
                ('D', ['cumsum', 'cummax']),
                ('F', lambda x: x + 1),
            ])
            r = df.transform(fn_dict)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(fn_dict)
            pd.testing.assert_frame_equal(result, expected)

            # test agg scenarios on series
            r = df.transform(lambda x: x.iloc[:-1], _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.iloc[:-1])
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: x.iloc[:-1], axis=1, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.iloc[:-1], axis=1)
            pd.testing.assert_frame_equal(result, expected)

            fn_list = [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                       lambda x: x.iloc[:-1].reset_index(drop=True)]
            r = df.transform(fn_list, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(fn_list)
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: x.sum(), _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(lambda x: x.sum())
            pd.testing.assert_series_equal(result, expected)

            fn_dict = OrderedDict([
                ('A', rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1')),
                ('D', [rename_fn(lambda x: x.iloc[1:].reset_index(drop=True), 'f1'),
                       lambda x: x.iloc[:-1].reset_index(drop=True)]),
                ('F', lambda x: x.iloc[:-1].reset_index(drop=True)),
            ])
            r = df.transform(fn_dict, _call_agg=True)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.agg(fn_dict)
            pd.testing.assert_frame_equal(result, expected)

            # SERIES CASES
            series = from_pandas_series(s_raw, chunk_size=5)

            # test transform scenarios on series
            r = series.transform(lambda x: x + 1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_raw.transform(lambda x: x + 1)
            pd.testing.assert_series_equal(result, expected)

            r = series.transform(['cumsum', lambda x: x + 1])
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_raw.transform(['cumsum', lambda x: x + 1])
            pd.testing.assert_frame_equal(result, expected)
        finally:
            options.chunk_store_limit = old_chunk_store_limit

    def testStringMethodExecution(self):
        s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan])
        s2 = pd.concat([s, s, s])

        series = from_pandas_series(s, chunk_size=2)
        series2 = from_pandas_series(s2, chunk_size=2)

        # test getitem
        r = series.str[:3]
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str[:3]
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=False
        r = series.str.split(',', n=2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', n=2)
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=True
        r = series.str.split(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test rsplit
        r = series.str.rsplit(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.rsplit(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test cat all data
        r = series2.str.cat(sep='/', na_rep='e')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s2.str.cat(sep='/', na_rep='e')
        self.assertEqual(result, expected)

        # test cat list
        r = series.str.cat(['a', 'b', np.nan, 'c'])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(['a', 'b', np.nan, 'c'])
        pd.testing.assert_series_equal(result, expected)

        # test cat series
        r = series.str.cat(series.str.capitalize(), join='outer')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(s.str.capitalize(), join='outer')
        pd.testing.assert_series_equal(result, expected)

        # test extractall
        r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        pd.testing.assert_frame_equal(result, expected)

        # test extract, expand=False
        r = series.str.extract(r'[ab](\d)', expand=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=False)
        pd.testing.assert_series_equal(result, expected)

        # test extract, expand=True
        r = series.str.extract(r'[ab](\d)', expand=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=True)
        pd.testing.assert_frame_equal(result, expected)

    def testDatetimeMethodExecution(self):
        # test datetime
        s = pd.Series([pd.Timestamp('2020-1-1'),
                       pd.Timestamp('2020-2-1'),
                       np.nan])
        series = from_pandas_series(s, chunk_size=2)

        r = series.dt.year
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.year
        pd.testing.assert_series_equal(result, expected)

        r = series.dt.strftime('%m-%d-%Y')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.strftime('%m-%d-%Y')
        pd.testing.assert_series_equal(result, expected)

        # test timedelta
        s = pd.Series([pd.Timedelta('1 days'),
                       pd.Timedelta('3 days'),
                       np.nan])
        series = from_pandas_series(s, chunk_size=2)

        r = series.dt.days
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.days
        pd.testing.assert_series_equal(result, expected)

    def testSeriesIsin(self):
        # one chunk in multiple chunks
        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = pd.Series([2, 1, 9, 3])
        sa = from_pandas_series(a, chunk_size=10)
        sb = from_pandas_series(b, chunk_size=2)

        result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

        # multiple chunk in one chunks
        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = pd.Series([2, 1, 9, 3])
        sa = from_pandas_series(a, chunk_size=2)
        sb = from_pandas_series(b, chunk_size=4)

        result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

        # multiple chunk in multiple chunks
        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = pd.Series([2, 1, 9, 3])
        sa = from_pandas_series(a, chunk_size=2)
        sb = from_pandas_series(b, chunk_size=2)

        result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = pd.Series([2, 1, 9, 3])
        sa = from_pandas_series(a, chunk_size=2)

        result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = np.array([2, 1, 9, 3])
        sa = from_pandas_series(a, chunk_size=2)
        sb = tensor(b, chunk_size=3)

        result = self.executor.execute_dataframe(sa.isin(sb), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

        a = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
        b = {2, 1, 9, 3}  # set
        sa = from_pandas_series(a, chunk_size=2)

        result = self.executor.execute_dataframe(sa.isin(b), concat=True)[0]
        expected = a.isin(b)
        pd.testing.assert_series_equal(result, expected)

    def testCheckNA(self):
        df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)

        df = from_pandas_df(df_raw, chunk_size=4)

        pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.isna(), concat=True)[0],
                                      df_raw.isna())
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(df.notna(), concat=True)[0],
                                      df_raw.notna())

        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        series = from_pandas_series(series_raw, chunk_size=4)

        pd.testing.assert_series_equal(self.executor.execute_dataframe(series.isna(), concat=True)[0],
                                       series_raw.isna())
        pd.testing.assert_series_equal(self.executor.execute_dataframe(series.notna(), concat=True)[0],
                                       series_raw.notna())

    def testDropNA(self):
        # dataframe cases
        df_raw = pd.DataFrame(np.nan, index=range(0, 20), columns=list('ABCDEFGHIJ'))
        for _ in range(30):
            df_raw.iloc[random.randint(0, 19), random.randint(0, 9)] = random.randint(0, 99)
        for rowid in range(random.randint(1, 5)):
            row = random.randint(0, 19)
            for idx in range(0, 10):
                df_raw.iloc[row, idx] = random.randint(0, 99)

        # only one chunk in columns, can run dropna directly
        r = from_pandas_df(df_raw, chunk_size=(4, 10)).dropna()
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna())

        # multiple chunks in columns, count() will be called first
        r = from_pandas_df(df_raw, chunk_size=4).dropna()
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna())

        r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all')
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna(how='all'))

        r = from_pandas_df(df_raw, chunk_size=4).dropna(subset=list('ABFI'))
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna(subset=list('ABFI')))

        r = from_pandas_df(df_raw, chunk_size=4).dropna(how='all', subset=list('BDHJ'))
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna(how='all', subset=list('BDHJ')))

        r = from_pandas_df(df_raw, chunk_size=4)
        r.dropna(how='all', inplace=True)
        pd.testing.assert_frame_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                      df_raw.dropna(how='all'))

        # series cases
        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(10):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)

        r = from_pandas_series(series_raw, chunk_size=4).dropna()
        pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                       series_raw.dropna())

        r = from_pandas_series(series_raw, chunk_size=4)
        r.dropna(inplace=True)
        pd.testing.assert_series_equal(self.executor.execute_dataframe(r, concat=True)[0],
                                       series_raw.dropna())

    def testCutExecution(self):
        rs = np.random.RandomState(0)
        raw = rs.random(15) * 1000
        s = pd.Series(raw, index=['i{}'.format(i) for i in range(15)])
        bins = [10, 100, 500]
        ii = pd.interval_range(10, 500, 3)
        labels = ['a', 'b']

        t = tensor(raw, chunk_size=4)
        series = from_pandas_series(s, chunk_size=4)
        iii = from_pandas_index(ii, chunk_size=2)

        # cut on Series
        r = cut(series, bins)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins))

        r, b = cut(series, bins, retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        # cut on tensor
        r = cut(t, bins)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # one chunk
        r = cut(s, tensor(bins, chunk_size=2), right=False, include_lowest=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(result, pd.cut(s, bins, right=False, include_lowest=True))

        # test labels
        r = cut(t, bins, labels=labels)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        r = cut(t, bins, labels=False)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_tensor(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=False)
        np.testing.assert_array_equal(result, expected)

        # test labels which is tensor
        labels_t = tensor(['a', 'b'], chunk_size=1)
        r = cut(raw, bins, labels=labels_t, include_lowest=True)
        # result and expected is array whose dtype is CategoricalDtype
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.cut(raw, bins, labels=labels, include_lowest=True)
        self.assertEqual(len(result), len(expected))
        for r, e in zip(result, expected):
            np.testing.assert_equal(r, e)

        # test labels=False
        r, b = cut(raw, ii, labels=False, retbins=True)
        # result and expected is array whose dtype is CategoricalDtype
        r_result = self.executor.execute_tileable(r, concat=True)[0]
        b_result = self.executor.execute_tileable(b, concat=True)[0]
        r_expected, b_expected = pd.cut(raw, ii, labels=False, retbins=True)
        for r, e in zip(r_result, r_expected):
            np.testing.assert_equal(r, e)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test bins which is md.IntervalIndex
        r, b = cut(series, iii, labels=tensor(labels, chunk_size=1), retbins=True)
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_dataframe(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, ii, labels=labels, retbins=True)
        pd.testing.assert_series_equal(r_result, r_expected)
        pd.testing.assert_index_equal(b_result, b_expected)

        # test duplicates
        bins2 = [0, 2, 4, 6, 10, 10]
        r, b = cut(s, bins2, labels=False, retbins=True,
                   right=False, duplicates='drop')
        r_result = self.executor.execute_dataframe(r, concat=True)[0]
        b_result = self.executor.execute_tensor(b, concat=True)[0]
        r_expected, b_expected = pd.cut(s, bins2, labels=False, retbins=True,
                                        right=False, duplicates='drop')
        pd.testing.assert_series_equal(r_result, r_expected)
        np.testing.assert_array_equal(b_result, b_expected)

        ctx, executor = self._create_test_context(self.executor)
        with ctx:
            # test integer bins
            r = cut(series, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s, 3))

            r, b = cut(series, 3, right=False, retbins=True)
            r_result, b_result = executor.execute_dataframes([r, b])
            r_expected, b_expected = pd.cut(s, 3, right=False, retbins=True)
            pd.testing.assert_series_equal(r_result, r_expected)
            np.testing.assert_array_equal(b_result, b_expected)

            # test min max same
            s2 = pd.Series([1.1] * 15)
            r = cut(s2, 3)
            result = executor.execute_dataframes([r])[0]
            pd.testing.assert_series_equal(result, pd.cut(s2, 3))

            # test inf exist
            s3 = s2.copy()
            s3[-1] = np.inf
            with self.assertRaises(ValueError):
                executor.execute_dataframes([cut(s3, 3)])

    def testShiftExecution(self):
        # test dataframe
        rs = np.random.RandomState(0)
        raw = pd.DataFrame(rs.randint(1000, size=(10, 8)),
                           columns=['col' + str(i + 1) for i in range(8)])

        df = from_pandas_df(raw, chunk_size=5)

        for periods in (2, -2, 6, -6):
            for axis in (0, 1):
                for fill_value in (None, 0, 1.):
                    r = df.shift(periods=periods, axis=axis,
                                 fill_value=fill_value)

                    try:
                        result = self.executor.execute_dataframe(r, concat=True)[0]
                        expected = raw.shift(periods=periods, axis=axis,
                                             fill_value=fill_value)
                        pd.testing.assert_frame_equal(result, expected)
                    except AssertionError as e:  # pragma: no cover
                        raise AssertionError(
                            'Failed when periods: {}, axis: {}, fill_value: {}'.format(
                                periods, axis, fill_value
                            )) from e

        raw2 = raw.copy()
        raw2.index = pd.date_range('2020-1-1', periods=10)
        raw2.columns = pd.date_range('2020-3-1', periods=8)

        df2 = from_pandas_df(raw2, chunk_size=5)

        # test freq not None
        for periods in (2, -2):
            for axis in (0, 1):
                for fill_value in (None, 0, 1.):
                    r = df2.shift(periods=periods, freq='D', axis=axis,
                                  fill_value=fill_value)

                    try:
                        result = self.executor.execute_dataframe(r, concat=True)[0]
                        expected = raw2.shift(periods=periods, freq='D', axis=axis,
                                              fill_value=fill_value)
                        pd.testing.assert_frame_equal(result, expected)
                    except AssertionError as e:  # pragma: no cover
                        raise AssertionError(
                            'Failed when periods: {}, axis: {}, fill_value: {}'.format(
                                periods, axis, fill_value
                            )) from e

        # test tshift
        r = df2.tshift(periods=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.tshift(periods=1)
        pd.testing.assert_frame_equal(result, expected)

        with self.assertRaises(ValueError):
            _ = df.tshift(periods=1)

        # test series
        s = raw.iloc[:, 0]

        series = from_pandas_series(s, chunk_size=5)
        for periods in (0, 2, -2, 6, -6):
            for fill_value in (None, 0, 1.):
                r = series.shift(periods=periods, fill_value=fill_value)

                try:
                    result = self.executor.execute_dataframe(r, concat=True)[0]
                    expected = s.shift(periods=periods, fill_value=fill_value)
                    pd.testing.assert_series_equal(result, expected)
                except AssertionError as e:  # pragma: no cover
                    raise AssertionError(
                        'Failed when periods: {}, fill_value: {}'.format(
                            periods, fill_value
                        )) from e

        s2 = raw2.iloc[:, 0]

        # test freq not None
        series2 = from_pandas_series(s2, chunk_size=5)
        for periods in (2, -2):
            for fill_value in (None, 0, 1.):
                r = series2.shift(periods=periods, freq='D', fill_value=fill_value)

                try:
                    result = self.executor.execute_dataframe(r, concat=True)[0]
                    expected = s2.shift(periods=periods, freq='D', fill_value=fill_value)
                    pd.testing.assert_series_equal(result, expected)
                except AssertionError as e:  # pragma: no cover
                    raise AssertionError(
                        'Failed when periods: {}, fill_value: {}'.format(
                            periods, fill_value
                        )) from e
Пример #7
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest('numpy')
        self.ctx, self.executor = self._create_test_context(self.executor)
        self.ctx.__enter__()

    def tearDown(self) -> None:
        self.ctx.__exit__(None, None, None)

    def testGroupBy(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        mdf = md.DataFrame(df1, chunk_size=3)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df1.groupby('b'))

        df2 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=['i' + str(i) for i in range(9)])
        mdf = md.DataFrame(df2, chunk_size=3)
        grouped = mdf.groupby('b')
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby('b'))

        # test groupby series
        grouped = mdf.groupby(mdf['b'])
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby(df2['b']))

        # test groupby multiple series
        grouped = mdf.groupby(by=[mdf['b'], mdf['c']])
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df2.groupby(by=[df2['b'], df2['c']]))

        df3 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i))
                                             for i in range(9)]))
        mdf = md.DataFrame(df3, chunk_size=3)
        grouped = mdf.groupby(level=0)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            df3.groupby(level=0))

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)
        grouped = ms1.groupby(lambda x: x % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series1.groupby(lambda x: x % 3))

        # test groupby series
        grouped = ms1.groupby(ms1)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series1.groupby(series1))

        series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3],
                            index=['i' + str(i) for i in range(9)])
        ms2 = md.Series(series2, chunk_size=3)
        grouped = ms2.groupby(lambda x: int(x[1:]) % 3)
        assert_groupby_equal(
            self.executor.execute_dataframe(grouped, concat=True)[0],
            series2.groupby(lambda x: int(x[1:]) % 3))

    def testGroupByGetItem(self):
        df1 = pd.DataFrame(
            {
                'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                'c': list('aabaaddce')
            },
            index=pd.MultiIndex.from_tuples([(i % 3, 'i' + str(i))
                                             for i in range(9)]))
        mdf = md.DataFrame(df1, chunk_size=3)

        r = mdf.groupby(level=0)[['a', 'b']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby(level=0)[['a', 'b']],
                             with_selection=True)

        r = mdf.groupby(level=0)[['a', 'b']].sum(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby(level=0)[['a', 'b']].sum())

        r = mdf.groupby(level=0)[['a', 'b']].apply(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby(level=0)[['a',
                                  'b']].apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b')[['a', 'b']],
                             with_selection=True)

        r = mdf.groupby('b')[['a', 'c']]
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b')[['a', 'c']],
                             with_selection=True)

        r = mdf.groupby('b')[['a', 'b']].sum(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'b']].sum())

        r = mdf.groupby('b')[['a', 'b']].agg(['sum', 'count'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'b']].agg(['sum', 'count']))

        r = mdf.groupby('b')[['a', 'c']].agg(['sum', 'count'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b')[['a', 'c']].agg(['sum', 'count']))

        r = mdf.groupby('b')[['a', 'b']].apply(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a', 'b']].apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']].transform(lambda x: x + 1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a',
                              'b']].transform(lambda x: x + 1).sort_index())

        r = mdf.groupby('b')[['a', 'b']].cumsum()
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b')[['a', 'b']].cumsum().sort_index())

        r = mdf.groupby('b').a
        assert_groupby_equal(self.executor.execute_dataframe(r,
                                                             concat=True)[0],
                             df1.groupby('b').a,
                             with_selection=True)

        r = mdf.groupby('b').a.sum(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b').a.sum())

        r = mdf.groupby('b').a.agg(['sum', 'mean', 'var'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            df1.groupby('b').a.agg(['sum', 'mean', 'var']))

        r = mdf.groupby('b').a.apply(lambda x: x + 1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.apply(lambda x: x + 1).sort_index())

        r = mdf.groupby('b').a.transform(lambda x: x + 1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.transform(lambda x: x + 1).sort_index())

        r = mdf.groupby('b').a.cumsum()
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').a.cumsum().sort_index())

    def testDataFrameGroupByAgg(self):
        rs = np.random.RandomState(0)
        df1 = pd.DataFrame({
            'a': rs.choice([2, 3, 4], size=(100, )),
            'b': rs.choice([2, 3, 4], size=(100, ))
        })
        mdf = md.DataFrame(df1, chunk_size=3)

        df2 = pd.DataFrame({
            'c1': np.arange(10).astype(np.int64),
            'c2': rs.choice(['a', 'b', 'c'], (10, )),
            'c3': rs.rand(10)
        })
        mdf2 = md.DataFrame(df2, chunk_size=2)

        for method in ['tree', 'shuffle']:
            r0 = mdf2.groupby('c2').agg('size', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r0, concat=True)[0],
                df2.groupby('c2').agg('size'))

            r1 = mdf.groupby('a').agg('sum', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r1, concat=True)[0],
                df1.groupby('a').agg('sum'))
            r2 = mdf.groupby('b').agg('min', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r2, concat=True)[0],
                df1.groupby('b').agg('min'))

            r1 = mdf2.groupby('c2').agg('prod', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r1, concat=True)[0],
                df2.groupby('c2').agg('prod'))
            r2 = mdf2.groupby('c2').agg('max', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r2, concat=True)[0],
                df2.groupby('c2').agg('max'))
            r3 = mdf2.groupby('c2').agg('count', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby('c2').agg('count'))
            r4 = mdf2.groupby('c2').agg('mean', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r4, concat=True)[0],
                df2.groupby('c2').agg('mean'))
            r5 = mdf2.groupby('c2').agg('var', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r5, concat=True)[0],
                df2.groupby('c2').agg('var'))
            r6 = mdf2.groupby('c2').agg('std', method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r6, concat=True)[0],
                df2.groupby('c2').agg('std'))

            agg = ['std', 'mean', 'var', 'max', 'count', 'size']
            r3 = mdf2.groupby('c2').agg(agg, method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby('c2').agg(agg))

            agg = OrderedDict([('c1', ['min', 'mean']), ('c3', 'std')])
            r3 = mdf2.groupby('c2').agg(agg, method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby('c2').agg(agg))

            agg = OrderedDict([('c1', 'min'), ('c3', 'sum')])
            r3 = mdf2.groupby('c2').agg(agg, method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby('c2').agg(agg))

            r3 = mdf2.groupby('c2').agg({'c1': 'min'}, method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby('c2').agg({'c1': 'min'}))

            # test groupby series
            r3 = mdf2.groupby(mdf2['c2']).sum(method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                df2.groupby(df2['c2']).sum())

        r8 = mdf2.groupby('c2').size(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r8, concat=True)[0],
            df2.groupby('c2').size())

        r4 = mdf2.groupby('c2').sum(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r4, concat=True)[0],
            df2.groupby('c2').sum())

        r5 = mdf2.groupby('c2').prod(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r5, concat=True)[0],
            df2.groupby('c2').prod())

        r6 = mdf2.groupby('c2').min(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r6, concat=True)[0],
            df2.groupby('c2').min())

        r7 = mdf2.groupby('c2').max(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r7, concat=True)[0],
            df2.groupby('c2').max())

        r8 = mdf2.groupby('c2').count(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r8, concat=True)[0],
            df2.groupby('c2').count())

        r9 = mdf2.groupby('c2').mean(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r9, concat=True)[0],
            df2.groupby('c2').mean())

        r10 = mdf2.groupby('c2').var(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r10, concat=True)[0],
            df2.groupby('c2').var())

        r11 = mdf2.groupby('c2').std(method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r11, concat=True)[0],
            df2.groupby('c2').std())

        # test as_index=False
        r12 = mdf2.groupby('c2', as_index=False).agg('mean', method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r12, concat=True)[0],
            df2.groupby('c2', as_index=False).agg('mean'))
        self.assertFalse(r12.op.groupby_params['as_index'])

        # test as_index=False takes no effect
        r13 = mdf2.groupby(['c1', 'c2'], as_index=False).agg(['mean', 'count'],
                                                             method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r13, concat=True)[0],
            df2.groupby(['c1', 'c2'], as_index=False).agg(['mean', 'count']))
        self.assertTrue(r13.op.groupby_params['as_index'])

        r14 = mdf2.groupby('c2').agg(['cumsum', 'cumcount'], method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r14, concat=True)[0].sort_index(),
            df2.groupby('c2').agg(['cumsum', 'cumcount']).sort_index())

        # test auto method
        r15 = mdf2.groupby('c2').agg('prod')
        self.assertEqual(r15.op.method, 'auto')
        self.assertTrue(
            all((not isinstance(c.op, ShuffleProxy))
                for c in r15.build_graph(tiled=True)))

    def testSeriesGroupByAgg(self):
        rs = np.random.RandomState(0)
        series1 = pd.Series(rs.rand(10))
        ms1 = md.Series(series1, chunk_size=3)

        for method in ['tree', 'shuffle']:
            r0 = ms1.groupby(lambda x: x % 2).agg('size', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r0, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('size'))

            r1 = ms1.groupby(lambda x: x % 2).agg('sum', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r1, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('sum'))
            r2 = ms1.groupby(lambda x: x % 2).agg('min', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r2, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('min'))

            r1 = ms1.groupby(lambda x: x % 2).agg('prod', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r1, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('prod'))
            r2 = ms1.groupby(lambda x: x % 2).agg('max', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r2, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('max'))
            r3 = ms1.groupby(lambda x: x % 2).agg('count', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('count'))
            r4 = ms1.groupby(lambda x: x % 2).agg('mean', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r4, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('mean'))
            r5 = ms1.groupby(lambda x: x % 2).agg('var', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r5, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('var'))
            r6 = ms1.groupby(lambda x: x % 2).agg('std', method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r6, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg('std'))

            agg = ['std', 'mean', 'var', 'max', 'count', 'size']
            r3 = ms1.groupby(lambda x: x % 2).agg(agg, method=method)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                series1.groupby(lambda x: x % 2).agg(agg))

            # test groupby series
            r3 = ms1.groupby(ms1).sum(method=method)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r3, concat=True)[0],
                series1.groupby(series1).sum())

        r4 = ms1.groupby(lambda x: x % 2).size(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r4, concat=True)[0],
            series1.groupby(lambda x: x % 2).size())

        r4 = ms1.groupby(lambda x: x % 2).sum(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r4, concat=True)[0],
            series1.groupby(lambda x: x % 2).sum())

        r5 = ms1.groupby(lambda x: x % 2).prod(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r5, concat=True)[0],
            series1.groupby(lambda x: x % 2).prod())

        r6 = ms1.groupby(lambda x: x % 2).min(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r6, concat=True)[0],
            series1.groupby(lambda x: x % 2).min())

        r7 = ms1.groupby(lambda x: x % 2).max(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r7, concat=True)[0],
            series1.groupby(lambda x: x % 2).max())

        r8 = ms1.groupby(lambda x: x % 2).count(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r8, concat=True)[0],
            series1.groupby(lambda x: x % 2).count())

        r9 = ms1.groupby(lambda x: x % 2).mean(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r9, concat=True)[0],
            series1.groupby(lambda x: x % 2).mean())

        r10 = ms1.groupby(lambda x: x % 2).var(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r10, concat=True)[0],
            series1.groupby(lambda x: x % 2).var())

        r11 = ms1.groupby(lambda x: x % 2).std(method='tree')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r11, concat=True)[0],
            series1.groupby(lambda x: x % 2).std())

        r11 = ms1.groupby(lambda x: x % 2).agg(['cumsum', 'cumcount'],
                                               method='tree')
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r11, concat=True)[0].sort_index(),
            series1.groupby(lambda x: x % 2).agg(['cumsum',
                                                  'cumcount']).sort_index())

    def testGroupByApply(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })

        def apply_df(df):
            df = df.sort_index()
            df.a += df.b
            if len(df.index) > 0:
                df = df.iloc[:-1, :]
            return df

        def apply_series(s, truncate=True):
            s = s.sort_index()
            if truncate and len(s.index) > 0:
                s = s.iloc[:-1]
            return s

        mdf = md.DataFrame(df1, chunk_size=3)

        applied = mdf.groupby('b').apply(apply_df)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(applied,
                                            concat=True)[0].sort_index(),
            df1.groupby('b').apply(apply_df).sort_index())

        applied = mdf.groupby('b').apply(lambda df: df.a)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(applied,
                                            concat=True)[0].sort_index(),
            df1.groupby('b').apply(lambda df: df.a).sort_index())

        applied = mdf.groupby('b').apply(lambda df: df.a.sum())
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(applied,
                                            concat=True)[0].sort_index(),
            df1.groupby('b').apply(lambda df: df.a.sum()).sort_index())

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)

        applied = ms1.groupby(lambda x: x % 3).apply(apply_series)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(applied,
                                            concat=True)[0].sort_index(),
            series1.groupby(lambda x: x % 3).apply(apply_series).sort_index())

        sindex2 = pd.MultiIndex.from_arrays(
            [list(range(9)), list('ABCDEFGHI')])
        series2 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3], index=sindex2)
        ms2 = md.Series(series2, chunk_size=3)

        applied = ms2.groupby(lambda x: x[0] % 3).apply(apply_series)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(applied,
                                            concat=True)[0].sort_index(),
            series2.groupby(lambda x: x[0] % 3).apply(
                apply_series).sort_index())

    def testGroupByTransform(self):
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce'),
            'd': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'e': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'f': list('aabaaddce'),
        })

        def transform_series(s, truncate=True):
            s = s.sort_index()
            if truncate and len(s.index) > 1:
                s = s.iloc[:-1].reset_index(drop=True)
            return s

        mdf = md.DataFrame(df1, chunk_size=3)

        r = mdf.groupby('b').transform(transform_series, truncate=False)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').transform(transform_series,
                                       truncate=False).sort_index())

        r = mdf.groupby('b').transform(['cummax', 'cumsum'], _call_agg=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').agg(['cummax', 'cumsum']).sort_index())

        agg_list = ['cummax', 'cumsum']
        r = mdf.groupby('b').transform(agg_list, _call_agg=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').agg(agg_list).sort_index())

        agg_dict = OrderedDict([('d', 'cummax'), ('b', 'cumsum')])
        r = mdf.groupby('b').transform(agg_dict, _call_agg=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').agg(agg_dict).sort_index())

        agg_list = ['sum', lambda s: s.sum()]
        r = mdf.groupby('b').transform(agg_list, _call_agg=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            df1.groupby('b').agg(agg_list).sort_index())

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)

        r = ms1.groupby(lambda x: x % 3).transform(lambda x: x + 1)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            series1.groupby(lambda x: x % 3).transform(
                lambda x: x + 1).sort_index())

        r = ms1.groupby(lambda x: x % 3).transform('cummax', _call_agg=True)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            series1.groupby(lambda x: x % 3).agg('cummax').sort_index())

        agg_list = ['cummax', 'cumcount']
        r = ms1.groupby(lambda x: x % 3).transform(agg_list, _call_agg=True)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0].sort_index(),
            series1.groupby(lambda x: x % 3).agg(agg_list).sort_index())

    def testGroupByCum(self):
        df1 = pd.DataFrame({
            'a': [3, 5, 2, 7, 1, 2, 4, 6, 2, 4],
            'b': [8, 3, 4, 1, 8, 2, 2, 2, 2, 3],
            'c': [1, 8, 8, 5, 3, 5, 0, 0, 5, 4]
        })
        mdf = md.DataFrame(df1, chunk_size=3)

        for fun in ['cummin', 'cummax', 'cumprod', 'cumsum']:
            r1 = getattr(mdf.groupby('b'), fun)()
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r1,
                                                concat=True)[0].sort_index(),
                getattr(df1.groupby('b'), fun)().sort_index())

            r2 = getattr(mdf.groupby('b'), fun)(axis=1)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r2,
                                                concat=True)[0].sort_index(),
                getattr(df1.groupby('b'), fun)(axis=1).sort_index())

        r3 = mdf.groupby('b').cumcount()
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r3, concat=True)[0].sort_index(),
            df1.groupby('b').cumcount().sort_index())

        series1 = pd.Series([3, 4, 5, 3, 5, 4, 1, 2, 3])
        ms1 = md.Series(series1, chunk_size=3)

        for fun in ['cummin', 'cummax', 'cumprod', 'cumsum', 'cumcount']:
            r1 = getattr(ms1.groupby(lambda x: x % 2), fun)()
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r1,
                                                concat=True)[0].sort_index(),
                getattr(series1.groupby(lambda x: x % 2), fun)().sort_index())
Пример #8
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testMerge(self):
        df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e'])
        df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y'])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # Note [Index of Merge]
        #
        # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to
        # the final result dataframe.
        #
        # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex.
        # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the
        # same index value with pandas. But we guarantee that the content of dataframe is correct.

        # merge on index
        expected0 = df1.merge(df2)
        jdf0 = mdf1.merge(mdf2)
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

        # merge on left index and `right_on`
        expected1 = df1.merge(df2, how='left', right_on='x', left_index=True)
        jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True)
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        expected1.set_index('a_x', inplace=True)
        result1.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0))

        # merge on `left_on` and right index
        expected2 = df1.merge(df2, how='right', left_on='a', right_index=True)
        jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True)
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        expected2.set_index('a', inplace=True)
        result2.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

        # merge on `left_on` and `right_on`
        expected3 = df1.merge(df2, how='left', left_on='a', right_on='x')
        jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        expected3.set_index('a_x', inplace=True)
        result3.set_index('a_x', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

        # merge on `on`
        expected4 = df1.merge(df2, how='right', on='a')
        jdf4 = mdf1.merge(mdf2, how='right', on='a')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        expected4.set_index('a', inplace=True)
        result4.set_index('a', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))

        # merge on multiple columns
        expected5 = df1.merge(df2, how='inner', on=['a', 'b'])
        jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b'])
        result5 = self.executor.execute_dataframe(jdf5, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0))

    def testJoin(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        # default `how`
        expected0 = df1.join(df2, lsuffix='l_', rsuffix='r_')
        jdf0 = mdf1.join(mdf2, lsuffix='l_', rsuffix='r_')
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(expected0.sort_index(), result0.sort_index())

        # how = 'left'
        expected1 = df1.join(df2, how='left', lsuffix='l_', rsuffix='r_')
        jdf1 = mdf1.join(mdf2, how='left', lsuffix='l_', rsuffix='r_')
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]
        pd.testing.assert_frame_equal(expected1.sort_index(), result1.sort_index())

        # how = 'right'
        expected2 = df1.join(df2, how='right', lsuffix='l_', rsuffix='r_')
        jdf2 = mdf1.join(mdf2, how='right', lsuffix='l_', rsuffix='r_')
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]
        pd.testing.assert_frame_equal(expected2.sort_index(), result2.sort_index())

        # how = 'inner'
        expected3 = df1.join(df2, how='inner', lsuffix='l_', rsuffix='r_')
        jdf3 = mdf1.join(mdf2, how='inner', lsuffix='l_', rsuffix='r_')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        pd.testing.assert_frame_equal(expected3.sort_index(), result3.sort_index())

        # how = 'outer'
        expected4 = df1.join(df2, how='outer', lsuffix='l_', rsuffix='r_')
        jdf4 = mdf1.join(mdf2, how='outer', lsuffix='l_', rsuffix='r_')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]
        pd.testing.assert_frame_equal(expected4.sort_index(), result4.sort_index())

    def testJoinOn(self):
        df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], columns=['a1', 'a2', 'a3'])
        df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], columns=['a1', 'b2', 'b3']) + 1
        df2 = pd.concat([df2, df2 + 1])

        mdf1 = from_pandas(df1, chunk_size=2)
        mdf2 = from_pandas(df2, chunk_size=2)

        expected0 = df1.join(df2, on=None, lsuffix='_l', rsuffix='_r')
        jdf0 = mdf1.join(mdf2, on=None, lsuffix='_l', rsuffix='_r')
        result0 = self.executor.execute_dataframe(jdf0, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0))

        expected1 = df1.join(df2, how='left', on='a1', lsuffix='_l', rsuffix='_r')
        jdf1 = mdf1.join(mdf2, how='left', on='a1', lsuffix='_l', rsuffix='_r')
        result1 = self.executor.execute_dataframe(jdf1, concat=True)[0]

        # Note [Columns of Left Join]
        #
        # I believe we have no chance to obtain the entirely same result with pandas here:
        #
        # Look at the following example:
        #
        # >>> df1
        #     a1  a2  a3
        # 0   1   3   3
        # >>> df2
        #     a1  b2  b3
        # 1   2   6   7
        # >>> df3
        #     a1  b2  b3
        # 1   2   6   7
        # 1   2   6   7
        #
        # >>> df1.merge(df2, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1_x  a2  a3  a1_y  b2  b3
        # 0   1   3   3     2   6   7
        # >>> df1.merge(df3, how='left', left_on='a1', left_index=False, right_index=True)
        #     a1  a1_x  a2  a3  a1_y  b2  b3
        # 0   1     1   3   3     2   6   7
        # 0   1     1   3   3     2   6   7
        #
        # Note that the result of `df1.merge(df3)` has an extra column `a` compared to `df1.merge(df2)`.
        # The value of column `a` is the same of `a1_x`, just because `1` occurs twice in index of `df3`.
        # I haven't invistagated why pandas has such behaviour...
        #
        # We cannot yeild the same result with pandas, because, the `df3` is chunked, then some of the
        # result chunk has 6 columns, others may have 7 columns, when concatenated into one DataFrame
        # some cells of column `a` will have value `NaN`, which is different from the result of pandas.
        #
        # But we can guarantee that other effective columns have absolutely same value with pandas.

        columns_to_compare = jdf1.columns_value.to_pandas()

        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1[columns_to_compare], 0, 1),
                                      sort_dataframe_inplace(result1[columns_to_compare], 0, 1))

        # Note [Index of Join on EmptyDataFrame]
        #
        # It is tricky that it is non-trivial to get the same `index` result with pandas.
        #
        # Look at the following example:
        #
        # >>> df1
        #    a1  a2  a3
        # 1   4   2   6
        # >>> df2
        #    a1  b2  b3
        # 1   2   6   7
        # 2   8   9  10
        # >>> df3
        # Empty DataFrame
        # Columns: [a1, a2, a3]
        # Index: []
        # >>> df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #       a1_l  a2   a3  a1_r  b2  b3
        # 1.0   4.0   2  6.0     8   9  10
        # NaN   NaN   1  NaN     2   6   7
        # >>> df3.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        #     a1_l  a2  a3  a1_r  b2  b3
        # 1   NaN   1 NaN     2   6   7
        # 2   NaN   2 NaN     8   9  10
        #
        # When the `left` dataframe is not empty, the mismatched rows in `right` will have index value `NaN`,
        # and the matched rows have index value from `right`. When the `left` dataframe is empty, the mismatched
        # rows have index value from `right`.
        #
        # Since we chunked the `left` dataframe, it is uneasy to obtain the same index value with pandas in the
        # final result dataframe, but we guaranteed that the dataframe content is correctly.

        expected2 = df1.join(df2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        jdf2 = mdf1.join(mdf2, how='right', on='a2', lsuffix='_l', rsuffix='_r')
        result2 = self.executor.execute_dataframe(jdf2, concat=True)[0]

        expected2.set_index('a2', inplace=True)
        result2.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0))

        expected3 = df1.join(df2, how='inner', on='a2', lsuffix='_l', rsuffix='_r')
        jdf3 = mdf1.join(mdf2, how='inner', on='a2', lsuffix='_l', rsuffix='_r')
        result3 = self.executor.execute_dataframe(jdf3, concat=True)[0]
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0))

        expected4 = df1.join(df2, how='outer', on='a2', lsuffix='_l', rsuffix='_r')
        jdf4 = mdf1.join(mdf2, how='outer', on='a2', lsuffix='_l', rsuffix='_r')
        result4 = self.executor.execute_dataframe(jdf4, concat=True)[0]

        expected4.set_index('a2', inplace=True)
        result4.set_index('a2', inplace=True)
        pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0))

    def testMergeOneChunk(self):
        df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                            'value': [1, 2, 3, 5]}, index=['a1', 'a2', 'a3', 'a4'])
        df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                            'value': [5, 6, 7, 8]}, index=['a1', 'a2', 'a3', 'a4'])

        # all have one chunk
        mdf1 = from_pandas(df1)
        mdf2 = from_pandas(df2)

        expected = df1.merge(df2, left_on='lkey', right_on='rkey')
        jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey')
        result = self.executor.execute_dataframe(jdf, concat=True)[0]

        pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
                                      result.sort_values(by=result.columns[1]).reset_index(drop=True))

        # left have one chunk
        mdf1 = from_pandas(df1)
        mdf2 = from_pandas(df2, chunk_size=2)

        expected = df1.merge(df2, left_on='lkey', right_on='rkey')
        jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey')
        result = self.executor.execute_dataframe(jdf, concat=True)[0]

        pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
                                      result.sort_values(by=result.columns[1]).reset_index(drop=True))

        # right have one chunk
        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2)

        expected = df1.merge(df2, left_on='lkey', right_on='rkey')
        jdf = mdf1.merge(mdf2, left_on='lkey', right_on='rkey')
        result = self.executor.execute_dataframe(jdf, concat=True)[0]

        pd.testing.assert_frame_equal(expected.sort_values(by=expected.columns[1]).reset_index(drop=True),
                                      result.sort_values(by=result.columns[1]).reset_index(drop=True))

    def testAppendExecution(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=2)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD'))
        mdf3 = from_pandas(df3, chunk_size=3)
        expected = df1.append([df2, df3])
        adf = mdf1.append([mdf2, mdf3])
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10,))
        series2 = pd.Series(np.random.rand(10,))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=2)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        series3 = pd.Series(np.random.rand(4,))
        mseries3 = series_from_pandas(series3, chunk_size=2)
        expected = series1.append([series2, series3])
        aseries = mseries1.append([mseries2, mseries3])
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)
Пример #9
0
    def testSortIndexExecution(self):
        raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw)
        mdf.sort_index(inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=30)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=20)
        result = self.executor.execute_dataframe(
            mdf.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        executor = ExecutorForTest(storage=new_session().context)

        mdf = DataFrame(raw, chunk_size=10)
        result = executor.execute_dataframe(mdf.sort_index(ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(ignore_index=True)
        except TypeError:
            expected = raw.sort_index()
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        result = self.executor.execute_dataframe(mdf.sort_index(
            axis=1, ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1, ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        executor = ExecutorForTest(storage=new_session().context)

        result = executor.execute_dataframe(mdf.sort_index(axis=1,
                                                           ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(axis=1, ignore_index=True)
        except TypeError:
            expected = raw.sort_index(axis=1)
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test series
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(
            series.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_series_equal(result, expected)
Пример #10
0
class Test(unittest.TestCase):
    def setUp(self) -> None:
        self.executor = ExecutorForTest('numpy')

    def testDotExecution(self):
        df1_raw = pd.DataFrame(np.random.rand(4, 7))
        df2_raw = pd.DataFrame(np.random.rand(7, 5), columns=list('efghi'))
        s1_raw = pd.Series(np.random.rand(7))
        s2_raw = pd.Series(np.random.rand(7))

        df1 = DataFrame(df1_raw, chunk_size=(3, 2))
        df2 = DataFrame(df2_raw, chunk_size=(3, 4))

        # df.dot(df)
        r = df1.dot(df2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(df2_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test @
        r = df1 @ df2
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw @ df2_raw
        pd.testing.assert_frame_equal(result, expected)

        series1 = Series(s1_raw, chunk_size=5)

        # df.dot(series)
        r = df1.dot(series1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(s1_raw)
        pd.testing.assert_series_equal(result, expected)

        # df.dot(2d_array)
        r = df1.dot(df2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(df2_raw.to_numpy())
        pd.testing.assert_frame_equal(result, expected)

        # df.dot(1d_array)
        r = df1.dot(s1_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df1_raw.dot(s1_raw.to_numpy())
        pd.testing.assert_series_equal(result, expected)

        series2 = Series(s2_raw, chunk_size=4)

        # series.dot(series)
        r = series1.dot(series2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(s2_raw)
        self.assertAlmostEqual(result, expected)

        # series.dot(df)
        r = series1.dot(df2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(df2_raw)
        pd.testing.assert_series_equal(result, expected)

        # series.dot(2d_array)
        r = series1.dot(df2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(df2_raw.to_numpy())
        np.testing.assert_almost_equal(result, expected)

        # series.dot(1d_array)
        r = series1.dot(s2_raw.to_numpy())
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s1_raw.dot(s2_raw.to_numpy())
        self.assertAlmostEqual(result, expected)
Пример #11
0
class TestReduction(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def compute(self, data, **kwargs):
        return getattr(data, self.func_name)(**kwargs)

    def testSeriesReduction(self):
        data = pd.Series(np.random.randint(0, 8, (10,)), index=[str(i) for i in range(10)], name='a')
        reduction_df1 = self.compute(from_pandas_series(data))
        self.assertAlmostEqual(
            self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6))
        self.assertAlmostEqual(
            self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data), self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4), axis='index')
        self.assertAlmostEqual(
            self.compute(data, axis='index'), self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        data = pd.Series(np.random.rand(20), name='a')
        data[0] = 0.1  # make sure not all elements are NAN
        data[data > 0.5] = np.nan
        reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False)
        self.assertTrue(
            np.isnan(self.executor.execute_dataframe(reduction_df2, concat=True)[0]))

        if self.has_min_count:
            reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3), skipna=False, min_count=2)
            self.assertTrue(
                np.isnan(self.executor.execute_dataframe(reduction_df3, concat=True)[0]))

            reduction_df4 = self.compute(from_pandas_series(data, chunk_size=3), min_count=1)
            self.assertAlmostEqual(
                self.compute(data, min_count=1),
                self.executor.execute_dataframe(reduction_df4, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_series(data, chunk_size=3), min_count=21)
            self.assertTrue(
                np.isnan(self.executor.execute_dataframe(reduction_df5, concat=True)[0]))

    def testDataFrameReduction(self):
        data = pd.DataFrame(np.random.rand(20, 10))
        reduction_df1 = self.compute(from_pandas_df(data))
        pd.testing.assert_series_equal(
            self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data), self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), axis=1)
        pd.testing.assert_series_equal(
            self.compute(data, axis=1),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        # test null
        np_data = np.random.rand(20, 10)
        np_data[np_data > 0.6] = np.nan
        data = pd.DataFrame(np_data)

        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3), skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False), self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        if self.has_min_count:
            reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), min_count=15)
            pd.testing.assert_series_equal(
                self.compute(data, min_count=15),
                self.executor.execute_dataframe(reduction_df3, concat=True)[0])

            reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3), min_count=3)
            pd.testing.assert_series_equal(
                self.compute(data, min_count=3),
                self.executor.execute_dataframe(reduction_df4, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=3)
            pd.testing.assert_series_equal(
                self.compute(data, axis=1, min_count=3),
                self.executor.execute_dataframe(reduction_df5, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3), axis=1, min_count=8)
            pd.testing.assert_series_equal(
                self.compute(data, axis=1, min_count=8),
                self.executor.execute_dataframe(reduction_df5, concat=True)[0])

        # test numeric_only
        data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)),
                            columns=[np.random.bytes(10) for _ in range(10)])
        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2))
        pd.testing.assert_series_equal(
            self.compute(data), self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=6), axis='index', numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3), axis='columns')
        pd.testing.assert_series_equal(
            self.compute(data, axis='columns'),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        data_dict = dict((str(i), np.random.rand(10)) for i in range(10))
        data_dict['string'] = [str(i) for i in range(10)]
        data_dict['bool'] = np.random.choice([True, False], (10,))
        data = pd.DataFrame(data_dict)
        reduction_df = self.compute(from_pandas_df(data, chunk_size=3), axis='index', numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df, concat=True)[0])

        data1 = pd.DataFrame(np.random.rand(10, 10), columns=[str(i) for i in range(10)])
        data2 = pd.DataFrame(np.random.rand(10, 10), columns=[str(i) for i in range(10)])
        df = from_pandas_df(data1, chunk_size=5) + from_pandas_df(data2, chunk_size=6)
        reduction_df = self.compute(df)
        pd.testing.assert_series_equal(
            self.compute(data1 + data2).sort_index(),
            self.executor.execute_dataframe(reduction_df, concat=True)[0].sort_index())

    @require_cudf
    @require_cupy
    def testGPUExecution(self):
        df_raw = pd.DataFrame(np.random.rand(30, 3), columns=list('abc'))
        df = to_gpu(from_pandas_df(df_raw, chunk_size=6))

        r = df.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.sum())

        r = df.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), df_raw.kurt())

        r = df.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_frame_equal(res.to_pandas(), df_raw.agg(['sum', 'var']))

        s_raw = pd.Series(np.random.rand(30))
        s = to_gpu(from_pandas_series(s_raw, chunk_size=6))

        r = s.sum()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.sum())

        r = s.kurt()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        self.assertAlmostEqual(res, s_raw.kurt())

        r = s.agg(['sum', 'var'])
        res = self.executor.execute_dataframe(r, concat=True)[0]
        pd.testing.assert_series_equal(res.to_pandas(), s_raw.agg(['sum', 'var']))

        s_raw = pd.Series(np.random.randint(0, 3, size=(30,))
                          * np.random.randint(0, 5, size=(30,)))
        s = to_gpu(from_pandas_series(s_raw, chunk_size=6))

        r = s.unique()
        res = self.executor.execute_dataframe(r, concat=True)[0]
        np.testing.assert_array_equal(cp.asnumpy(res).sort(), s_raw.unique().sort())
Пример #12
0
class TestCount(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testSeriesCount(self):
        array = np.random.rand(10)
        array[[2, 7, 9]] = np.nan
        data = pd.Series(array)
        series = from_pandas_series(data)

        result = self.executor.execute_dataframe(series.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=1)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=3)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

    def testDataFrameCount(self):
        data = pd.DataFrame({
            "Person": ["John", "Myla", "Lewis", "John", "Myla"],
            "Age": [24., np.nan, 21., 33, 26],
            "Single": [False, True, True, True, False]
        })
        df = from_pandas_df(data)

        result = self.executor.execute_dataframe(df.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df2 = from_pandas_df(data, chunk_size=2)

        result = self.executor.execute_dataframe(df2.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df2.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df3 = from_pandas_df(data, chunk_size=3)

        result = self.executor.execute_dataframe(df3.count(numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df3.count(axis='columns',
                                                           numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(axis='columns', numeric_only=True)
        pd.testing.assert_series_equal(result, expected)
Пример #13
0
class TestCount(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testSeriesCount(self):
        array = np.random.rand(10)
        array[[2, 7, 9]] = np.nan
        data = pd.Series(array)
        series = from_pandas_series(data)

        result = self.executor.execute_dataframe(series.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=1)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=3)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

    def testDataFrameCount(self):
        data = pd.DataFrame({
            "Person": ["John", "Myla", "Lewis", "John", "Myla"],
            "Age": [24., np.nan, 21., 33, 26],
            "Single": [False, True, True, True, False]
        })
        df = from_pandas_df(data)

        result = self.executor.execute_dataframe(df.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df2 = from_pandas_df(data, chunk_size=2)

        result = self.executor.execute_dataframe(df2.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df2.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df3 = from_pandas_df(data, chunk_size=3)

        result = self.executor.execute_dataframe(df3.count(numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df3.count(axis='columns',
                                                           numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(axis='columns', numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

    def testNunique(self):
        data1 = pd.Series(np.random.randint(0, 5, size=(20, )))

        series = from_pandas_series(data1)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data1.nunique()
        self.assertEqual(result, expected)

        series = from_pandas_series(data1, chunk_size=6)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data1.nunique()
        self.assertEqual(result, expected)

        # test dropna
        data2 = data1.copy()
        data2[[2, 9, 18]] = np.nan

        series = from_pandas_series(data2)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data2.nunique()
        self.assertEqual(result, expected)

        series = from_pandas_series(data2, chunk_size=3)
        result = self.executor.execute_dataframe(series.nunique(dropna=False),
                                                 concat=True)[0]
        expected = data2.nunique(dropna=False)
        self.assertEqual(result, expected)

        # test dataframe
        data1 = pd.DataFrame(np.random.randint(0, 6, size=(20, 20)),
                             columns=['c' + str(i) for i in range(20)])
        df = from_pandas_df(data1)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data1.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=6)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data1.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)

        # test dropna
        data2 = data1.copy()
        data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan

        df = from_pandas_df(data2)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data2.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data2, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(dropna=False),
                                                 concat=True)[0]
        expected = data2.nunique(dropna=False)
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)
Пример #14
0
    def testSortValuesExecution(self):
        df = pd.DataFrame(np.random.rand(100, 10),
                          columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a6', 'a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a0', 'a1'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            }, )
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = self.executor.execute_dataframe(mdf.sort_values(label),
                                                     concat=True)[0]
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a', 'b', 'e'], ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        executor = ExecutorForTest(storage=new_session().context)

        df = pd.DataFrame(np.random.rand(10, 3),
                          columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'],
                                                            ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = self.executor.execute_dataframe(filtered.sort_values(by='b'),
                                                 concat=True)[0]

        pd.testing.assert_frame_equal(result,
                                      df[df['a'] > 2].sort_values(by='b'))

        # test Sereis.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(
            series.sort_values(ascending=False), concat=True)[0]
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)
Пример #15
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testToCSVExecution(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100)
            },
            index=index)
        df = DataFrame(raw, chunk_size=33)

        with tempfile.TemporaryDirectory() as base_path:
            # DATAFRAME TESTS
            # test one file with dataframe
            path = os.path.join(base_path, 'out.csv')

            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)

            # test multi files with dataframe
            path = os.path.join(base_path, 'out-*.csv')
            r = df.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw)
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.iloc[33:66])

            # SERIES TESTS
            series = md.Series(raw.col1, chunk_size=33)

            # test one file with series
            path = os.path.join(base_path, 'out.csv')
            r = series.to_csv(path)
            self.executor.execute_dataframe(r)

            result = pd.read_csv(path, dtype=raw.dtypes.to_dict())
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw.col1.to_frame())

            # test multi files with series
            path = os.path.join(base_path, 'out-*.csv')
            r = series.to_csv(path)
            self.executor.execute_dataframe(r)

            dfs = [
                pd.read_csv(os.path.join(base_path, 'out-{}.csv'.format(i)),
                            dtype=raw.dtypes.to_dict()) for i in range(4)
            ]
            result = pd.concat(dfs, axis=0)
            result.set_index('index', inplace=True)
            pd.testing.assert_frame_equal(result, raw.col1.to_frame())
            pd.testing.assert_frame_equal(dfs[1].set_index('index'),
                                          raw.col1.to_frame().iloc[33:66])

    @unittest.skipIf(sqlalchemy is None, 'sqlalchemy not installed')
    def testToSQL(self):
        index = pd.RangeIndex(100, 0, -1, name='index')
        raw = pd.DataFrame(
            {
                'col1': np.random.rand(100),
                'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                'col3': np.arange(100).astype('int64'),
            },
            index=index)

        with tempfile.TemporaryDirectory() as d:
            table_name1 = 'test_table'
            table_name2 = 'test_table2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            engine = sqlalchemy.create_engine(uri)

            # test write dataframe
            df = DataFrame(raw, chunk_size=33)
            r = df.to_sql(table_name1, con=engine)
            self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name1, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw, written)

            # test write with existing table
            with self.assertRaises(ValueError):
                df.to_sql(table_name1, con=uri).execute()

            # test write series
            series = md.Series(raw.col1, chunk_size=33)
            with engine.connect() as conn:
                r = series.to_sql(table_name2, con=conn)
                self.executor.execute_dataframe(r)

            written = pd.read_sql(table_name2, con=engine, index_col='index') \
                .sort_index(ascending=False)
            pd.testing.assert_frame_equal(raw.col1.to_frame(), written)

    @unittest.skipIf(vineyard is None, 'vineyard not installed')
    @mock.patch('webbrowser.open_new_tab', new=lambda *_, **__: True)
    def testToVineyard(self):
        def testWithGivenSession(session):
            with option_context({'vineyard.socket': '/tmp/vineyard.sock'}):
                df1 = DataFrame(pd.DataFrame(np.arange(12).reshape(3, 4),
                                             columns=['a', 'b', 'c', 'd']),
                                chunk_size=2)
                object_id = df1.to_vineyard().execute(session=session)
                df2 = md.from_vineyard(object_id)

                df1_value = df1.execute(session=session)
                df2_value = df2.execute(session=session)
                pd.testing.assert_frame_equal(df1_value.reset_index(drop=True),
                                              df2_value.reset_index(drop=True))

        with new_session().as_default() as session:
            testWithGivenSession(session)

        with new_cluster(scheduler_n_process=2,
                         worker_n_process=2,
                         shared_memory='20M',
                         web=True) as cluster:
            with new_session(cluster.endpoint).as_default() as session:
                testWithGivenSession(session)

            with new_session(
                    'http://' +
                    cluster._web_endpoint).as_default() as web_session:
                testWithGivenSession(web_session)
Пример #16
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testInitializerExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testSeriesFromTensor(self):
        data = np.random.rand(10)
        series = md.Series(mt.tensor(data), name='a')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a'))

        series = md.Series(mt.tensor(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data))

        series = md.Series(mt.ones((10, ), chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(np.ones(10, )))

        index_data = np.random.rand(10)
        series = md.Series(mt.tensor(data, chunk_size=3),
                           name='a',
                           index=mt.tensor(index_data, chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a', index=index_data))

    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2,
                                    index=pd.Index(['a', 'b']),
                                    columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(
            self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given index that is a tensor
        raw7 = np.random.rand(10, 10)
        tensor7 = mt.tensor(raw7, chunk_size=3)
        index_raw7 = np.random.rand(10)
        index7 = mt.tensor(index_raw7, chunk_size=4)
        df7 = dataframe_from_tensor(tensor7, index=index7)
        result7 = self.executor.execute_dataframe(df7, concat=True)[0]
        pdf_expected = pd.DataFrame(raw7, index=index_raw7)
        pd.testing.assert_frame_equal(pdf_expected, result7)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

        # from 1d tensors
        raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10,
                                                                   size=8)),
                 ('c', [
                     ''.join(np.random.choice(list(printable), size=6))
                     for _ in range(8)
                 ])]
        tensors8 = [mt.tensor(r[1], chunk_size=3) for r in raws8]
        df8 = dataframe_from_1d_tensors(tensors8,
                                        columns=[r[0] for r in raws8])
        result = self.executor.execute_dataframe(df8, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8))
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index with a tensor
        index_raw9 = np.random.rand(8)
        index9 = mt.tensor(index_raw9, chunk_size=4)
        df9 = dataframe_from_1d_tensors(tensors8,
                                        columns=[r[0] for r in raws8],
                                        index=index9)
        result = self.executor.execute_dataframe(df9, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9)
        pd.testing.assert_frame_equal(result, pdf_expected)

    def testFromRecordsExecution(self):
        dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')])

        ndarr = np.ones((10, ), dtype=dtype)
        pdf_expected = pd.DataFrame.from_records(ndarr,
                                                 index=pd.RangeIndex(10))

        # from structured array of mars
        tensor = mt.ones((10, ), dtype=dtype, chunk_size=3)
        df1 = from_records(tensor)
        df1_result = self.executor.execute_dataframe(df1, concat=True)[0]
        pd.testing.assert_frame_equal(df1_result, pdf_expected)

        # from structured array of numpy
        df2 = from_records(ndarr)
        df2_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(df2_result, pdf_expected)

    def testReadCSVExecution(self):
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test sep
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, sep=';')

            pdf = pd.read_csv(file_path, sep=';', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              sep=';',
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               sep=';',
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test missing value
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame({
                'c1': [np.nan, 'a', 'b', 'c'],
                'c2': [1, 2, 3, np.nan],
                'c3': [np.nan, np.nan, 3.4, 2.2]
            })
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=12),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, index_col=0, chunk_bytes=100),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test compression
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.gzip')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path, compression='gzip')

            pdf = pd.read_csv(file_path, compression='gzip', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0, chunk_bytes='1k'),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test multiply files
        tempdir = tempfile.mkdtemp()
        try:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            mdf = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                               index_col=0,
                                                               chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test wildcards in path
        tempdir = tempfile.mkdtemp()
        try:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            # As we can not guarantee the order in which these files are processed,
            # the result may not keep the original order.
            mdf = self.executor.execute_dataframe(md.read_csv(
                '{}/*.csv'.format(tempdir), index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf.sort_index())

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                '{}/*.csv'.format(tempdir), index_col=0, chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2.sort_index())

        finally:
            shutil.rmtree(tempdir)

    @require_cudf
    def testReadCSVGPUExecution(self):
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame({
                'col1':
                np.random.rand(100),
                'col2':
                np.random.choice(['a', 'b', 'c'], (100, )),
                'col3':
                np.arange(100)
            })
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              gpu=True),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf.to_pandas().reset_index(drop=True))

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, gpu=True, chunk_bytes=200),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf2.to_pandas().reset_index(drop=True))

        finally:
            shutil.rmtree(tempdir)
Пример #17
0
class Test(unittest.TestCase):
    def setUp(self) -> None:
        super().setUp()
        self.executor = ExecutorForTest('numpy')

    def testSeriesQuantileExecution(self):
        raw = pd.Series(np.random.rand(10), name='a')
        a = Series(raw, chunk_size=3)

        # q = 0.5, scalar
        r = a.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        self.assertEqual(result, expected)

        # q is a list
        r = a.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_series_equal(result, expected)

        # test interpolation
        r = a.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_series_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = a.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_series_equal(result, expected)

    def testDataFrameQuantileExecution(self):
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
                'f': [pd.Timedelta('{} days'.format(i)) for i in range(10)]
            },
            index=pd.RangeIndex(1, 11))
        df = DataFrame(raw, chunk_size=3)

        # q = 0.5, axis = 0, series
        r = df.quantile()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile()

        pd.testing.assert_series_equal(result, expected)

        # q = 0.5, axis = 1, series
        r = df.quantile(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile(axis=1)

        pd.testing.assert_series_equal(result, expected)

        # q is a list, axis = 0, dataframe
        r = df.quantile([0.3, 0.7])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7])

        pd.testing.assert_frame_equal(result, expected)

        # q is a list, axis = 1, dataframe
        r = df.quantile([0.3, 0.7], axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], axis=1)

        pd.testing.assert_frame_equal(result, expected)

        # test interpolation
        r = df.quantile([0.3, 0.7], interpolation='midpoint')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.quantile([0.3, 0.7], interpolation='midpoint')

        pd.testing.assert_frame_equal(result, expected)

        this = self

        class MockSession:
            def __init__(self):
                self.executor = this.executor

        ctx = LocalContext(MockSession())
        executor = ExecutorForTest('numpy', storage=ctx)
        with ctx:
            q = tensor([0.3, 0.7])

            # q is a tensor
            r = df.quantile(q)
            result = executor.execute_dataframes([r])[0]
            expected = raw.quantile([0.3, 0.7])

            pd.testing.assert_frame_equal(result, expected)

        # test numeric_only
        raw2 = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [pd.Timestamp('201{}'.format(i)) for i in range(10)],
            },
            index=pd.RangeIndex(1, 11))
        df2 = DataFrame(raw2, chunk_size=3)

        r = df2.quantile([0.3, 0.7], numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile([0.3, 0.7], numeric_only=False)

        pd.testing.assert_frame_equal(result, expected)

        r = df2.quantile(numeric_only=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw2.quantile(numeric_only=False)

        pd.testing.assert_series_equal(result, expected)
Пример #18
0
class TestReduction(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def compute(self, data, **kwargs):
        return getattr(data, self.func_name)(**kwargs)

    def testSeriesReduction(self):
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')
        reduction_df1 = self.compute(from_pandas_series(data))
        self.assertEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4),
                                     axis='index')
        self.assertAlmostEqual(
            self.compute(data, axis='index'),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        data = pd.Series(np.random.rand(20), name='a')
        data[0] = 0.1  # make sure not all elements are NAN
        data[data > 0.5] = np.nan
        reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3))
        self.assertAlmostEqual(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3),
                                     skipna=False)
        self.assertTrue(
            np.isnan(
                self.executor.execute_dataframe(reduction_df2,
                                                concat=True)[0]))

        if self.has_min_count:
            reduction_df3 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         skipna=False,
                                         min_count=2)
            self.assertTrue(
                np.isnan(
                    self.executor.execute_dataframe(reduction_df3,
                                                    concat=True)[0]))

            reduction_df4 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         min_count=1)
            self.assertAlmostEqual(
                self.compute(data, min_count=1),
                self.executor.execute_dataframe(reduction_df4, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_series(data,
                                                            chunk_size=3),
                                         min_count=21)
            self.assertTrue(
                np.isnan(
                    self.executor.execute_dataframe(reduction_df5,
                                                    concat=True)[0]))

    def testDataFrameReduction(self):
        data = pd.DataFrame(np.random.rand(20, 10))
        reduction_df1 = self.compute(from_pandas_df(data))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_df(data, chunk_size=6),
                                     axis='index',
                                     numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3),
                                     axis=1)
        pd.testing.assert_series_equal(
            self.compute(data, axis=1),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        # test null
        np_data = np.random.rand(20, 10)
        np_data[np_data > 0.6] = np.nan
        data = pd.DataFrame(np_data)

        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        if self.has_min_count:
            reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3),
                                         min_count=15)
            pd.testing.assert_series_equal(
                self.compute(data, min_count=15),
                self.executor.execute_dataframe(reduction_df3, concat=True)[0])

            reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3),
                                         min_count=3)
            pd.testing.assert_series_equal(
                self.compute(data, min_count=3),
                self.executor.execute_dataframe(reduction_df4, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3),
                                         axis=1,
                                         min_count=3)
            pd.testing.assert_series_equal(
                self.compute(data, axis=1, min_count=3),
                self.executor.execute_dataframe(reduction_df5, concat=True)[0])

            reduction_df5 = self.compute(from_pandas_df(data, chunk_size=3),
                                         axis=1,
                                         min_count=8)
            pd.testing.assert_series_equal(
                self.compute(data, axis=1, min_count=8),
                self.executor.execute_dataframe(reduction_df5, concat=True)[0])

        # test numeric_only
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=6),
                                     axis='index',
                                     numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3),
                                     axis='columns')
        pd.testing.assert_series_equal(
            self.compute(data, axis='columns'),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        data_dict = dict((str(i), np.random.rand(10)) for i in range(10))
        data_dict['string'] = [str(i) for i in range(10)]
        data_dict['bool'] = np.random.choice([True, False], (10, ))
        data = pd.DataFrame(data_dict)
        reduction_df = self.compute(from_pandas_df(data, chunk_size=3),
                                    axis='index',
                                    numeric_only=True)
        pd.testing.assert_series_equal(
            self.compute(data, axis='index', numeric_only=True),
            self.executor.execute_dataframe(reduction_df, concat=True)[0])
Пример #19
0
class TestCount(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testSeriesCount(self):
        array = np.random.rand(10)
        array[[2, 7, 9]] = np.nan
        data = pd.Series(array)
        series = from_pandas_series(data)

        result = self.executor.execute_dataframe(series.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=1)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

        series2 = from_pandas_series(data, chunk_size=3)

        result = self.executor.execute_dataframe(series2.count(),
                                                 concat=True)[0]
        expected = data.count()
        self.assertEqual(result, expected)

    def testDataFrameCount(self):
        data = pd.DataFrame({
            "Person": ["John", "Myla", "Lewis", "John", "Myla"],
            "Age": [24., np.nan, 21., 33, 26],
            "Single": [False, True, True, True, False]
        })
        df = from_pandas_df(data)

        result = self.executor.execute_dataframe(df.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df2 = from_pandas_df(data, chunk_size=2)

        result = self.executor.execute_dataframe(df2.count(), concat=True)[0]
        expected = data.count()
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df2.count(axis='columns'),
                                                 concat=True)[0]
        expected = data.count(axis='columns')
        pd.testing.assert_series_equal(result, expected)

        df3 = from_pandas_df(data, chunk_size=3)

        result = self.executor.execute_dataframe(df3.count(numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

        result = self.executor.execute_dataframe(df3.count(axis='columns',
                                                           numeric_only=True),
                                                 concat=True)[0]
        expected = data.count(axis='columns', numeric_only=True)
        pd.testing.assert_series_equal(result, expected)

    def testNunique(self):
        data1 = pd.Series(np.random.randint(0, 5, size=(20, )))

        series = from_pandas_series(data1)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data1.nunique()
        self.assertEqual(result, expected)

        series = from_pandas_series(data1, chunk_size=6)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data1.nunique()
        self.assertEqual(result, expected)

        # test dropna
        data2 = data1.copy()
        data2[[2, 9, 18]] = np.nan

        series = from_pandas_series(data2)
        result = self.executor.execute_dataframe(series.nunique(),
                                                 concat=True)[0]
        expected = data2.nunique()
        self.assertEqual(result, expected)

        series = from_pandas_series(data2, chunk_size=3)
        result = self.executor.execute_dataframe(series.nunique(dropna=False),
                                                 concat=True)[0]
        expected = data2.nunique(dropna=False)
        self.assertEqual(result, expected)

        # test dataframe
        data1 = pd.DataFrame(np.random.randint(0, 6, size=(20, 20)),
                             columns=['c' + str(i) for i in range(20)])
        df = from_pandas_df(data1)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data1.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=6)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data1.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)

        # test dropna
        data2 = data1.copy()
        data2.iloc[[2, 9, 18], [2, 9, 18]] = np.nan

        df = from_pandas_df(data2)
        result = self.executor.execute_dataframe(df.nunique(), concat=True)[0]
        expected = data2.nunique()
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data2, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(dropna=False),
                                                 concat=True)[0]
        expected = data2.nunique(dropna=False)
        pd.testing.assert_series_equal(result, expected)

        df = from_pandas_df(data1, chunk_size=3)
        result = self.executor.execute_dataframe(df.nunique(axis=1),
                                                 concat=True)[0]
        expected = data1.nunique(axis=1)
        pd.testing.assert_series_equal(result, expected)

    def testUseArrowDtypeNUnique(self):
        with option_context({
                'dataframe.use_arrow_dtype': True,
                'combine_size': 2
        }):
            rs = np.random.RandomState(0)
            data1 = pd.DataFrame({
                'a':
                rs.random(10),
                'b': [f's{i}' for i in rs.randint(100, size=10)]
            })
            data1['c'] = data1['b'].copy()
            data1['d'] = data1['b'].copy()
            data1['e'] = data1['b'].copy()

            df = from_pandas_df(data1, chunk_size=(3, 2))
            r = df.nunique(axis=0)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data1.nunique(axis=0)
            pd.testing.assert_series_equal(result, expected)

            r = df.nunique(axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data1.nunique(axis=1)
            pd.testing.assert_series_equal(result, expected)

    def testUnique(self):
        data1 = pd.Series(np.random.randint(0, 5, size=(20, )))

        series = from_pandas_series(data1)
        result = self.executor.execute_dataframe(series.unique(),
                                                 concat=True)[0]
        expected = data1.unique()
        np.testing.assert_array_equal(result, expected)

        series = from_pandas_series(data1, chunk_size=6)
        result = self.executor.execute_dataframe(series.unique(),
                                                 concat=True)[0]
        expected = data1.unique()
        np.testing.assert_array_equal(result, expected)

        data2 = pd.Series([
            pd.Timestamp('20200101'),
        ] * 5 + [pd.Timestamp('20200202')] + [pd.Timestamp('20020101')] * 9)
        series = from_pandas_series(data2)
        result = self.executor.execute_dataframe(series.unique(),
                                                 concat=True)[0]
        expected = data2.unique()
        np.testing.assert_array_equal(result, expected)

        series = from_pandas_series(data2, chunk_size=6)
        result = self.executor.execute_dataframe(series.unique(),
                                                 concat=True)[0]
        expected = data2.unique()
        np.testing.assert_array_equal(result, expected)
Пример #20
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testFromPandasDataFrameExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = from_pandas_df(pdf, chunk_size=(13, 21))

        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

    def testFromPandasSeriesExecution(self):
        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = from_pandas_series(ps, chunk_size=13)

        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testInitializerExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30),
                           index=[np.arange(20),
                                  np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        ps = pd.Series(np.random.rand(20),
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

    def testSeriesFromTensor(self):
        data = np.random.rand(10)
        series = md.Series(mt.tensor(data), name='a')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a'))

        series = md.Series(mt.tensor(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data))

        series = md.Series(mt.ones((10, ), chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(np.ones(10, )))

        index_data = np.random.rand(10)
        series = md.Series(mt.tensor(data, chunk_size=3),
                           name='a',
                           index=mt.tensor(index_data, chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a', index=index_data))

    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2,
                                    index=pd.Index(['a', 'b']),
                                    columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(
            self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given index that is a tensor
        raw7 = np.random.rand(10, 10)
        tensor7 = mt.tensor(raw7, chunk_size=3)
        index_raw7 = np.random.rand(10)
        index7 = mt.tensor(index_raw7, chunk_size=4)
        df7 = dataframe_from_tensor(tensor7, index=index7)
        result7 = self.executor.execute_dataframe(df7, concat=True)[0]
        pdf_expected = pd.DataFrame(raw7, index=index_raw7)
        pd.testing.assert_frame_equal(pdf_expected, result7)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

        # from 1d tensors
        raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10,
                                                                   size=8)),
                 ('c', [
                     ''.join(np.random.choice(list(printable), size=6))
                     for _ in range(8)
                 ])]
        tensors8 = [mt.tensor(r[1], chunk_size=3) for r in raws8]
        df8 = dataframe_from_1d_tensors(tensors8,
                                        columns=[r[0] for r in raws8])
        result = self.executor.execute_dataframe(df8, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8))
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index with a tensor
        index_raw9 = np.random.rand(8)
        index9 = mt.tensor(index_raw9, chunk_size=4)
        df9 = dataframe_from_1d_tensors(tensors8,
                                        columns=[r[0] for r in raws8],
                                        index=index9)
        result = self.executor.execute_dataframe(df9, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9)
        pd.testing.assert_frame_equal(result, pdf_expected)

    def testFromRecordsExecution(self):
        dtype = np.dtype([('x', 'int'), ('y', 'double'), ('z', '<U16')])

        ndarr = np.ones((10, ), dtype=dtype)
        pdf_expected = pd.DataFrame.from_records(ndarr,
                                                 index=pd.RangeIndex(10))

        # from structured array of mars
        tensor = mt.ones((10, ), dtype=dtype, chunk_size=3)
        df1 = from_records(tensor)
        df1_result = self.executor.execute_dataframe(df1, concat=True)[0]
        pd.testing.assert_frame_equal(df1_result, pdf_expected)

        # from structured array of numpy
        df2 = from_records(ndarr)
        df2_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(df2_result, pdf_expected)

    def testReadCSVExecution(self):
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test sep
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, sep=';')

            pdf = pd.read_csv(file_path, sep=';', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              sep=';',
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               sep=';',
                                                               index_col=0,
                                                               chunk_bytes=10),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test missing value
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame({
                'c1': [np.nan, 'a', 'b', 'c'],
                'c2': [1, 2, 3, np.nan],
                'c3': [np.nan, np.nan, 3.4, 2.2]
            })
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_path,
                                                               index_col=0,
                                                               chunk_bytes=12),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path)

            pdf = pd.read_csv(file_path, index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, index_col=0, chunk_bytes=100),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test compression
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.gzip')
        try:
            index = pd.date_range(start='1/1/2018', periods=100)
            df = pd.DataFrame(
                {
                    'col1': np.random.rand(100),
                    'col2': np.random.choice(['a', 'b', 'c'], (100, )),
                    'col3': np.arange(100)
                },
                index=index)
            df.to_csv(file_path, compression='gzip')

            pdf = pd.read_csv(file_path, compression='gzip', index_col=0)
            mdf = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, compression='gzip', index_col=0, chunk_bytes='1k'),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(pdf, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test multiply files
        tempdir = tempfile.mkdtemp()
        try:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            mdf = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                              index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf)

            mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths,
                                                               index_col=0,
                                                               chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2)

        finally:
            shutil.rmtree(tempdir)

        # test wildcards in path
        tempdir = tempfile.mkdtemp()
        try:
            df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c'])

            file_paths = [
                os.path.join(tempdir, 'test{}.csv'.format(i)) for i in range(3)
            ]
            df[:100].to_csv(file_paths[0])
            df[100:200].to_csv(file_paths[1])
            df[200:].to_csv(file_paths[2])

            # As we can not guarantee the order in which these files are processed,
            # the result may not keep the original order.
            mdf = self.executor.execute_dataframe(md.read_csv(
                '{}/*.csv'.format(tempdir), index_col=0),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf.sort_index())

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                '{}/*.csv'.format(tempdir), index_col=0, chunk_bytes=50),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(df, mdf2.sort_index())

        finally:
            shutil.rmtree(tempdir)

    @require_cudf
    def testReadCSVGPUExecution(self):
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame({
                'col1':
                np.random.rand(100),
                'col2':
                np.random.choice(['a', 'b', 'c'], (100, )),
                'col3':
                np.arange(100)
            })
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = self.executor.execute_dataframe(md.read_csv(file_path,
                                                              gpu=True),
                                                  concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf.to_pandas().reset_index(drop=True))

            mdf2 = self.executor.execute_dataframe(md.read_csv(
                file_path, gpu=True, chunk_bytes=200),
                                                   concat=True)[0]
            pd.testing.assert_frame_equal(
                pdf.reset_index(drop=True),
                mdf2.to_pandas().reset_index(drop=True))

        finally:
            shutil.rmtree(tempdir)

    def testReadCSVWithoutIndex(self):
        sess = new_session()

        # test csv file without storing index
        tempdir = tempfile.mkdtemp()
        file_path = os.path.join(tempdir, 'test.csv')
        try:
            df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                              columns=['a', 'b', 'c'])
            df.to_csv(file_path, index=False)

            pdf = pd.read_csv(file_path)
            mdf = sess.run(md.read_csv(file_path, sort_range_index=True))
            pd.testing.assert_frame_equal(pdf, mdf)

            mdf2 = sess.run(
                md.read_csv(file_path, sort_range_index=True, chunk_bytes=10))
            pd.testing.assert_frame_equal(pdf, mdf2)
        finally:
            shutil.rmtree(tempdir)

    def testReadSQLTableExecution(self):
        import sqlalchemy as sa

        test_df = pd.DataFrame({
            'a': np.arange(10).astype(np.int64, copy=False),
            'b': ['s%d' % i for i in range(10)],
            'c': np.random.rand(10)
        })

        with tempfile.TemporaryDirectory() as d:
            table_name = 'test'
            table_name2 = 'test2'
            uri = 'sqlite:///' + os.path.join(d, 'test.db')

            test_df.to_sql(table_name, uri, index=False)

            r = md.read_sql_table('test', uri, chunk_size=4)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            pd.testing.assert_frame_equal(result, test_df)

            engine = sa.create_engine(uri)
            m = sa.MetaData()

            try:
                # test index_col and columns
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      chunk_size=4,
                                      index_col='a',
                                      columns=['b'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index('a', inplace=True)
                del expected['c']
                pd.testing.assert_frame_equal(result, expected)

                # do not specify chunk_size
                r = md.read_sql_table('test',
                                      engine.connect(),
                                      index_col='a',
                                      columns=['b'])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, expected)

                table = sa.Table(table_name,
                                 m,
                                 autoload=True,
                                 autoload_with=engine)
                r = md.read_sql_table(
                    table,
                    engine,
                    chunk_size=4,
                    index_col=[table.columns['a'], table.columns['b']],
                    columns=[table.columns['c']])
                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = test_df.copy(deep=True)
                expected.set_index(['a', 'b'], inplace=True)
                pd.testing.assert_frame_equal(result, expected)

                # test primary key
                sa.Table(table_name2, m,
                         sa.Column('id', sa.Integer, primary_key=True),
                         sa.Column('a', sa.Integer), sa.Column('b', sa.String),
                         sa.Column('c', sa.Float))
                m.create_all(engine)
                test_df = test_df.copy(deep=True)
                test_df.index.name = 'id'
                test_df.to_sql(table_name2, uri, if_exists='append')

                r = md.read_sql_table(table_name2,
                                      engine,
                                      chunk_size=4,
                                      index_col='id')
                result = self.executor.execute_dataframe(r, concat=True)[0]
                pd.testing.assert_frame_equal(result, test_df)
            finally:
                engine.dispose()
Пример #21
0
class TestCumReduction(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def compute(self, data, **kwargs):
        return getattr(data, self.func_name)(**kwargs)

    def testSeriesCumReduction(self):
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')
        reduction_df1 = self.compute(from_pandas_series(data))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=6))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_series(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_series(data, chunk_size=4),
                                     axis='index')
        pd.testing.assert_series_equal(
            self.compute(data, axis='index'),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        data = pd.Series(np.random.rand(20), name='a')
        data[0] = 0.1  # make sure not all elements are NAN
        data[data > 0.5] = np.nan
        reduction_df1 = self.compute(from_pandas_series(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_series(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_series_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

    def testDataFrameCumReduction(self):
        data = pd.DataFrame(np.random.rand(20, 10))
        reduction_df1 = self.compute(from_pandas_df(data))
        pd.testing.assert_frame_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_frame_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df4 = self.compute(from_pandas_df(data, chunk_size=3),
                                     axis=1)
        pd.testing.assert_frame_equal(
            self.compute(data, axis=1),
            self.executor.execute_dataframe(reduction_df4, concat=True)[0])

        # test null
        np_data = np.random.rand(20, 10)
        np_data[np_data > 0.6] = np.nan
        data = pd.DataFrame(np_data)

        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=3))
        pd.testing.assert_frame_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_frame_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        reduction_df2 = self.compute(from_pandas_df(data, chunk_size=3),
                                     skipna=False)
        pd.testing.assert_frame_equal(
            self.compute(data, skipna=False),
            self.executor.execute_dataframe(reduction_df2, concat=True)[0])

        # test numeric_only
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        reduction_df1 = self.compute(from_pandas_df(data, chunk_size=2))
        pd.testing.assert_frame_equal(
            self.compute(data),
            self.executor.execute_dataframe(reduction_df1, concat=True)[0])

        reduction_df3 = self.compute(from_pandas_df(data, chunk_size=3),
                                     axis='columns')
        pd.testing.assert_frame_equal(
            self.compute(data, axis='columns'),
            self.executor.execute_dataframe(reduction_df3, concat=True)[0])
Пример #22
0
class Test(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testDataFrameEwmAgg(self):
        np.random.seed(0)

        raw = pd.DataFrame({
            'a':
            np.random.randint(100, size=(10, )),
            'b':
            np.random.rand(10),
            'c':
            np.random.randint(100, size=(10, )),
            'd': ['c' * i for i in np.random.randint(4, size=10)]
        })
        raw.b[0:3] = np.nan
        raw.b[5:7] = np.nan
        raw.b[9] = np.nan

        df = md.DataFrame(raw, chunk_size=(10, 3))

        r = df.ewm(alpha=0.5).agg(['mean'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.5).agg(['mean']))

        df = md.DataFrame(raw, chunk_size=(3, 3))

        aggs = ['mean', 'var', 'std']

        for fun_name in aggs:
            r = df.ewm(alpha=0.3).agg(fun_name)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r, concat=True)[0],
                raw.ewm(alpha=0.3).agg(fun_name))

            r = df.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
            pd.testing.assert_frame_equal(
                self.executor.execute_dataframe(r, concat=True)[0],
                raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name))

        r = df.ewm(alpha=0.3).agg(['mean'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(['mean']))

        r = df.ewm(alpha=0.3).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(aggs))

        agg_dict = {'c': 'mean'}
        r = df.ewm(alpha=0.3).agg(agg_dict)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(agg_dict))

        agg_dict = OrderedDict([('a', ['mean', 'var']), ('b', 'var')])
        r = df.ewm(alpha=0.3).agg(agg_dict)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(agg_dict))

        r = df.ewm(alpha=0.3, min_periods=0).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3, min_periods=0).agg(aggs))

        r = df.ewm(alpha=0.3, min_periods=2).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3, min_periods=2).agg(aggs))

        agg_dict = OrderedDict([('a', ['mean', 'var']), ('b', 'var'),
                                ('c', 'mean')])
        r = df.ewm(alpha=0.3, min_periods=2).agg(agg_dict)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3, min_periods=2).agg(agg_dict))

    def testSeriesExpandingAgg(self):
        raw = pd.Series(np.random.rand(10), name='a')
        raw[:3] = np.nan
        raw[5:10:2] = np.nan

        series = md.Series(raw, chunk_size=10)

        r = series.ewm(alpha=0.3).agg(['mean'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(['mean']))

        r = series.ewm(alpha=0.3).agg('mean')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg('mean'))

        series = md.Series(raw, chunk_size=3)

        aggs = ['mean', 'var', 'std']

        for fun_name in aggs:
            r = series.ewm(alpha=0.3).agg(fun_name)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r, concat=True)[0],
                raw.ewm(alpha=0.3).agg(fun_name))

            r = series.ewm(alpha=0.3, ignore_na=True).agg(fun_name)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(r, concat=True)[0],
                raw.ewm(alpha=0.3, ignore_na=True).agg(fun_name))

        r = series.ewm(alpha=0.3).agg(['mean'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(['mean']))

        r = series.ewm(alpha=0.3).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3).agg(aggs))

        r = series.ewm(alpha=0.3, min_periods=0).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3, min_periods=0).agg(aggs))

        r = series.ewm(alpha=0.3, min_periods=2).agg(aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(r, concat=True)[0],
            raw.ewm(alpha=0.3, min_periods=2).agg(aggs))
Пример #23
0
class TestAggregate(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def testDataFrameAggregate(self):
        all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'mean', 'var', 'std']
        data = pd.DataFrame(np.random.rand(20, 20))

        df = from_pandas_df(data)
        result = df.agg(all_aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))

        df = from_pandas_df(data, chunk_size=3)

        # will redirect to transform
        result = df.agg(['cumsum', 'cummax'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(['cumsum', 'cummax']))

        for func in all_aggs:
            result = df.agg(func)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(result, concat=True)[0],
                data.agg(func))

            result = df.agg(func, axis=1)
            pd.testing.assert_series_equal(
                self.executor.execute_dataframe(result, concat=True)[0],
                data.agg(func, axis=1))

        result = df.agg(['sum'])
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(['sum']))

        result = df.agg(all_aggs)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))

        result = df.agg(all_aggs, axis=1)
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs, axis=1))

        result = df.agg({0: ['sum', 'min', 'var'], 9: ['mean', 'var', 'std']})
        pd.testing.assert_frame_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg({
                0: ['sum', 'min', 'var'],
                9: ['mean', 'var', 'std']
            }))

    def testSeriesAggregate(self):
        all_aggs = ['sum', 'prod', 'min', 'max', 'count', 'mean', 'var', 'std']
        data = pd.Series(np.random.rand(20),
                         index=[str(i) for i in range(20)],
                         name='a')

        series = from_pandas_series(data)
        result = series.agg(all_aggs)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))

        series = from_pandas_series(data, chunk_size=3)

        for func in all_aggs:
            result = series.agg(func)
            self.assertAlmostEqual(
                self.executor.execute_dataframe(result, concat=True)[0],
                data.agg(func))

        result = series.agg(all_aggs)
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(result, concat=True)[0],
            data.agg(all_aggs))
Пример #24
0
class Test(unittest.TestCase):
    def setUp(self) -> None:
        super().setUp()
        self.executor = ExecutorForTest('numpy')

    def testSortValuesExecution(self):
        df = pd.DataFrame(np.random.rand(100, 10),
                          columns=['a' + str(i) for i in range(10)])

        # test one chunk
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a6', 'a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a6', 'a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test psrs
        mdf = DataFrame(df, chunk_size=10)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test ascending=False
        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a0', 'a1'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a0', 'a1'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a7'], ascending=False),
                                                 concat=True)[0]
        expected = df.sort_values(['a7'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test multiindex
        df2 = df.copy(deep=True)
        df2.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')])
        mdf = DataFrame(df2, chunk_size=10)

        result = self.executor.execute_dataframe(mdf.sort_values([('A', 'C')]),
                                                 concat=True)[0]
        expected = df2.sort_values([('A', 'C')])

        pd.testing.assert_frame_equal(result, expected)

        # test rechunk
        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                 concat=True)[0]
        expected = df.sort_values('a0')

        pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(['a3', 'a4']),
                                                 concat=True)[0]
        expected = df.sort_values(['a3', 'a4'])

        pd.testing.assert_frame_equal(result, expected)

        # test other types
        raw = pd.DataFrame(
            {
                'a': np.random.rand(10),
                'b': np.random.randint(1000, size=10),
                'c': np.random.rand(10),
                'd': [np.random.bytes(10) for _ in range(10)],
                'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                'f': [pd.Timedelta(f'{i} days') for i in range(10)]
            }, )
        mdf = DataFrame(raw, chunk_size=3)

        for label in raw.columns:
            result = self.executor.execute_dataframe(mdf.sort_values(label),
                                                     concat=True)[0]
            expected = raw.sort_values(label)
            pd.testing.assert_frame_equal(result, expected)

        result = self.executor.execute_dataframe(mdf.sort_values(
            ['a', 'b', 'e'], ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

        pd.testing.assert_frame_equal(result, expected)

        # test nan
        df = pd.DataFrame({
            'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
            'col2': [2, 1, 9, np.nan, 7, 4],
            'col3': [0, 1, 9, 4, 2, 3],
        })
        mdf = DataFrame(df)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(df, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                 concat=True)[0]
        expected = df.sort_values(['col2'])

        pd.testing.assert_frame_equal(result, expected)

        # test ignore_index
        executor = ExecutorForTest(storage=new_session().context)

        df = pd.DataFrame(np.random.rand(10, 3),
                          columns=['a' + str(i) for i in range(3)])

        mdf = DataFrame(df, chunk_size=3)
        result = executor.execute_dataframe(mdf.sort_values(['a0', 'a1'],
                                                            ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = df.sort_values(['a0', 'a1'], ignore_index=True)
        except TypeError:
            expected = df.sort_values(['a0', 'a1'])
            expected.index = pd.RangeIndex(len(expected))

        pd.testing.assert_frame_equal(result, expected)

        # test inplace
        mdf = DataFrame(df)
        mdf.sort_values('a0', inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        df.sort_values('a0', inplace=True)

        pd.testing.assert_frame_equal(result, df)

        # test unknown shape
        df = pd.DataFrame({'a': list(range(10)), 'b': np.random.random(10)})
        mdf = DataFrame(df, chunk_size=4)
        filtered = mdf[mdf['a'] > 2]
        result = self.executor.execute_dataframe(filtered.sort_values(by='b'),
                                                 concat=True)[0]

        pd.testing.assert_frame_equal(result,
                                      df[df['a'] > 2].sort_values(by='b'))

        # test Sereis.sort_values
        raw = pd.Series(np.random.rand(10))
        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(series.sort_values(),
                                                 concat=True)[0]
        expected = raw.sort_values()

        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(
            series.sort_values(ascending=False), concat=True)[0]
        expected = raw.sort_values(ascending=False)

        pd.testing.assert_series_equal(result, expected)

    def testSortIndexExecution(self):
        raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw)
        mdf.sort_index(inplace=True)
        result = self.executor.execute_dataframe(mdf, concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=30)
        result = self.executor.execute_dataframe(mdf.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=20)
        result = self.executor.execute_dataframe(
            mdf.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        executor = ExecutorForTest(storage=new_session().context)

        mdf = DataFrame(raw, chunk_size=10)
        result = executor.execute_dataframe(mdf.sort_index(ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(ignore_index=True)
        except TypeError:
            expected = raw.sort_index()
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test axis=1
        raw = pd.DataFrame(np.random.rand(10, 10), columns=np.random.rand(10))

        mdf = DataFrame(raw)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=3)
        result = self.executor.execute_dataframe(mdf.sort_index(axis=1),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        result = self.executor.execute_dataframe(mdf.sort_index(
            axis=1, ascending=False),
                                                 concat=True)[0]
        expected = raw.sort_index(axis=1, ascending=False)
        pd.testing.assert_frame_equal(result, expected)

        mdf = DataFrame(raw, chunk_size=4)
        executor = ExecutorForTest(storage=new_session().context)

        result = executor.execute_dataframe(mdf.sort_index(axis=1,
                                                           ignore_index=True),
                                            concat=True)[0]
        try:  # for python3.5
            expected = raw.sort_index(axis=1, ignore_index=True)
        except TypeError:
            expected = raw.sort_index(axis=1)
            expected.index = pd.RangeIndex(len(expected))
        pd.testing.assert_frame_equal(result, expected)

        # test series
        raw = pd.Series(np.random.rand(10, ), index=np.random.rand(10))

        series = Series(raw)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=2)
        result = self.executor.execute_dataframe(series.sort_index(),
                                                 concat=True)[0]
        expected = raw.sort_index()
        pd.testing.assert_series_equal(result, expected)

        series = Series(raw, chunk_size=3)
        result = self.executor.execute_dataframe(
            series.sort_index(ascending=False), concat=True)[0]
        expected = raw.sort_index(ascending=False)
        pd.testing.assert_series_equal(result, expected)

    def testArrowStringSortValues(self):
        rs = np.random.RandomState(0)
        raw = pd.DataFrame({
            'a': rs.rand(10),
            'b': [f's{rs.randint(1000)}' for _ in range(10)]
        })
        raw['b'] = raw['b'].astype(ArrowStringDtype())
        mdf = DataFrame(raw, chunk_size=3)

        df = mdf.sort_values(by='b')
        result = self.executor.execute_dataframe(df, concat=True)[0]
        expected = raw.sort_values(by='b')
        pd.testing.assert_frame_equal(result, expected)
Пример #25
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testToDatetimeExecution(self):
        # scalar
        r = to_datetime(1490195805, unit='s')

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(1490195805, unit='s')
        self.assertEqual(pd.to_datetime(result.item()), expected)

        # test list like
        raw = ['3/11/2000', '3/12/2000', '3/13/2000']
        t = tensor(raw, chunk_size=2)
        r = to_datetime(t, infer_datetime_format=True)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw, infer_datetime_format=True)
        pd.testing.assert_index_equal(result, expected)

        # test series
        raw_series = pd.Series(raw)
        s = Series(raw_series, chunk_size=2)
        r = to_datetime(s)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_series)
        pd.testing.assert_series_equal(result, expected)

        # test DataFrame
        raw_df = pd.DataFrame({
            'year': [2015, 2016],
            'month': [2, 3],
            'day': [4, 5]
        })
        df = DataFrame(raw_df, chunk_size=(1, 2))
        r = to_datetime(df)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_df)
        pd.testing.assert_series_equal(result, expected)

        # test Index
        raw_index = pd.Index([1, 2, 3])
        s = Index(raw_index, chunk_size=2)
        r = to_datetime(s)

        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw_index)
        pd.testing.assert_index_equal(result, expected)

        # test raises == 'ignore'
        raw = ['13000101']
        r = to_datetime(raw, format='%Y%m%d', errors='ignore')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime(raw, format='%Y%m%d', errors='ignore')
        pd.testing.assert_index_equal(result, expected)

        # test unit
        r = to_datetime([1490195805], unit='s')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime([1490195805], unit='s')
        pd.testing.assert_index_equal(result, expected)

        # test origin
        r = to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = pd.to_datetime([1, 2, 3],
                                  unit='D',
                                  origin=pd.Timestamp('1960-01-01'))
        pd.testing.assert_index_equal(result, expected)
Пример #26
0
class TestBinary(TestBase):
    def setUp(self):
        self.executor = ExecutorForTest()

    def to_boolean_if_needed(self, value, split_value=0.5):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            return value > split_value
        else:
            return value

    def testWithoutShuffleExecution(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # all the axes are monotonic
        # data1 with index split into [0...4], [5...9],
        # columns [3...7], [8...12]
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=np.arange(3, 13))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        # columns [4...9], [10, 13]
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=np.arange(4, 14))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithOneShuffleExecution(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithAllShuffleExecution(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # no axis is monotonic
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9],
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3],
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testBothWithOneChunk(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=10)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=10)
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=10)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithoutShuffleAndWithOneChunk(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testWithShuffleAndWithOneChunk(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # pandas fails to compute some expected values due to `na`.
            return

        # only 1 axis is monotonic
        # data1 with index split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        df1 = from_pandas(data1, chunk_size=(10, 5))
        # data2 with index split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        df2 = from_pandas(data2, chunk_size=(10, 6))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

        # only 1 axis is monotonic
        # data1 with columns split into [0...4], [5...9],
        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7],
                             columns=np.arange(10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=(5, 10))
        # data2 with columns split into [6...11], [2, 5],
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2],
                             columns=np.arange(11, 1, -1))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=(6, 10))

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testSameIndex(self):
        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(0, 2, size=(10, )),
                            columns=['c' + str(i) for i in range(10)])
        data = self.to_boolean_if_needed(data)
        df = from_pandas(data, chunk_size=3)
        df2 = self.func(df, df)

        expected = self.func(data, data)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        series = from_pandas_series(data.iloc[0], chunk_size=3)
        df3 = self.func(df, series)

        expected = self.func(data, data.iloc[0])
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        series = from_pandas_series(data.iloc[:, 0], chunk_size=3)
        df4 = getattr(df, self.func_name)(series, axis=0)

        expected = getattr(data, self.func_name)(data.iloc[:, 0], axis=0)
        result = self.executor.execute_dataframe(df4, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

    def testChained(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        data4 = pd.DataFrame(np.random.rand(10, 10))
        data4 = self.to_boolean_if_needed(data1)
        df4 = from_pandas(data4, chunk_size=6)

        df5 = self.func(df3, df4)

        result = self.executor.execute_dataframe(df5, concat=True)[0]
        expected = self.func(self.func(data1, data2), data4)

        pd.testing.assert_frame_equal(expected, result)

    def testRfunc(self):
        data1 = pd.DataFrame(np.random.rand(10, 10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)
        df3 = getattr(df1, self.rfunc_name)(df2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]
        expected = self.func(data2, data1)
        pd.testing.assert_frame_equal(expected, result)

        data3 = pd.DataFrame(np.random.rand(10, 10))
        data3 = self.to_boolean_if_needed(data3)
        df4 = from_pandas(data3, chunk_size=5)
        df5 = getattr(df4, self.rfunc_name)(1)
        # todo check dtypes when pandas reverts its behavior on broadcasting
        check_dtypes = self.func_name not in ('__and__', '__or__', '__xor__')
        result = self.executor.execute_dataframe(df5,
                                                 concat=True,
                                                 check_dtypes=check_dtypes)[0]
        expected2 = self.func(1, data3)
        pd.testing.assert_frame_equal(expected2, result)

    def testWithMultiForms(self):
        # test multiple forms
        # such as self+other, self.add(other), add(self,other)
        data1 = pd.DataFrame(np.random.rand(10, 10))
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 10))
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(self.func(df1, df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(self.func(df1, df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1,
                                                         self.func_name)(df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result = self.executor.execute_dataframe(getattr(df1,
                                                         self.rfunc_name)(df2),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(self.func(data2, data1), result)

    def testDataframeAndScalar(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators doesn\'t support floating point scalars
            return

        # test dataframe and scalar
        pdf = pd.DataFrame(np.random.rand(10, 10))
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=2)
        expected = self.func(pdf, 1)
        result = self.executor.execute_dataframe(self.func(df, 1),
                                                 concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)
        result2 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result2)
        result3 = self.executor.execute_dataframe(getattr(df,
                                                          self.func_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result3)

        # test scalar and dataframe
        result4 = self.executor.execute_dataframe(self.func(df, 1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected, result4)

        expected2 = self.func(1, pdf)
        result5 = self.executor.execute_dataframe(self.func(1, df),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result5)

        result6 = self.executor.execute_dataframe(getattr(df,
                                                          self.rfunc_name)(1),
                                                  concat=True)[0]
        pd.testing.assert_frame_equal(expected2, result6)

    def testWithShuffleOnStringIndex(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # FIXME bitwise logical operators behave differently with pandas when index is not aligned.
            return

        # no axis is monotonic, and the index values are strings.
        data1 = pd.DataFrame(
            np.random.rand(10, 10),
            index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]],
            columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=5)
        data2 = pd.DataFrame(
            np.random.rand(10, 10),
            index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]],
            columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)
        df2 = from_pandas(data2, chunk_size=6)

        df3 = self.func(df1, df2)

        expected = self.func(data1, data2)
        result = self.executor.execute_dataframe(df3, concat=True)[0]

        pd.testing.assert_frame_equal(expected, result)

    def testDataframeAndSeries(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # pandas fails to compute some expected values due to `na`.
            return

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        data2 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(11, 1, -1),
                             columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2])
        data2 = self.to_boolean_if_needed(data2)

        s1 = from_pandas_series(data2[1], chunk_size=(6, ))

        # operate on single-column dataframe and series
        df1 = from_pandas(data1[[1]], chunk_size=(5, 5))
        r1 = getattr(df1, self.func_name)(s1, axis='index')

        expected = getattr(data1[[1]], self.func_name)(data2[1], axis='index')
        result = self.executor.execute_dataframe(r1, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # operate on dataframe and series without shuffle
        df2 = from_pandas(data1, chunk_size=(5, 5))
        r2 = getattr(df2, self.func_name)(s1, axis='index')

        expected = getattr(data1, self.func_name)(data2[1], axis='index')
        result = self.executor.execute_dataframe(r2, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # operate on dataframe and series with shuffle
        df3 = from_pandas(data1, chunk_size=(5, 5))
        r3 = getattr(df3, self.func_name)(s1, axis='columns')

        expected = getattr(data1, self.func_name)(data2[1], axis='columns')
        result = self.executor.execute_dataframe(r3, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test both one chunk, axis=0
        pdf = pd.DataFrame({
            'ca': [1, 3, 2],
            'cb': [360, 180, 2]
        },
                           index=[1, 2, 3])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=0),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=0)
        pd.testing.assert_frame_equal(expected, result)

        # test different number of chunks, axis=0
        pdf = pd.DataFrame({
            'ca': [1, 3, 2],
            'cb': [360, 180, 2]
        },
                           index=[1, 2, 3])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=0),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=0)
        pd.testing.assert_frame_equal(expected, result)

        # test with row shuffle, axis=0
        pdf = pd.DataFrame({
            'ca': [1, 3, 2],
            'cb': [360, 180, 2]
        },
                           index=[2, 1, 3])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[3, 1, 2])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=0),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series,
                                                axis=0).reindex([3, 1, 2])
        # modify the order of rows
        result = result.reindex(index=[3, 1, 2])
        pd.testing.assert_frame_equal(expected, result)

        # test both one chunk, axis=1
        pdf = pd.DataFrame({
            1: [1, 3, 2],
            2: [360, 180, 2],
            3: [1, 2, 3]
        },
                           index=['ra', 'rb', 'rc'])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=1),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        pd.testing.assert_frame_equal(expected, result)

        # test different number of chunks, axis=1
        pdf = pd.DataFrame({
            1: [1, 3, 2],
            2: [360, 180, 2],
            3: [1, 2, 3]
        },
                           index=['ra', 'rb', 'rc'])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[1, 2, 3])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=1),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        pd.testing.assert_frame_equal(expected, result)

        # test with row shuffle, axis=1
        pdf = pd.DataFrame({
            1: [1, 3, 2],
            3: [1, 2, 3],
            2: [360, 180, 2]
        },
                           index=['ra', 'rb', 'rc'])
        pdf = self.to_boolean_if_needed(pdf)
        df = from_pandas(pdf, chunk_size=1)
        series = pd.Series([0, 1, 2], index=[3, 1, 2])
        mars_series = from_pandas_series(series)
        result = self.executor.execute_dataframe(getattr(df, self.func_name)(
            mars_series, axis=1),
                                                 concat=True)[0]
        expected = getattr(pdf, self.func_name)(series, axis=1)
        # modify the order of columns
        result = result[[1, 2, 3]]
        pd.testing.assert_frame_equal(expected, result)

    def testSeries(self):
        # only one chunk
        s1 = pd.Series(np.arange(10) + 1)
        s1 = self.to_boolean_if_needed(s1)
        s2 = pd.Series(np.arange(10) + 1)
        s2 = self.to_boolean_if_needed(s2)
        r = self.func(from_pandas_series(s1, chunk_size=10),
                      from_pandas_series(s2, chunk_size=10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # same index
        s1 = pd.Series(np.arange(10) + 1)
        s1 = self.to_boolean_if_needed(s1)
        s2 = pd.Series(np.arange(10) + 1)
        s2 = self.to_boolean_if_needed(s2)
        r = self.func(from_pandas_series(s1, chunk_size=4),
                      from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # no shuffle
        s1 = pd.Series(np.arange(10) + 1, index=range(10))
        s1 = self.to_boolean_if_needed(s1)
        s2 = pd.Series(np.arange(10) + 1, index=range(10, 0, -1))
        s2 = self.to_boolean_if_needed(s2)
        r = self.func(from_pandas_series(s1, chunk_size=4),
                      from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        # shuffle
        data = (np.arange(10) + 1).astype(np.int64, copy=False)
        s1 = pd.Series(data, index=np.random.permutation(range(10)))
        s1 = self.to_boolean_if_needed(s1)
        s2 = pd.Series(data, index=np.random.permutation(range(10, 0, -1)))
        s2 = self.to_boolean_if_needed(s2)
        r = self.func(from_pandas_series(s1, chunk_size=4),
                      from_pandas_series(s2, chunk_size=6))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, s2)
        pd.testing.assert_series_equal(expected, result)

        if self.func_name in ['__and__', '__or__', '__xor__']:
            # bitwise logical operators doesn\'t support floating point scalars
            return

        # operate with scalar
        s1 = pd.Series(np.arange(10) + 1,
                       index=np.random.permutation(range(10)))
        s1 = self.to_boolean_if_needed(s1)
        r = self.func(from_pandas_series(s1, chunk_size=4), 4)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(s1, 4)
        pd.testing.assert_series_equal(expected, result)

        # reverse with scalar
        s1 = pd.Series(np.arange(10) + 1,
                       index=np.random.permutation(range(10)))
        s1 = self.to_boolean_if_needed(s1)
        r = self.func(4, from_pandas_series(s1, chunk_size=4))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = self.func(4, s1)
        pd.testing.assert_series_equal(expected, result)

    def testWithPlainValue(self):
        if self.func_name in ['__and__', '__or__', '__xor__']:
            # skip tests for bitwise logical operators on plain value.
            return

        data1 = pd.DataFrame(np.random.rand(10, 10),
                             index=np.arange(10),
                             columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                         axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1,
                           self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                           axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
                                         axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1,
                           self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
                                           axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2],
                           self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2],
                           self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
        pd.testing.assert_series_equal(expected, result)

        # specify index, not the default range index
        data1 = pd.DataFrame(np.random.rand(10, 7),
                             index=np.arange(5, 15),
                             columns=[4, 1, 3, 2, 5, 6, 7])
        data1 = self.to_boolean_if_needed(data1)
        df1 = from_pandas(data1, chunk_size=6)
        s1 = df1[2]

        r = getattr(df1,
                    self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
                                    axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array(
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
                                                  axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(df1, self.func_name)(from_array(
            np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
                                         axis=0)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1, self.func_name)(np.array(
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
                                                  axis=0)
        pd.testing.assert_frame_equal(expected, result)

        r = getattr(s1,
                    self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array(
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)

        r = getattr(s1, self.func_name)(from_array(
            np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = getattr(data1[2], self.func_name)(np.array(
            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
        pd.testing.assert_series_equal(expected, result)
Пример #27
0
    def testSortValuesExecution(self):
        distinct_opts = ['0'] if sys.platform.lower().startswith('win') else [
            '0', '1'
        ]
        for add_distinct in distinct_opts:
            os.environ['PSRS_DISTINCT_COL'] = add_distinct
            df = pd.DataFrame(np.random.rand(100, 10),
                              columns=['a' + str(i) for i in range(10)])

            # test one chunk
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a6', 'a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a6', 'a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test psrs
            mdf = DataFrame(df, chunk_size=10)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test ascending=False
            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a0', 'a1'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a7'], ascending=False),
                                                     concat=True)[0]
            expected = df.sort_values(['a7'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test multiindex
            df2 = df.copy(deep=True)
            df2.columns = pd.MultiIndex.from_product(
                [list('AB'), list('CDEFG')])
            mdf = DataFrame(df2, chunk_size=10)

            result = self.executor.execute_dataframe(mdf.sort_values([('A',
                                                                       'C')]),
                                                     concat=True)[0]
            expected = df2.sort_values([('A', 'C')])

            pd.testing.assert_frame_equal(result, expected)

            # test rechunk
            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values('a0'),
                                                     concat=True)[0]
            expected = df.sort_values('a0')

            pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a3', 'a4']),
                                                     concat=True)[0]
            expected = df.sort_values(['a3', 'a4'])

            pd.testing.assert_frame_equal(result, expected)

            # test other types
            raw = pd.DataFrame(
                {
                    'a': np.random.rand(10),
                    'b': np.random.randint(1000, size=10),
                    'c': np.random.rand(10),
                    'd': [np.random.bytes(10) for _ in range(10)],
                    'e': [pd.Timestamp(f'201{i}') for i in range(10)],
                    'f': [pd.Timedelta(f'{i} days') for i in range(10)]
                }, )
            mdf = DataFrame(raw, chunk_size=3)

            for label in raw.columns:
                result = self.executor.execute_dataframe(
                    mdf.sort_values(label), concat=True)[0]
                expected = raw.sort_values(label)
                pd.testing.assert_frame_equal(result, expected)

            result = self.executor.execute_dataframe(mdf.sort_values(
                ['a', 'b', 'e'], ascending=False),
                                                     concat=True)[0]
            expected = raw.sort_values(['a', 'b', 'e'], ascending=False)

            pd.testing.assert_frame_equal(result, expected)

            # test nan
            df = pd.DataFrame({
                'col1': ['A', 'A', 'B', 'B', 'D', 'C'],
                'col2': [2, 1, 9, np.nan, 7, 4],
                'col3': [0, 1, 9, 4, 2, 3],
            })
            mdf = DataFrame(df)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            mdf = DataFrame(df, chunk_size=3)
            result = self.executor.execute_dataframe(mdf.sort_values(['col2']),
                                                     concat=True)[0]
            expected = df.sort_values(['col2'])

            pd.testing.assert_frame_equal(result, expected)

            # test ignore_index
            executor = ExecutorForTest(storage=new_session().context)

            df = pd.DataFrame(np.random.rand(10, 3),
                              columns=['a' + str(i) for i in range(3)])

            mdf = DataFrame(df, chunk_size=3)
            result = executor.execute_dataframe(mdf.sort_values(
                ['a0', 'a1'], ignore_index=True),
                                                concat=True)[0]
            try:  # for python3.5
                expected = df.sort_values(['a0', 'a1'], ignore_index=True)
            except TypeError:
                expected = df.sort_values(['a0', 'a1'])
                expected.index = pd.RangeIndex(len(expected))

            pd.testing.assert_frame_equal(result, expected)

            # test inplace
            mdf = DataFrame(df)
            mdf.sort_values('a0', inplace=True)
            result = self.executor.execute_dataframe(mdf, concat=True)[0]
            df.sort_values('a0', inplace=True)

            pd.testing.assert_frame_equal(result, df)

            # test unknown shape
            df = pd.DataFrame({
                'a': list(range(10)),
                'b': np.random.random(10)
            })
            mdf = DataFrame(df, chunk_size=4)
            filtered = mdf[mdf['a'] > 2]
            result = self.executor.execute_dataframe(
                filtered.sort_values(by='b'), concat=True)[0]

            pd.testing.assert_frame_equal(result,
                                          df[df['a'] > 2].sort_values(by='b'))

            # test Series.sort_values
            raw = pd.Series(np.random.rand(10))
            series = Series(raw)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=3)
            result = self.executor.execute_dataframe(series.sort_values(),
                                                     concat=True)[0]
            expected = raw.sort_values()

            pd.testing.assert_series_equal(result, expected)

            series = Series(raw, chunk_size=2)
            result = self.executor.execute_dataframe(
                series.sort_values(ascending=False), concat=True)[0]
            expected = raw.sort_values(ascending=False)

            pd.testing.assert_series_equal(result, expected)
Пример #28
0
class TestUnary(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    def testAbs(self):
        data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)))
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(df1.abs(), concat=True)[0]
        expected = data1.abs()
        pd.testing.assert_frame_equal(expected, result)

        result = self.executor.execute_dataframe(abs(df1), concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

    def testNot(self):
        data1 = pd.DataFrame(
            np.random.uniform(low=-1, high=1, size=(10, 10)) > 0)
        df1 = from_pandas(data1, chunk_size=5)

        result = self.executor.execute_dataframe(~df1, concat=True)[0]
        expected = ~data1
        pd.testing.assert_frame_equal(expected, result)

    def testUfunc(self):
        df_raw = pd.DataFrame(np.random.uniform(size=(10, 10)),
                              index=pd.RangeIndex(9, -1, -1))
        df = from_pandas(df_raw, chunk_size=5)

        series_raw = pd.Series(np.random.uniform(size=10),
                               index=pd.RangeIndex(9, -1, -1))
        series = from_pandas_series(series_raw, chunk_size=5)

        ufuncs = [[np.abs, mt.abs], [np.log, mt.log], [np.log2, mt.log2],
                  [np.log10, mt.log10], [np.sin, mt.sin], [np.cos, mt.cos],
                  [np.tan, mt.tan], [np.sinh, mt.sinh], [np.cosh, mt.cosh],
                  [np.tanh, mt.tanh], [np.arcsin, mt.arcsin],
                  [np.arccos, mt.arccos], [np.arctan, mt.arctan],
                  [np.arcsinh, mt.arcsinh], [np.arccosh, mt.arccosh],
                  [np.arctanh, mt.arctanh], [np.radians, mt.radians],
                  [np.degrees, mt.degrees], [np.ceil, mt.ceil],
                  [np.floor, mt.floor],
                  [
                      partial(np.around, decimals=2),
                      partial(mt.around, decimals=2)
                  ], [np.exp, mt.exp], [np.exp2, mt.exp2],
                  [np.expm1, mt.expm1], [np.sqrt, mt.sqrt]]

        for raw, data in [(df_raw, df), (series_raw, series)]:
            for npf, mtf in ufuncs:
                r = mtf(data)

                result = self.executor.execute_tensor(r, concat=True)[0]
                expected = npf(raw)

                if isinstance(raw, pd.DataFrame):
                    pd.testing.assert_frame_equal(result, expected)
                else:
                    pd.testing.assert_series_equal(result, expected)

                # test numpy ufunc
                r = npf(data)

                result = self.executor.execute_tensor(r, concat=True)[0]

                if isinstance(raw, pd.DataFrame):
                    pd.testing.assert_frame_equal(result, expected)
                else:
                    pd.testing.assert_series_equal(result, expected)
Пример #29
0
    def testAppendExecution(self):
        executor = ExecutorForTest(storage=new_session().context)

        df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))
        df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD'))

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=3)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        mdf1 = from_pandas(df1, chunk_size=3)
        mdf2 = from_pandas(df2, chunk_size=2)

        adf = mdf1.append(mdf2)
        expected = df1.append(df2)
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(mdf2, ignore_index=True)
        expected = df1.append(df2, ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        df3 = pd.DataFrame(np.random.rand(8, 4), columns=list('ABCD'))
        mdf3 = from_pandas(df3, chunk_size=3)
        expected = df1.append([df2, df3])
        adf = mdf1.append([mdf2, mdf3])
        result = self.executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        adf = mdf1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        expected = df1.append(dict(A=1, B=2, C=3, D=4), ignore_index=True)
        result = executor.execute_dataframe(adf, concat=True)[0]
        pd.testing.assert_frame_equal(expected, result)

        # test for series
        series1 = pd.Series(np.random.rand(10,))
        series2 = pd.Series(np.random.rand(10,))

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=3)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        mseries1 = series_from_pandas(series1, chunk_size=3)
        mseries2 = series_from_pandas(series2, chunk_size=2)

        aseries = mseries1.append(mseries2)
        expected = series1.append(series2)
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        aseries = mseries1.append(mseries2, ignore_index=True)
        expected = series1.append(series2, ignore_index=True)
        result = executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)

        series3 = pd.Series(np.random.rand(4,))
        mseries3 = series_from_pandas(series3, chunk_size=2)
        expected = series1.append([series2, series3])
        aseries = mseries1.append([mseries2, mseries3])
        result = self.executor.execute_dataframe(aseries, concat=True)[0]
        pd.testing.assert_series_equal(expected, result)
Пример #30
0
class Test(TestBase):
    def setUp(self):
        super().setUp()
        self.executor = ExecutorForTest()

    @require_cudf
    def testToGPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)

        res = self.executor.execute_dataframe(cdf, concat=True)[0]
        self.assertIsInstance(res, cudf.DataFrame)
        pd.testing.assert_frame_equal(res.to_pandas(), pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries)
        cseries = series.to_gpu()

        res = self.executor.execute_dataframe(cseries, concat=True)[0]
        self.assertIsInstance(res, cudf.Series)
        pd.testing.assert_series_equal(res.to_pandas(), pseries)

    @require_cudf
    def testToCPUExecution(self):
        pdf = pd.DataFrame(np.random.rand(20, 30), index=np.arange(20, 0, -1))
        df = from_pandas_df(pdf, chunk_size=(13, 21))
        cdf = to_gpu(df)
        df2 = to_cpu(cdf)

        res = self.executor.execute_dataframe(df2, concat=True)[0]
        self.assertIsInstance(res, pd.DataFrame)
        pd.testing.assert_frame_equal(res, pdf)

        pseries = pdf.iloc[:, 0]
        series = from_pandas_series(pseries, chunk_size=(13, 21))
        cseries = to_gpu(series)
        series2 = to_cpu(cseries)

        res = self.executor.execute_dataframe(series2, concat=True)[0]
        self.assertIsInstance(res, pd.Series)
        pd.testing.assert_series_equal(res, pseries)

    def testRechunkExecution(self):
        data = pd.DataFrame(np.random.rand(8, 10))
        df = from_pandas_df(pd.DataFrame(data), chunk_size=3)
        df2 = df.rechunk((3, 4))
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        data = pd.DataFrame(np.random.rand(10, 10),
                            index=np.random.randint(-100, 100, size=(10, )),
                            columns=[np.random.bytes(10) for _ in range(10)])
        df = from_pandas_df(data)
        df2 = df.rechunk(5)
        res = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_frame_equal(data, res)

        # test Series rechunk execution.
        data = pd.Series(np.random.rand(10, ))
        series = from_pandas_series(data)
        series2 = series.rechunk(3)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

        series2 = series.rechunk(1)
        res = self.executor.execute_dataframe(series2, concat=True)[0]
        pd.testing.assert_series_equal(data, res)

    def testResetIndexExecution(self):
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=['falcon', 'parrot', 'lion', 'monkey'],
                            columns=('class', 'max_speed'))
        df = from_pandas_df(data)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index()
        pd.testing.assert_frame_equal(result, expected)

        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, drop=True)
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(drop=True)
        pd.testing.assert_frame_equal(result, expected)

        index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                           ('bird', 'parrot'),
                                           ('mammal', 'lion'),
                                           ('mammal', 'monkey')],
                                          names=['class', 'name'])
        data = pd.DataFrame([('bird', 389.0), ('bird', 24.0), ('mammal', 80.5),
                             ('mammal', np.nan)],
                            index=index,
                            columns=('type', 'max_speed'))
        df = from_pandas_df(data, chunk_size=1)
        df2 = df_reset_index(df, level='class')
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class')
        pd.testing.assert_frame_equal(result, expected)

        columns = pd.MultiIndex.from_tuples([('speed', 'max'),
                                             ('species', 'type')])
        df = from_pandas_df(data, chunk_size=2)
        df2 = df_reset_index(df,
                             level='class',
                             col_level=1,
                             col_fill='species')
        data.columns = columns
        result = self.executor.execute_dataframe(df2, concat=True)[0]
        expected = data.reset_index(level='class',
                                    col_level=1,
                                    col_fill='species')
        pd.testing.assert_frame_equal(result, expected)

        # Test Series

        s = pd.Series([1, 2, 3, 4],
                      name='foo',
                      index=pd.Index(['a', 'b', 'c', 'd'], name='idx'))

        series = from_pandas_series(s)
        s2 = series_reset_index(series, name='bar')
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(name='bar')
        pd.testing.assert_frame_equal(result, expected)

        series = from_pandas_series(s, chunk_size=2)
        s2 = series_reset_index(series, drop=True)
        result = self.executor.execute_dataframe(s2, concat=True)[0]
        expected = s.reset_index(drop=True)
        pd.testing.assert_series_equal(result, expected)

        # Test Unknown shape
        sess = new_session()
        data1 = pd.DataFrame(np.random.rand(10, 3),
                             index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        df1 = from_pandas_df(data1, chunk_size=5)
        data2 = pd.DataFrame(np.random.rand(10, 3),
                             index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        df2 = from_pandas_df(data2, chunk_size=6)
        df = (df1 + df2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

        data1 = pd.Series(np.random.rand(10, ),
                          index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9])
        series1 = from_pandas_series(data1, chunk_size=3)
        data2 = pd.Series(np.random.rand(10, ),
                          index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3])
        series2 = from_pandas_series(data2, chunk_size=3)
        df = (series1 + series2).reset_index()
        result = sess.run(df)
        pd.testing.assert_index_equal(result.index, pd.RangeIndex(12))
        # Inconsistent with Pandas when input dataframe's shape is unknown.
        result = result.sort_values(by=result.columns[0])
        expected = (data1 + data2).reset_index()
        np.testing.assert_array_equal(result.to_numpy(), expected.to_numpy())

    def testSeriesMapExecution(self):
        raw = pd.Series(np.arange(10))
        s = from_pandas_series(raw, chunk_size=7)

        with self.assertRaises(ValueError):
            # cannot infer dtype, the inferred is int,
            # but actually it is float
            # just due to nan
            s.map({5: 10})

        r = s.map({5: 10}, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10})
        pd.testing.assert_series_equal(result, expected)

        r = s.map({i: 10 + i for i in range(7)}, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({i: 10 + i for i in range(7)})
        pd.testing.assert_series_equal(result, expected)

        r = s.map({5: 10}, dtype=float, na_action='ignore')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10}, na_action='ignore')
        pd.testing.assert_series_equal(result, expected)

        # dtype can be inferred
        r = s.map({5: 10.})
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({5: 10.})
        pd.testing.assert_series_equal(result, expected)

        r = s.map(lambda x: x + 1, dtype=int)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(lambda x: x + 1)
        pd.testing.assert_series_equal(result, expected)

        def f(x: int) -> float:
            return x + 1.

        # dtype can be inferred for function
        r = s.map(f)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(lambda x: x + 1.)
        pd.testing.assert_series_equal(result, expected)

        # test arg is a md.Series
        raw2 = pd.Series([10], index=[5])
        s2 = from_pandas_series(raw2)

        r = s.map(s2, dtype=float)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(raw2)
        pd.testing.assert_series_equal(result, expected)

        # test arg is a md.Series, and dtype can be inferred
        raw2 = pd.Series([10.], index=[5])
        s2 = from_pandas_series(raw2)

        r = s.map(s2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map(raw2)
        pd.testing.assert_series_equal(result, expected)

        # test str
        raw = pd.Series(['a', 'b', 'c', 'd'])
        s = from_pandas_series(raw, chunk_size=2)

        r = s.map({'c': 'e'})
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = raw.map({'c': 'e'})
        pd.testing.assert_series_equal(result, expected)

    def testDescribeExecution(self):
        s_raw = pd.Series(np.random.rand(10))

        # test one chunk
        series = from_pandas_series(s_raw, chunk_size=10)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        series = from_pandas_series(s_raw, chunk_size=3)

        r = series.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe()
        pd.testing.assert_series_equal(result, expected)

        r = series.describe(percentiles=[])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[])
        pd.testing.assert_series_equal(result, expected)

        df_raw = pd.DataFrame(np.random.rand(10, 4), columns=list('abcd'))
        df_raw['e'] = np.random.randint(100, size=10)

        # test one chunk
        df = from_pandas_df(df_raw, chunk_size=10)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = series.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_series_equal(result, expected)

        # test multi chunks
        df = from_pandas_df(df_raw, chunk_size=3)

        r = df.describe()
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe()
        pd.testing.assert_frame_equal(result, expected)

        r = df.describe(percentiles=[], include=np.float64)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.describe(percentiles=[], include=np.float64)
        pd.testing.assert_frame_equal(result, expected)

        with self.assertRaises(ValueError):
            df.describe(percentiles=[1.1])

    def testDataFrameFillNAExecution(self):
        df_raw = pd.DataFrame(np.nan,
                              index=range(0, 20),
                              columns=list('ABCDEFGHIJ'))
        for _ in range(20):
            df_raw.iloc[random.randint(0, 19),
                        random.randint(0, 9)] = random.randint(0, 99)

        value_df_raw = pd.DataFrame(np.random.randint(0, 100, (10, 7)).astype(
            np.float32),
                                    columns=list('ABCDEFG'))

        # test DataFrame single chunk with numeric fill
        df = from_pandas_df(df_raw)
        r = df.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test DataFrame single chunk with value as single chunk
        df = from_pandas_df(df_raw)
        value_df = from_pandas_df(value_df_raw)
        r = df.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_df_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test chunked with numeric fill
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test inplace tile
        df = from_pandas_df(df_raw, chunk_size=3)
        df.fillna(1, inplace=True)
        result = self.executor.execute_dataframe(df, concat=True)[0]
        expected = df_raw.fillna(1)
        pd.testing.assert_frame_equal(result, expected)

        # test forward fill in axis=0 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(method='pad')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(method='pad')
        pd.testing.assert_frame_equal(result, expected)

        # test backward fill in axis=0 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.fillna(method='backfill')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(method='backfill')
        pd.testing.assert_frame_equal(result, expected)

        # test forward fill in axis=1 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.ffill(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.ffill(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        # test backward fill in axis=1 without limit
        df = from_pandas_df(df_raw, chunk_size=3)
        r = df.bfill(axis=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.bfill(axis=1)
        pd.testing.assert_frame_equal(result, expected)

        # test fill with dataframe
        df = from_pandas_df(df_raw, chunk_size=3)
        value_df = from_pandas_df(value_df_raw, chunk_size=4)
        r = df.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_df_raw)
        pd.testing.assert_frame_equal(result, expected)

        # test fill with series
        value_series_raw = pd.Series(np.random.randint(0, 100, (10, )).astype(
            np.float32),
                                     index=list('ABCDEFGHIJ'))
        df = from_pandas_df(df_raw, chunk_size=3)
        value_series = from_pandas_series(value_series_raw, chunk_size=4)
        r = df.fillna(value_series)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = df_raw.fillna(value_series_raw)
        pd.testing.assert_frame_equal(result, expected)

    def testSeriesFillNAExecution(self):
        series_raw = pd.Series(np.nan, index=range(20))
        for _ in range(3):
            series_raw.iloc[random.randint(0, 19)] = random.randint(0, 99)
        value_series_raw = pd.Series(
            np.random.randint(0, 100, (10, )).astype(np.float32))

        series = from_pandas_series(series_raw)
        r = series.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test DataFrame single chunk with value as single chunk
        series = from_pandas_series(series_raw)
        value_series = from_pandas_series(value_series_raw)
        r = series.fillna(value_series)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(value_series_raw)
        pd.testing.assert_series_equal(result, expected)

        # test chunked with numeric fill
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test inplace tile
        series = from_pandas_series(series_raw, chunk_size=3)
        series.fillna(1, inplace=True)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        expected = series_raw.fillna(1)
        pd.testing.assert_series_equal(result, expected)

        # test forward fill in axis=0 without limit
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(method='pad')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(method='pad')
        pd.testing.assert_series_equal(result, expected)

        # test backward fill in axis=0 without limit
        series = from_pandas_series(series_raw, chunk_size=3)
        r = series.fillna(method='backfill')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(method='backfill')
        pd.testing.assert_series_equal(result, expected)

        # test fill with series
        series = from_pandas_series(series_raw, chunk_size=3)
        value_df = from_pandas_series(value_series_raw, chunk_size=4)
        r = series.fillna(value_df)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = series_raw.fillna(value_series_raw)
        pd.testing.assert_series_equal(result, expected)

    def testDataFrameApplyExecute(self):
        cols = [chr(ord('A') + i) for i in range(10)]
        df_raw = pd.DataFrame(
            dict((c, [i**2 for i in range(20)]) for c in cols))

        old_chunk_store_limit = options.chunk_store_limit
        try:
            options.chunk_store_limit = 20

            df = from_pandas_df(df_raw, chunk_size=5)

            r = df.apply('ffill')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply('ffill')
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(np.sqrt)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sqrt)
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: pd.Series([1, 2]))
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: pd.Series([1, 2]))
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(np.sum, axis='index')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sum, axis='index')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(np.sum, axis='columns')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(np.sum, axis='columns')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: [1, 2], axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: [1, 2], axis=1)
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']),
                         axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(
                lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: [1, 2], axis=1, result_type='expand')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: [1, 2],
                                    axis=1,
                                    result_type='expand')
            pd.testing.assert_frame_equal(result, expected)

            r = df.apply(lambda x: list(range(10)),
                         axis=1,
                         result_type='reduce')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: list(range(10)),
                                    axis=1,
                                    result_type='reduce')
            pd.testing.assert_series_equal(result, expected)

            r = df.apply(lambda x: list(range(10)),
                         axis=1,
                         result_type='broadcast')
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.apply(lambda x: list(range(10)),
                                    axis=1,
                                    result_type='broadcast')
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: list(range(len(x))))
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))))
            pd.testing.assert_frame_equal(result, expected)

            r = df.transform(lambda x: list(range(len(x))), axis=1)
            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = df_raw.transform(lambda x: list(range(len(x))), axis=1)
            pd.testing.assert_frame_equal(result, expected)
        finally:
            options.chunk_store_limit = old_chunk_store_limit

    def testSeriesApplyExecute(self):
        idxes = [chr(ord('A') + i) for i in range(20)]
        s_raw = pd.Series([i**2 for i in range(20)], index=idxes)

        series = from_pandas_series(s_raw, chunk_size=5)
        r = series.apply('add', args=(1, ))
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply('add', args=(1, ))
        pd.testing.assert_series_equal(result, expected)

        r = series.apply(np.sqrt)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply(np.sqrt)
        pd.testing.assert_series_equal(result, expected)

        r = series.apply('sqrt')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply('sqrt')
        pd.testing.assert_series_equal(result, expected)

        r = series.apply(lambda x: [x, x + 1], convert_dtype=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False)
        pd.testing.assert_series_equal(result, expected)

        r = series.transform(lambda x: x + 1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s_raw.transform(lambda x: x + 1)
        pd.testing.assert_series_equal(result, expected)

    def testStringMethodExecution(self):
        s = pd.Series(['s1,s2', 'ef,', 'dd', np.nan])
        s2 = pd.concat([s, s, s])

        series = from_pandas_series(s, chunk_size=2)
        series2 = from_pandas_series(s2, chunk_size=2)

        # test getitem
        r = series.str[:3]
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str[:3]
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=False
        r = series.str.split(',', n=2)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', n=2)
        pd.testing.assert_series_equal(result, expected)

        # test split, expand=True
        r = series.str.split(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.split(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test rsplit
        r = series.str.rsplit(',', expand=True, n=1)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.rsplit(',', expand=True, n=1)
        pd.testing.assert_frame_equal(result, expected)

        # test cat all data
        r = series2.str.cat(sep='/', na_rep='e')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s2.str.cat(sep='/', na_rep='e')
        self.assertEqual(result, expected)

        # test cat list
        r = series.str.cat(['a', 'b', np.nan, 'c'])
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(['a', 'b', np.nan, 'c'])
        pd.testing.assert_series_equal(result, expected)

        # test cat series
        r = series.str.cat(series.str.capitalize(), join='outer')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.cat(s.str.capitalize(), join='outer')
        pd.testing.assert_series_equal(result, expected)

        # test extractall
        r = series.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
        pd.testing.assert_frame_equal(result, expected)

        # test extract, expand=False
        r = series.str.extract(r'[ab](\d)', expand=False)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=False)
        pd.testing.assert_series_equal(result, expected)

        # test extract, expand=True
        r = series.str.extract(r'[ab](\d)', expand=True)
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.str.extract(r'[ab](\d)', expand=True)
        pd.testing.assert_frame_equal(result, expected)

    def testDatetimeMethodExecution(self):
        # test datetime
        s = pd.Series(
            [pd.Timestamp('2020-1-1'),
             pd.Timestamp('2020-2-1'), np.nan])
        series = from_pandas_series(s, chunk_size=2)

        r = series.dt.year
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.year
        pd.testing.assert_series_equal(result, expected)

        r = series.dt.strftime('%m-%d-%Y')
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.strftime('%m-%d-%Y')
        pd.testing.assert_series_equal(result, expected)

        # test timedelta
        s = pd.Series([pd.Timedelta('1 days'), pd.Timedelta('3 days'), np.nan])
        series = from_pandas_series(s, chunk_size=2)

        r = series.dt.days
        result = self.executor.execute_dataframe(r, concat=True)[0]
        expected = s.dt.days
        pd.testing.assert_series_equal(result, expected)