Exemplo n.º 1
0
    def testInitializerExecution(self):
        arr = np.random.rand(20, 30)

        pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
        df = md.DataFrame(pdf, chunk_size=(15, 10))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(pdf, result)

        df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_frame_equal(
            result,
            pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20)))

        s = np.random.rand(20)

        ps = pd.Series(s,
                       index=[np.arange(20),
                              np.arange(20, 0, -1)],
                       name='a')
        series = md.Series(ps, chunk_size=7)
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(ps, result)

        series = md.Series(s, index=md.date_range('2020-1-1', periods=20))
        result = self.executor.execute_dataframe(series, concat=True)[0]
        pd.testing.assert_series_equal(
            result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20)))

        pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
        index = md.Index(md.Index(pi))
        result = self.executor.execute_dataframe(index, concat=True)[0]
        pd.testing.assert_index_equal(pi, result)
Exemplo n.º 2
0
    def testSeriesFromTensor(self):
        data = np.random.rand(10)
        series = md.Series(mt.tensor(data), name='a')
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a'))

        series = md.Series(mt.tensor(data, chunk_size=3))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data))

        series = md.Series(mt.ones((10, ), chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(np.ones(10, )))

        index_data = np.random.rand(10)
        series = md.Series(mt.tensor(data, chunk_size=3),
                           name='a',
                           index=mt.tensor(index_data, chunk_size=4))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data, name='a', index=index_data))

        series = md.Series(mt.tensor(data, chunk_size=3),
                           name='a',
                           index=md.date_range('2020-1-1', periods=10))
        pd.testing.assert_series_equal(
            self.executor.execute_dataframe(series, concat=True)[0],
            pd.Series(data,
                      name='a',
                      index=pd.date_range('2020-1-1', periods=10)))
Exemplo n.º 3
0
def test_series_from_tensor(setup):
    data = np.random.rand(10)
    series = md.Series(mt.tensor(data), name='a')
    pd.testing.assert_series_equal(series.execute().fetch(),
                                   pd.Series(data, name='a'))

    series = md.Series(mt.tensor(data, chunk_size=3))
    pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data))

    series = md.Series(mt.ones((10, ), chunk_size=4))
    pd.testing.assert_series_equal(series.execute().fetch(),
                                   pd.Series(np.ones(10, )))

    index_data = np.random.rand(10)
    series = md.Series(mt.tensor(data, chunk_size=3),
                       name='a',
                       index=mt.tensor(index_data, chunk_size=4))
    pd.testing.assert_series_equal(series.execute().fetch(),
                                   pd.Series(data, name='a', index=index_data))

    series = md.Series(mt.tensor(data, chunk_size=3),
                       name='a',
                       index=md.date_range('2020-1-1', periods=10))
    pd.testing.assert_series_equal(
        series.execute().fetch(),
        pd.Series(data, name='a', index=pd.date_range('2020-1-1', periods=10)))
Exemplo n.º 4
0
def test_initializer_execution(setup):
    arr = np.random.rand(20, 30)

    pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)])
    df = md.DataFrame(pdf, chunk_size=(15, 10))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(pdf, result)

    df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(
        result, pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20)))

    df = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                      index=md.date_range('1/1/2010', periods=6, freq='D'))
    result = df.execute().fetch()
    pd.testing.assert_frame_equal(
        result,
        pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},
                     index=pd.date_range('1/1/2010', periods=6, freq='D')))

    s = np.random.rand(20)

    ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name='a')
    series = md.Series(ps, chunk_size=7)
    result = series.execute().fetch()
    pd.testing.assert_series_equal(ps, result)

    series = md.Series(s, index=md.date_range('2020-1-1', periods=20))
    result = series.execute().fetch()
    pd.testing.assert_series_equal(
        result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20)))

    pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
    index = md.Index(md.Index(pi))
    result = index.execute().fetch()
    pd.testing.assert_index_equal(pi, result)
Exemplo n.º 5
0
    def testDateRangeExecution(self):
        for closed in [None, 'left', 'right']:
            # start, periods, freq
            dr = md.date_range('2020-1-1',
                               periods=10,
                               chunk_size=3,
                               closed=closed)

            result = self.executor.execute_dataframe(dr, concat=True)[0]
            expected = pd.date_range('2020-1-1', periods=10, closed=closed)
            pd.testing.assert_index_equal(result, expected)

            # end, periods, freq
            dr = md.date_range(end='2020-1-10',
                               periods=10,
                               chunk_size=3,
                               closed=closed)

            result = self.executor.execute_dataframe(dr, concat=True)[0]
            expected = pd.date_range(end='2020-1-10',
                                     periods=10,
                                     closed=closed)
            pd.testing.assert_index_equal(result, expected)

            # start, end, freq
            dr = md.date_range('2020-1-1',
                               '2020-1-10',
                               chunk_size=3,
                               closed=closed)

            result = self.executor.execute_dataframe(dr, concat=True)[0]
            expected = pd.date_range('2020-1-1', '2020-1-10', closed=closed)
            pd.testing.assert_index_equal(result, expected)

            # start, end and periods
            dr = md.date_range('2020-1-1',
                               '2020-1-10',
                               periods=19,
                               chunk_size=3,
                               closed=closed)

            result = self.executor.execute_dataframe(dr, concat=True)[0]
            expected = pd.date_range('2020-1-1',
                                     '2020-1-10',
                                     periods=19,
                                     closed=closed)
            pd.testing.assert_index_equal(result, expected)

            # start, end and freq
            dr = md.date_range('2020-1-1',
                               '2020-1-10',
                               freq='12H',
                               chunk_size=3,
                               closed=closed)

            result = self.executor.execute_dataframe(dr, concat=True)[0]
            expected = pd.date_range('2020-1-1',
                                     '2020-1-10',
                                     freq='12H',
                                     closed=closed)
            pd.testing.assert_index_equal(result, expected)

        # test timezone
        dr = md.date_range('2020-1-1',
                           periods=10,
                           tz='Asia/Shanghai',
                           chunk_size=7)

        result = self.executor.execute_dataframe(dr, concat=True)[0]
        expected = pd.date_range('2020-1-1', periods=10, tz='Asia/Shanghai')
        pd.testing.assert_index_equal(result, expected)

        # test periods=0
        dr = md.date_range('2020-1-1', periods=0)

        result = self.executor.execute_dataframe(dr, concat=True)[0]
        expected = pd.date_range('2020-1-1', periods=0)
        pd.testing.assert_index_equal(result, expected)

        # test start == end
        dr = md.date_range('2020-1-1', '2020-1-1', periods=1)

        result = self.executor.execute_dataframe(dr, concat=True)[0]
        expected = pd.date_range('2020-1-1', '2020-1-1', periods=1)
        pd.testing.assert_index_equal(result, expected)

        # test normalize=True
        dr = md.date_range('2020-1-1',
                           periods=10,
                           normalize=True,
                           chunk_size=4)

        result = self.executor.execute_dataframe(dr, concat=True)[0]
        expected = pd.date_range('2020-1-1', periods=10, normalize=True)
        pd.testing.assert_index_equal(result, expected)

        # test freq
        dr = md.date_range(start='1/1/2018', periods=5, freq='M', chunk_size=3)

        result = self.executor.execute_dataframe(dr, concat=True)[0]
        expected = pd.date_range(start='1/1/2018', periods=5, freq='M')
        pd.testing.assert_index_equal(result, expected)
Exemplo n.º 6
0
    def testFromTensorExecution(self):
        tensor = mt.random.rand(10, 10, chunk_size=5)
        df = dataframe_from_tensor(tensor)
        tensor_res = self.executor.execute_tensor(tensor, concat=True)[0]
        pdf_expected = pd.DataFrame(tensor_res)
        df_result = self.executor.execute_dataframe(df, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10))
        pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10))
        pd.testing.assert_frame_equal(df_result, pdf_expected)

        # test converted with specified index_value and columns
        tensor2 = mt.random.rand(2, 2, chunk_size=1)
        df2 = dataframe_from_tensor(tensor2,
                                    index=pd.Index(['a', 'b']),
                                    columns=pd.Index([3, 4]))
        df_result = self.executor.execute_dataframe(df2, concat=True)[0]
        pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b']))
        pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4]))

        # test converted from 1-d tensor
        tensor3 = mt.array([1, 2, 3])
        df3 = dataframe_from_tensor(tensor3)
        result3 = self.executor.execute_dataframe(df3, concat=True)[0]
        pdf_expected = pd.DataFrame(np.array([1, 2, 3]))
        pd.testing.assert_frame_equal(pdf_expected, result3)

        # test converted from identical chunks
        tensor4 = mt.ones((10, 10), chunk_size=3)
        df4 = dataframe_from_tensor(tensor4)
        result4 = self.executor.execute_dataframe(df4, concat=True)[0]
        pdf_expected = pd.DataFrame(
            self.executor.execute_tensor(tensor4, concat=True)[0])
        pd.testing.assert_frame_equal(pdf_expected, result4)

        # from tensor with given index
        tensor5 = mt.ones((10, 10), chunk_size=3)
        df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2))
        result5 = self.executor.execute_dataframe(df5, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor5, concat=True)[0],
                                    index=np.arange(0, 20, 2))
        pd.testing.assert_frame_equal(pdf_expected, result5)

        # from tensor with given index that is a tensor
        raw7 = np.random.rand(10, 10)
        tensor7 = mt.tensor(raw7, chunk_size=3)
        index_raw7 = np.random.rand(10)
        index7 = mt.tensor(index_raw7, chunk_size=4)
        df7 = dataframe_from_tensor(tensor7, index=index7)
        result7 = self.executor.execute_dataframe(df7, concat=True)[0]
        pdf_expected = pd.DataFrame(raw7, index=index_raw7)
        pd.testing.assert_frame_equal(pdf_expected, result7)

        # from tensor with given index is a md.Index
        raw10 = np.random.rand(10, 10)
        tensor10 = mt.tensor(raw10, chunk_size=3)
        index10 = md.date_range('2020-1-1', periods=10, chunk_size=3)
        df10 = dataframe_from_tensor(tensor10, index=index10)
        result10 = self.executor.execute_dataframe(df10, concat=True)[0]
        pdf_expected = pd.DataFrame(raw10,
                                    index=pd.date_range('2020-1-1',
                                                        periods=10))
        pd.testing.assert_frame_equal(pdf_expected, result10)

        # from tensor with given columns
        tensor6 = mt.ones((10, 10), chunk_size=3)
        df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij'))
        result6 = self.executor.execute_dataframe(df6, concat=True)[0]
        pdf_expected = pd.DataFrame(self.executor.execute_tensor(
            tensor6, concat=True)[0],
                                    columns=list('abcdefghij'))
        pd.testing.assert_frame_equal(pdf_expected, result6)

        # from 1d tensors
        raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10,
                                                                   size=8)),
                 ('c', [
                     ''.join(np.random.choice(list(printable), size=6))
                     for _ in range(8)
                 ])]
        tensors8 = OrderedDict(
            (r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8)
        raws8.append(('d', 1))
        raws8.append(('e', pd.date_range('2020-1-1', periods=8)))
        tensors8['d'] = 1
        tensors8['e'] = raws8[-1][1]
        df8 = dataframe_from_1d_tileables(tensors8,
                                          columns=[r[0] for r in raws8])
        result = self.executor.execute_dataframe(df8, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8))
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index with a tensor
        index_raw9 = np.random.rand(8)
        index9 = mt.tensor(index_raw9, chunk_size=4)
        df9 = dataframe_from_1d_tileables(tensors8,
                                          columns=[r[0] for r in raws8],
                                          index=index9)
        result = self.executor.execute_dataframe(df9, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9)
        pd.testing.assert_frame_equal(result, pdf_expected)

        # from 1d tensors and specify index
        df11 = dataframe_from_1d_tileables(tensors8,
                                           columns=[r[0] for r in raws8],
                                           index=md.date_range('2020-1-1',
                                                               periods=8))
        result = self.executor.execute_dataframe(df11, concat=True)[0]
        pdf_expected = pd.DataFrame(OrderedDict(raws8),
                                    index=pd.date_range('2020-1-1', periods=8))
        pd.testing.assert_frame_equal(result, pdf_expected)
Exemplo n.º 7
0
    def testReindexExecution(self):
        data = pd.DataFrame(np.random.rand(10, 5),
                            columns=['c1', 'c2', 'c3', 'c4', 'c5'])
        df = md.DataFrame(data, chunk_size=4)

        for enable_sparse in [True, False, None]:
            r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=3),
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=np.arange(10, 1, -1))
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(columns=['c5', 'c6', 'c2'],
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(columns=['c5', 'c6', 'c2'])
            pd.testing.assert_frame_equal(result, expected)

        for enable_sparse in [True, False]:
            r = df.reindex(index=[5, 11, 1],
                           columns=['c5', 'c6', 'c2'],
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=[5, 11, 1],
                                    columns=['c5', 'c6', 'c2'])
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(index=mt.tensor([2, 4, 10]),
                           columns=['c2', 'c3', 'c5', 'c7'],
                           method='bfill',
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=[2, 4, 10],
                                    columns=['c2', 'c3', 'c5', 'c7'],
                                    method='bfill')
            pd.testing.assert_frame_equal(result, expected)

            for fill_value, test_fill_value in \
                    [(3, 3), (df.iloc[:, 0].max(), data.iloc[:, 0].max())]:
                r = df.reindex(index=mt.tensor([2, 4, 10]),
                               columns=['c2', 'c3', 'c5', 'c7'],
                               fill_value=fill_value,
                               enable_sparse=enable_sparse)

                result = self.executor.execute_dataframe(r, concat=True)[0]
                expected = data.reindex(index=[2, 4, 10],
                                        columns=['c2', 'c3', 'c5', 'c7'],
                                        fill_value=test_fill_value)
                pd.testing.assert_frame_equal(result, expected)

            # test date_range index
            data = pd.DataFrame(np.random.rand(10, 5),
                                index=pd.date_range('2020-1-1', periods=10))
            df = md.DataFrame(data, chunk_size=5)

            r = df.reindex(index=md.date_range('2020-1-6', periods=6),
                           method='ffill',
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=pd.date_range('2020-1-6', periods=6),
                                    method='ffill')
            pd.testing.assert_frame_equal(result, expected)

            # test MultiIndex
            data = pd.DataFrame(np.random.rand(10, 5),
                                index=pd.MultiIndex.from_arrays(
                                    [np.arange(10),
                                     np.arange(11, 1, -1)]))
            df = md.DataFrame(data, chunk_size=5)

            r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r,
                                                     concat=True,
                                                     check_shape=False)[0]
            expected = data.reindex([2, 4, 9, 12], level=1)
            pd.testing.assert_frame_equal(result, expected)

            r = df.reindex(mt.tensor([2, 4, 9, 12], chunk_size=2),
                           level=1,
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r,
                                                     concat=True,
                                                     check_shape=False)[0]
            expected = data.reindex([2, 4, 9, 12], level=1)
            pd.testing.assert_frame_equal(result, expected)

            # test duplicate index
            index = np.arange(10)
            index[-1] = 0
            data = pd.DataFrame(np.random.rand(10, 5), index=index)
            df = md.DataFrame(data, chunk_size=5)

            with self.assertRaises(ValueError):
                r = df.reindex([0, 1], enable_sparse=enable_sparse)
                self.executor.execute_dataframe(r)

            # test one chunk
            data = pd.DataFrame(np.random.rand(10, 5),
                                columns=['c1', 'c2', 'c3', 'c4', 'c5'])
            df = md.DataFrame(data, chunk_size=10)

            r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=10),
                           fill_value=df['c1'].max(),
                           enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = data.reindex(index=np.arange(10, 1, -1),
                                    fill_value=data['c1'].max())
            pd.testing.assert_frame_equal(result, expected)

            # test series
            s_data = pd.Series(np.random.rand(10),
                               index=[f'c{i + 1}' for i in range(10)])
            series = md.Series(s_data, chunk_size=6)

            r = series.reindex(['c2', 'c11', 'c4'],
                               copy=False,
                               enable_sparse=enable_sparse)

            result = self.executor.execute_dataframe(r, concat=True)[0]
            expected = s_data.reindex(['c2', 'c11', 'c4'], copy=False)
            pd.testing.assert_series_equal(result, expected)