def testInitializerExecution(self): arr = np.random.rand(20, 30) pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20)) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal( result, pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20))) s = np.random.rand(20) ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal(ps, result) series = md.Series(s, index=md.date_range('2020-1-1', periods=20)) result = self.executor.execute_dataframe(series, concat=True)[0] pd.testing.assert_series_equal( result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20))) pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) index = md.Index(md.Index(pi)) result = self.executor.execute_dataframe(index, concat=True)[0] pd.testing.assert_index_equal(pi, result)
def testSeriesFromTensor(self): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data)) series = md.Series(mt.ones((10, ), chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(np.ones(10, ))) index_data = np.random.rand(10) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=mt.tensor(index_data, chunk_size=4)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a', index=index_data)) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=md.date_range('2020-1-1', periods=10)) pd.testing.assert_series_equal( self.executor.execute_dataframe(series, concat=True)[0], pd.Series(data, name='a', index=pd.date_range('2020-1-1', periods=10)))
def test_series_from_tensor(setup): data = np.random.rand(10) series = md.Series(mt.tensor(data), name='a') pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data, name='a')) series = md.Series(mt.tensor(data, chunk_size=3)) pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data)) series = md.Series(mt.ones((10, ), chunk_size=4)) pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(np.ones(10, ))) index_data = np.random.rand(10) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=mt.tensor(index_data, chunk_size=4)) pd.testing.assert_series_equal(series.execute().fetch(), pd.Series(data, name='a', index=index_data)) series = md.Series(mt.tensor(data, chunk_size=3), name='a', index=md.date_range('2020-1-1', periods=10)) pd.testing.assert_series_equal( series.execute().fetch(), pd.Series(data, name='a', index=pd.date_range('2020-1-1', periods=10)))
def test_initializer_execution(setup): arr = np.random.rand(20, 30) pdf = pd.DataFrame(arr, index=[np.arange(20), np.arange(20, 0, -1)]) df = md.DataFrame(pdf, chunk_size=(15, 10)) result = df.execute().fetch() pd.testing.assert_frame_equal(pdf, result) df = md.DataFrame(arr, index=md.date_range('2020-1-1', periods=20)) result = df.execute().fetch() pd.testing.assert_frame_equal( result, pd.DataFrame(arr, index=pd.date_range('2020-1-1', periods=20))) df = md.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, index=md.date_range('1/1/2010', periods=6, freq='D')) result = df.execute().fetch() pd.testing.assert_frame_equal( result, pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, index=pd.date_range('1/1/2010', periods=6, freq='D'))) s = np.random.rand(20) ps = pd.Series(s, index=[np.arange(20), np.arange(20, 0, -1)], name='a') series = md.Series(ps, chunk_size=7) result = series.execute().fetch() pd.testing.assert_series_equal(ps, result) series = md.Series(s, index=md.date_range('2020-1-1', periods=20)) result = series.execute().fetch() pd.testing.assert_series_equal( result, pd.Series(s, index=pd.date_range('2020-1-1', periods=20))) pi = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) index = md.Index(md.Index(pi)) result = index.execute().fetch() pd.testing.assert_index_equal(pi, result)
def testDateRangeExecution(self): for closed in [None, 'left', 'right']: # start, periods, freq dr = md.date_range('2020-1-1', periods=10, chunk_size=3, closed=closed) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', periods=10, closed=closed) pd.testing.assert_index_equal(result, expected) # end, periods, freq dr = md.date_range(end='2020-1-10', periods=10, chunk_size=3, closed=closed) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range(end='2020-1-10', periods=10, closed=closed) pd.testing.assert_index_equal(result, expected) # start, end, freq dr = md.date_range('2020-1-1', '2020-1-10', chunk_size=3, closed=closed) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', '2020-1-10', closed=closed) pd.testing.assert_index_equal(result, expected) # start, end and periods dr = md.date_range('2020-1-1', '2020-1-10', periods=19, chunk_size=3, closed=closed) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', '2020-1-10', periods=19, closed=closed) pd.testing.assert_index_equal(result, expected) # start, end and freq dr = md.date_range('2020-1-1', '2020-1-10', freq='12H', chunk_size=3, closed=closed) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', '2020-1-10', freq='12H', closed=closed) pd.testing.assert_index_equal(result, expected) # test timezone dr = md.date_range('2020-1-1', periods=10, tz='Asia/Shanghai', chunk_size=7) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', periods=10, tz='Asia/Shanghai') pd.testing.assert_index_equal(result, expected) # test periods=0 dr = md.date_range('2020-1-1', periods=0) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', periods=0) pd.testing.assert_index_equal(result, expected) # test start == end dr = md.date_range('2020-1-1', '2020-1-1', periods=1) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', '2020-1-1', periods=1) pd.testing.assert_index_equal(result, expected) # test normalize=True dr = md.date_range('2020-1-1', periods=10, normalize=True, chunk_size=4) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range('2020-1-1', periods=10, normalize=True) pd.testing.assert_index_equal(result, expected) # test freq dr = md.date_range(start='1/1/2018', periods=5, freq='M', chunk_size=3) result = self.executor.execute_dataframe(dr, concat=True)[0] expected = pd.date_range(start='1/1/2018', periods=5, freq='M') pd.testing.assert_index_equal(result, expected)
def testFromTensorExecution(self): tensor = mt.random.rand(10, 10, chunk_size=5) df = dataframe_from_tensor(tensor) tensor_res = self.executor.execute_tensor(tensor, concat=True)[0] pdf_expected = pd.DataFrame(tensor_res) df_result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.RangeIndex(0, 10)) pd.testing.assert_index_equal(df_result.columns, pd.RangeIndex(0, 10)) pd.testing.assert_frame_equal(df_result, pdf_expected) # test converted with specified index_value and columns tensor2 = mt.random.rand(2, 2, chunk_size=1) df2 = dataframe_from_tensor(tensor2, index=pd.Index(['a', 'b']), columns=pd.Index([3, 4])) df_result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_index_equal(df_result.index, pd.Index(['a', 'b'])) pd.testing.assert_index_equal(df_result.columns, pd.Index([3, 4])) # test converted from 1-d tensor tensor3 = mt.array([1, 2, 3]) df3 = dataframe_from_tensor(tensor3) result3 = self.executor.execute_dataframe(df3, concat=True)[0] pdf_expected = pd.DataFrame(np.array([1, 2, 3])) pd.testing.assert_frame_equal(pdf_expected, result3) # test converted from identical chunks tensor4 = mt.ones((10, 10), chunk_size=3) df4 = dataframe_from_tensor(tensor4) result4 = self.executor.execute_dataframe(df4, concat=True)[0] pdf_expected = pd.DataFrame( self.executor.execute_tensor(tensor4, concat=True)[0]) pd.testing.assert_frame_equal(pdf_expected, result4) # from tensor with given index tensor5 = mt.ones((10, 10), chunk_size=3) df5 = dataframe_from_tensor(tensor5, index=np.arange(0, 20, 2)) result5 = self.executor.execute_dataframe(df5, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor5, concat=True)[0], index=np.arange(0, 20, 2)) pd.testing.assert_frame_equal(pdf_expected, result5) # from tensor with given index that is a tensor raw7 = np.random.rand(10, 10) tensor7 = mt.tensor(raw7, chunk_size=3) index_raw7 = np.random.rand(10) index7 = mt.tensor(index_raw7, chunk_size=4) df7 = dataframe_from_tensor(tensor7, index=index7) result7 = self.executor.execute_dataframe(df7, concat=True)[0] pdf_expected = pd.DataFrame(raw7, index=index_raw7) pd.testing.assert_frame_equal(pdf_expected, result7) # from tensor with given index is a md.Index raw10 = np.random.rand(10, 10) tensor10 = mt.tensor(raw10, chunk_size=3) index10 = md.date_range('2020-1-1', periods=10, chunk_size=3) df10 = dataframe_from_tensor(tensor10, index=index10) result10 = self.executor.execute_dataframe(df10, concat=True)[0] pdf_expected = pd.DataFrame(raw10, index=pd.date_range('2020-1-1', periods=10)) pd.testing.assert_frame_equal(pdf_expected, result10) # from tensor with given columns tensor6 = mt.ones((10, 10), chunk_size=3) df6 = dataframe_from_tensor(tensor6, columns=list('abcdefghij')) result6 = self.executor.execute_dataframe(df6, concat=True)[0] pdf_expected = pd.DataFrame(self.executor.execute_tensor( tensor6, concat=True)[0], columns=list('abcdefghij')) pd.testing.assert_frame_equal(pdf_expected, result6) # from 1d tensors raws8 = [('a', np.random.rand(8)), ('b', np.random.randint(10, size=8)), ('c', [ ''.join(np.random.choice(list(printable), size=6)) for _ in range(8) ])] tensors8 = OrderedDict( (r[0], mt.tensor(r[1], chunk_size=3)) for r in raws8) raws8.append(('d', 1)) raws8.append(('e', pd.date_range('2020-1-1', periods=8))) tensors8['d'] = 1 tensors8['e'] = raws8[-1][1] df8 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8]) result = self.executor.execute_dataframe(df8, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8)) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index with a tensor index_raw9 = np.random.rand(8) index9 = mt.tensor(index_raw9, chunk_size=4) df9 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8], index=index9) result = self.executor.execute_dataframe(df9, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=index_raw9) pd.testing.assert_frame_equal(result, pdf_expected) # from 1d tensors and specify index df11 = dataframe_from_1d_tileables(tensors8, columns=[r[0] for r in raws8], index=md.date_range('2020-1-1', periods=8)) result = self.executor.execute_dataframe(df11, concat=True)[0] pdf_expected = pd.DataFrame(OrderedDict(raws8), index=pd.date_range('2020-1-1', periods=8)) pd.testing.assert_frame_equal(result, pdf_expected)
def testReindexExecution(self): data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=4) for enable_sparse in [True, False, None]: r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=3), enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) r = df.reindex(columns=['c5', 'c6', 'c2'], enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(columns=['c5', 'c6', 'c2']) pd.testing.assert_frame_equal(result, expected) for enable_sparse in [True, False]: r = df.reindex(index=[5, 11, 1], columns=['c5', 'c6', 'c2'], enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[5, 11, 1], columns=['c5', 'c6', 'c2']) pd.testing.assert_frame_equal(result, expected) r = df.reindex(index=mt.tensor([2, 4, 10]), columns=['c2', 'c3', 'c5', 'c7'], method='bfill', enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[2, 4, 10], columns=['c2', 'c3', 'c5', 'c7'], method='bfill') pd.testing.assert_frame_equal(result, expected) for fill_value, test_fill_value in \ [(3, 3), (df.iloc[:, 0].max(), data.iloc[:, 0].max())]: r = df.reindex(index=mt.tensor([2, 4, 10]), columns=['c2', 'c3', 'c5', 'c7'], fill_value=fill_value, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=[2, 4, 10], columns=['c2', 'c3', 'c5', 'c7'], fill_value=test_fill_value) pd.testing.assert_frame_equal(result, expected) # test date_range index data = pd.DataFrame(np.random.rand(10, 5), index=pd.date_range('2020-1-1', periods=10)) df = md.DataFrame(data, chunk_size=5) r = df.reindex(index=md.date_range('2020-1-6', periods=6), method='ffill', enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=pd.date_range('2020-1-6', periods=6), method='ffill') pd.testing.assert_frame_equal(result, expected) # test MultiIndex data = pd.DataFrame(np.random.rand(10, 5), index=pd.MultiIndex.from_arrays( [np.arange(10), np.arange(11, 1, -1)])) df = md.DataFrame(data, chunk_size=5) r = df.reindex([2, 4, 9, 12], level=1, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True, check_shape=False)[0] expected = data.reindex([2, 4, 9, 12], level=1) pd.testing.assert_frame_equal(result, expected) r = df.reindex(mt.tensor([2, 4, 9, 12], chunk_size=2), level=1, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True, check_shape=False)[0] expected = data.reindex([2, 4, 9, 12], level=1) pd.testing.assert_frame_equal(result, expected) # test duplicate index index = np.arange(10) index[-1] = 0 data = pd.DataFrame(np.random.rand(10, 5), index=index) df = md.DataFrame(data, chunk_size=5) with self.assertRaises(ValueError): r = df.reindex([0, 1], enable_sparse=enable_sparse) self.executor.execute_dataframe(r) # test one chunk data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df = md.DataFrame(data, chunk_size=10) r = df.reindex(index=mt.arange(10, 1, -1, chunk_size=10), fill_value=df['c1'].max(), enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = data.reindex(index=np.arange(10, 1, -1), fill_value=data['c1'].max()) pd.testing.assert_frame_equal(result, expected) # test series s_data = pd.Series(np.random.rand(10), index=[f'c{i + 1}' for i in range(10)]) series = md.Series(s_data, chunk_size=6) r = series.reindex(['c2', 'c11', 'c4'], copy=False, enable_sparse=enable_sparse) result = self.executor.execute_dataframe(r, concat=True)[0] expected = s_data.reindex(['c2', 'c11', 'c4'], copy=False) pd.testing.assert_series_equal(result, expected)