def testExecutedPruning(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') pd_df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pd_df.to_csv(file_path, index=False) in_df = md.read_csv(file_path) mdf = in_df.groupby('c').agg({'a': 'sum'}) expected = pd_df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(mdf.to_pandas(), expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.usecols, ['a', 'c']) # make sure in_df has correct columns pd.testing.assert_frame_equal(in_df.to_pandas(), pd_df) # skip pruning in_df = md.read_csv(file_path) df1 = in_df.groupby('d').agg({'b': 'min'}) df2 = in_df[in_df.d.isin(df1.index)] expected1 = pd_df.groupby('d').agg({'b': 'min'}) expected2 = pd_df[pd_df.d.isin(expected1.index)] pd.testing.assert_frame_equal(df2.to_pandas(), expected2)
def testIterativeDependency(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True): with tempfile.TemporaryDirectory() as d: file_path = os.path.join(d, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) mdf1 = md.read_csv(file_path, chunk_bytes=10) r1 = mdf1.iloc[:3].to_pandas() pd.testing.assert_frame_equal(df[:3], r1.reset_index(drop=True)) mdf2 = md.read_csv(file_path, chunk_bytes=10) r2 = mdf2.iloc[:3].to_pandas() pd.testing.assert_frame_equal(df[:3], r2.reset_index(drop=True)) f = mdf1[mdf1.a > mdf2.a] r3 = f.iloc[:3].to_pandas() pd.testing.assert_frame_equal( r3, df[df.a > df.a].reset_index(drop=True)) mdf3 = md.read_csv(file_path, chunk_bytes=15, incremental_index=True) r4 = mdf3.to_pandas() pd.testing.assert_frame_equal(df, r4.reset_index(drop=True))
def testReadCSVGPUExecution(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = self.executor.execute_dataframe(md.read_csv(file_path, gpu=True), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, gpu=True, chunk_bytes=200), concat=True)[0] pd.testing.assert_frame_equal( pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True))
def test_read_csv_without_index(setup): # test csv file without storing index with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path).execute().fetch() pd.testing.assert_frame_equal(pdf, mdf) mdf2 = md.read_csv(file_path, chunk_bytes=10).execute().fetch() pd.testing.assert_frame_equal(pdf, mdf2) file_path2 = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.random.RandomState(0).rand(100, 10), columns=[f'col{i}' for i in range(10)]) df.to_csv(file_path2, index=False) mdf3 = md.read_csv(file_path2, chunk_bytes=os.stat(file_path2).st_size / 5) result = mdf3.execute().fetch() expected = pd.read_csv(file_path2) pd.testing.assert_frame_equal(result, expected) # test incremental_index = False mdf4 = md.read_csv(file_path2, chunk_bytes=os.stat(file_path2).st_size / 5, incremental_index=False) result = mdf4.execute().fetch() assert not result.index.is_monotonic_increasing expected = pd.read_csv(file_path2) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected)
def testReadCSVExecution(self): with self.hdfs.open("{}/simple_test.csv".format(TEST_DIR), "wb", replication=1) as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = md.read_csv( 'hdfs://localhost:8020{}/simple_test.csv'.format(TEST_DIR)) expected = pd.read_csv( BytesIO(b'name,amount,id\nAlice,100,1\nBob,200,2')) res = df.to_pandas() pd.testing.assert_frame_equal(expected, res) with self.hdfs.open("{}/chunk_test.csv".format(TEST_DIR), "wb", replication=1) as f: f.write(csv_content) df = md.read_csv( 'hdfs://localhost:8020{}/chunk_test.csv'.format(TEST_DIR), chunk_bytes=50) expected = pd.read_csv(BytesIO(csv_content)) res = df.to_pandas() pd.testing.assert_frame_equal(expected.reset_index(drop=True), res.reset_index(drop=True))
def test_read_csv_execution(setup, setup_hdfs): hdfs = setup_hdfs with hdfs.open(f"{TEST_DIR}/simple_test.csv", "wb", replication=1) as f: f.write(b'name,amount,id\nAlice,100,1\nBob,200,2') df = md.read_csv(f'hdfs://localhost:8020{TEST_DIR}/simple_test.csv') expected = pd.read_csv(BytesIO(b'name,amount,id\nAlice,100,1\nBob,200,2')) res = df.to_pandas() pd.testing.assert_frame_equal(expected, res) test_df = pd.DataFrame({ 'A': np.random.rand(20), 'B': [ pd.Timestamp('2020-01-01') + pd.Timedelta(days=random.randint(0, 31)) for _ in range(20) ], 'C': np.random.rand(20), 'D': np.random.randint(0, 100, size=(20, )), 'E': ['foo' + str(random.randint(0, 999999)) for _ in range(20)], }) buf = StringIO() test_df[:10].to_csv(buf) csv_content = buf.getvalue().encode() buf = StringIO() test_df[10:].to_csv(buf) csv_content2 = buf.getvalue().encode() with hdfs.open(f"{TEST_DIR}/chunk_test.csv", "wb", replication=1) as f: f.write(csv_content) df = md.read_csv(f'hdfs://localhost:8020{TEST_DIR}/chunk_test.csv', chunk_bytes=50) expected = pd.read_csv(BytesIO(csv_content)) res = df.to_pandas() pd.testing.assert_frame_equal(expected.reset_index(drop=True), res.reset_index(drop=True)) test_read_dir = f'{TEST_DIR}/test_read_csv_directory' hdfs.mkdir(test_read_dir) with hdfs.open(f"{test_read_dir}/part.csv", "wb", replication=1) as f: f.write(csv_content) with hdfs.open(f"{test_read_dir}/part2.csv", "wb", replication=1) as f: f.write(csv_content2) df = md.read_csv(f'hdfs://localhost:8020{test_read_dir}', chunk_bytes=50) expected = pd.concat([ pd.read_csv(BytesIO(csv_content)), pd.read_csv(BytesIO(csv_content2)) ]) res = df.to_pandas() pd.testing.assert_frame_equal(expected.reset_index(drop=True), res.reset_index(drop=True))
def test_read_csv_head(prepare_data, setup): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected) # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_sync_execute(): session = new_session(n_cpu=2, web=False, use_uvloop=False) # web not started assert session._session.client.web_address is None assert session.get_web_endpoint() is None with session: raw = np.random.RandomState(0).rand(10, 5) a = mt.tensor(raw, chunk_size=5).sum(axis=1) b = a.execute(show_progress=False) assert b is a result = a.fetch() np.testing.assert_array_equal(result, raw.sum(axis=1)) c = b + 1 c.execute(show_progress=False) result = c.fetch() np.testing.assert_array_equal(result, raw.sum(axis=1) + 1) c = mt.tensor(raw, chunk_size=5).sum() d = session.execute(c) assert d is c assert abs(session.fetch(d) - raw.sum()) < 0.001 with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') pdf = pd.DataFrame(np.random.RandomState(0).rand(100, 10), columns=[f'col{i}' for i in range(10)]) pdf.to_csv(file_path, index=False) df = md.read_csv(file_path, chunk_bytes=os.stat(file_path).st_size / 5) result = df.sum(axis=1).execute().fetch() expected = pd.read_csv(file_path).sum(axis=1) pd.testing.assert_series_equal(result, expected) df = md.read_csv(file_path, chunk_bytes=os.stat(file_path).st_size / 5) result = df.head(10).execute().fetch() expected = pd.read_csv(file_path).head(10) pd.testing.assert_frame_equal(result, expected) for worker_pool in session._session.client._cluster._worker_pools: _assert_storage_cleaned(session.session_id, worker_pool.external_address, StorageLevel.MEMORY) session.stop_server() assert get_default_async_session() is None
def testReadCSVUseArrowDtype(self): rs = np.random.RandomState(0) df = pd.DataFrame({ 'col1': rs.rand(100), 'col2': rs.choice(['a' * 2, 'b' * 3, 'c' * 4], (100, )), 'col3': np.arange(100) }) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path, use_arrow_dtype=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf) with tempfile.TemporaryDirectory() as tempdir: with option_context({'dataframe.use_arrow_dtype': True}): file_path = os.path.join(tempdir, 'test.csv') df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf) # test compression with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.gzip') df.to_csv(file_path, compression='gzip', index=False) pdf = pd.read_csv(file_path, compression='gzip') mdf = md.read_csv(file_path, compression='gzip', use_arrow_dtype=True) result = self.executor.execute_dataframe(mdf, concat=True)[0] self.assertIsInstance(mdf.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), pdf)
def testReadCSVWithoutIndex(self): sess = new_session() # test csv file without storing index with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = sess.run(md.read_csv(file_path, incremental_index=True)) pd.testing.assert_frame_equal(pdf, mdf) mdf2 = sess.run(md.read_csv(file_path, incremental_index=True, chunk_bytes=10)) pd.testing.assert_frame_equal(pdf, mdf2)
def testRayTask(self): with new_session(backend='ray').as_default(): # test tensor task raw = np.random.rand(100, 100) t = (mt.tensor(raw, chunk_size=30) + 1).sum().to_numpy() self.assertAlmostEqual(t, (raw + 1).sum()) # test DataFrame task raw = pd.DataFrame(np.random.random((20, 4)), columns=list('abcd')) df = md.DataFrame(raw, chunk_size=5) r = df.describe().to_pandas() pd.testing.assert_frame_equal(r, raw.describe()) # test update shape raw = np.random.rand(100) t = mt.tensor(raw, chunk_size=30) selected = (t[t > 0.5] + 1).execute() r = selected.to_numpy() expected = raw[raw > 0.5] + 1 np.testing.assert_array_equal(r, expected) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=['a', 'b', 'c']) df.to_csv(file_path) mdf = md.read_csv(file_path) r = mdf.groupby('a').agg({'c': 'sum'}).to_pandas() expected = df.groupby('a').agg({'c': 'sum'}) pd.testing.assert_frame_equal(r, expected)
def testFetch(self): with tempfile.TemporaryDirectory() as tempdir: filename = os.path.join(tempdir, 'test_fetch.csv') pd_df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pd_df.to_csv(filename, index=False) df = md.read_csv(filename) df2 = df.groupby('d').agg({'b': 'min'}) expected = pd_df.groupby('d').agg({'b': 'min'}) _ = df2.execute() def _execute_read_csv(*_): # pragma: no cover raise ValueError('cannot run read_csv again') try: register(DataFrameReadCSV, _execute_read_csv) pd.testing.assert_frame_equal(df2.fetch(), expected) pd.testing.assert_frame_equal(df2.iloc[:3].fetch(), expected.iloc[:3]) finally: del Executor._op_runners[DataFrameReadCSV]
def testReadCSVHead(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = self.df df.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 mdf = md.read_csv(file_path, chunk_bytes=size) with self._raise_iloc(): hdf = mdf.head(5) expected = df.head(5) pd.testing.assert_frame_equal(hdf.execute().fetch(), expected) with self.assertRaises(ValueError) as cm: # need iloc mdf.head(99).execute() self.assertIn('cannot run iloc', str(cm.exception)) with self._raise_iloc(): s = mdf.head(5).sum() expected = df.head(5).sum() pd.testing.assert_series_equal(s.execute().fetch(), expected) pd.testing.assert_frame_equal( mdf.head(99).execute().fetch().reset_index(drop=True), df.head(99))
def test_read_csv_without_index(setup): # test csv file without storing index with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path, incremental_index=True).execute().fetch() pd.testing.assert_frame_equal(pdf, mdf) mdf2 = md.read_csv(file_path, incremental_index=True, chunk_bytes=10).execute().fetch() pd.testing.assert_frame_equal(pdf, mdf2)
def test_read_csv_gpu_execution(setup_gpu): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100,)), 'col3': np.arange(100) }) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = md.read_csv(file_path, gpu=True).execute().fetch() pd.testing.assert_frame_equal(pdf.reset_index(drop=True), mdf.to_pandas().reset_index(drop=True)) mdf2 = md.read_csv(file_path, gpu=True, chunk_bytes=200).execute().fetch() pd.testing.assert_frame_equal(pdf.reset_index(drop=True), mdf2.to_pandas().reset_index(drop=True))
def test_read_csv_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_cannot_prune(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune, another rule df3 = df1.head(3) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1[df1.dtypes.index.tolist()] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) # all columns selected records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None
async def test_optimization(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') pdf = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pdf.to_csv(file_path, index=False) df = md.read_csv(file_path) df2 = df.groupby('c').agg({'a': 'sum'}) df3 = df[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) assert await manager.get_task_progress(task_id) == 1.0 expect = pdf.groupby('c').agg({'a': 'sum'}) result_tileables = (await manager.get_task_result_tileables(task_id)) result1 = result_tileables[0] result = await _merge_data(result1, storage_api) np.testing.assert_array_equal(result, expect) expect = pdf[['b', 'a']] result2 = result_tileables[1] result = await _merge_data(result2, storage_api) np.testing.assert_array_equal(result, expect) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts([ c.key for c in result_tileables[1].chunks ])) == [1] * len(result_tileables[1].chunks) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts([ c.key for c in result_tileables[1].chunks ])) == [1] * len(result_tileables[1].chunks)
def testReadCSVWithoutIndex(self): sess = new_session() # test csv file without storing index tempdir = tempfile.mkdtemp() file_path = os.path.join(tempdir, 'test.csv') try: df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) pdf = pd.read_csv(file_path) mdf = sess.run(md.read_csv(file_path, sort_range_index=True)) pd.testing.assert_frame_equal(pdf, mdf) mdf2 = sess.run( md.read_csv(file_path, sort_range_index=True, chunk_bytes=10)) pd.testing.assert_frame_equal(pdf, mdf2) finally: shutil.rmtree(tempdir)
def testIterativeDependency(self, *_): with new_cluster(scheduler_n_process=2, worker_n_process=2, shared_memory='20M', web=True): with tempfile.TemporaryDirectory() as d: file_path = os.path.join(d, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path) mdf1 = md.read_csv(file_path, index_col=0, chunk_bytes=10) r1 = mdf1.iloc[:3].execute() pd.testing.assert_frame_equal(df[:3], r1) mdf2 = md.read_csv(file_path, index_col=0, chunk_bytes=10) r2 = mdf2.iloc[:3].execute() pd.testing.assert_frame_equal(df[:3], r2) f = mdf1[mdf1.a > mdf2.a] r3 = f.iloc[:3].execute() pd.testing.assert_frame_equal(r3, df[df.a > df.a])
def test_read_csv_head(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) context = dict() chunk_graph_builder = ChunkGraphBuilder(graph, fuse_enabled=False, tile_context=context) chunk_graph = next(chunk_graph_builder.build()) chunk1 = context[df1.data].chunks[0].data chunk2 = context[df2.data].chunks[0].data records = optimize(chunk_graph) assert records.get_optimization_result(chunk1) is None opt_chunk2 = records.get_optimization_result(chunk2) assert opt_chunk2.op.nrows == 5 assert len(chunk_graph) == 1 assert opt_chunk2 in chunk_graph.results
def test_groupby_and_getitem(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df1[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1 in graph.predecessors(opt_df2) opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
def test_groupby_read_csv(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1[['a', 'b']] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) context = dict() chunk_graph_builder = ChunkGraphBuilder(graph, fuse_enabled=False, tile_context=context) chunk_graph = next(chunk_graph_builder.build()) chunk1 = context[df1.data].chunks[0].data chunk2 = context[df2.data].chunks[0].data records = optimize(chunk_graph) opt_chunk1 = records.get_optimization_result(chunk1) assert opt_chunk1 is None opt_chunk2 = records.get_optimization_result(chunk2) assert opt_chunk2 is not None assert opt_chunk2.op.usecols == ['a', 'b'] # original tileable should not be modified assert chunk2.inputs[0] is chunk1
def test_no_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.iloc[1:10] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None df2 = df1.head(3) df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None assert records.get_optimization_result(df3.data) is None
def testDistributedReadCSVHead(self): service_ep = 'http://127.0.0.1:' + self.web_port timeout = 120 if 'CI' in os.environ else -1 with new_session(service_ep) as sess: rs = np.random.RandomState(0) # test md.read_csv().head() with tempfile.TemporaryDirectory() as d: file_path = os.path.join(d, 'test.csv') df = pd.DataFrame({ 'a': rs.rand(100), 'b': [f's{i}' for i in range(100)], }) df.to_csv(file_path, index=False) chunk_bytes = os.stat(file_path).st_size // 3 - 2 mdf = md.read_csv(file_path, chunk_bytes=chunk_bytes) r = mdf.head(3) result = r.execute(session=sess, timeout=timeout).fetch() expected = df.head(3) pd.testing.assert_frame_equal(result, expected)
def process_error_data_01(data): for index, row in data.iterrows(): if row['ONTIME'] != row['WORKTIME'] + row['STOPTIME']: # print(index, row['MACHINE_ID'] ,row['PDLINE_ID']) data.drop(index=index, axis=0, inplace=True) return data # https://blog.csdn.net/qq_18254385/article/details/90401181 if __name__ == '__main__': # 没有这行会错误 original_data = md.read_csv( 'F:/Projects/Python/data_process/csv/splited_data/data_tianzheng_assembly_2.csv' ).to_pandas() original_data.index = pd.DatetimeIndex(original_data["UPDATE_DATE"]) data_len = original_data.shape[0] print(data_len) data_1 = original_data[:(data_len * 1) // 8] data_2 = original_data[(data_len * 1) // 8:(data_len * 2) // 8] data_3 = original_data[(data_len * 2) // 8:(data_len * 3) // 8] data_4 = original_data[(data_len * 3) // 8:(data_len * 4) // 8] data_5 = original_data[(data_len * 4) // 8:(data_len * 5) // 8] data_6 = original_data[(data_len * 5) // 8:(data_len * 6) // 8] data_7 = original_data[(data_len * 6) // 8:(data_len * 7) // 8] data_8 = original_data[(data_len * 7) // 8:] data_list = [
def test_groupby_read_csv(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1.op.usecols == ['a', 'c'] assert opt_df2 in graph.predecessors(df3.data) assert opt_df2 in df3.inputs df4 = md.read_csv(file_path, usecols=['a', 'b', 'c']) df5 = df4.groupby('c').agg({'b': 'sum'}) graph = TileableGraph([df5.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df4 = records.get_optimization_result(df4.data) assert opt_df4 is not None opt_df5 = records.get_optimization_result(df5.data) assert opt_df5 is not None assert opt_df4.op.usecols == ['b', 'c'] df6 = md.read_csv(file_path) df7 = df6.groupby('c').agg({'b': 'sum'}) df8 = df6.groupby('b').agg({'a': 'sum'}) graph = TileableGraph([df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is not None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is not None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is not None assert opt_df6.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df7.inputs[0] is df6.data assert df8.inputs[0] is df6.data # test data source in result tileables graph = TileableGraph([df6.data, df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is None
def testReadCSVExecution(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=['a', 'b', 'c']) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) r = md.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(r, concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) size_res = self.executor.execute_dataframe(r, mock=True) self.assertEqual(sum(s[0] for s in size_res), os.stat(file_path).st_size) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, nrows=1), concat=True)[0] pd.testing.assert_frame_equal(df[:1], mdf) # test names and usecols with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=['a', 'b', 'c']) df.to_csv(file_path, index=False) mdf = self.executor.execute_dataframe(md.read_csv(file_path, usecols=['c', 'b']), concat=True)[0] pd.testing.assert_frame_equal( pd.read_csv(file_path, usecols=['c', 'b']), mdf) mdf = self.executor.execute_dataframe(md.read_csv(file_path, names=['a', 'b', 'c'], usecols=['c', 'b']), concat=True)[0] pd.testing.assert_frame_equal( pd.read_csv(file_path, names=['a', 'b', 'c'], usecols=['c', 'b']), mdf) mdf = self.executor.execute_dataframe(md.read_csv(file_path, names=['a', 'b', 'c'], usecols=['a', 'c']), concat=True)[0] pd.testing.assert_frame_equal( pd.read_csv(file_path, names=['a', 'b', 'c'], usecols=['a', 'c']), mdf) mdf = self.executor.execute_dataframe( md.read_csv(file_path, usecols=['a', 'c']), concat=True)[0] pd.testing.assert_frame_equal( pd.read_csv(file_path, usecols=['a', 'c']), mdf) # test sep with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, sep=';') pdf = pd.read_csv(file_path, sep=';', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test missing value with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan], 'c3': [np.nan, np.nan, 3.4, 2.2]}) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100,)), 'col3': np.arange(100) }, index=index) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=100), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test nan with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'col1': np.random.rand(100, ), 'col2': np.random.choice(['a', 'b', 'c'], (100,)), 'col3': np.arange(100) }) df.iloc[20:, :] = pd.NA df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = md.read_csv(file_path, index_col=0, head_lines=10, chunk_bytes=200) result = self.executor.execute_dataframe(mdf, concat=True)[0] pd.testing.assert_frame_equal(pdf, result) # dtypes is inferred as expected pd.testing.assert_series_equal(mdf.dtypes, pd.Series(['float64', 'object', 'int64'], index=df.columns)) # test compression with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.gzip') index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100,)), 'col3': np.arange(100) }, index=index) df.to_csv(file_path, compression='gzip') pdf = pd.read_csv(file_path, compression='gzip', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, compression='gzip', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, compression='gzip', index_col=0, chunk_bytes='1k'), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test multiply files with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [os.path.join(tempdir, f'test{i}.csv') for i in range(3)] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2) # test wildcards in path with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [os.path.join(tempdir, f'test{i}.csv') for i in range(3)] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = self.executor.execute_dataframe( md.read_csv(f'{tempdir}/*.csv', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf.sort_index()) mdf2 = self.executor.execute_dataframe( md.read_csv(f'{tempdir}/*.csv', index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2.sort_index()) # test read directory with tempfile.TemporaryDirectory() as tempdir: testdir = os.path.join(tempdir, 'test_dir') os.makedirs(testdir, exist_ok=True) df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [os.path.join(testdir, f'test{i}.csv') for i in range(3)] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = self.executor.execute_dataframe( md.read_csv(testdir, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf.sort_index()) mdf2 = self.executor.execute_dataframe( md.read_csv(testdir, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2.sort_index())
def testOptimizedHeadTail(self): import sqlalchemy as sa with tempfile.TemporaryDirectory() as tempdir: executor = ExecutorForTest(storage=self.executor.storage) filename = os.path.join(tempdir, 'test_head.csv') rs = np.random.RandomState(0) pd_df = pd.DataFrame({ 'a': rs.randint(1000, size=(100, )).astype(np.int64), 'b': rs.randint(1000, size=(100, )).astype(np.int64), 'c': ['sss' for _ in range(100)], 'd': ['eeee' for _ in range(100)] }) pd_df.to_csv(filename, index=False) size = os.path.getsize(filename) chunk_bytes = size / 3 df = md.read_csv(filename, chunk_bytes=chunk_bytes) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadCSV): result = executor.execute_tileables([r])[0] expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected) # test DataFrame.tail r = df.tail(3) result = executor.execute_tileables([r])[0] expected = pd_df.tail(3) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test head more than 1 chunk r = df.head(99) result = executor.execute_tileables([r])[0] result.reset_index(drop=True, inplace=True) expected = pd_df.head(99) pd.testing.assert_frame_equal(result, expected) # test Series.tail more than 1 chunk r = df.tail(99) result = executor.execute_tileables([r])[0] expected = pd_df.tail(99) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) filename = os.path.join(tempdir, 'test_sql.db') conn = sa.create_engine('sqlite:///' + filename) pd_df.to_sql('test_sql', conn) df = md.read_sql('test_sql', conn, index_col='index', chunk_size=20) # test DataFrame.head r = df.head(3) with self._inject_execute_data_source(3, DataFrameReadSQL): result = executor.execute_tileables([r])[0] result.index.name = None expected = pd_df.head(3) pd.testing.assert_frame_equal(result, expected)
def testReadCSVExecution(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=['a', 'b', 'c']) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, nrows=1), concat=True)[0] pd.testing.assert_frame_equal(df[:1], mdf) # test sep with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['a', 'b', 'c']) df.to_csv(file_path, sep=';') pdf = pd.read_csv(file_path, sep=';', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, sep=';', index_col=0, chunk_bytes=10), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test missing value with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') df = pd.DataFrame({ 'c1': [np.nan, 'a', 'b', 'c'], 'c2': [1, 2, 3, np.nan], 'c3': [np.nan, np.nan, 3.4, 2.2] }) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0, chunk_bytes=12), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path) pdf = pd.read_csv(file_path, index_col=0) mdf = self.executor.execute_dataframe(md.read_csv(file_path, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, index_col=0, chunk_bytes=100), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test compression with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.gzip') index = pd.date_range(start='1/1/2018', periods=100) df = pd.DataFrame( { 'col1': np.random.rand(100), 'col2': np.random.choice(['a', 'b', 'c'], (100, )), 'col3': np.arange(100) }, index=index) df.to_csv(file_path, compression='gzip') pdf = pd.read_csv(file_path, compression='gzip', index_col=0) mdf = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv( file_path, compression='gzip', index_col=0, chunk_bytes='1k'), concat=True)[0] pd.testing.assert_frame_equal(pdf, mdf2) # test multiply files with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, f'test{i}.csv') for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) mdf = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf) mdf2 = self.executor.execute_dataframe(md.read_csv(file_paths, index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2) # test wildcards in path with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame(np.random.rand(300, 3), columns=['a', 'b', 'c']) file_paths = [ os.path.join(tempdir, f'test{i}.csv') for i in range(3) ] df[:100].to_csv(file_paths[0]) df[100:200].to_csv(file_paths[1]) df[200:].to_csv(file_paths[2]) # As we can not guarantee the order in which these files are processed, # the result may not keep the original order. mdf = self.executor.execute_dataframe(md.read_csv( f'{tempdir}/*.csv', index_col=0), concat=True)[0] pd.testing.assert_frame_equal(df, mdf.sort_index()) mdf2 = self.executor.execute_dataframe(md.read_csv( f'{tempdir}/*.csv', index_col=0, chunk_bytes=50), concat=True)[0] pd.testing.assert_frame_equal(df, mdf2.sort_index())