def test_read_parquet_execution(setup, setup_hdfs): hdfs = setup_hdfs test_df = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) test_df2 = pd.DataFrame({ 'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) with hdfs.open(f"{TEST_DIR}/test.parquet", "wb", replication=1) as f: test_df.to_parquet(f, row_group_size=3) df = md.read_parquet(f'hdfs://localhost:8020{TEST_DIR}/test.parquet') res = df.to_pandas() pd.testing.assert_frame_equal(res, test_df) hdfs.mkdir(f"{TEST_DIR}/test_partitioned") with hdfs.open(f"{TEST_DIR}/test_partitioned/file1.parquet", "wb", replication=1) as f: test_df.to_parquet(f, row_group_size=3) with hdfs.open(f"{TEST_DIR}/test_partitioned/file2.parquet", "wb", replication=1) as f: test_df2.to_parquet(f, row_group_size=3) df = md.read_parquet(f'hdfs://localhost:8020{TEST_DIR}/test_partitioned') res = df.to_pandas() pd.testing.assert_frame_equal(res, pd.concat([test_df, test_df2]))
def test_local_classifier_from_to_parquet(setup): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = (rs.rand(n_rows) > 0.5).astype(np.int32) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, y, verbose=True) with tempfile.TemporaryDirectory() as d: result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testReadParquetArrow(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path) df = md.read_parquet(file_path) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) size_res = self.executor.execute_dataframe(df, mock=True) self.assertGreater(sum(s[0] for s in size_res), test_df.memory_usage(deep=True).sum()) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, row_group_size=3) df = md.read_parquet(file_path, groups_as_chunks=True, columns=['a', 'b']) result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df[['a', 'b']]) with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, row_group_size=5) df = md.read_parquet(file_path, groups_as_chunks=True, use_arrow_dtype=True, incremental_index=True) result = self.executor.execute_dataframe(df, concat=True)[0] self.assertIsInstance(df.dtypes.iloc[1], md.ArrowStringDtype) self.assertIsInstance(result.dtypes.iloc[1], md.ArrowStringDtype) pd.testing.assert_frame_equal(arrow_array_to_objects(result), test_df) # test wildcards in path with tempfile.TemporaryDirectory() as tempdir: df = pd.DataFrame({'a': np.arange(300).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(300)], 'c': np.random.rand(300), }) file_paths = [os.path.join(tempdir, f'test{i}.parquet') for i in range(3)] df[:100].to_parquet(file_paths[0], row_group_size=50) df[100:200].to_parquet(file_paths[1], row_group_size=30) df[200:].to_parquet(file_paths[2]) mdf = md.read_parquet(f'{tempdir}/*.parquet') r = self.executor.execute_dataframe(mdf, concat=True)[0] pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True)) mdf = md.read_parquet(f'{tempdir}/*.parquet', groups_as_chunks=True) r = self.executor.execute_dataframe(mdf, concat=True)[0] pd.testing.assert_frame_equal(df, r.sort_values('a').reset_index(drop=True))
def test_getitem_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.c df3 = df1[['a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs assert opt_df1.op.columns == ['a', 'c'] assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
def test_groupby_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1.op.columns == ['a', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'}) graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1.op.columns == ['c']
def testReadParquetHead(self): with tempfile.TemporaryDirectory() as tempdir: df = self.df dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname, f'test{i}.parquet') df[i * 40:(i + 1) * 40].to_parquet(file_path, index=False) mdf = md.read_parquet(dirname) with self._raise_iloc(): hdf = mdf.head(5) expected = df.head(5) pd.testing.assert_frame_equal(hdf.execute().fetch(), expected) with self.assertRaises(ValueError) as cm: # need iloc mdf.head(99).execute() self.assertIn('cannot run iloc', str(cm.exception)) pd.testing.assert_frame_equal( mdf.head(99).execute().fetch().reset_index(drop=True), df.head(99))
def testGroupbyPruneReadParquet(self): with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.parquet') df = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) df.to_parquet(file_path, index=False) # Use test executor mdf = md.read_parquet(file_path).groupby('c').agg({'a': 'sum'}) result = self.executor.execute_dataframes([mdf])[0] mdf._shape = result.shape expected = df.groupby('c').agg({'a': 'sum'}) pd.testing.assert_frame_equal(result, expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.columns, ['a', 'c']) mdf = md.read_parquet(file_path).groupby( 'c', as_index=False).c.agg({'cnt': 'count'}) result = self.executor.execute_dataframes([mdf])[0] mdf._shape = result.shape expected = df.groupby('c', as_index=False).c.agg({'cnt': 'count'}) pd.testing.assert_frame_equal(result, expected) optimized_df = tileable_optimized[mdf.data] self.assertEqual(optimized_df.inputs[0].op.columns, ['c']) # test getitem mdf = md.read_parquet(file_path) df1 = mdf.c.value_counts() df2 = mdf.groupby('b')['b'].count() results = self.executor.execute_dataframes([df1, df2]) df1._shape = results[0].shape df2._shape = results[1].shape expected = df.c.value_counts(), df.groupby('b')['b'].count() pd.testing.assert_series_equal(results[0], expected[0]) pd.testing.assert_series_equal(results[1], expected[1]) optimized_df = tileable_optimized[df1.data] self.assertEqual(optimized_df.inputs[0].inputs[0].op.columns, ['b', 'c'])
def testToParquetExecution(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) df = md.DataFrame(test_df, chunk_size=5) dir_name = f'hdfs://localhost:8020{TEST_DIR}/test_to_parquet/' self.hdfs.mkdir(dir_name) df.to_parquet(dir_name).execute() result = md.read_parquet(dir_name).to_pandas() pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df) # test wildcard dir_name = f'hdfs://localhost:8020{TEST_DIR}/test_to_parquet2/*.parquet' self.hdfs.mkdir(dir_name.rsplit('/', 1)[0]) df.to_parquet(dir_name).execute() result = md.read_parquet(dir_name.rsplit('/', 1)[0]).to_pandas() pd.testing.assert_frame_equal(result.reset_index(drop=True), test_df)
def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = rs.rand(n_rows) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) df['id'] = [f'i{i}' for i in range(n_rows)] booster = xgboost.train({}, xgboost.DMatrix(X, y), num_boost_round=2) with tempfile.TemporaryDirectory() as d: m_name = os.path.join(d, 'c.model') result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) booster.save_model(m_name) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir).set_index('id') model = XGBClassifier() model.load_model(m_name) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() model2 = xgboost.XGBClassifier() model2.load_model(m_name) expected = model2.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testToParquetArrowExecution(self): raw = pd.DataFrame({ 'col1': np.random.rand(100), 'col2': np.arange(100), 'col3': np.random.choice(['a', 'b', 'c'], (100, )), }) df = DataFrame(raw, chunk_size=33) with tempfile.TemporaryDirectory() as base_path: # DATAFRAME TESTS path = os.path.join(base_path, 'out-*.parquet') r = df.to_parquet(path) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result = result.sort_index() pd.testing.assert_frame_equal(result, raw) # test read_parquet then to_parquet read_df = md.read_parquet(path) r = read_df.to_parquet(path) self.executor.execute_dataframes([r]) # test partition_cols path = os.path.join(base_path, 'out-partitioned') r = df.to_parquet(path, partition_cols=['col3']) self.executor.execute_dataframe(r) read_df = md.read_parquet(path) result = self.executor.execute_dataframe(read_df, concat=True)[0] result['col3'] = result['col3'].astype('object') pd.testing.assert_frame_equal( result.sort_values('col1').reset_index(drop=True), raw.sort_values('col1').reset_index(drop=True))
def test_read_parquet_fast_parquet(setup): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) # test fastparquet engine with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, compression=None) df = md.read_parquet(file_path, engine='fastparquet') result = df.execute().fetch() pd.testing.assert_frame_equal(result, test_df)
def testLocalClassifierFromToParquet(self): n_rows = 1000 n_columns = 10 rs = np.random.RandomState(0) X = rs.rand(n_rows, n_columns) y = (rs.rand(n_rows) > 0.5).astype(np.int32) df = pd.DataFrame(X, columns=[f'c{i}' for i in range(n_columns)]) # test with existing model classifier = lightgbm.LGBMClassifier(n_estimators=2) classifier.fit(X, y, verbose=True) with tempfile.TemporaryDirectory() as d: result_dir = os.path.join(d, 'result') os.mkdir(result_dir) data_dir = os.path.join(d, 'data') os.mkdir(data_dir) df.iloc[:500].to_parquet(os.path.join(d, 'data', 'data1.parquet')) df.iloc[500:].to_parquet(os.path.join(d, 'data', 'data2.parquet')) df = md.read_parquet(data_dir) model = LGBMClassifier() model.load_model(classifier) result = model.predict(df, run=False) r = md.DataFrame(result).to_parquet(result_dir) # tiles to ensure no iterative tiling exists g = r.build_graph(tiled=True) self.assertTrue(all(isinstance(n.op, Fuse) for n in g)) self.assertEqual(len(g), 2) r.execute() ret = md.read_parquet(result_dir).to_pandas().iloc[:, 0].to_numpy() expected = classifier.predict(X) expected = np.stack([1 - expected, expected]).argmax(axis=0) np.testing.assert_array_equal(ret, expected)
def testReadParquetFastParquet(self): test_df = pd.DataFrame({'a': np.arange(10).astype(np.int64, copy=False), 'b': [f's{i}' for i in range(10)], 'c': np.random.rand(10), }) # test fastparquet engine with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') test_df.to_parquet(file_path, compression=None) df = md.read_parquet(file_path, engine='fastparquet') result = self.executor.execute_dataframe(df, concat=True)[0] pd.testing.assert_frame_equal(result, test_df) size_res = self.executor.execute_dataframe(df, mock=True) self.assertGreater(sum(s[0] for s in size_res), test_df.memory_usage(deep=True).sum())
def test_read_parquet_head(prepare_data): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results
def test_read_parquet_head(prepare_data, setup): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors}).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected)