def test_merge(tempdir, dirs, row_groups): fn = str(tempdir) default_mkdirs(os.path.join(fn, dirs[0])) df0 = pd.DataFrame({'a': [1, 2, 3, 4]}) fn0 = os.sep.join([fn, dirs[0], 'out0.parq']) write(fn0, df0, row_group_offsets=row_groups) default_mkdirs(os.path.join(fn, dirs[1])) df1 = pd.DataFrame({'a': [5, 6, 7, 8]}) fn1 = os.sep.join([fn, dirs[1], 'out1.parq']) write(fn1, df1, row_group_offsets=row_groups) # with file-names pf = writer.merge([fn0, fn1]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert 'cat' in pf.cats # with instances pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert 'cat' in pf.cats
def write_parquet(fname, chunk, **kwargs): from xpark.plan.dataframe.results import Result if isinstance(chunk, Result): chunk = chunk.data default_mkdirs(os.path.dirname(fname)) fastparquet.write(fname, chunk, compression=settings.PARQUET_COMPRESSION, **kwargs)