Exemplo n.º 1
0
def test_merge(tempdir, dirs, row_groups):
    fn = str(tempdir)

    default_mkdirs(os.path.join(fn, dirs[0]))
    df0 = pd.DataFrame({'a': [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, dirs[0], 'out0.parq'])
    write(fn0, df0, row_group_offsets=row_groups)

    default_mkdirs(os.path.join(fn, dirs[1]))
    df1 = pd.DataFrame({'a': [5, 6, 7, 8]})
    fn1 = os.sep.join([fn, dirs[1], 'out1.parq'])
    write(fn1, df1, row_group_offsets=row_groups)

    # with file-names
    pf = writer.merge([fn0, fn1])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert 'cat' in pf.cats

    # with instances
    pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert 'cat' in pf.cats
Exemplo n.º 2
0
def write_parquet(fname, chunk, **kwargs):
    from xpark.plan.dataframe.results import Result

    if isinstance(chunk, Result):
        chunk = chunk.data
    default_mkdirs(os.path.dirname(fname))
    fastparquet.write(fname,
                      chunk,
                      compression=settings.PARQUET_COMPRESSION,
                      **kwargs)