def test_read_parquet_custom_columns(engine): with tmpdir() as tmp: tmp = str(tmp) data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'f': np.arange(1000, dtype=np.float64)}) df = dd.from_pandas(data, chunksize=50) df.to_parquet(tmp) df2 = read_parquet(tmp, columns=['i32', 'f'], engine=engine) assert_eq(df2, df2, check_index=False) df3 = read_parquet(tmp, columns=['f', 'i32'], engine=engine) assert_eq(df3, df3, check_index=False)
def test_names(fn): assert set(read_parquet(fn).dask) == set(read_parquet(fn).dask) assert (set(read_parquet(fn).dask) != set( read_parquet(fn, columns=['x']).dask)) assert (set(read_parquet(fn, columns='x').dask) != set( read_parquet(fn, columns=['x']).dask)) assert (set(read_parquet(fn, columns=('x', )).dask) == set( read_parquet(fn, columns=['x']).dask))
def test_names(fn): assert set(read_parquet(fn).dask) == set(read_parquet(fn).dask) assert (set(read_parquet(fn).dask) != set(read_parquet(fn, columns=['x']).dask)) assert (set(read_parquet(fn, columns='x').dask) != set(read_parquet(fn, columns=['x']).dask)) assert (set(read_parquet(fn, columns=('x',)).dask) == set(read_parquet(fn, columns=['x']).dask))
def test_parquet(s3): dd = pytest.importorskip('dask.dataframe') pytest.importorskip('fastparquet') from dask.dataframe.io.parquet import to_parquet, read_parquet import pandas as pd import numpy as np url = 's3://%s/test.parquet' % test_bucket_name data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice( ['hello', 'you', 'people'], size=1000).astype("O")}, index=pd.Index(np.arange(1000), name='foo')) df = dd.from_pandas(data, chunksize=500) to_parquet(url, df, object_encoding='utf8') files = [f.split('/')[-1] for f in s3.ls(url)] assert '_metadata' in files assert 'part.0.parquet' in files df2 = read_parquet(url, index='foo') assert len(df2.divisions) > 1 pd.util.testing.assert_frame_equal(data, df2.compute())
def test_parquet(s3): dd = pytest.importorskip('dask.dataframe') pytest.importorskip('fastparquet') from dask.dataframe.io.parquet import to_parquet, read_parquet import pandas as pd import numpy as np url = 's3://%s/test.parquet' % test_bucket_name data = pd.DataFrame( { 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O") }, index=pd.Index(np.arange(1000), name='foo')) df = dd.from_pandas(data, chunksize=500) to_parquet(url, df, object_encoding='utf8') files = [f.split('/')[-1] for f in s3.ls(url)] assert '_metadata' in files assert 'part.0.parquet' in files df2 = read_parquet(url, index='foo') assert len(df2.divisions) > 1 pd.util.testing.assert_frame_equal(data, df2.compute())
def test_append_wo_index(): """Test append with write_index=False.""" with tmpdir() as tmp: df = pd.DataFrame({ 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O") }) half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) with pytest.raises(ValueError) as excinfo: ddf2.to_parquet(tmp, write_index=False, append=True) assert 'Appended columns' in str(excinfo.value) with tmpdir() as tmp: ddf1.to_parquet(tmp, write_index=False) ddf2.to_parquet(tmp, write_index=False, append=True) ddf3 = read_parquet(tmp, index='f') assert_eq(df.set_index('f'), ddf3)
def test_local(): with tmpdir() as tmp: tmp = str(tmp) data = pd.DataFrame({ 'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O") }) df = dd.from_pandas(data, chunksize=500) df.to_parquet(tmp, write_index=False, object_encoding='utf8') files = os.listdir(tmp) assert '_metadata' in files assert 'part.0.parquet' in files df2 = read_parquet(tmp, index=False) assert len(df2.divisions) > 1 out = df2.compute(get=dask.get).reset_index() for column in df.columns: assert (data[column] == out[column]).all()
def test_optimize(fn, c): ddf = read_parquet(fn) assert_eq(df[c], ddf[c]) x = ddf[c] dsk = x._optimize(x.dask, x._keys()) assert len(dsk) == x.npartitions assert all(v[4] == c for v in dsk.values())
def test_roundtrip(df, write_kwargs, read_kwargs): with tmpdir() as tmp: tmp = str(tmp) if df.index.name is None: df.index.name = 'index' ddf = dd.from_pandas(df, npartitions=2) to_parquet(tmp, ddf, **write_kwargs) ddf2 = read_parquet(tmp, index=df.index.name, **read_kwargs) assert_eq(ddf, ddf2)
def test_categorical(): with tmpdir() as tmp: df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) to_parquet(tmp, ddf) ddf2 = read_parquet(tmp, categories=['x']) assert ddf2.compute().x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2)
def test_categorical(): with tmpdir() as tmp: df = pd.DataFrame({'x': ['a', 'b', 'c'] * 100}, dtype='category') ddf = dd.from_pandas(df, npartitions=3) to_parquet(tmp, ddf) ddf2 = read_parquet(tmp, categories=['x']) assert ddf2.x.cat.categories.tolist() == ['a', 'b', 'c'] ddf2.loc[:1000].compute() df.index.name = 'index' # defaults to 'index' in this case assert assert_eq(df, ddf2)
def test_ordering(): with tmpdir() as tmp: tmp = str(tmp) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) to_parquet(tmp, ddf) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def test_index(): with tmpdir() as tmp: tmp = str(tmp) df = pd.DataFrame( { 'x': [6, 2, 3, 4, 5], 'y': [1.0, 2.0, 1.0, 2.0, 1.0] }, index=pd.Index([10, 20, 30, 40, 50], name='myindex')) ddf = dd.from_pandas(df, npartitions=3) to_parquet(tmp, ddf) ddf2 = read_parquet(tmp) assert_eq(ddf, ddf2)
def test_append(): """Test that appended parquet equal to the original one.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")}) df.index.name = 'index' half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) ddf2.to_parquet(tmp, append=True) ddf3 = read_parquet(tmp) assert_eq(df, ddf3)
def test_append(): """Test that appended parquet equal to the original one.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'yo', 'people'], size=1000).astype("O")}) df.index.name = 'index' half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) ddf2.to_parquet(tmp, append=True) ddf3 = read_parquet(tmp) assert_eq(df, ddf3)
def test_append_wo_index(): """Test append with write_index=False.""" with tmpdir() as tmp: df = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")}) half = len(df) // 2 ddf1 = dd.from_pandas(df.iloc[:half], chunksize=100) ddf2 = dd.from_pandas(df.iloc[half:], chunksize=100) ddf1.to_parquet(tmp) with pytest.raises(ValueError) as excinfo: ddf2.to_parquet(tmp, write_index=False, append=True) assert 'Appended columns' in str(excinfo.value) with tmpdir() as tmp: ddf1.to_parquet(tmp, write_index=False) ddf2.to_parquet(tmp, write_index=False, append=True) ddf3 = read_parquet(tmp, index='f') assert_eq(df.set_index('f'), ddf3)
def test_local(): with tmpdir() as tmp: tmp = str(tmp) data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice(['hello', 'you', 'people'], size=1000).astype("O")}) df = dd.from_pandas(data, chunksize=500) df.to_parquet(tmp, write_index=False, object_encoding='utf8') files = os.listdir(tmp) assert '_metadata' in files assert 'part.0.parquet' in files df2 = read_parquet(tmp, index=False) assert len(df2.divisions) > 1 out = df2.compute(get=dask.get).reset_index() for column in df.columns: assert (data[column] == out[column]).all()
def test_parquet_wstoragepars(s3): dd = pytest.importorskip('dask.dataframe') pytest.importorskip('fastparquet') from dask.dataframe.io.parquet import to_parquet, read_parquet import pandas as pd import numpy as np url = 's3://%s/test.parquet' % test_bucket_name data = pd.DataFrame({'i32': np.array([0, 5, 2, 5])}) df = dd.from_pandas(data, chunksize=500) to_parquet(url, df, write_index=False) read_parquet(url, storage_options={'default_fill_cache': False}) assert s3.current().default_fill_cache is False read_parquet(url, storage_options={'default_fill_cache': True}) assert s3.current().default_fill_cache is True read_parquet(url, storage_options={'default_block_size': 2**20}) assert s3.current().default_block_size == 2**20 with s3.current().open(url + '/_metadata') as f: assert f.blocksize == 2**20
def test_no_columns_no_index(fn): ddf = read_parquet(fn, columns=[]) assert_eq(df[[]], ddf)
def test_index_column_false_index(fn): ddf = read_parquet(fn, columns=['myindex'], index=False) assert_eq(pd.DataFrame(df.index), ddf, check_index=False)
def test_no_columns_yes_index(fn): ddf = read_parquet(fn, columns=[], index='myindex') assert_eq(df[[]], ddf)
def test_index_column(fn): ddf = read_parquet(fn, columns=['myindex'], index='myindex') assert_eq(df[[]], ddf)
def test_index_column_no_index(fn): ddf = read_parquet(fn, columns=['myindex']) assert_eq(df[[]], ddf)
def test_auto_add_index(fn): ddf = read_parquet(fn, columns=['x'], index='myindex') assert_eq(df[['x']], ddf)
def test_glob(fn): os.unlink(os.path.join(fn, '_metadata')) files = os.listdir(fn) assert '_metadata' not in files ddf = read_parquet(os.path.join(fn, '*')) assert_eq(df, ddf)
def read(fn, **kwargs): return read_parquet(fn, engine=engine, **kwargs)
def test_index(fn): ddf = read_parquet(fn) assert_eq(df, ddf)
def test_series(fn): ddf = read_parquet(fn, columns=['x']) assert_eq(df[['x']], ddf) ddf = read_parquet(fn, columns='x', index='myindex') assert_eq(df.x, ddf)