def test_row_filter(tempdir): fn = os.path.join(tempdir, 'test.parquet') df = pd.DataFrame({'a': ['o'] * 10 + ['i'] * 5, 'b': range(15)}) write(fn, df, row_group_offsets=8) pf = ParquetFile(fn) assert pf.count(filters=[["a", "==", "o"]]) == 15 assert pf.count(filters=[["a", "==", "o"]], row_filter=True) == 10 assert pf.count(filters=[["a", "==", "i"]], row_filter=True) == 5 assert pf.count(filters=[["b", "in", [1, 3, 4]]]) == 8 assert pf.count(filters=[["b", "in", [1, 3, 4]]], row_filter=True) == 3 assert pf.to_pandas(filters=[["b", "in", [1, 3, 4]]], row_filter=True).b.tolist() == [1, 3, 4] assert pf.to_pandas(filters=[["a", "<", "o"]], row_filter=True).b.tolist() == [10, 11, 12, 13, 14]
def test_groups_roundtrip(tempdir, scheme): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme=scheme) r = ParquetFile(tempdir) assert r.columns == ['b'] out = r.to_pandas() if scheme == 'drill': assert set(r.cats) == {'dir0', 'dir1'} assert set(out.columns) == {'b', 'dir0', 'dir1'} out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True) for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'], file_scheme=scheme) r = ParquetFile(tempdir) assert r.fmd.num_rows == r.count() == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() if scheme == 'drill': assert set(out.columns) == {'b', 'dir0', 'dir1'} out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True) for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_roundtrip(tempdir, scheme, row_groups, comp): data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'u64': np.arange(1000, dtype=np.uint64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice([b'hello', b'you', b'people'], size=1000).astype("O")}) data['a'] = np.array([b'a', b'b', b'c', b'd', b'e']*200, dtype="S1") data['aa'] = data['a'].map(lambda x: 2*x).astype("S2") data['hello'] = data.bhello.str.decode('utf8') data['bcat'] = data.bhello.astype('category') data['cat'] = data.hello.astype('category') fname = os.path.join(tempdir, 'test.parquet') write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp) r = ParquetFile(fname) assert r.fmd.num_rows == r.count() == 1000 df = r.to_pandas() assert data.cat.dtype == 'category' for col in r.columns: assert (df[col] == data[col]).all() # tests https://github.com/dask/fastparquet/issues/250 assert isinstance(data[col][0], type(df[col][0]))
def test_empty_dataframe(tempdir): df = pd.DataFrame({'a': [], 'b': []}, dtype=int) fn = os.path.join(str(tempdir), 'test.parquet') write(fn, df) pf = ParquetFile(fn) out = pf.to_pandas() assert pf.count() == 0 assert len(out) == 0 assert (out.columns == df.columns).all() assert pf.statistics
def test_append_empty(tempdir, scheme): fn = os.path.join(str(tempdir), 'test.parq') df = pd.DataFrame({'a': [1, 2, 3, 0], 'b': ['a', 'a', 'b', 'b']}) write(fn, df.head(0), write_index=False, file_scheme=scheme) pf = ParquetFile(fn) assert pf.count() == 0 assert pf.file_scheme == 'empty' write(fn, df, append=True, write_index=False, file_scheme=scheme) pf = ParquetFile(fn) pd.testing.assert_frame_equal( pf.to_pandas(), df, check_categorical=False, check_dtype=False)
def test_empty_groupby(tempdir): df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000), 'b': np.random.randint(0, 64000, size=1000), 'c': np.random.choice([True, False], size=1000)}) df.loc[499:, 'c'] = True # no False in second half writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive', row_group_offsets=[0, 500]) r = ParquetFile(tempdir) assert r.count() == sum(~df.a.isnull()) assert len(r.row_groups) == 6 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
def test_attributes(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) pf = ParquetFile(fn) assert pf.columns == ['x', 'y', 'z'] assert len(pf.row_groups) == 2 assert pf.count() == 4 assert join_path(fn).replace("\\", "/") == pf.info['name'] assert join_path(fn).replace("\\", "/") in str(pf) for col in df: assert getattr(pf.dtypes[col], "numpy_dtype", pf.dtypes[col]) == df.dtypes[col]