Пример #1
0
def test_row_filter(tempdir):
    fn = os.path.join(tempdir, 'test.parquet')
    df = pd.DataFrame({'a': ['o'] * 10 + ['i'] * 5, 'b': range(15)})
    write(fn, df, row_group_offsets=8)
    pf = ParquetFile(fn)
    assert pf.count(filters=[["a", "==", "o"]]) == 15
    assert pf.count(filters=[["a", "==", "o"]], row_filter=True) == 10
    assert pf.count(filters=[["a", "==", "i"]], row_filter=True) == 5
    assert pf.count(filters=[["b", "in", [1, 3, 4]]]) == 8
    assert pf.count(filters=[["b", "in", [1, 3, 4]]], row_filter=True) == 3
    assert pf.to_pandas(filters=[["b", "in", [1, 3, 4]]],
                        row_filter=True).b.tolist() == [1, 3, 4]
    assert pf.to_pandas(filters=[["a", "<", "o"]],
                        row_filter=True).b.tolist() == [10, 11, 12, 13, 14]
Пример #2
0
def test_groups_roundtrip(tempdir, scheme):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme=scheme)

    r = ParquetFile(tempdir)
    assert r.columns == ['b']

    out = r.to_pandas()
    if scheme == 'drill':
        assert set(r.cats) == {'dir0', 'dir1'}
        assert set(out.columns) == {'b', 'dir0', 'dir1'}
        out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True)

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'],
                 file_scheme=scheme)

    r = ParquetFile(tempdir)
    assert r.fmd.num_rows == r.count() == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    if scheme == 'drill':
        assert set(out.columns) == {'b', 'dir0', 'dir1'}
        out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True)
    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
Пример #3
0
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                         'i64': np.arange(1000, dtype=np.int64),
                         'u64': np.arange(1000, dtype=np.uint64),
                         'f': np.arange(1000, dtype=np.float64),
                         'bhello': np.random.choice([b'hello', b'you',
                            b'people'], size=1000).astype("O")})
    data['a'] = np.array([b'a', b'b', b'c', b'd', b'e']*200, dtype="S1")
    data['aa'] = data['a'].map(lambda x: 2*x).astype("S2")
    data['hello'] = data.bhello.str.decode('utf8')
    data['bcat'] = data.bhello.astype('category')
    data['cat'] = data.hello.astype('category')
    fname = os.path.join(tempdir, 'test.parquet')
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups,
          compression=comp)

    r = ParquetFile(fname)
    assert r.fmd.num_rows == r.count() == 1000

    df = r.to_pandas()

    assert data.cat.dtype == 'category'

    for col in r.columns:
        assert (df[col] == data[col]).all()
        # tests https://github.com/dask/fastparquet/issues/250
        assert isinstance(data[col][0], type(df[col][0]))
Пример #4
0
def test_empty_dataframe(tempdir):
    df = pd.DataFrame({'a': [], 'b': []}, dtype=int)
    fn = os.path.join(str(tempdir), 'test.parquet')
    write(fn, df)
    pf = ParquetFile(fn)
    out = pf.to_pandas()
    assert pf.count() == 0
    assert len(out) == 0
    assert (out.columns == df.columns).all()
    assert pf.statistics
Пример #5
0
def test_append_empty(tempdir, scheme):
    fn = os.path.join(str(tempdir), 'test.parq')
    df = pd.DataFrame({'a': [1, 2, 3, 0],
                       'b': ['a', 'a', 'b', 'b']})
    write(fn, df.head(0), write_index=False, file_scheme=scheme)
    pf = ParquetFile(fn)
    assert pf.count() == 0
    assert pf.file_scheme == 'empty'
    write(fn, df, append=True, write_index=False, file_scheme=scheme)

    pf = ParquetFile(fn)
    pd.testing.assert_frame_equal(
        pf.to_pandas(), df, check_categorical=False, check_dtype=False)
Пример #6
0
def test_empty_groupby(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    df.loc[499:, 'c'] = True  # no False in second half
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive',
                 row_group_offsets=[0, 500])
    r = ParquetFile(tempdir)
    assert r.count() == sum(~df.a.isnull())
    assert len(r.row_groups) == 6
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
Пример #7
0
def test_attributes(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    pf = ParquetFile(fn)
    assert pf.columns == ['x', 'y', 'z']
    assert len(pf.row_groups) == 2
    assert pf.count() == 4
    assert join_path(fn).replace("\\", "/") == pf.info['name']
    assert join_path(fn).replace("\\", "/") in str(pf)
    for col in df:
        assert getattr(pf.dtypes[col], "numpy_dtype",
                       pf.dtypes[col]) == df.dtypes[col]