예제 #1
0
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.columns == ["b"]
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
예제 #2
0
def test_empty_row_group(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'o': np.random.choice(['hello', 'world'],
                                               size=1000)})
    writer.write(fname, data, row_group_offsets=[0, 900, 1800])
    pf = ParquetFile(fname)
    assert len(pf.row_groups) == 2
예제 #3
0
def test_groups_roundtrip(tempdir, scheme):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme=scheme)

    r = ParquetFile(tempdir)
    assert r.columns == ['b']

    out = r.to_pandas()
    if scheme == 'drill':
        assert set(r.cats) == {'dir0', 'dir1'}
        assert set(out.columns) == {'b', 'dir0', 'dir1'}
        out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True)

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'],
                 file_scheme=scheme)

    r = ParquetFile(tempdir)
    assert r.fmd.num_rows == r.count() == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    if scheme == 'drill':
        assert set(out.columns) == {'b', 'dir0', 'dir1'}
        out.rename(columns={'dir0': 'a', 'dir1': 'c'}, inplace=True)
    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
예제 #4
0
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame({
        'a': np.random.choice(['a', 'b', None], size=1000),
        'b': np.random.randint(0, 64000, size=1000),
        'c': np.random.choice([True, False], size=1000)
    })
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.columns == ['b']
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir,
                 df,
                 row_group_offsets=[0, 50],
                 partition_on=['a', 'c'],
                 file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
예제 #5
0
def test_too_many_partition_columns(tempdir):
    df = pd.DataFrame({
        'a': np.random.choice(['a', 'b', 'c'], size=1000),
        'c': np.random.choice([True, False], size=1000)
    })
    with pytest.raises(ValueError) as ve:
        writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive')
    assert "Cannot include all columns" in str(ve.value)
예제 #6
0
def test_index(tempdir):
    fn = os.path.join(tempdir, "tmp.parq")
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}, index=pd.Index([10, 20, 30], name="z"))

    writer.write(fn, df)

    r = ParquetFile(fn)
    assert set(r.columns) == {"x", "y", "z"}
예제 #7
0
def test_write_compression_dict(tempdir, compression):
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
    fn = os.path.join(tempdir, "tmp.parq")
    writer.write(fn, df, compression=compression)
    r = ParquetFile(fn)
    df2 = r.to_pandas()

    tm.assert_frame_equal(df, df2, check_categorical=False)
예제 #8
0
def test_dotted_column(tempdir):
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'x.y': [1, 2, 3], 'y': [1., 2., 3.]})

    writer.write(fn, df)

    out = ParquetFile(fn).to_pandas()
    assert list(out.columns) == ['x.y', 'y']
예제 #9
0
def test_write_compression_dict(tempdir, compression):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
    fn = os.path.join(tempdir, 'tmp.parq')
    writer.write(fn, df, compression=compression)
    r = ParquetFile(fn)
    df2 = r.to_pandas()

    tm.assert_frame_equal(df, df2, check_categorical=False)
예제 #10
0
def test_index(tempdir):
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'x': [1, 2, 3],
                       'y': [1., 2., 3.]},
                       index=pd.Index([10, 20, 30], name='z'))

    writer.write(fn, df)

    r = ParquetFile(fn)
    assert set(r.columns) == {'x', 'y', 'z'}
예제 #11
0
def test_write_compression_schema(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})
    fn = os.path.join(tempdir, 'tmp.parq')
    writer.write(fn, df, compression={'x': 'gzip'})
    r = ParquetFile(fn)

    assert all(c.meta_data.codec for row in r.row_groups for c in row.columns
               if c.meta_data.path_in_schema == ['x'])
    assert not any(c.meta_data.codec for row in r.row_groups
                   for c in row.columns if c.meta_data.path_in_schema == ['y'])
예제 #12
0
def test_write_compression_schema(tempdir):
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
    fn = os.path.join(tempdir, "tmp.parq")
    writer.write(fn, df, compression={"x": "gzip"})
    r = ParquetFile(fn)

    assert all(c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ["x"])
    assert not any(
        c.meta_data.codec for row in r.row_groups for c in row.columns if c.meta_data.path_in_schema == ["y"]
    )
예제 #13
0
def test_write_delta(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'i1': np.arange(10, dtype=np.int32) + 2,
                         'i2': np.cumsum(np.random.randint(
                                 0, 5, size=10)).astype(np.int32) + 2})
    writer.write(fname, data, encoding="DELTA_BINARY_PACKED")

    df = sql.read.parquet(fname)
    ddf = df.toPandas()
    for col in data:
        assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
예제 #14
0
def test_nulls_roundtrip(tempdir):
    fname = os.path.join(tempdir, "temp.parq")
    data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)})
    data["cat"] = data["o"].astype("category")
    writer.write(fname, data, has_nulls=["o", "cat"])

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
        assert (data[col].isnull() == df[col].isnull()).all()
예제 #15
0
def test_nulls_roundtrip(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'o': np.random.choice(['hello', 'world', None],
                                               size=1000)})
    data['cat'] = data['o'].astype('category')
    writer.write(fname, data, has_nulls=['o', 'cat'])

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
        assert (data[col].isnull() == df[col].isnull()).all()
예제 #16
0
def test_groups_iterable(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['aaa', 'bbb', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    writer.write(tempdir, df, partition_on=['a'], file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.columns == ['b', 'c']
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
예제 #17
0
def test_naive_index(tempdir):
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]})

    writer.write(fn, df)
    r = ParquetFile(fn)

    assert set(r.columns) == {'x', 'y'}

    writer.write(fn, df, write_index=True)
    r = ParquetFile(fn)

    assert set(r.columns) == {'x', 'y', 'index'}
예제 #18
0
def test_naive_index(tempdir):
    fn = os.path.join(tempdir, "tmp.parq")
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})

    writer.write(fn, df)
    r = ParquetFile(fn)

    assert set(r.columns) == {"x", "y"}

    writer.write(fn, df, write_index=True)
    r = ParquetFile(fn)

    assert set(r.columns) == {"x", "y", "index"}
예제 #19
0
def test_hasnulls_ordering(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'a': np.random.rand(100),
                         'b': np.random.rand(100),
                         'c': np.random.rand(100)})
    writer.write(fname, data, has_nulls=['a', 'c'])

    r = ParquetFile(fname)
    assert r._schema[1].name == 'a'
    assert r._schema[1].repetition_type == 1
    assert r._schema[2].name == 'b'
    assert r._schema[2].repetition_type == 0
    assert r._schema[3].name == 'c'
    assert r._schema[3].repetition_type == 1
예제 #20
0
def test_read_partitioned_and_write_with_empty_partions(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', 'c'], size=1000),
                       'c': np.random.choice([True, False], size=1000)})

    writer.write(tempdir, df, partition_on=['a'], file_scheme='hive')
    df_filtered = ParquetFile(tempdir).to_pandas(
                                            filters=[('a', '==', 'b')]
                                            )

    writer.write(tempdir, df_filtered, partition_on=['a'], file_scheme='hive')

    df_loaded = ParquetFile(tempdir).to_pandas()

    tm.assert_frame_equal(df_filtered, df_loaded, check_categorical=False)
예제 #21
0
def test_empty_groupby(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    df.loc[499:, 'c'] = True  # no False in second half
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive',
                 row_group_offsets=[0, 500])
    r = ParquetFile(tempdir)
    assert r.count() == sum(~df.a.isnull())
    assert len(r.row_groups) == 6
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
예제 #22
0
def test_write_delta(tempdir):
    fname = os.path.join(tempdir, "temp.parq")
    data = pd.DataFrame(
        {
            "i1": np.arange(10, dtype=np.int32) + 2,
            "i2": np.cumsum(np.random.randint(0, 5, size=10)).astype(np.int32) + 2,
        }
    )
    writer.write(fname, data, encoding="DELTA_BINARY_PACKED")

    df = sql.read.parquet(fname)
    ddf = df.toPandas()
    for col in data:
        assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
예제 #23
0
def test_cats_in_part_files(tempdir):
    df = pd.DataFrame({'a': pd.Categorical(['a', 'b'] * 100)})
    writer.write(tempdir, df, file_scheme='hive', row_group_offsets=50)
    import glob
    files = glob.glob(os.path.join(tempdir, 'part*'))
    pf = ParquetFile(tempdir)
    assert len(pf.row_groups) == 4
    kv = pf.fmd.key_value_metadata
    assert kv
    for f in files:
        pf = ParquetFile(f)
        assert pf.fmd.key_value_metadata == kv
        assert len(pf.row_groups) == 1
    out = pd.concat([ParquetFile(f).to_pandas() for f in files],
                    ignore_index=True)
    pd.testing.assert_frame_equal(df, out)
예제 #24
0
def test_decimal_roundtrip(tempdir):
    import decimal
    def decimal_convert(x):
        return decimal.Decimal(x)

    fname = os.path.join(tempdir, 'decitemp.parq')
    data = pd.DataFrame({'f64': np.arange(10000000, 10001000, dtype=np.float64) / 100000,
                         'f16': np.arange(1000, dtype=np.float16) /10000
                        })
    data['f64']=data['f64'].apply(decimal_convert)
    data['f16']=data['f16'].apply(decimal_convert)
    writer.write(fname, data)

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (data[col] == df[col]).all()
예제 #25
0
def test_empty_groupby(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    df.loc[499:, "c"] = True  # no False in second half
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive", row_group_offsets=[0, 500])
    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 6
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
예제 #26
0
def test_multi_index(tempdir):
    import json
    fn = os.path.join(tempdir, 'tmp.parq')
    idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3]])
    idx.names = ['index0', 'index1']
    df = pd.DataFrame(list(range(9)), idx, ['col'])
    writer.write(fn, df)

    pf = ParquetFile(fn)
    assert set(pf.columns) == {'col', 'index0', 'index1'}
    meta = json.loads(pf.key_value_metadata['pandas'])
    assert meta['index_columns'] == idx.names
    out = pf.to_pandas()
    assert out.index.names == idx.names
    pd.util.testing.assert_frame_equal(df, out)
    out = pf.to_pandas(index=False)
    assert out.index.name is None
    assert (out.index == range(9)).all()
    assert len(out.columns) == 3
예제 #27
0
def test_index(tempdir):
    import json
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'x': [1, 2, 3],
                       'y': [1., 2., 3.]},
                       index=pd.Index([10, 20, 30], name='z'))

    writer.write(fn, df)

    pf = ParquetFile(fn)
    assert set(pf.columns) == {'x', 'y', 'z'}
    meta = json.loads(pf.key_value_metadata['pandas'])
    assert meta['index_columns'] == ['z']
    out = pf.to_pandas()
    assert out.index.name == 'z'
    pd.testing.assert_frame_equal(df, out, check_dtype=False)
    out = pf.to_pandas(index=False)
    assert out.index.name is None
    assert (out.index == range(3)).all()
    assert (out.z == df.index).all()
예제 #28
0
def test_int_rowgroups(tempdir):
    df = pd.DataFrame({'a': [1]*100})
    fname = os.path.join(tempdir, 'test.parq')
    writer.write(fname, df, row_group_offsets=30)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25]
    writer.write(fname, df, row_group_offsets=33)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25]
    writer.write(fname, df, row_group_offsets=34)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
    writer.write(fname, df, row_group_offsets=35)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
예제 #29
0
def test_int_rowgroups(tempdir):
    df = pd.DataFrame({"a": [1] * 100})
    fname = os.path.join(tempdir, "test.parq")
    writer.write(fname, df, row_group_offsets=30)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25]
    writer.write(fname, df, row_group_offsets=33)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [25, 25, 25, 25]
    writer.write(fname, df, row_group_offsets=34)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
    writer.write(fname, df, row_group_offsets=35)
    r = ParquetFile(fname)
    assert [rg.num_rows for rg in r.row_groups] == [34, 34, 32]
예제 #30
0
def test_empty_row_group(tempdir):
    fname = os.path.join(tempdir, "temp.parq")
    data = pd.DataFrame({"o": np.random.choice(["hello", "world"], size=1000)})
    writer.write(fname, data, row_group_offsets=[0, 900, 1800])
    pf = ParquetFile(fname)
    assert len(pf.row_groups) == 2