Exemplo n.º 1
0
def test_bad_file_paths(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x=0')
    fn1 = os.path.join(dir1, 'part.=.parquet')
    os.makedirs(dir1)
    write(fn1, df)
    dir2 = os.path.join(tempdir, 'y/z')
    fn2 = os.path.join(dir2, 'part.0.parquet')
    os.makedirs(dir2)
    write(fn2, df)

    pf = ParquetFile([fn1, fn2])
    assert pf.file_scheme == 'other'
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    assert 'dir0' not in out

    path1 = os.path.join(tempdir, 'data')
    fn1 = os.path.join(path1, 'out.parq')
    os.makedirs(path1)
    write(fn1, df)
    path2 = os.path.join(tempdir, 'data2')
    fn2 = os.path.join(path2, 'out.parq')
    os.makedirs(path2)
    write(fn2, df)
    pf = ParquetFile([fn1, fn2])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
Exemplo n.º 2
0
def test_text_convert(tempdir):
    df = pd.DataFrame({"a": ["a"] * 100, "b": [b"a"] * 100})
    fn = os.path.join(tempdir, "tmp.parq")

    write(fn, df, fixed_text={"a": 1, "b": 2})
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[1].type_length == 1
    assert pf.schema[2].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[2].type_length == 2
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df)
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, fixed_text={"a": 1})
    pf = ParquetFile(fn)
    assert pf.schema[1].type == parquet_thrift.Type.FIXED_LEN_BYTE_ARRAY
    assert pf.schema[2].type == parquet_thrift.Type.BYTE_ARRAY
    assert pf.statistics["max"]["a"] == ["a"]
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 3
0
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
Exemplo n.º 4
0
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.columns == ["b"]
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
Exemplo n.º 5
0
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame(
        {
            "a": [1, 2, 3, 0],
            "b": [1.0, 2.0, 3.0, np.nan],
            "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
            "d": ["a", "b", "c", None],
        }
    )
    df["e"] = df["d"].astype("category")
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises(TypeError):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf.schema[2:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf.schema[1:3]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 6
0
def test_index_not_in_columns(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=['b'])
    assert out.index.tolist() == ['x', 'y', 'z']
    out = pf.to_pandas(columns=['b'], index=False)
    assert out.index.tolist() == [0, 1, 2]
Exemplo n.º 7
0
def test_request_nonexistent_column(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3]})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)

    pf = ParquetFile(fn)
    with pytest.raises(ValueError):
        pf.to_pandas(columns=['y'])
Exemplo n.º 8
0
def test_in_filter_numbers(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['values'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('values', 'in', ['1', '4'])])
    assert set(out.symbols) == {'a', 'c'}
    out = pf.to_pandas(filters=[('values', 'in', [1, 4])])
    assert set(out.symbols) == {'a', 'c'}
Exemplo n.º 9
0
def test_multi_cat_fail(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df, row_group_offsets=25)

    pf = ParquetFile(fn)
    with pytest.raises(RuntimeError):
        pf.to_pandas()
Exemplo n.º 10
0
def test_filter_without_paths(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g']
    })
    write(fn, df)

    pf = ParquetFile(fn)
    out = pf.to_pandas(filters=[['x', '>', 3]])
    pd.util.testing.assert_frame_equal(out, df)
    out = pf.to_pandas(filters=[['x', '>', 30]])
    assert len(out) == 0
Exemplo n.º 11
0
def test_single_upper_directory(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
    write(tempdir, df, file_scheme='hive', partition_on='y')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert (out.y == 'aa').all()

    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
    pf = ParquetFile(flist, root=tempdir)
    assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
    out = pf.to_pandas()
    assert (out.y == 'aa').all()
Exemplo n.º 12
0
def test_multi_list(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x')
    write(dir1, df, file_scheme='hive')
    dir2 = os.path.join(tempdir, 'y')
    write(dir2, df, file_scheme='hive')
    dir3 = os.path.join(tempdir, 'z', 'deep')
    write(dir3, df, file_scheme='hive')

    pf = ParquetFile([dir1, dir2])
    out = pf.to_pandas()  # this version may have extra column!
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    pf = ParquetFile([dir1, dir2, dir3])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 3
Exemplo n.º 13
0
def test_filelike(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    with open(fn, 'rb') as f:
        pf = ParquetFile(f, open_with=open)
        d2 = pf.to_pandas()
        pd.util.testing.assert_frame_equal(d2, df)

    b = io.BytesIO(open(fn, 'rb').read())
    pf = ParquetFile(b, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
Exemplo n.º 14
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 15
0
def test_numerical_partition_name(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5]
    assert out[out.y1 == 'bb'].x.tolist() == [2]
Exemplo n.º 16
0
def test_floating_point_partition_name(tempdir):
    df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
    assert out[out.y1 == 'bb'].x.tolist() == [200.0]
Exemplo n.º 17
0
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
        }
    )
    data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
    data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
    data["hello"] = data.bhello.str.decode("utf8")
    data["bcat"] = data.bhello.astype("category")
    data["cat"] = data.hello.astype("category")
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == "category"

    for col in r.columns:
        assert (df[col] == data[col]).all()
Exemplo n.º 18
0
def test_roundtrip_complex(tempdir, scheme):
    import datetime

    data = pd.DataFrame(
        {
            "ui32": np.arange(1000, dtype=np.uint32),
            "i16": np.arange(1000, dtype=np.int16),
            "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
            "f16": np.arange(1000, dtype=np.float16),
            "dicts": [{"oi": "you"}] * 1000,
            "t": [datetime.datetime.now()] * 1000,
            "td": [datetime.timedelta(seconds=1)] * 1000,
            "bool": np.random.choice([True, False], size=1000),
        }
    )
    data.loc[100, "t"] = None

    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme)

    r = ParquetFile(fname)

    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
Exemplo n.º 19
0
def test_input_column_list_not_mutated(tempdir):
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    cols = ['a']
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=cols)
    assert cols == ['a']
Exemplo n.º 20
0
def test_to_pandas():
    fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq'
    pf = ParquetFile(fname)
    out = pf.to_pandas()
    assert len(out.columns) == 29
    # test for bad integer conversion
    assert (out.dep_time < 0).sum() == 0
    assert out.dep_time.dtype == 'float64'
Exemplo n.º 21
0
def test_write_compression_dict(tempdir, compression):
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
    fn = os.path.join(tempdir, "tmp.parq")
    writer.write(fn, df, compression=compression)
    r = ParquetFile(fn)
    df2 = r.to_pandas()

    tm.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 22
0
def test_filter_stats(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
    })
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('x', '>=', 5)])
    assert out.x.tolist() == [5, 6, 7]
Exemplo n.º 23
0
def test_append_simple(tempdir):
    fn = os.path.join(str(tempdir), "test.parq")
    df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]})
    write(fn, df, write_index=False)
    write(fn, df, append=True, write_index=False)

    pf = ParquetFile(fn)
    expected = pd.concat([df, df], ignore_index=True)
    pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
Exemplo n.º 24
0
    def read_partition(
        cls, fs, piece, columns, index, categories=(), pf=None, **kwargs
    ):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(
                        pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
                    )
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(
                rg_piece, columns, categories, index=index, **kwargs.get("read", {})
            )
Exemplo n.º 25
0
def test_spark_date_empty_rg():
    # https://github.com/dask/fastparquet/issues/634
    # first file has header size much smaller than others as it contains no row groups
    fn = os.path.join(TEST_DATA, 'spark-date-empty-rg.parq')
    pf = ParquetFile(fn)
    out = pf.to_pandas(columns=['Date'])
    assert out.Date.tolist() == [
        pd.Timestamp("2020-1-1"),
        pd.Timestamp("2020-1-2")
    ]
Exemplo n.º 26
0
def test_null_time(tempdir):
    """Test reading a file that contains null records."""
    tmp = str(tempdir)
    expected = pd.DataFrame({"t": [np.timedelta64(), np.timedelta64("NaT")]})
    fn = os.path.join(tmp, "test-time-null.parquet")

    # with NaT
    write(fn, expected, has_nulls=False)
    p = ParquetFile(fn)
    data = p.to_pandas()
    assert (data["t"] == expected["t"])[~expected["t"].isnull()].all()
    assert sum(data["t"].isnull()) == sum(expected["t"].isnull())

    # with NULL
    write(fn, expected, has_nulls=True)
    p = ParquetFile(fn)
    data = p.to_pandas()
    assert (data["t"] == expected["t"])[~expected["t"].isnull()].all()
    assert sum(data["t"].isnull()) == sum(expected["t"].isnull())
Exemplo n.º 27
0
def test_filter_special(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI']
    })
    write(tempdir, df, file_scheme='hive', partition_on=['symbol'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbol', '==', 'NOW')])
    assert out.x.tolist() == [1, 5, 6]
    assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
Exemplo n.º 28
0
def test_filter_dates(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'date': [
            '2015-05-09', '2017-05-15', '2017-05-14', 
            '2017-05-13', '2015-05-10', '2015-05-11', '2017-05-12'
        ]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['date'])
    pf = ParquetFile(tempdir)
    out_1 = pf.to_pandas(filters=[('date', '>', '2017-01-01')])
    
    assert set(out_1.x.tolist()) == {2, 3, 4, 7}
    expected_dates = set(pd.to_datetime(['2017-05-15', '2017-05-14', '2017-05-13', '2017-05-12']))
    assert set(out_1.date.tolist()) == expected_dates

    out_2 = pf.to_pandas(filters=[('date', '==', pd.to_datetime('may 9 2015'))])
    assert out_2.x.tolist() == [1]
    assert out_2.date.tolist() == pd.to_datetime(['2015-05-09']).tolist()
Exemplo n.º 29
0
def test_multi_index(tempdir):
    import json
    fn = os.path.join(tempdir, 'tmp.parq')
    idx = pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2, 3]])
    idx.names = ['index0', 'index1']
    df = pd.DataFrame(list(range(9)), idx, ['col'])
    writer.write(fn, df)

    pf = ParquetFile(fn)
    assert set(pf.columns) == {'col', 'index0', 'index1'}
    meta = json.loads(pf.key_value_metadata['pandas'])
    assert meta['index_columns'] == idx.names
    out = pf.to_pandas()
    assert out.index.names == idx.names
    pd.util.testing.assert_frame_equal(df, out)
    out = pf.to_pandas(index=False)
    assert out.index.name is None
    assert (out.index == range(9)).all()
    assert len(out.columns) == 3
Exemplo n.º 30
0
def test_open_standard(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], file_scheme='hive',
          open_with=open)
    pf = ParquetFile(fn, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
Exemplo n.º 31
0
def test_null_time(tempdir):
    """Test reading a file that contains null records."""
    tmp = str(tempdir)
    expected = pd.DataFrame({"t": [np.timedelta64(), np.timedelta64('NaT')]})
    fn = os.path.join(tmp, "test-time-null.parquet")

    # with NaT
    write(fn, expected, has_nulls=False)
    p = ParquetFile(fn)
    data = p.to_pandas()
    assert (data['t'] == expected['t'])[~expected['t'].isnull()].all()
    assert sum(data['t'].isnull()) == sum(expected['t'].isnull())

    # with NULL
    write(fn, expected, has_nulls=True)
    p = ParquetFile(fn)
    data = p.to_pandas()
    assert (data['t'] == expected['t'])[~expected['t'].isnull()].all()
    assert sum(data['t'].isnull()) == sum(expected['t'].isnull())
Exemplo n.º 32
0
def test_read_multiple_no_metadata(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5]})
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2])
    os.unlink(os.path.join(tempdir, '_metadata'))
    import glob
    flist = glob.glob(os.path.join(tempdir, '*'))
    pf = ParquetFile(flist)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas()
    pd.util.testing.assert_frame_equal(out, df)
Exemplo n.º 33
0
def parquet_heatmap():
    pf = ParquetFile(os.path.join(project_dir, 'data', 'interim', 'data.parq'))
    df = pf.to_pandas(
        filters=[('user', '==', 194), ('modality', '==', 'cpm')]).set_index(
            'date')  # .drop(['modality', 'user'], axis=1)
    print(df.shape)

    data = DataLoader.convert_to_npy(df, save=False)
    p = sns.heatmap(np.nan_to_num(data[:, :, 0]))
    plt.show(p)
Exemplo n.º 34
0
def read_local_parquet(file_name):
    """
    Read Local ParquetFile
    :param file_name1:
    :return:
    """
    pf = ParquetFile(file_name)
    print(pf.columns)
    print(len(pf.columns))
    print(pf.to_pandas())
Exemplo n.º 35
0
def test_floating_point_partition_name(tempdir):
    df = pd.DataFrame({
        'x': [1e99, 5e-10, 2e+2, -0.1],
        'y1': ['aa', 'aa', 'bb', 'aa']
    })
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
    assert out[out.y1 == 'bb'].x.tolist() == [200.0]
Exemplo n.º 36
0
def test_filter_special(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI']
    })
    write(tempdir, df, file_scheme='hive', partition_on=['symbol'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbol', '==', 'NOW')])
    assert out.x.tolist() == [1, 5, 6]
    assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
Exemplo n.º 37
0
def test_datetime_partition_names(tempdir):
    dates = pd.to_datetime(
        ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09'])
    df = pd.DataFrame({'date': dates, 'x': [1, 5, 2, 5]})
    write(tempdir, df, file_scheme='hive', partition_on=['date'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert set(out.date.tolist()) == set(dates.tolist())
    assert out[out.date == '2015-05-09'].x.tolist() == [1, 5]
    assert out[out.date == '2020-10-17'].x.tolist() == [2]
Exemplo n.º 38
0
def test_filter_dates(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'date': [
            '2015-05-09', '2017-05-15', '2017-05-14',
            '2017-05-13', '2015-05-10', '2015-05-11', '2017-05-12'
        ]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['date'])
    pf = ParquetFile(tempdir)
    out_1 = pf.to_pandas(filters=[('date', '>', '2017-01-01')])

    assert set(out_1.x.tolist()) == {2, 3, 4, 7}
    expected_dates = set(pd.to_datetime(['2017-05-15', '2017-05-14', '2017-05-13', '2017-05-12']))
    assert set(out_1.date.tolist()) == expected_dates

    out_2 = pf.to_pandas(filters=[('date', '==', pd.to_datetime('may 9 2015'))])
    assert out_2.x.tolist() == [1]
    assert out_2.date.tolist() == pd.to_datetime(['2015-05-09']).tolist()
Exemplo n.º 39
0
def test_open_standard(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], file_scheme='hive',
          open_with=open)
    pf = ParquetFile(fn, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
Exemplo n.º 40
0
def test_empty_dataframe(tempdir):
    df = pd.DataFrame({'a': [], 'b': []}, dtype=int)
    fn = os.path.join(str(tempdir), 'test.parquet')
    write(fn, df)
    pf = ParquetFile(fn)
    out = pf.to_pandas()
    assert pf.count() == 0
    assert len(out) == 0
    assert (out.columns == df.columns).all()
    assert pf.statistics
Exemplo n.º 41
0
def test_nulls_roundtrip(tempdir):
    fname = os.path.join(tempdir, "temp.parq")
    data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)})
    data["cat"] = data["o"].astype("category")
    writer.write(fname, data, has_nulls=["o", "cat"])

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
        assert (data[col].isnull() == df[col].isnull()).all()
Exemplo n.º 42
0
def test_index(tempdir):
    import json
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'x': [1, 2, 3],
                       'y': [1., 2., 3.]},
                       index=pd.Index([10, 20, 30], name='z'))

    writer.write(fn, df)

    pf = ParquetFile(fn)
    assert set(pf.columns) == {'x', 'y', 'z'}
    meta = json.loads(pf.key_value_metadata['pandas'])
    assert meta['index_columns'] == ['z']
    out = pf.to_pandas()
    assert out.index.name == 'z'
    pd.testing.assert_frame_equal(df, out, check_dtype=False)
    out = pf.to_pandas(index=False)
    assert out.index.name is None
    assert (out.index == range(3)).all()
    assert (out.z == df.index).all()
Exemplo n.º 43
0
def test_many_categories(tempdir, n):
    tmp = str(tempdir)
    cats = np.arange(n)
    codes = np.random.randint(0, n, size=1000000)
    df = pd.DataFrame({"x": pd.Categorical.from_codes(codes, cats), "y": 1})
    fn = os.path.join(tmp, "test.parq")

    write(fn, df, has_nulls=False)
    pf = ParquetFile(fn)
    out = pf.to_pandas(categories={"x": n})

    tm.assert_frame_equal(df, out, check_categorical=False)

    df.set_index("x", inplace=True)
    write(fn, df, has_nulls=False, write_index=True)
    pf = ParquetFile(fn)
    out = pf.to_pandas(categories={"x": n}, index="x")

    assert (out.index == df.index).all()
    assert (out.y == df.y).all()
Exemplo n.º 44
0
def test_many_categories(tempdir, n):
    tmp = str(tempdir)
    cats = np.arange(n)
    codes = np.random.randint(0, n, size=1000000)
    df = pd.DataFrame({'x': pd.Categorical.from_codes(codes, cats), 'y': 1})
    fn = os.path.join(tmp, "test.parq")

    write(fn, df, has_nulls=False)
    pf = ParquetFile(fn)
    out = pf.to_pandas(categories={'x': n})

    tm.assert_frame_equal(df, out, check_categorical=False, check_dtype=False)

    df.set_index('x', inplace=True)
    write(fn, df, has_nulls=False, write_index=True)
    pf = ParquetFile(fn)
    out = pf.to_pandas(categories={'x': n}, index='x')

    assert (out.index == df.index).all()
    assert (out.y == df.y).all()
Exemplo n.º 45
0
def test_append_simple(tempdir):
    fn = os.path.join(str(tempdir), 'test.parq')
    df = pd.DataFrame({'a': [1, 2, 3, 0],
                       'b': ['a', 'a', 'b', 'b']})
    write(fn, df, write_index=False)
    write(fn, df, append=True, write_index=False)

    pf = ParquetFile(fn)
    expected = pd.concat([df, df], ignore_index=True)
    pd.testing.assert_frame_equal(
        pf.to_pandas(), expected, check_categorical=False, check_dtype=False)
Exemplo n.º 46
0
def test_read_multiple_no_metadata(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5]})
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2])
    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*'))))
    pf = ParquetFile(flist)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas()
    pd.util.testing.assert_frame_equal(out, df)
Exemplo n.º 47
0
def test_many_categories(tempdir, n):
    tmp = str(tempdir)
    cats = np.arange(n)
    codes = np.random.randint(0, n, size=1000000)
    df = pd.DataFrame({'x': pd.Categorical.from_codes(codes, cats)})
    fn = os.path.join(tmp, "test.parq")

    write(fn, df, has_nulls=False)
    pf = ParquetFile(fn)
    out = pf.to_pandas(categories=['x'])

    tm.assert_frame_equal(df, out)
Exemplo n.º 48
0
def test_groups_iterable(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['aaa', 'bbb', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    writer.write(tempdir, df, partition_on=['a'], file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.columns == ['b', 'c']
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
Exemplo n.º 49
0
def test_directory_local(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })
    df.index.name = 'index'
    write(os.path.join(tempdir, 'foo1.parquet'), df)
    write(os.path.join(tempdir, 'foo2.parquet'), df)
    pf = ParquetFile(tempdir)
    assert pf.info['rows'] == 8
    assert pf.to_pandas()['z'].tolist() == ['a', 'b', 'c', 'd'] * 2
Exemplo n.º 50
0
def test_nulls_roundtrip(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'o': np.random.choice(['hello', 'world', None],
                                               size=1000)})
    data['cat'] = data['o'].astype('category')
    writer.write(fname, data, has_nulls=['o', 'cat'])

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
        assert (data[col].isnull() == df[col].isnull()).all()
Exemplo n.º 51
0
def test_no_index_name(tempdir):
    df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'],
                       'b': [4, 5, 6]}).set_index('__index_level_0__')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out.index.name is None
    assert out.index.tolist() == ['x', 'y', 'z']

    df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'],
                       'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(index='__index_level_0__', columns=['b'])
    assert out.index.name is None
    assert out.index.tolist() == ['x', 'y', 'z']

    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out.index.name is None
    assert out.index.tolist() == [0, 1, 2]
Exemplo n.º 52
0
def get_epa_by_date(start_date, end_date, hourly=True):
    """
    Gets EPA data by whole day at a time
    Args:
      - start_date, end_date (str, format "YYYY/MM/DD" although pandas is pretty smart about picking that stuff up)
      - hourly (boolean, default = True) whether only values on the hour (or interpolated values in between hours)
        are returned
    """

    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    df_list = []

    # Get File from s3
    try:
        for one_day in date_range:
            filename = 'epa_' + one_day.strftime("%Y%m%d")
            folder = 'EpaDaily'

            s3 = s3fs.S3FileSystem()
            myopen = s3.open
            s3_resource = boto3.resource('s3')
            s3_resource.Object('midscapstone-whos-polluting-my-air',
                               '{}/{}.parquet'.format(folder,
                                                      filename)).load()
            pf = ParquetFile(
                'midscapstone-whos-polluting-my-air/{}/{}.parquet'.format(
                    folder, filename),
                open_with=myopen)
            df = pf.to_pandas()
            df.reset_index(inplace=True, drop=True)
            if hourly:
                hourly_filter = np.where(df.created % 100 == 0, True, False)
                df_list.append(df[hourly_filter])
            else:
                df_list.append(df)

    except Exception as e:
        print(f"Processing {folder}/{filename} failed")
        print(e)

    all_df = pd.concat(df_list, ignore_index = True) \
        .assign(
            ts_ = lambda da: da['created'].map(parse_date),
            site_id = lambda da: da.apply(lambda l: str(l['ts_']) + "_" + format_name(l['site_name']), axis = 1)
        ) \
        .set_index("site_id", drop = True)['epa_pm25_value']

    # create lookup dictionary based site id and value
    lookup = {}
    for site_id, val in all_df.iteritems():
        lookup[site_id] = val

    return lookup
Exemplo n.º 53
0
    def __init__(self, pq_file_dir, output, mimic_notes):
        self.pq_file_dir = pq_file_dir
        self.output = output
        self.mimic_notes_file = mimic_notes

        pf = ParquetFile(self.mimic_notes_file)
        self.notes = pf.to_pandas()

        self.preds = self.get_df_from_pq(self.pq_file_dir, 'predicates')
        self.mentions = self.get_df_from_pq(self.pq_file_dir, 'mentions')
        self.umls = self.get_df_from_pq(self.pq_file_dir, 'umls_concepts')
        self.sents = self.get_df_from_pq(self.pq_file_dir, 'sentences')
        print("Finished loading data...")
Exemplo n.º 54
0
def test_append_empty(tempdir, scheme):
    fn = os.path.join(str(tempdir), 'test.parq')
    df = pd.DataFrame({'a': [1, 2, 3, 0],
                       'b': ['a', 'a', 'b', 'b']})
    write(fn, df.head(0), write_index=False, file_scheme=scheme)
    pf = ParquetFile(fn)
    assert pf.count() == 0
    assert pf.file_scheme == 'empty'
    write(fn, df, append=True, write_index=False, file_scheme=scheme)

    pf = ParquetFile(fn)
    pd.testing.assert_frame_equal(
        pf.to_pandas(), df, check_categorical=False, check_dtype=False)
Exemplo n.º 55
0
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame({
        'a': [1, 2, 3, 0],
        'aa': [1, 2, 3, None],
        'b': [1., 2., 3., np.nan],
        'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'),
        'd': ['a', 'b', 'c', None],
        'f': [True, False, True, True],
        'ff': [True, False, None, True]
    })
    df['e'] = df['d'].astype('category')
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises((TypeError, AttributeError)):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    cols = list(set(df) - {'ff'})
    tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in ['d', 'ff']:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
Exemplo n.º 56
0
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.columns == ['b']
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=['a', 'c'],
                 file_scheme='hive')

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)
Exemplo n.º 57
0
def test_multi(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df)

    pf = ParquetFile(fn)
    df1 = pf.to_pandas()
    assert df1.equals(df)
    assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
Exemplo n.º 58
0
def test_only_partition_columns(tempdir):
    df = pd.DataFrame({'a': np.random.rand(20),
                       'b': np.random.choice(['hi', 'ho'], size=20),
                       'c': np.random.choice(['a', 'b'], size=20)})
    write(tempdir, df, file_scheme='hive', partition_on=['b'])
    pf = ParquetFile(tempdir)
    df2 = pf.to_pandas(columns=['b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    write(tempdir, df, file_scheme='hive', partition_on=['a', 'b'])
    pf = ParquetFile(tempdir)
    df2 = pf.to_pandas(columns=['a', 'b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    df2 = pf.to_pandas(columns=['b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    df2 = pf.to_pandas(columns=['b', 'c'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    with pytest.raises(ValueError):
        # because this leaves no data to write
        write(tempdir, df[['b']], file_scheme='hive', partition_on=['b'])
Exemplo n.º 59
0
def test_pickle(tempdir):
    import pickle
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })
    df.index.name = 'index'

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], write_index=True)
    pf = ParquetFile(fn)
    pf2 = pickle.loads(pickle.dumps(pf))
    assert pf.to_pandas().equals(pf2.to_pandas())
Exemplo n.º 60
0
def test_empty_groupby(tempdir):
    df = pd.DataFrame({'a': np.random.choice(['a', 'b', None], size=1000),
                       'b': np.random.randint(0, 64000, size=1000),
                       'c': np.random.choice([True, False], size=1000)})
    df.loc[499:, 'c'] = True  # no False in second half
    writer.write(tempdir, df, partition_on=['a', 'c'], file_scheme='hive',
                 row_group_offsets=[0, 500])
    r = ParquetFile(tempdir)
    assert r.count() == sum(~df.a.isnull())
    assert len(r.row_groups) == 6
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a==row.a)&(df.c==row.c)].b)