Exemplo n.º 1
0
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
Exemplo n.º 2
0
def test_bad_file_paths(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x=0')
    fn1 = os.path.join(dir1, 'part.=.parquet')
    os.makedirs(dir1)
    write(fn1, df)
    dir2 = os.path.join(tempdir, 'y/z')
    fn2 = os.path.join(dir2, 'part.0.parquet')
    os.makedirs(dir2)
    write(fn2, df)

    pf = ParquetFile([fn1, fn2])
    assert pf.file_scheme == 'other'
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    assert 'dir0' not in out

    path1 = os.path.join(tempdir, 'data')
    fn1 = os.path.join(path1, 'out.parq')
    os.makedirs(path1)
    write(fn1, df)
    path2 = os.path.join(tempdir, 'data2')
    fn2 = os.path.join(path2, 'out.parq')
    os.makedirs(path2)
    write(fn2, df)
    pf = ParquetFile([fn1, fn2])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
Exemplo n.º 3
0
def test_floating_point_partition_name(tempdir):
    df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
    assert out[out.y1 == 'bb'].x.tolist() == [200.0]
Exemplo n.º 4
0
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
        }
    )
    data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
    data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
    data["hello"] = data.bhello.str.decode("utf8")
    data["bcat"] = data.bhello.astype("category")
    data["cat"] = data.hello.astype("category")
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == "category"

    for col in r.columns:
        assert (df[col] == data[col]).all()
Exemplo n.º 5
0
def test_roundtrip_complex(tempdir, scheme):
    import datetime

    data = pd.DataFrame(
        {
            "ui32": np.arange(1000, dtype=np.uint32),
            "i16": np.arange(1000, dtype=np.int16),
            "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
            "f16": np.arange(1000, dtype=np.float16),
            "dicts": [{"oi": "you"}] * 1000,
            "t": [datetime.datetime.now()] * 1000,
            "td": [datetime.timedelta(seconds=1)] * 1000,
            "bool": np.random.choice([True, False], size=1000),
        }
    )
    data.loc[100, "t"] = None

    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme)

    r = ParquetFile(fname)

    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
Exemplo n.º 6
0
def test_numerical_partition_name(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5]
    assert out[out.y1 == 'bb'].x.tolist() == [2]
Exemplo n.º 7
0
def test_groups_roundtrip(tempdir):
    df = pd.DataFrame(
        {
            "a": np.random.choice(["a", "b", None], size=1000),
            "b": np.random.randint(0, 64000, size=1000),
            "c": np.random.choice([True, False], size=1000),
        }
    )
    writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.columns == ["b"]
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)

    writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive")

    r = ParquetFile(tempdir)
    assert r.count == sum(~df.a.isnull())
    assert len(r.row_groups) == 8
    out = r.to_pandas()

    for i, row in out.iterrows():
        assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
Exemplo n.º 8
0
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame(
        {
            "a": [1, 2, 3, 0],
            "b": [1.0, 2.0, 3.0, np.nan],
            "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"),
            "d": ["a", "b", "c", None],
        }
    )
    df["e"] = df["d"].astype("category")
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises(TypeError):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf.schema[2:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf.schema[1:3]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=["e"])
    tm.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 9
0
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series,
                    categories, cats, scheme, storage_name_mapping):
    """Read dataset with fastparquet using ParquetFile machinery"""
    from fastparquet import ParquetFile
    pf = ParquetFile(path, open_with=fs.open)
    relpath = path.replace(base, '').lstrip('/')
    for rg in pf.row_groups:
        for ch in rg.columns:
            ch.file_path = relpath
    pf.file_scheme = scheme
    pf.cats = cats
    pf.fn = base
    df = pf.to_pandas(all_columns, categories, index=index_names)
    if df.index.nlevels == 1:
        if index_names:
            df.index.name = storage_name_mapping.get(index_names[0],
                                                     index_names[0])
    else:
        if index_names:
            df.index.names = [storage_name_mapping.get(name, name)
                              for name in index_names]
    df.columns = [storage_name_mapping.get(col, col)
                  for col in all_columns
                  if col not in (index_names or [])]

    if is_series:
        return df[df.columns[0]]
    else:
        return df
Exemplo n.º 10
0
def test_input_column_list_not_mutated(tempdir):
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    cols = ['a']
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=cols)
    assert cols == ['a']
Exemplo n.º 11
0
def test_index_not_in_columns(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=['b'])
    assert out.index.tolist() == ['x', 'y', 'z']
    out = pf.to_pandas(columns=['b'], index=False)
    assert out.index.tolist() == [0, 1, 2]
Exemplo n.º 12
0
def test_filter_stats(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
    })
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('x', '>=', 5)])
    assert out.x.tolist() == [5, 6, 7]
Exemplo n.º 13
0
def test_in_filter(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['symbols'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])])
    assert set(out.symbols) == {'a', 'c'}
Exemplo n.º 14
0
def test_write_compression_dict(tempdir, compression):
    df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
    fn = os.path.join(tempdir, "tmp.parq")
    writer.write(fn, df, compression=compression)
    r = ParquetFile(fn)
    df2 = r.to_pandas()

    tm.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 15
0
def test_to_pandas():
    fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq'
    pf = ParquetFile(fname)
    out = pf.to_pandas()
    assert len(out.columns) == 29
    # test for bad integer conversion
    assert (out.dep_time < 0).sum() == 0
    assert out.dep_time.dtype == 'float64'
Exemplo n.º 16
0
def test_append_simple(tempdir):
    fn = os.path.join(str(tempdir), "test.parq")
    df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]})
    write(fn, df, write_index=False)
    write(fn, df, append=True, write_index=False)

    pf = ParquetFile(fn)
    expected = pd.concat([df, df], ignore_index=True)
    pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
Exemplo n.º 17
0
def test_request_nonexistent_column(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3]})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)

    pf = ParquetFile(fn)
    with pytest.raises(ValueError):
        pf.to_pandas(columns=['y'])
Exemplo n.º 18
0
def test_filter_special(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI']
    })
    write(tempdir, df, file_scheme='hive', partition_on=['symbol'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbol', '==', 'NOW')])
    assert out.x.tolist() == [1, 5, 6]
    assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
Exemplo n.º 19
0
def test_open_standard(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], file_scheme='hive',
          open_with=open)
    pf = ParquetFile(fn, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
Exemplo n.º 20
0
def test_read_multiple_no_metadata(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5]})
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2])
    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*'))))
    pf = ParquetFile(flist)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas()
    pd.util.testing.assert_frame_equal(out, df)
Exemplo n.º 21
0
def test_nulls_roundtrip(tempdir):
    fname = os.path.join(tempdir, "temp.parq")
    data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)})
    data["cat"] = data["o"].astype("category")
    writer.write(fname, data, has_nulls=["o", "cat"])

    r = ParquetFile(fname)
    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
        assert (data[col].isnull() == df[col].isnull()).all()
Exemplo n.º 22
0
def test_datetime_partition_names(tempdir):
    date_strings = ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09']
    df = pd.DataFrame({
        'date': date_strings,
        'x': [1, 5, 2, 5]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['date'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert set(out.date.tolist()) == set(pd.to_datetime(date_strings).tolist())
    assert out[out.date == '2015-05-09'].x.tolist() == [1, 5]
    assert out[out.date == '2020-10-17'].x.tolist() == [2]
Exemplo n.º 23
0
def test_filter_without_paths(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g']
    })
    write(fn, df)

    pf = ParquetFile(fn)
    out = pf.to_pandas(filters=[['x', '>', 3]])
    pd.util.testing.assert_frame_equal(out, df)
    out = pf.to_pandas(filters=[['x', '>', 30]])
    assert len(out) == 0
Exemplo n.º 24
0
def test_multi_cat_fail(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df, row_group_offsets=25)

    pf = ParquetFile(fn)
    with pytest.raises(RuntimeError):
        pf.to_pandas()
Exemplo n.º 25
0
def test_multi(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df)

    pf = ParquetFile(fn)
    df1 = pf.to_pandas()
    assert df1.equals(df)
    assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
Exemplo n.º 26
0
def test_hive_and_drill_list(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x=0')
    fn1 = os.path.join(dir1, 'part.0.parquet')
    os.makedirs(dir1)
    write(fn1, df)
    dir2 = os.path.join(tempdir, 'y')
    fn2 = os.path.join(dir2, 'part.0.parquet')
    os.makedirs(dir2)
    write(fn2, df)

    pf = ParquetFile([fn1, fn2])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    assert out.dir0.tolist() == ['x=0'] * 3 + ['y'] * 3
Exemplo n.º 27
0
def test_single_upper_directory(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4})
    write(tempdir, df, file_scheme='hive', partition_on='y')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert (out.y == 'aa').all()

    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*'))))
    pf = ParquetFile(flist, root=tempdir)
    assert pf.fn == join_path(os.path.join(tempdir, '_metadata'))
    out = pf.to_pandas()
    assert (out.y == 'aa').all()
Exemplo n.º 28
0
def test_datetime_roundtrip(tempdir, df, capsys):
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, df)

    r = ParquetFile(fname)
    out, err = capsys.readouterr()
    if "x" in df and str(df.x.dtype.tz) == "Europe/London":
        # warning happens first time only
        assert "UTC" in err

    df2 = r.to_pandas()
    if "x" in df:
        df["x"] = df.x.dt.tz_convert(None)

    pd.util.testing.assert_frame_equal(df, df2, check_categorical=False)
Exemplo n.º 29
0
def test_multi_list(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x')
    write(dir1, df, file_scheme='hive')
    dir2 = os.path.join(tempdir, 'y')
    write(dir2, df, file_scheme='hive')
    dir3 = os.path.join(tempdir, 'z', 'deep')
    write(dir3, df, file_scheme='hive')

    pf = ParquetFile([dir1, dir2])
    out = pf.to_pandas()  # this version may have extra column!
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    pf = ParquetFile([dir1, dir2, dir3])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 3
Exemplo n.º 30
0
def test_append(tempdir, row_groups, partition):
    fn = str(tempdir)
    df0 = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "b", "a", "b"], "c": True})
    df1 = pd.DataFrame({"a": [4, 5, 6, 7], "b": ["a", "b", "a", "b"], "c": False})
    write(fn, df0, partition_on=partition, file_scheme="hive", row_group_offsets=row_groups)
    write(fn, df1, partition_on=partition, file_scheme="hive", row_group_offsets=row_groups, append=True)

    pf = ParquetFile(fn)

    expected = pd.concat([df0, df1], ignore_index=True)

    assert len(pf.row_groups) == 2 * len(row_groups) * (len(partition) + 1)
    items_out = {tuple(row[1]) for row in pf.to_pandas()[["a", "b", "c"]].iterrows()}
    items_in = {tuple(row[1]) for row in expected.iterrows()}
    assert items_in == items_out
def create_train_test_features(
    tokenizer: tokenization.FullTokenizer
) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]:
    train_input_examples, test_input_examples = (ParquetFile(
        data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply(
            create_bert_input_example,
            axis=1) for dataset_name in DATASET_NAMES)

    train_features, test_features = (
        run_classifier.convert_examples_to_features(input_examples, LABEL_LIST,
                                                    MAX_SEQ_LENGTH, tokenizer)
        for input_examples in (train_input_examples, test_input_examples))

    return train_features, test_features
Exemplo n.º 32
0
def test_consolidate_cats(tempdir):
    import json
    df = pd.DataFrame({'x': pd.Categorical([1, 2, 1])})
    fn = os.path.join(tempdir, 'temp.parq')
    write(fn, df)
    pf = ParquetFile(fn)
    assert 2 == json.loads(pf.fmd.key_value_metadata[0].value
                           )['columns'][0]['metadata']['num_categories']
    start = pf.row_groups[0].columns[0].meta_data.key_value_metadata[0].value
    assert start == '2'
    pf.row_groups[0].columns[0].meta_data.key_value_metadata[0].value = '5'
    writer.consolidate_categories(pf.fmd)
    assert 5 == json.loads(pf.fmd.key_value_metadata[0].value
                           )['columns'][0]['metadata']['num_categories']
Exemplo n.º 33
0
def test_append(tempdir, row_groups, partition):
    fn = str(tempdir)
    df0 = pd.DataFrame({'a': [1, 2, 3, 0],
                        'b': ['a', 'b', 'a', 'b'],
                        'c': True})
    df1 = pd.DataFrame({'a': [4, 5, 6, 7],
                        'b': ['a', 'b', 'a', 'b'],
                        'c': False})
    write(fn, df0, partition_on=partition, file_scheme='hive',
          row_group_offsets=row_groups)
    write(fn, df1, partition_on=partition, file_scheme='hive',
          row_group_offsets=row_groups, append=True)

    pf = ParquetFile(fn)

    expected = pd.concat([df0, df1], ignore_index=True)

    assert len(pf.row_groups) == 2 * len(row_groups) * (len(partition) + 1)
    items_out = {tuple(row[1])
                 for row in pf.to_pandas()[['a', 'b', 'c']].iterrows()}
    items_in = {tuple(row[1])
                for row in expected.iterrows()}
    assert items_in == items_out
Exemplo n.º 34
0
def _read_fp_multifile(fs, fs_token, paths, columns=None,
                       categories=None, index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme, join_path
    base, fns = analyse_paths(paths)
    parsed_paths = [join_path(p) for p in paths]
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (meta, _, index_name, out_type, all_columns, index_names,
     storage_name_mapping) = _pf_validation(
        pf, columns, index, categories, [])
    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns,
                                      categories)
    dsk = {(name, i): (_read_pf_simple, fs, path, base,
                       index_names, all_columns, out_type == Series,
                       categories, pf.cats,
                       pf.file_scheme, storage_name_mapping)
           for i, path in enumerate(parsed_paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)
Exemplo n.º 35
0
def getEPAHistData(month, yr):
    """Function to get raw epa data from s3"""

    # Change this to use the csv file being modified every hour
    try:
        try:
            s3 = s3fs.S3FileSystem()
            myopen = s3.open
            s3_resource = boto3.resource('s3')
            s3_resource.Object('midscapstone-whos-polluting-my-air', 'EpaRaw/epa_20{}{}.parquet'.format(yr, month)).load()
            pf=ParquetFile('midscapstone-whos-polluting-my-air/EpaRaw/epa_20{}{}.parquet'.format(yr, month), open_with=myopen)
            epa_df=pf.to_pandas()
        except:
            raise CustomError("FILE ERROR: Epa Raw Dataframe not found")

        # Add a datekey column based on local date
        epa_df.rename(columns={'Latitude':'lat', 'Longitude':'lon', 'UTC':'utc', 'Parameter':'parameter', 'Unit':'epa_pm25_unit', 'Value':'epa_pm25_value',
                    'RawConcentration':'raw_concentration', 'AQI':'aqi', 'Category':'category', 'SiteName':'site_name', 'AgencyName':'agency_name',
                    'FullAQSCode':'full_aqs_code', 'IntlAQSCode':'intl_aqs_code'}, inplace=True)
        epa_df['created'] = epa_df['utc'].apply(lambda x: int(datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').replace(tzinfo=tz.tzutc()).astimezone(timezone('US/Pacific')).strftime("%Y%m%d%H%M")))
    except  Exception as e:
        print("*** EXCEPTION IN GET EPA HIST DATA *** {}".format(e))
    return epa_df
Exemplo n.º 36
0
def test_cat_order(tempdir):
    # #629
    fn = os.path.join(tempdir, 'temp.parq')
    cat = ['hot', 'moderate', 'cold']
    catdtype = pd.CategoricalDtype(cat, ordered=True)
    val = [30, -10, 10]
    cities = ['Lisbonne', 'Paris', 'Paris']
    df = pd.DataFrame({'val': val, 'cat': cat, 'city': cities})
    df['cat'] = df['cat'].astype(catdtype)
    write(fn, df, file_scheme='hive', partition_on=['city'])

    out = ParquetFile(fn).to_pandas()
    assert out.cat.cat.ordered
    assert out.cat.cat.categories.tolist() == catdtype.categories.tolist()
Exemplo n.º 37
0
def test_hasnulls_ordering(tempdir):
    fname = os.path.join(tempdir, 'temp.parq')
    data = pd.DataFrame({'a': np.random.rand(100),
                         'b': np.random.rand(100),
                         'c': np.random.rand(100)})
    writer.write(fname, data, has_nulls=['a', 'c'])

    r = ParquetFile(fname)
    assert r._schema[1].name == 'a'
    assert r._schema[1].repetition_type == 1
    assert r._schema[2].name == 'b'
    assert r._schema[2].repetition_type == 0
    assert r._schema[3].name == 'c'
    assert r._schema[3].repetition_type == 1
Exemplo n.º 38
0
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32),
                         'i64': np.arange(1000, dtype=np.int64),
                         'f': np.arange(1000, dtype=np.float64),
                         'bhello': np.random.choice([b'hello', b'you',
                            b'people'], size=1000).astype("O")})
    data['a'] = np.array([b'a', b'b', b'c', b'd', b'e']*200, dtype="S1")
    data['aa'] = data['a'].map(lambda x: 2*x).astype("S2")
    data['hello'] = data.bhello.str.decode('utf8')
    data['bcat'] = data.bhello.astype('category')
    data['cat'] = data.hello.astype('category')
    fname = os.path.join(tempdir, 'test.parquet')
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups,
          compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == 'category'

    for col in r.columns:
        assert (df[col] == data[col]).all()
Exemplo n.º 39
0
 def incremental_train_with_parquet(self, parquet_path):
     print("Training incrementally with parquet...")
     nrows = 0
     pf = ParquetFile(parquet_path)
     classes, labels_freq = DataframePreprocessing(
         target_themes=self.target_themes).get_unique_binarized_labels(
             parquet_path, "tema", True)
     for df in pf.iter_row_groups():
         df = df.reset_index()
         self._update_dataframe(df,
                                is_parquet=True,
                                labels_freq=labels_freq)
         X_train, y_train = (
             self.df[self.x_column_name],
             self.df[self.target_themes + [self.other_themes_value]],
         )
         vector = self._vectorize(X_train)
         self.mo_classifier.partial_fit(vector.toarray(),
                                        y_train,
                                        classes=classes)
         nrows += len(self.df)
         print("{} rows already trained\n".format(nrows))
         clear_output(wait=True)
Exemplo n.º 40
0
def test_cast_index(tempdir):
    df = pd.DataFrame({'i8': np.array([1, 2, 3, 4], dtype='uint8'),
                       'i16': np.array([1, 2, 3, 4], dtype='int16'),
                       'i32': np.array([1, 2, 3, 4], dtype='int32'),
                       'i62': np.array([1, 2, 3, 4], dtype='int64'),
                       'f16': np.array([1, 2, 3, 4], dtype='float16'),
                       'f32': np.array([1, 2, 3, 4], dtype='float32'),
                       'f64': np.array([1, 2, 3, 4], dtype='float64'),
                       })
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)
    pf = ParquetFile(fn)
    for col in list(df):
        d = pf.to_pandas(index=col)
        if d.index.dtype.kind == 'i':
            assert d.index.dtype == 'int64'
        elif d.index.dtype.kind == 'u':
            # new UInt64Index
            assert pd.__version__ >= '0.20'
            assert d.index.dtype == 'uint64'
        else:
            assert d.index.dtype == 'float64'
        assert (d.index == df[col]).all()
Exemplo n.º 41
0
    def convert_to_npy(df=None, save=True, modalities=None):
        if df is None:
            df = ParquetFile(
                os.path.join(project_dir, 'data', 'interim',
                             'data.parq')).to_pandas().set_index('date')

        user_data = list()
        for user, group in df.groupby('user'):
            # Select activity
            activity = group[group['modality'] == 'cpm']

            # Require 8 hours of data
            activity = activity[pd.isnull(activity).sum(axis=1) < (16 * 12)]

            if activity.modality.count() >= 120:
                group = group.loc[activity.index.tolist()]

                # Extract modalities
                modality_data = list()
                modality_grouped = group.groupby('modality')
                for modality in modalities:
                    modality_data.append(
                        modality_grouped.get_group(modality).drop(['modality'],
                                                                  axis=1))

                # We concatenate on dates to ensure the same dimension across modalities
                user_data.append(
                    pd.concat(modality_data,
                              axis=1).values.reshape(-1, len(modality_data),
                                                     289).transpose(0, 2, 1))

        data = np.concatenate(user_data, axis=0)
        if save:
            np.save(os.path.join(project_dir, 'data', 'interim', 'data.npy'),
                    data)

        return data
def run_test(input_file: str, output_dir: str, filters: list):
    print('Using fastparquet')

    pf = ParquetFile(input_file)
    print('Parquet metadata: ' + str(pf.info))
    print('Parquet schema: ' + str(pf.schema))
    print('Parquet columns: ' + str(pf.columns))
    print('Parquet count (total number of rows): ' + str(pf.count))
    print('Parquet dtypes: ' + str(pf.dtypes))
    print('Parquet statistics: ' + str(pf.statistics))
    print('Parquet cats: ' +
          str(pf.cats))  # possible values of each partitioning field
    print('Parquet row_groups number: ' + str(len(pf.row_groups)))
    # print('Parquet row_groups: ' + str(pf.row_groups))

    with timeblock('fastparquet read and filter'):
        data = pf.to_pandas(filters=filters)
        # data: RowGroup = pf.filter_row_groups(filters=filters)
    # for df in pf.iter_row_groups():
    #     print(df.shape)

    size = sys.getsizeof(data)
    print('Size of filtered Pandas dataframe in memory: ' + str(size) +
          ' bytes (' + str(size / 1000000) + ' MB)')

    milliseconds_since_epoch = int(time() * 1000)
    output_file = output_dir + str(milliseconds_since_epoch) + '.parquet'
    print('Output file name: ' + output_file)

    with timeblock('pyarrow write_table()'):
        write(output_file, data, compression='SNAPPY')

    pf = ParquetFile(output_file)
    print('Parquet metadata of output: ' + str(str(pf.info)))
    print('Parquet schema of output: ' + str(pf.schema))
    print('Size of output file on disk: ' + str(os.path.getsize(output_file)) +
          ' bytes (' + str(os.path.getsize(output_file) / 1000000) + ' MB)')
Exemplo n.º 43
0
def test_compression_lz4(tempdir):
    pytest.importorskip('lz4')

    df = pd.DataFrame({
        'x': np.arange(1000),
        'y': np.arange(1, 1001),
        'z': np.arange(2, 1002),
    })

    fn = os.path.join(tempdir, 'foocomp.parquet')

    c = {
        "x": {
            "type": "gzip",
            "args": {
                "compresslevel": 5,
            }
        },
        "y": {
            "type": "lz4",
            "args": {
                "compression": 5,
                "store_size": False,
            }
        },
        "_default": {
            "type": "gzip",
            "args": None
        }
    }
    write(fn, df, compression=c)

    p = ParquetFile(fn)

    df2 = p.to_pandas()

    pd.util.testing.assert_frame_equal(df, df2)
Exemplo n.º 44
0
def read_parquet_on_ha_hdfs():
    """
    Read parquet file on HA mode hdfs
    :return:
    """

    ns = "nameservice1"
    conf = {
        "dfs.nameservices": "nameservice1",
        "dfs.ha.namenodes.nameservice1": "namenode113,namenode188",
        "dfs.namenode.rpc-address.nameservice1.namenode113":
        "hostname_of_server1:8020",
        "dfs.namenode.rpc-address.nameservice1.namenode188":
        "hostname_of_server2:8020",
        "dfs.namenode.http-address.nameservice1.namenode113":
        "hostname_of_server1:50070",
        "dfs.namenode.http-address.nameservice1.namenode188":
        "hostname_of_server2:50070",
        "hadoop.security.authentication": "kerberos"
    }
    hdfs = HDFileSystem(host=ns, pars=conf)
    sc = hdfs.open
    pf = ParquetFile("/user/hive/warehouse/test.db/test.parquet", open_with=sc)
    print(pf.to_pandas())
Exemplo n.º 45
0
def test_iter(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })
    df.index.name = 'index'

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], write_index=True)
    pf = ParquetFile(fn)
    out = iter(pf.iter_row_groups(index='index'))
    d1 = next(out)
    pd.testing.assert_frame_equal(d1,
                                  df[:2],
                                  check_dtype=False,
                                  check_index_type=False)
    d2 = next(out)
    pd.testing.assert_frame_equal(d2,
                                  df[2:],
                                  check_dtype=False,
                                  check_index_type=False)
    with pytest.raises(StopIteration):
        next(out)
Exemplo n.º 46
0
def test_attributes(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])
    pf = ParquetFile(fn)
    assert pf.columns == ['x', 'y', 'z']
    assert len(pf.row_groups) == 2
    assert pf.count == 4
    assert fn == pf.info['name']
    assert fn in str(pf)
    for col in df:
        assert pf.dtypes[col] == df.dtypes[col]
Exemplo n.º 47
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])

    pf = ParquetFile(fn)

    result = sorted_partitioned_columns(pf)
    expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

    assert result == expected
Exemplo n.º 48
0
def test_timestamp_filer(tempdir):
    fn = os.path.join(tempdir, 'test.parquet')
    ts = [
        pd.Timestamp('2021/01/01 08:00:00'),
        pd.Timestamp('2021/01/05 10:00:00')
    ]
    val = [10, 34]
    df = pd.DataFrame({'val': val, 'ts': ts})
    # two row-groups
    write(fn, df, row_group_offsets=1, file_scheme='hive')

    ts_filter = pd.Timestamp('2021/01/03 00:00:00')
    pf = ParquetFile(fn)
    filt = [[('ts', '<', ts_filter)], [('ts', '>=', ts_filter)]]
    assert pf.to_pandas(filters=filt).val.tolist() == [10, 34]

    filt = [[('ts', '>=', ts_filter)], [('ts', '<', ts_filter)]]
    assert pf.to_pandas(filters=filt).val.tolist() == [10, 34]

    ts_filter_down = pd.Timestamp('2021/01/03 00:00:00')
    ts_filter_up = pd.Timestamp('2021/01/06 00:00:00')
    # AND filter
    filt = [[('ts', '>=', ts_filter_down), ('ts', '<', ts_filter_up)]]
    assert pf.to_pandas(filters=filt).val.tolist() == [34]
Exemplo n.º 49
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'v': [{
            'a': 0
        }, {
            'b': -1
        }, {
            'c': 5
        }, {
            'a': 0
        }],
        'y': [1.0, 2.0, 1.0, 2.0],
        'z': ['a', 'b', 'c', 'd']
    })

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn,
          df,
          row_group_offsets=[0, 2],
          object_encoding={
              'v': 'json',
              'z': 'utf8'
          })

    pf = ParquetFile(fn)

    # string stats should be stored without byte-encoding
    zcol = [
        c for c in pf.row_groups[0].columns
        if c.meta_data.path_in_schema == ['z']
    ][0]
    assert zcol.meta_data.statistics.min == b'a'

    result = sorted_partitioned_columns(pf)
    expected = {
        'x': {
            'min': [1, 3],
            'max': [2, 4]
        },
        'z': {
            'min': ['a', 'c'],
            'max': ['b', 'd']
        }
    }

    # NB column v should not feature, as dict are unorderable
    assert result == expected
Exemplo n.º 50
0
def test_no_index_name(tempdir):
    df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'],
                       'b': [4, 5, 6]}).set_index('__index_level_0__')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out.index.name is None
    assert out.index.tolist() == ['x', 'y', 'z']

    df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'],
                       'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(index='__index_level_0__', columns=['b'])
    assert out.index.name is None
    assert out.index.tolist() == ['x', 'y', 'z']

    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out.index.name is None
    assert out.index.tolist() == [0, 1, 2]
Exemplo n.º 51
0
    def add_indices(self, chunk, part_id):
        chunk_fname = self.get_fname(part_id)
        pf = ParquetFile(chunk_fname)
        row_group_offsets = [0]
        for rg in pf.row_groups:
            row_group_offsets.append(row_group_offsets[-1] + rg.num_rows)
        row_group_offsets = row_group_offsets[:len(pf.row_groups)]

        high_cardinality_cols = set(chunk.cols) - set(
            self.categorical_cols) - set(self.partition_cols)
        for col in high_cardinality_cols:
            self.add_high_cardinality_index(chunk, col, part_id,
                                            row_group_offsets)

        for col in self.categorical_cols:
            self.add_categorical_index(chunk, col, part_id, row_group_offsets)
Exemplo n.º 52
0
def append(bucket, key1, key2, s3, output_filename):  
    s3_open = s3.open
    path1='{}{}'.format(bucket,key1)   
    pf1 = ParquetFile(path1, open_with=s3_open)
    df1=pf1.to_pandas()
    path2='{}{}'.format(bucket,key2)   
    pf2 = ParquetFile(path2, open_with=s3_open)
    df2=pf2.to_pandas()            
    data = df1.append(df2) 
    
    pwrite('{}{}'.format(bucket,output_filename), data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True)    
Exemplo n.º 53
0
def check_exists(s3: S3, frame: pd.DataFrame, table_name: str, table_partitions: List[AnyStr]):
    table_exists = s3.fs.exists(f'structured/{table_name}/_metadata')
    if table_exists:
        dataset = ParquetFile(f'structured/{table_name}', open_with=s3.fs.open)
        if not verify_schema(dataset, frame, table_partitions):
            old_files = [
                fn.split(f'structured/{table_name}/')[-1]
                for fn in s3.fs.find(f'structured/{table_name}')
            ]
            deprecation_date = datetime.now().replace(microsecond=0).isoformat()
            for old_file in old_files:
                s3.fs.copy(f'structured/{table_name}/{old_file}',
                           f'structured/deprecated/{deprecation_date}/{table_name}/{old_file}')

            s3.fs.rm(f'structured/{table_name}', recursive=True)
            table_exists = False
    return table_exists
Exemplo n.º 54
0
def test_partition_columns(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['symbols'])
    pf = ParquetFile(tempdir)

    # partition columns always come after actual columns
    assert pf.to_pandas().columns.tolist() == ['values', 'symbols']
    assert pf.to_pandas(columns=['symbols']).columns.tolist() == ['symbols']
    assert pf.to_pandas(columns=['values']).columns.tolist() == ['values']
    assert pf.to_pandas(columns=[]).columns.tolist() == []
Exemplo n.º 55
0
def test_write_with_rgp_by_date_as_index(tempdir):

    # Step 1 - Writing of a 1st df, with `row_group_offsets=0`,
    # `file_scheme=hive` and `partition_on=['location', 'color`].
    df1 = pd.DataFrame({
        'humidity': [0.3, 0.8, 0.9],
        'pressure': [1e5, 1.1e5, 0.95e5],
        'location': ['Paris', 'Paris', 'Milan'],
        'color': ['red', 'black', 'blue']
    })
    write(tempdir,
          df1,
          row_group_offsets=0,
          file_scheme='hive',
          partition_on=['location', 'color'])

    # Step 2 - Overwriting with a 2nd df having overlapping data, in
    # 'overwrite' mode:
    # `row_group_offsets=0`, `file_scheme=hive`,
    # `partition_on=['location', 'color`] and `append=True`.
    df2 = pd.DataFrame({
        'humidity': [0.5, 0.3, 0.4, 0.8, 1.1],
        'pressure': [9e4, 1e5, 1.1e5, 1.1e5, 0.95e5],
        'location': ['Milan', 'Paris', 'Paris', 'Paris', 'Paris'],
        'color': ['red', 'black', 'black', 'green', 'green']
    })

    write(tempdir,
          df2,
          row_group_offsets=0,
          file_scheme='hive',
          append='overwrite',
          partition_on=['location', 'color'])

    expected = pd.DataFrame({'humidity': [0.9, 0.5, 0.3, 0.4, 0.8, 1.1, 0.3],
                             'pressure': [9.5e4, 9e4, 1e5, 1.1e5, 1.1e5, 9.5e4, 1e5],
                             'location': ['Milan', 'Milan', 'Paris', 'Paris', 'Paris', 'Paris', 'Paris'],
                             'color': ['blue', 'red', 'black', 'black', 'green', 'green', 'red']})\
                           .astype({'location': 'category', 'color': 'category'})
    recorded = ParquetFile(tempdir).to_pandas()
    # df1 is 3 rows, df2 is 5 rows. Because of overlapping data with keys
    # 'location' = 'Paris' & 'color' = 'black' (1 row in df2, 2 rows in df2)
    # resulting df contains for this combination values of df2 and not that of
    # df1. Total resulting number of rows is 7.
    assert expected.equals(recorded)
Exemplo n.º 56
0
def test_cmd_bytesize(tempdir, cmp):
    from fastparquet import core
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame({'s': ['a', 'b']}, dtype='category')
    write(fn, df, compression=cmp)
    pf = ParquetFile(fn)
    chunk = pf.row_groups[0].columns[0]
    cmd = chunk.meta_data
    csize = cmd.total_compressed_size
    f = open(fn, 'rb')
    f.seek(cmd.dictionary_page_offset)
    ph = core.read_thrift(f, parquet_thrift.PageHeader)
    c1 = ph.compressed_page_size
    f.seek(c1, 1)
    ph = core.read_thrift(f, parquet_thrift.PageHeader)
    c2 = ph.compressed_page_size
    f.seek(c2, 1)
    assert csize == f.tell() - cmd.dictionary_page_offset
Exemplo n.º 57
0
def test_sorted_row_group_columns(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json',
                                                             'z': 'utf8'})

    pf = ParquetFile(fn)

    result = sorted_partitioned_columns(pf)
    expected = {'x': {'min': [1, 3], 'max': [2, 4]},
                'z': {'min': ['a', 'c'], 'max': ['b', 'd']}}

    # NB column v should not feature, as dict are unorderable
    assert result == expected
Exemplo n.º 58
0
def test_append_empty(tempdir, scheme):
    fn = os.path.join(str(tempdir), 'test.parq')
    df = pd.DataFrame({'a': [1, 2, 3, 0],
                       'b': ['a', 'a', 'b', 'b']})
    write(fn, df.head(0), write_index=False, file_scheme=scheme)
    pf = ParquetFile(fn)
    assert pf.count() == 0
    assert pf.file_scheme == 'empty'
    write(fn, df, append=True, write_index=False, file_scheme=scheme)

    pf = ParquetFile(fn)
    pd.testing.assert_frame_equal(
        pf.to_pandas(), df, check_categorical=False, check_dtype=False)
Exemplo n.º 59
0
def read_single(n, type):
    #print(type)
    if type == 'item':
        pf = ParquetFile(
            '/itemFactors/part-0000' + str(n) +
            '-bb0e8317-d384-4c08-824c-0b2a8661846f-c000.snappy.parquet')
        return pf.to_pandas()
    elif type == 'user':
        pf = ParquetFile(
            '/userFactors/part-0000' + str(n) +
            '-e7a03551-5ae9-4231-b614-549034330d20-c000.snappy.parquet')
        return pf.to_pandas()
    return -1
Exemplo n.º 60
0
def test_write_partitioned_with_empty_categories(tempdir):
    df = pd.DataFrame({
        'b':
        np.random.random(size=1000),
        'a':
        pd.Series(np.random.choice(['x', 'z'], size=1000)).astype(
            CategoricalDtype(categories=['x', 'y', 'z'])),
    })
    write(tempdir,
          df,
          partition_on=['a'],
          file_scheme='hive',
          write_index=True)
    out = ParquetFile(tempdir).to_pandas()
    assert_frame_equal(out,
                       df,
                       check_like=True,
                       check_categorical=False,
                       check_names=False)