Exemplo n.º 1
0
def test_pyspark_roundtrip(tempdir, scheme, row_groups, comp, sql):
    if comp in ['BROTLI', 'ZSTD', 'LZO', "LZ4"]:
        pytest.xfail("spark doesn't support compression")
    data = pd.DataFrame({'i32': np.random.randint(-2**17, 2**17, size=1001,
                                                  dtype=np.int32),
                         'i64': np.random.randint(-2**33, 2**33, size=1001,
                                                  dtype=np.int64),
                         'f': np.random.randn(1001),
                         'bhello': np.random.choice([b'hello', b'you',
                            b'people'], size=1001).astype("O"),
                         't': [datetime.datetime.now()]*1001})

    data['t'] += pd.to_timedelta('1ns')
    data['hello'] = data.bhello.str.decode('utf8')
    data.loc[100, 'f'] = np.nan
    data['bcat'] = data.bhello.astype('category')
    data['cat'] = data.hello.astype('category')

    fname = os.path.join(tempdir, 'test.parquet')
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups,
          compression=comp, times='int96', write_index=True)

    df = sql.read.parquet(fname)
    ddf = df.sort('index').toPandas()
    for col in data:
        if data[col].dtype.kind == "M":
            # pyspark auto-converts timezones
            offset = round((datetime.datetime.utcnow() -
                            datetime.datetime.now()).seconds / 3600)
            ddf[col] + datetime.timedelta(hours=offset) == data[col]
        else:
            assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
Exemplo n.º 2
0
def test_floating_point_partition_name(tempdir):
    df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1]
    assert out[out.y1 == 'bb'].x.tolist() == [200.0]
Exemplo n.º 3
0
def time_text():
    with tmpdir() as tempdir:
        result = {}
        fn = join_path(tempdir, 'temp.parq')
        n = 1000000
        d = pd.DataFrame({
            'a': np.random.choice(['hi', 'you', 'people'], size=n),
            'b': np.random.choice([b'hi', b'you', b'people'], size=n)})

        for col in d.columns:
            for fixed in [None, 6]:
                df = d[[col]]
                if isinstance(df.iloc[0, 0], bytes):
                    t = "bytes"
                else:
                    t = 'utf8'
                write(fn, df)
                with measure('%s: write, fixed: %s' % (t, fixed), result):
                    write(fn, df, has_nulls=False, write_index=False,
                          fixed_text={col: fixed}, object_encoding=t)

                pf = ParquetFile(fn)
                pf.to_pandas()  # warm-up

                with measure('%s: read, fixed: %s' % (t, fixed), result):
                    pf.to_pandas()
        return result
Exemplo n.º 4
0
def test_roundtrip(tempdir, scheme, row_groups, comp):
    data = pd.DataFrame(
        {
            "i32": np.arange(1000, dtype=np.int32),
            "i64": np.arange(1000, dtype=np.int64),
            "f": np.arange(1000, dtype=np.float64),
            "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"),
        }
    )
    data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1")
    data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2")
    data["hello"] = data.bhello.str.decode("utf8")
    data["bcat"] = data.bhello.astype("category")
    data["cat"] = data.hello.astype("category")
    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp)

    r = ParquetFile(fname)

    df = r.to_pandas()

    assert data.cat.dtype == "category"

    for col in r.columns:
        assert (df[col] == data[col]).all()
Exemplo n.º 5
0
def test_roundtrip_complex(tempdir, scheme):
    import datetime

    data = pd.DataFrame(
        {
            "ui32": np.arange(1000, dtype=np.uint32),
            "i16": np.arange(1000, dtype=np.int16),
            "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8),
            "f16": np.arange(1000, dtype=np.float16),
            "dicts": [{"oi": "you"}] * 1000,
            "t": [datetime.datetime.now()] * 1000,
            "td": [datetime.timedelta(seconds=1)] * 1000,
            "bool": np.random.choice([True, False], size=1000),
        }
    )
    data.loc[100, "t"] = None

    fname = os.path.join(tempdir, "test.parquet")
    write(fname, data, file_scheme=scheme)

    r = ParquetFile(fname)

    df = r.to_pandas()
    for col in r.columns:
        assert (df[col] == data[col])[~data[col].isnull()].all()
Exemplo n.º 6
0
def test_numerical_partition_name(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']})
    write(tempdir, df, file_scheme='hive', partition_on=['y1'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5]
    assert out[out.y1 == 'bb'].x.tolist() == [2]
Exemplo n.º 7
0
def test_merge(tempdir, dirs, row_groups):
    fn = str(tempdir)

    os.makedirs(os.path.join(fn, dirs[0]), exist_ok=True)
    df0 = pd.DataFrame({"a": [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, dirs[0], "out0.parq"])
    write(fn0, df0, row_group_offsets=row_groups)

    os.makedirs(os.path.join(fn, dirs[1]), exist_ok=True)
    df1 = pd.DataFrame({"a": [5, 6, 7, 8]})
    fn1 = os.sep.join([fn, dirs[1], "out1.parq"])
    write(fn1, df1, row_group_offsets=row_groups)

    # with file-names
    pf = writer.merge([fn0, fn1])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert "cat" in pf.cats

    # with instances
    pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)])
    assert len(pf.row_groups) == 2 * len(row_groups)
    out = pf.to_pandas().a.tolist()
    assert out == [1, 2, 3, 4, 5, 6, 7, 8]
    if "cat=1" in dirs:
        assert "cat" in pf.cats
Exemplo n.º 8
0
def test_input_column_list_not_mutated(tempdir):
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    write(tempdir, df, file_scheme='hive')
    cols = ['a']
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=cols)
    assert cols == ['a']
Exemplo n.º 9
0
def test_filter_stats(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
    })
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('x', '>=', 5)])
    assert out.x.tolist() == [5, 6, 7]
Exemplo n.º 10
0
def test_in_filter(tempdir):
    symbols = ['a', 'a', 'b', 'c', 'c', 'd']
    values = [1, 2, 3, 4, 5, 6]
    df = pd.DataFrame(data={'symbols': symbols, 'values': values})
    write(tempdir, df, file_scheme='hive', partition_on=['symbols'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])])
    assert set(out.symbols) == {'a', 'c'}
Exemplo n.º 11
0
def test_mixed_partition_types_warning(tempdir, partitions):
    df = pd.DataFrame({
        'partitions': partitions,
        'x': [1, 2]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['partitions'])
    with pytest.warns(UserWarning, match=r'Partition names coerce to values of different types.*'):
        ParquetFile(tempdir)
Exemplo n.º 12
0
def test_datetime_partition_no_dupilcates(tempdir, partitions):
    df = pd.DataFrame({
        'partitions': partitions,
        'x': [1, 2]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['partitions'])
    with pytest.raises(ValueError, match=r'Partition names map to the same value.*'):
        ParquetFile(tempdir)
Exemplo n.º 13
0
def test_index_not_in_columns(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a')
    write(tempdir, df, file_scheme='hive')
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(columns=['b'])
    assert out.index.tolist() == ['x', 'y', 'z']
    out = pf.to_pandas(columns=['b'], index=False)
    assert out.index.tolist() == [0, 1, 2]
Exemplo n.º 14
0
def test_request_nonexistent_column(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3]})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)

    pf = ParquetFile(fn)
    with pytest.raises(ValueError):
        pf.to_pandas(columns=['y'])
Exemplo n.º 15
0
def test_grab_cats(tempdir):
    s = pd.Series(['a', 'c', 'b']*20)
    df = pd.DataFrame({'a': s, 'b': s.astype('category'),
                       'c': s.astype('category').cat.as_ordered()})
    fastparquet.write(tempdir, df, file_scheme='hive')
    pf = fastparquet.ParquetFile(tempdir)
    cats = pf.grab_cats(['b', 'c'])
    assert (cats['b'] == df.b.cat.categories).all()
    assert (cats['c'] == df.c.cat.categories).all()
Exemplo n.º 16
0
def test_2():
    # to make and save a large-ish DataFrame
    N = 10000000
 
    df = pd.DataFrame({'ints': np.random.randint(0, 1000, size=N),
                   'floats': np.random.randn(N),
                   'times': pd.DatetimeIndex(start='1980', freq='s', periods=N)})
    df.to_csv('test_2.csv')
    fastparquet.write('test_2_UNCOMPRESSED.parq', df, compression='UNCOMPRESSED')
Exemplo n.º 17
0
def test_append_simple(tempdir):
    fn = os.path.join(str(tempdir), "test.parq")
    df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]})
    write(fn, df, write_index=False)
    write(fn, df, append=True, write_index=False)

    pf = ParquetFile(fn)
    expected = pd.concat([df, df], ignore_index=True)
    pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
Exemplo n.º 18
0
def test_open_standard(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3, 4],
                       'y': [1.0, 2.0, 1.0, 2.0],
                       'z': ['a', 'b', 'c', 'd']})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], file_scheme='hive',
          open_with=open)
    pf = ParquetFile(fn, open_with=open)
    d2 = pf.to_pandas()
    pd.util.testing.assert_frame_equal(d2, df)
Exemplo n.º 19
0
def test_filter_special(tempdir):
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI']
    })
    write(tempdir, df, file_scheme='hive', partition_on=['symbol'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas(filters=[('symbol', '==', 'NOW')])
    assert out.x.tolist() == [1, 5, 6]
    assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
Exemplo n.º 20
0
def test_datetime_category_no_dupilcates(tempdir, categories):
    # The purpose of this test is to ensure that the changes made for the previous test
    # haven't broken categories in general.
    df = pd.DataFrame({
        'categories': categories,
        'x': [1, 2]
    }).astype({'categories': 'category'})
    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)
    assert ParquetFile(fn).to_pandas().categories.tolist() == categories
Exemplo n.º 21
0
def test_index(tempdir):
    s = pd.Series(['a', 'c', 'b']*20)
    df = pd.DataFrame({'a': s, 'b': s.astype('category'),
                       'c': range(60, 0, -1)})

    for column in df:
        d2 = df.set_index(column)
        fastparquet.write(tempdir, d2, file_scheme='hive', write_index=True)
        pf = fastparquet.ParquetFile(tempdir)
        out = pf.to_pandas(index=column, categories=['b'])
        pd.util.testing.assert_frame_equal(out, d2, check_categorical=False)
Exemplo n.º 22
0
def test_read_multiple_no_metadata(tempdir):
    df = pd.DataFrame({'x': [1, 5, 2, 5]})
    write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2])
    os.unlink(os.path.join(tempdir, '_metadata'))
    os.unlink(os.path.join(tempdir, '_common_metadata'))
    import glob
    flist = list(sorted(glob.glob(os.path.join(tempdir, '*'))))
    pf = ParquetFile(flist)
    assert len(pf.row_groups) == 2
    out = pf.to_pandas()
    pd.util.testing.assert_frame_equal(out, df)
Exemplo n.º 23
0
def test_logical_types(tempdir):
    df = pd.util.testing.makeMixedDataFrame()

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2])

    p = ParquetFile(fn)

    s = statistics(p)

    assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp))
Exemplo n.º 24
0
def test_int96_stats(tempdir):
    df = pd.util.testing.makeMixedDataFrame()

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df, row_group_offsets=[0, 2], times='int96')

    p = ParquetFile(fn)

    s = statistics(p)
    assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp))
    assert 'D' in sorted_partitioned_columns(p)
Exemplo n.º 25
0
def test_zero_child_leaf(tempdir):
    df = pd.DataFrame({'x': [1, 2, 3]})

    fn = os.path.join(tempdir, 'foo.parquet')
    write(fn, df)

    pf = ParquetFile(fn)
    assert pf.columns == ['x']

    pf._schema[1].num_children = 0
    assert pf.columns == ['x']
Exemplo n.º 26
0
def test_datetime_partition_names(tempdir):
    date_strings = ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09']
    df = pd.DataFrame({
        'date': date_strings,
        'x': [1, 5, 2, 5]
    })
    write(tempdir, df, file_scheme='hive', partition_on=['date'])
    pf = ParquetFile(tempdir)
    out = pf.to_pandas()
    assert set(out.date.tolist()) == set(pd.to_datetime(date_strings).tolist())
    assert out[out.date == '2015-05-09'].x.tolist() == [1, 5]
    assert out[out.date == '2020-10-17'].x.tolist() == [2]
Exemplo n.º 27
0
def test_filter_without_paths(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, 5, 6, 7],
        'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g']
    })
    write(fn, df)

    pf = ParquetFile(fn)
    out = pf.to_pandas(filters=[['x', '>', 3]])
    pd.util.testing.assert_frame_equal(out, df)
    out = pf.to_pandas(filters=[['x', '>', 30]])
    assert len(out) == 0
Exemplo n.º 28
0
def test_multi_cat_fail(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df, row_group_offsets=25)

    pf = ParquetFile(fn)
    with pytest.raises(RuntimeError):
        pf.to_pandas()
Exemplo n.º 29
0
def test_statistics(tempdir):
    s = pd.Series([b'a', b'b', b'c']*20)
    df = pd.DataFrame({'a': s, 'b': s.astype('category'),
                       'c': s.astype('category').cat.as_ordered()})
    fastparquet.write(tempdir, df, file_scheme='hive')
    pf = fastparquet.ParquetFile(tempdir)
    stat = pf.statistics
    assert stat['max']['a'] == [b'c']
    assert stat['min']['a'] == [b'a']
    assert stat['max']['b'] == [None]
    assert stat['min']['b'] == [None]
    assert stat['max']['c'] == [b'c']
    assert stat['min']['c'] == [b'a']
Exemplo n.º 30
0
def test_multi(tempdir):
    fn = os.path.join(tempdir, 'test.parq')
    N = 200
    df = pd.DataFrame(
        {'a': np.random.randint(10, size=N),
         'b': np.random.choice(['a', 'b', 'c'], size=N),
         'c': np.arange(200)})
    df = df.set_index(['a', 'b'])
    write(fn, df)

    pf = ParquetFile(fn)
    df1 = pf.to_pandas()
    assert df1.equals(df)
    assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
Exemplo n.º 31
0
def test_multi_list(tempdir):
    df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]})
    dir1 = os.path.join(tempdir, 'x')
    write(dir1, df, file_scheme='hive')
    dir2 = os.path.join(tempdir, 'y')
    write(dir2, df, file_scheme='hive')
    dir3 = os.path.join(tempdir, 'z', 'deep')
    write(dir3, df, file_scheme='hive')

    pf = ParquetFile([dir1, dir2])
    out = pf.to_pandas()  # this version may have extra column!
    assert out.a.tolist() == ['x', 'y', 'z'] * 2
    pf = ParquetFile([dir1, dir2, dir3])
    out = pf.to_pandas()
    assert out.a.tolist() == ['x', 'y', 'z'] * 3
Exemplo n.º 32
0
def parquet_conv(filename, cwd=os.getcwd(), datasourceformat=".xlsx"):
    """Converts a file of .xlsx or .csv into .parquet and reads prints/returns the first column

    :param filename: base filename to be converted to .parquet
    :param cwd: current working directory
    :param datasourceformat: what format the datasource comes in
    :return: the requested column from pset instructions
    """
    parquetfilename = filename + ".parquet"
    data_wd = os.path.abspath(os.path.join(cwd, "data"))
    data_source = os.path.join(data_wd, filename + datasourceformat)
    try:
        df = pd.read_csv(data_source)
    except:
        df = pd.read_excel(data_source)

    atomic_write(fastparquet.write(parquetfilename, df, compression=None))
    result = pd.read_parquet(parquetfilename,
                             engine="fastparquet",
                             columns=["hashed_id"])
    print(result)
    return result
Exemplo n.º 33
0
def test_merge_fail(tempdir):
    fn = str(tempdir)

    df0 = pd.DataFrame({'a': [1, 2, 3, 4]})
    fn0 = os.sep.join([fn, 'out0.parq'])
    write(fn0, df0)

    df1 = pd.DataFrame({'a': ['a', 'b', 'c']})
    fn1 = os.sep.join([fn, 'out1.parq'])
    write(fn1, df1)

    with pytest.raises(ValueError) as e:
        writer.merge([fn0, fn1])
    assert 'schemas' in str(e)

    os.remove(fn1)
    write(fn1, df0, file_scheme='hive')
    with pytest.raises(ValueError) as e:
        writer.merge([fn0, fn1])
    assert 'multi-file' in str(e)
Exemplo n.º 34
0
def test_auto_null(tempdir):
    tmp = str(tempdir)
    df = pd.DataFrame({
        'a': [1, 2, 3, 0],
        'aa': [1, 2, 3, None],
        'b': [1., 2., 3., np.nan],
        'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'),
        'd': ['a', 'b', 'c', None],
        'f': [True, False, True, True],
        'ff': [True, False, None, True]
    })
    df['e'] = df['d'].astype('category')
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises((TypeError, AttributeError)):
        ## TODO: this should be a nicer error?
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    cols = list(set(df) - {'ff'})
    tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])

    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in ['d', 'ff']:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
Exemplo n.º 35
0
def test_only_partition_columns(tempdir):
    df = pd.DataFrame({'a': np.random.rand(20),
                       'b': np.random.choice(['hi', 'ho'], size=20),
                       'c': np.random.choice(['a', 'b'], size=20)})
    write(tempdir, df, file_scheme='hive', partition_on=['b'])
    pf = ParquetFile(tempdir)
    df2 = pf.to_pandas(columns=['b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    write(tempdir, df, file_scheme='hive', partition_on=['a', 'b'])
    pf = ParquetFile(tempdir)
    df2 = pf.to_pandas(columns=['a', 'b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    df2 = pf.to_pandas(columns=['b'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    df2 = pf.to_pandas(columns=['b', 'c'])
    df.b.value_counts().to_dict() == df2.b.value_counts().to_dict()

    with pytest.raises(ValueError):
        # because this leaves no data to write
        write(tempdir, df[['b']], file_scheme='hive', partition_on=['b'])
Exemplo n.º 36
0
def test_custom_metadata(tempdir):
    df = pd.DataFrame({'a': [15]})
    fn = os.path.join(tempdir, 'temp.parq')
    write(fn, df, custom_metadata={"hello": "world"})
    pf = ParquetFile(fn)
    assert pf.key_value_metadata['hello'] == 'world'
Exemplo n.º 37
0
def test_append_w_partitioning(tempdir):
    fn = str(tempdir)
    df = pd.DataFrame({'a': np.random.choice([1, 2, 3], size=50),
                       'b': np.random.choice(['hello', 'world'], size=50),
                       'c': np.random.randint(50, size=50)})
    write(fn, df, file_scheme='hive', partition_on=['a', 'b'])
    write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True)
    write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True)
    write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True)
    pf = ParquetFile(fn)
    out = pf.to_pandas()
    assert len(out) == 200
    assert sorted(out.a)[::4] == sorted(df.a)
    with pytest.raises(ValueError):
        write(fn, df, file_scheme='hive', partition_on=['a'], append=True)
    with pytest.raises(ValueError):
        write(fn, df, file_scheme='hive', partition_on=['b', 'a'], append=True)
Exemplo n.º 38
0
def test_duplicate_columns(tempdir):
    fn = os.path.join(tempdir, 'tmp.parq')
    df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('aaa'))
    with pytest.raises(ValueError) as e:
        write(fn, df)
    assert 'duplicate' in str(e.value)
Exemplo n.º 39
0
def test_bad_coltype(tempdir):
    df = pd.DataFrame({'0': [1, 2], (0, 1): [3, 4]})
    fn = os.path.join(tempdir, 'temp.parq')
    with pytest.raises((ValueError, TypeError)) as e:
        write(fn, df)
        assert "tuple" in str(e.value)
Exemplo n.º 40
0
df.body = df.body.apply(pre_processing)

word_grams = TfidfVectorizer(analyzer="word",
                             ngram_range=(1, 5),
                             stop_words="english",
                             max_features=10000)

word_vector = word_grams.fit_transform(df.body)

word_df = pd.DataFrame()

for i, col in enumerate(word_grams.get_feature_names()):
    word_df[col] = pd.Series(word_vector[:, i].toarray().ravel())

df = pd.merge(df, word_df, left_index=True, right_index=True)

del word_df

# Note columns changed to _x for time
print(df.head())

df = df.drop(["body_x"], axis=1)

df = df.set_index("date_created")

print("write parquet")

fastparquet.write("processed_tweets.parquet", df)

print(df.shape)
Exemplo n.º 41
0
def to_parquet(filename, prefix="maccdc2012"):
    with open(filename) as f:
        traffic = {}
        nodes = set()

        for line in f.readlines():
            if "unreachable" in line:
                continue
            fields = line.split()
            if not fields:
                continue
            if fields[1] != "IP":
                continue
            protocol = get_ip_protocol(line)
            if protocol not in ("tcp", "udp", "eigrp", "icmp"):
                continue
            try:
                addresses = []

                # Extract source IP address and convert to integer
                m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[2])
                if not m:
                    continue
                addresses.append(ip_to_integer(m.group('address')))

                # Extract target IP address and convert to integer
                m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[4])
                if not m:
                    continue
                addresses.append(ip_to_integer(m.group('address')))

                nodes = nodes.union(addresses)
                src, dst = sorted(addresses)
                key = (protocol, src, dst)

                # Extract packet size
                nbytes = int(fields[-1])

                if key in traffic:
                    traffic[key] += nbytes
                else:
                    traffic[key] = nbytes
            except:
                pass

        nodes = dict([(node, i) for i, node in enumerate(sorted(nodes))])

        edges = []
        for key in traffic:
            edge = [nodes[key[1]], nodes[key[2]], key[0], traffic[key]]
            edges.append(edge)

        nodes_df = pd.DataFrame(np.arange(len(nodes)), columns=['id'])
        nodes_df = nodes_df.set_index('id')

        edges_df = pd.DataFrame(
            np.array(edges),
            columns=['source', 'target', 'protocol', 'weight'])
        edges_df['source'] = pd.to_numeric(edges_df['source'])
        edges_df['target'] = pd.to_numeric(edges_df['target'])
        edges_df['weight'] = pd.to_numeric(edges_df['weight'])
        edges_df['protocol'] = edges_df['protocol'].astype('category')

        fp.write('{}_nodes.parq'.format(prefix), nodes_df)
        fp.write('{}_edges.parq'.format(prefix), edges_df)
            owner.append(ipinfo['autonomous_system_organization'])
            asn.append(ipinfo['autonomous_system_number'])
            ispname.append(ipinfo['isp'])
        else:
            print("error: for ip %s, ipinfo==None" % (ip))
            owner.append('')
            asn.append(0)
            ispname.append('')
    print("\n  DONE getting ISP names")

    # add IP_owner and IP_ASN columns to the dataframe
    df["IP_owner"] = owner
    df["IP_ASN"] = asn
    # get company name from owner string
    df["ISP_name"] = ispname

    return df


#############################################################################

the_query = query_writer("06/15/14", "05/13/15", limit=999)
print(the_query)

project_id = 'mlab-185523'
df = acquire_mlab_data(project_id, "01/01/13", "02/01/13")

from fastparquet import write

write('mlab-test-data-0.parquet', df)
Exemplo n.º 43
0
def write_parquet_gzip(df, file_name, num_of_samples):
    print("Gzip Parquet writing started...")
    filename = file_name + '.parq'
    write(filename, df, num_of_samples, "GZIP")
Exemplo n.º 44
0
def write_parquet_snappy(df, file_name, num_of_samples):
    print("Snappy Parquet writing started...")
    filename = file_name + '.parq'
    write(filename, df, num_of_samples, "SNAPPY")
Exemplo n.º 45
0
import pandas as pd 
import numpy as np
from fastparquet import write

df = pd.read_csv('/etc/adult.data', names = ["Age", "Workclass", "fnlwgt", "Education", "Education_Num", "Martial_Status",
        "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss",
        "Hours_per_week", "Country", "Target"])

write('adult.parq', df, compression='GZIP')
Exemplo n.º 46
0
print("Remove constant cols")
train_df = train_df.drop(['ID', 'y'], axis=1)
test_df = test_df.drop(['ID'], axis=1)
print("Removed")

###########################################################

# Create interaction features
interactions2way = list(set(list(train_df)) - set(BASE_COLS))
interactions2way_list = list(combinations(interactions2way, 2))
for A, B in interactions2way_list:
    feat = "_".join([A, B])
    train_df[feat] = abs(train_df[A] - train_df[B])
    test_df[feat] = abs(test_df[A] - test_df[B])

# Now split into train_df and test_df and save the output of the processed dataset.
train_df['ID'] = id_train_df
train_df['y'] = y_train_df
test_df['ID'] = id_test_df

print('Writing Parquets')
# store
fastparquet.write('./data/processed/metalvl2/xtrain' + BUILD_NAME + '.parq',
                  train_df,
                  write_index=False)
fastparquet.write('./data/processed/metalvl2/xtest' + BUILD_NAME + '.parq',
                  test_df,
                  write_index=False)
print('Finished')
Exemplo n.º 47
0
def main():
    global baseCode
    print("Connecting to IB Gateway")
    print("Client ID: " + str(clientId))
    print("Host ID: " + str(host))
    ibConn = IBTrader.IBTrader()
    time.sleep(3)
    ibConn.connect(clientId=clientId, host=host, port=port)
    time.sleep(5)

    ibConn.contracts = {}
    time.sleep(5)
    ibConn.contracts = {}
    ibConn.createCashContract(baseCode[:3], currency=baseCode[3:])
    print("Adding: " + baseCode)
    print("Contracts Processing: " + str(len(ibConn.contracts)))

    for contract in ibConn.contracts:
        #baseCode = ibConn.contract_details[contract]['m_summary']['m_localSymbol'].replace('.','')
        print("Processing: " + baseCode)

        print("Retrieving Hourly Data")
        ibConn.requestHistoricalData(
            ibConn.contracts[contract],
            resolution="1 hour",
            end_datetime='{} 22:00:00'.format(
                (datetime.datetime.today()).strftime("%Y%m%d")),
            lookback="1 M")
        waiting = True
        lastLen = 0
        while waiting:
            try:
                if len(ibConn.historicalData[baseCode + '_CASH']) > lastLen:
                    lastLen = len(ibConn.historicalData[baseCode + '_CASH'])
                    time.sleep(2)
                else:
                    waiting = False
            except KeyError:
                pass
        time.sleep(5)

        print("Saving Hourly Data")
        hourlyData = ibConn.historicalData[baseCode + '_CASH']
        hourlyData = hourlyData.drop(['V', 'OI', 'WAP'],
                                     1).reset_index().sort_values('datetime')
        filename = baseCode + '_H' + str(
            (datetime.datetime.today()).strftime("%Y%m%d")) + '.parq'
        write('/root/data/hour/' + filename, hourlyData)
        bucket.upload_file('/root/data/hour/' + filename,
                           s3_StorageLocation + filename)
        ibConn.historicalData = {}

        years = [2019]

        for year in years:
            print("Retrieving Minute Data " + str(year))
            for i in range(3, 0, -1):
                d = datetime.datetime(year, i, calendar.monthrange(year, i)[1])
                dateStr = d.strftime("%Y%m%d")
                print("Month: " + str(d.strftime("%Y %m")))
                for contract in ibConn.contracts:
                    #baseCode = ibConn.contract_details[contract]['m_summary']['m_localSymbol'].replace('.','')
                    print("\tProcessing: " + baseCode)

                    ibConn.requestHistoricalData(
                        ibConn.contracts[contract],
                        resolution="1 min",
                        end_datetime='{} 22:00:00'.format(dateStr),
                        lookback="1 M")
                    waiting = True
                    lastLen = 0
                    while waiting:
                        try:
                            if len(ibConn.historicalData[baseCode +
                                                         '_CASH']) > lastLen:
                                lastLen = len(ibConn.historicalData[baseCode +
                                                                    '_CASH'])
                                #print("\tBars Received: "+str(lastLen))
                                time.sleep(10)
                            else:
                                waiting = False
                        except KeyError:
                            pass
                    time.sleep(5)

                    minuteData = ibConn.historicalData[baseCode + '_CASH']
                    minuteData = minuteData.drop(
                        ['V', 'OI', 'WAP'],
                        1).reset_index().sort_values('datetime')
                    filename = baseCode + '_M' + str(
                        d.strftime("%Y_%m")) + '.parq'
                    write('/root/data/min/' + filename, minuteData)
                    bucket.upload_file('/root/data/min/' + filename,
                                       s3_StorageLocation + "min/" + filename)
                    ibConn.historicalData = {}

        ibConn.historicalData = {}
        ibConn.cancelHistoricalData()
        ibConn.cancelMarketData()
        ibConn.contracts = {}
        ibConn.disconnect()
        print("Collection Complete")
        exit()
Exemplo n.º 48
0
import os
import pandas as pd
import datetime, time
from fastparquet import write
import urllib3
import json

import warnings
warnings.filterwarnings('ignore')

https = urllib3.PoolManager()

while True:
    try:
        now = datetime.datetime.now()
        datetimeval = datetime.datetime.now().strftime("%Y%m%d%H%M")
        parquet_file = "data.parquet"
        if now.minute % 5 == 4 and now.second == 56:
            r = https.request('GET',"https://www.purpleair.com/json?*")
            if r.status != 200:
                time.sleep(240)
                continue
            j = json.loads(r.data.decode('utf-8'))
            data_df = pd.DataFrame(j)
            write(parquet_file, data_df,compression='GZIP')
            os.system("aws s3 cp data.parquet s3://utkarsh-midscapstone-whos-polluting-my-air/PurpleAir/{}.parquet".format(datetimeval))
            time.sleep(120)
    except:
        pass
Exemplo n.º 49
0
def test_columns_index_with_multi_index(tmpdir, engine):
    fn = os.path.join(str(tmpdir), 'test.parquet')
    index = pd.MultiIndex.from_arrays(
        [np.arange(10), np.arange(10) + 1], names=['x0', 'x1'])
    df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index)
    df2 = df.reset_index(drop=False)

    if engine == 'fastparquet':
        fastparquet.write(fn, df, write_index=True)

        # fastparquet doesn't support multi-index
        with pytest.raises(ValueError):
            ddf = dd.read_parquet(fn, engine=engine)
    else:
        import pyarrow as pa
        pq.write_table(pa.Table.from_pandas(df), fn)

        # Pyarrow supports multi-index reads
        ddf = dd.read_parquet(fn, engine=engine)
        assert_eq(ddf, df)

        d = dd.read_parquet(fn, columns='a', engine=engine)
        assert_eq(d, df['a'])

        d = dd.read_parquet(fn,
                            index=['a', 'b'],
                            columns=['x0', 'x1'],
                            engine=engine)
        assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']])

    # Just index
    d = dd.read_parquet(fn, index=False, engine=engine)
    assert_eq(d, df2)

    d = dd.read_parquet(fn, index=['a'], engine=engine)
    assert_eq(d, df2.set_index('a')[['b']])

    d = dd.read_parquet(fn, index=['x0'], engine=engine)
    assert_eq(d, df2.set_index('x0')[['a', 'b']])

    # Just columns
    d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine)
    assert_eq(d, df2.set_index('x1')[['x0', 'a']])

    # Both index and columns
    d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine)
    assert_eq(d, df2[['x0', 'b']])

    for index in ['x1', 'b']:
        d = dd.read_parquet(fn,
                            index=index,
                            columns=['x0', 'a'],
                            engine=engine)
        assert_eq(d, df2.set_index(index)[['x0', 'a']])

    # Columns and index intersect
    for index in ['a', 'x0']:
        with pytest.raises(ValueError):
            d = dd.read_parquet(fn,
                                index=index,
                                columns=['x0', 'a'],
                                engine=engine)

    # Series output
    for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')),
                             (False, 'b', df2), (False, 'x0', df2),
                             ('a', 'x0', df2.set_index('a')),
                             ('a', 'b', df2.set_index('a'))]:
        d = dd.read_parquet(fn, index=ind, columns=col, engine=engine)
        assert_eq(d, sol_df[col])
Exemplo n.º 50
0
extractor_data = pd.read_csv(
    '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/date/date_extractions.csv'
)
extractor_data['imaginary_id'] = extractor_data['croppedImageId_url'].map(
    lambda x: x.split('/')[-1])

text = ParquetFile(
    '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/parquet_data/text_extractions_temp.parq'
).to_pandas()
text.columns = ['imaginary_id', 'Text']

df = pd.merge(extractor_data, text, on='imaginary_id')

write(
    '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/parquet_data/date_alg_results_and_ocr.parq',
    df,
    compression='GZIP',
    file_scheme='hive')

print('rows in extractor data: ', len(extractor_data))
print('rows in text Parquet: ', len(text))
print('rows in merged df: ', len(df))

extractor_data.loc[extractor_data['conclusion'] == 'N\A',
                   'conclusion'] = np.nan
extractor_data.loc[extractor_data['conclusionConfidence'] == 'N\A',
                   'conclusionConfidence'] = np.nan
extractor_data.loc[:, 'conclusionConfidence'] = extractor_data[
    'conclusionConfidence'].astype('float')

## sanity check
Exemplo n.º 51
0
    twitter_pred_df = pd.DataFrame({
        "date_col":
        twitter_date_col,
        "twitter_pred":
        twitter_pred.reshape(twitter_pred.shape[0], )
    })

    del twitter_test, twitter_train, twitter_data

    wallstreet_test, wallstreet_train = get_wallstreet_data()
    wallstreet_data = np.vstack((wallstreet_test, wallstreet_train))
    wallstreet_data = np.expand_dims(wallstreet_data, axis=0)

    wallstreet_pred = wallstreet_model.predict(wallstreet_data)

    wallstreet_pred = y_scaler.inverse_transform(wallstreet_pred[0])

    wallstreet_pred_df = pd.DataFrame({
        "date_col":
        non_twitter_dates,
        "iex_pred":
        wallstreet_pred.reshape(wallstreet_pred.shape[0], )
    })

    del wallstreet_test, wallstreet_train, wallstreet_data

    ensamble_data = pd.merge(wallstreet_pred_df, iex_pred_df, on="date_col")
    ensamble_data = pd.merge(ensamble_data, twitter_pred_df, on="date_col")

    fastparquet.write("ensamble/ensamble_data.pq", ensamble_data)
Exemplo n.º 52
0
def test_null_sizes(tempdir):
    df = pd.DataFrame({'a': [True, None], 'b': [3000, np.nan]}, dtype="O")
    fastparquet.write(tempdir, df, has_nulls=True, file_scheme='hive')
    pf = fastparquet.ParquetFile(tempdir)
    assert pf.dtypes['a'] == 'float16'
    assert pf.dtypes['b'] == 'float64'
Exemplo n.º 53
0
def test_bad_col(tempdir):
    df = pd.DataFrame({'x': [1, 2]})
    fn = os.path.join(tempdir, 'temp.parq')
    with pytest.raises(ValueError) as e:
        write(fn, df, has_nulls=['y'])
Exemplo n.º 54
0
import numpy as np
from dateutil.parser import *

print("--Start--")
print("clean_iex_data")
# Get Data
df = pd.read_csv("./iex/mintue_trade_data.csv", na_values=[-1])

# Get date and  minute to timestamps
df.date = df.date.astype(str) + " " + df.minute
df = df.drop(columns=["minute"])
df.date = df.date.apply(parse)

# Fill Na with values above
df = df.fillna(method="ffill")
df = df.dropna()

df.average = df.average.shift(-1)
df = df[:-1]

# Save all data to a pickle
fastparquet.write("iex_data/iex_clean.parquet", df)

# create a smaller dataframe to add to twitter and wallstreet journal
date_df = pd.DataFrame({"date_col": df.date, "stock_price_col" : df.average})

# save smaller df to a pickle
fastparquet.write("iex_data/date_iex_data.parquet", date_df)

print("--End--")
Exemplo n.º 55
0
def test_auto_null_object(tempdir, pnull):
    tmp = str(tempdir)
    df = pd.DataFrame({'a': [1, 2, 3, 0],
                       'aa': pd.Series([1, 2, 3, None], dtype=object),
                       'b': [1., 2., 3., np.nan],
                       'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'),
                       'd': ['a', 'b', 'c', None],
                       'f': [True, False, True, True],
                       'ff': [True, False, None, True]})  # object
    df['e'] = df['d'].astype('category')
    df['bb'] = df['b'].astype('object')
    df['aaa'] = df['a'].astype('object')
    object_cols = ['d', 'ff', 'bb', 'aaa', 'aa']
    test_cols = list(set(df) - set(object_cols)) + ['d']
    fn = os.path.join(tmp, "test.parq")

    with pytest.raises(ValueError):
        write(fn, df, has_nulls=False)

    write(fn, df, has_nulls=True)
    pf = ParquetFile(fn, pandas_nulls=pnull)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False,
                          check_dtype=False)
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])
    if pnull:
        tm.assert_frame_equal(df[['aa']].astype('Int64'), df2[['aa']])
        tm.assert_frame_equal(df[['ff']].astype("boolean"), df2[['ff']])
    else:
        tm.assert_frame_equal(df[['aa']].astype('float'), df2[['aa']])
        tm.assert_frame_equal(df[['ff']].astype("float"), df2[['ff']])

    # not giving any value same as has_nulls=True
    write(fn, df)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
    df2 = pf.to_pandas(categories=['e'])

    tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False,
                          check_dtype=False)
    tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])

    # 'infer' is new recommended auto-null
    write(fn, df, has_nulls='infer')
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in object_cols:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])

    # nut legacy None still works
    write(fn, df, has_nulls=None)
    pf = ParquetFile(fn)
    for col in pf._schema[1:]:
        if col.name in object_cols:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL
        else:
            assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED
    df2 = pf.to_pandas()
    tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False)
    tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']])
    tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']])
    tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])
Exemplo n.º 56
0
    def _write(refs, outpath, filetype=None):
        types = {"json": "json", "parquet": "parquet", "zarr": "zarr"}
        if filetype is None:
            ext = os.path.splitext(outpath)[1].lstrip(".")
            filetype = types[ext]
        elif filetype not in types:
            raise KeyError
        if filetype == "json":
            with open(outpath, "w") as f:
                json.dump(refs, f)
            return
        import pandas as pd
        references2 = {
            k: {
                "data": v.encode('ascii') if not isinstance(v, list) else None,
                "url": v[0] if isinstance(v, list) else None,
                "offset": v[1] if isinstance(v, list) else None,
                "size": v[2] if isinstance(v, list) else None
            }
            for k, v in refs['refs'].items()
        }
        # use pandas for sorting
        df = pd.DataFrame(references2.values(),
                          index=list(references2)).sort_values("offset")

        if filetype == "zarr":
            # compression should be NONE, if intent is to store in single zip
            g = zarr.open_group(outpath, mode='w')
            g.attrs.update({
                k: v
                for k, v in refs.items()
                if k in ['version', "templates", "gen"]
            })
            g.array(name="key",
                    data=df.index.values,
                    dtype="object",
                    compression="zstd",
                    object_codec=numcodecs.VLenUTF8())
            g.array(name="offset",
                    data=df.offset.values,
                    dtype="uint32",
                    compression="zstd")
            g.array(name="size",
                    data=df['size'].values,
                    dtype="uint32",
                    compression="zstd")
            g.array(name="data",
                    data=df.data.values,
                    dtype="object",
                    object_codec=numcodecs.VLenBytes(),
                    compression="gzip")
            # may be better as fixed length
            g.array(name="url",
                    data=df.url.values,
                    dtype="object",
                    object_codec=numcodecs.VLenUTF8(),
                    compression='gzip')
        if filetype == "parquet":
            import fastparquet
            metadata = {
                k: v
                for k, v in refs.items()
                if k in ['version', "templates", "gen"]
            }
            fastparquet.write(outpath,
                              df,
                              custom_metadata=metadata,
                              compression="ZSTD")
Exemplo n.º 57
0
def test_bad_object_encoding(tempdir):
    df = pd.DataFrame({'x': ['a', 'ab']})
    with pytest.raises(ValueError) as e:
        write(str(tempdir), df, object_encoding='utf-8')
    assert "utf-8" in str(e.value)
Exemplo n.º 58
0
JSON_FILE = 'Parquet/output/nodes.json'

nodes = []
tree = ET.parse(open(SOURCE_FILE))
for node in tree.iterfind('node'):
    nodes.append({
        'id': int(node.get('id')),
        'longitude': float(node.get('lon')),
        'latitude': float(node.get('lat')),
        'username': node.get('user')
    })

df = pd.DataFrame.from_records(nodes)

# Write nodes dictionary in an parquet file
write(PARQ_FILE, df)

# Write nodes dictionary in an avro file and use snappy compression algorithm
write(PARQ_SNAPPY_FILE, df, compression='snappy')

# Write nodes dictionary in an avro file and use GZIP compression algorithm
write(PARQ_GZIP_FILE, df, compression='GZIP')

# do the same with JSON format (for comparison)
df.to_json(JSON_FILE)


# Compare the size of the file formats
def print_file_size(file_path):
    file_stats = os.stat(file_path)
    print(f'Size of file {file_path} is {file_stats.st_size}')
Exemplo n.º 59
0
#!/usr/bin/env python

"""
An example of writing parquet files with 'fastparquet'.

References:
- https://github.com/dask/fastparquet
"""

import pandas
from fastparquet import write

df = pandas.read_csv("/etc/passwd", sep=":")

# this is uncompressed write (I think!)
write('/tmp/file.parq', df)
# this is compressed write
write('/tmp/file_compressed.parq', df, compression='GZIP', file_scheme='hive')
Exemplo n.º 60
0
                 verbose_eval=False,
                 params=params6,
                 early_stopping_rounds=50):
    'XGB6' + BUILD_NAME
}

merc = GeneralisedStacking(base_estimators_dict=estimators,
                           estimator_type='regression',
                           feval=r2_score,
                           stack_type='s',
                           folds_strategy=skf)
merc.fit(train, y_train)
lvl1meta_train_regressor = merc.meta_train
lvl1meta_test_regressor = merc.predict(test)

lvl1meta_train_regressor['ID'] = id_train
lvl1meta_train_regressor['y'] = y_train
lvl1meta_test_regressor['ID'] = id_test

print('Writing Parquets')
# store
fastparquet.write('./data/processed/metalvl1/xtrain_metalvl1' + BUILD_NAME +
                  '.parq',
                  lvl1meta_train_regressor,
                  write_index=False)
fastparquet.write('./data/processed/metalvl1/xtest_metalvl1' + BUILD_NAME +
                  '.parq',
                  lvl1meta_test_regressor,
                  write_index=False)
print('Finished')