def test_pyspark_roundtrip(tempdir, scheme, row_groups, comp, sql): if comp in ['BROTLI', 'ZSTD', 'LZO', "LZ4"]: pytest.xfail("spark doesn't support compression") data = pd.DataFrame({'i32': np.random.randint(-2**17, 2**17, size=1001, dtype=np.int32), 'i64': np.random.randint(-2**33, 2**33, size=1001, dtype=np.int64), 'f': np.random.randn(1001), 'bhello': np.random.choice([b'hello', b'you', b'people'], size=1001).astype("O"), 't': [datetime.datetime.now()]*1001}) data['t'] += pd.to_timedelta('1ns') data['hello'] = data.bhello.str.decode('utf8') data.loc[100, 'f'] = np.nan data['bcat'] = data.bhello.astype('category') data['cat'] = data.hello.astype('category') fname = os.path.join(tempdir, 'test.parquet') write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp, times='int96', write_index=True) df = sql.read.parquet(fname) ddf = df.sort('index').toPandas() for col in data: if data[col].dtype.kind == "M": # pyspark auto-converts timezones offset = round((datetime.datetime.utcnow() - datetime.datetime.now()).seconds / 3600) ddf[col] + datetime.timedelta(hours=offset) == data[col] else: assert (ddf[col] == data[col])[~ddf[col].isnull()].all()
def test_floating_point_partition_name(tempdir): df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1] assert out[out.y1 == 'bb'].x.tolist() == [200.0]
def time_text(): with tmpdir() as tempdir: result = {} fn = join_path(tempdir, 'temp.parq') n = 1000000 d = pd.DataFrame({ 'a': np.random.choice(['hi', 'you', 'people'], size=n), 'b': np.random.choice([b'hi', b'you', b'people'], size=n)}) for col in d.columns: for fixed in [None, 6]: df = d[[col]] if isinstance(df.iloc[0, 0], bytes): t = "bytes" else: t = 'utf8' write(fn, df) with measure('%s: write, fixed: %s' % (t, fixed), result): write(fn, df, has_nulls=False, write_index=False, fixed_text={col: fixed}, object_encoding=t) pf = ParquetFile(fn) pf.to_pandas() # warm-up with measure('%s: read, fixed: %s' % (t, fixed), result): pf.to_pandas() return result
def test_roundtrip(tempdir, scheme, row_groups, comp): data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"), } ) data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1") data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2") data["hello"] = data.bhello.str.decode("utf8") data["bcat"] = data.bhello.astype("category") data["cat"] = data.hello.astype("category") fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp) r = ParquetFile(fname) df = r.to_pandas() assert data.cat.dtype == "category" for col in r.columns: assert (df[col] == data[col]).all()
def test_roundtrip_complex(tempdir, scheme): import datetime data = pd.DataFrame( { "ui32": np.arange(1000, dtype=np.uint32), "i16": np.arange(1000, dtype=np.int16), "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8), "f16": np.arange(1000, dtype=np.float16), "dicts": [{"oi": "you"}] * 1000, "t": [datetime.datetime.now()] * 1000, "td": [datetime.timedelta(seconds=1)] * 1000, "bool": np.random.choice([True, False], size=1000), } ) data.loc[100, "t"] = None fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all()
def test_numerical_partition_name(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5] assert out[out.y1 == 'bb'].x.tolist() == [2]
def test_merge(tempdir, dirs, row_groups): fn = str(tempdir) os.makedirs(os.path.join(fn, dirs[0]), exist_ok=True) df0 = pd.DataFrame({"a": [1, 2, 3, 4]}) fn0 = os.sep.join([fn, dirs[0], "out0.parq"]) write(fn0, df0, row_group_offsets=row_groups) os.makedirs(os.path.join(fn, dirs[1]), exist_ok=True) df1 = pd.DataFrame({"a": [5, 6, 7, 8]}) fn1 = os.sep.join([fn, dirs[1], "out1.parq"]) write(fn1, df1, row_group_offsets=row_groups) # with file-names pf = writer.merge([fn0, fn1]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert "cat" in pf.cats # with instances pf = writer.merge([ParquetFile(fn0), ParquetFile(fn1)]) assert len(pf.row_groups) == 2 * len(row_groups) out = pf.to_pandas().a.tolist() assert out == [1, 2, 3, 4, 5, 6, 7, 8] if "cat=1" in dirs: assert "cat" in pf.cats
def test_input_column_list_not_mutated(tempdir): df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) write(tempdir, df, file_scheme='hive') cols = ['a'] pf = ParquetFile(tempdir) out = pf.to_pandas(columns=cols) assert cols == ['a']
def test_filter_stats(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], }) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4]) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('x', '>=', 5)]) assert out.x.tolist() == [5, 6, 7]
def test_in_filter(tempdir): symbols = ['a', 'a', 'b', 'c', 'c', 'd'] values = [1, 2, 3, 4, 5, 6] df = pd.DataFrame(data={'symbols': symbols, 'values': values}) write(tempdir, df, file_scheme='hive', partition_on=['symbols']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])]) assert set(out.symbols) == {'a', 'c'}
def test_mixed_partition_types_warning(tempdir, partitions): df = pd.DataFrame({ 'partitions': partitions, 'x': [1, 2] }) write(tempdir, df, file_scheme='hive', partition_on=['partitions']) with pytest.warns(UserWarning, match=r'Partition names coerce to values of different types.*'): ParquetFile(tempdir)
def test_datetime_partition_no_dupilcates(tempdir, partitions): df = pd.DataFrame({ 'partitions': partitions, 'x': [1, 2] }) write(tempdir, df, file_scheme='hive', partition_on=['partitions']) with pytest.raises(ValueError, match=r'Partition names map to the same value.*'): ParquetFile(tempdir)
def test_index_not_in_columns(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a') write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas(columns=['b']) assert out.index.tolist() == ['x', 'y', 'z'] out = pf.to_pandas(columns=['b'], index=False) assert out.index.tolist() == [0, 1, 2]
def test_request_nonexistent_column(tempdir): df = pd.DataFrame({'x': [1, 2, 3]}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) pf = ParquetFile(fn) with pytest.raises(ValueError): pf.to_pandas(columns=['y'])
def test_grab_cats(tempdir): s = pd.Series(['a', 'c', 'b']*20) df = pd.DataFrame({'a': s, 'b': s.astype('category'), 'c': s.astype('category').cat.as_ordered()}) fastparquet.write(tempdir, df, file_scheme='hive') pf = fastparquet.ParquetFile(tempdir) cats = pf.grab_cats(['b', 'c']) assert (cats['b'] == df.b.cat.categories).all() assert (cats['c'] == df.c.cat.categories).all()
def test_2(): # to make and save a large-ish DataFrame N = 10000000 df = pd.DataFrame({'ints': np.random.randint(0, 1000, size=N), 'floats': np.random.randn(N), 'times': pd.DatetimeIndex(start='1980', freq='s', periods=N)}) df.to_csv('test_2.csv') fastparquet.write('test_2_UNCOMPRESSED.parq', df, compression='UNCOMPRESSED')
def test_append_simple(tempdir): fn = os.path.join(str(tempdir), "test.parq") df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]}) write(fn, df, write_index=False) write(fn, df, append=True, write_index=False) pf = ParquetFile(fn) expected = pd.concat([df, df], ignore_index=True) pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
def test_open_standard(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], file_scheme='hive', open_with=open) pf = ParquetFile(fn, open_with=open) d2 = pf.to_pandas() pd.util.testing.assert_frame_equal(d2, df)
def test_filter_special(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI'] }) write(tempdir, df, file_scheme='hive', partition_on=['symbol']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('symbol', '==', 'NOW')]) assert out.x.tolist() == [1, 5, 6] assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
def test_datetime_category_no_dupilcates(tempdir, categories): # The purpose of this test is to ensure that the changes made for the previous test # haven't broken categories in general. df = pd.DataFrame({ 'categories': categories, 'x': [1, 2] }).astype({'categories': 'category'}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) assert ParquetFile(fn).to_pandas().categories.tolist() == categories
def test_index(tempdir): s = pd.Series(['a', 'c', 'b']*20) df = pd.DataFrame({'a': s, 'b': s.astype('category'), 'c': range(60, 0, -1)}) for column in df: d2 = df.set_index(column) fastparquet.write(tempdir, d2, file_scheme='hive', write_index=True) pf = fastparquet.ParquetFile(tempdir) out = pf.to_pandas(index=column, categories=['b']) pd.util.testing.assert_frame_equal(out, d2, check_categorical=False)
def test_read_multiple_no_metadata(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5]}) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2]) os.unlink(os.path.join(tempdir, '_metadata')) os.unlink(os.path.join(tempdir, '_common_metadata')) import glob flist = list(sorted(glob.glob(os.path.join(tempdir, '*')))) pf = ParquetFile(flist) assert len(pf.row_groups) == 2 out = pf.to_pandas() pd.util.testing.assert_frame_equal(out, df)
def test_logical_types(tempdir): df = pd.util.testing.makeMixedDataFrame() fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) p = ParquetFile(fn) s = statistics(p) assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp))
def test_int96_stats(tempdir): df = pd.util.testing.makeMixedDataFrame() fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], times='int96') p = ParquetFile(fn) s = statistics(p) assert isinstance(s['min']['D'][0], (np.datetime64, pd.tslib.Timestamp)) assert 'D' in sorted_partitioned_columns(p)
def test_zero_child_leaf(tempdir): df = pd.DataFrame({'x': [1, 2, 3]}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) pf = ParquetFile(fn) assert pf.columns == ['x'] pf._schema[1].num_children = 0 assert pf.columns == ['x']
def test_datetime_partition_names(tempdir): date_strings = ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09'] df = pd.DataFrame({ 'date': date_strings, 'x': [1, 5, 2, 5] }) write(tempdir, df, file_scheme='hive', partition_on=['date']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert set(out.date.tolist()) == set(pd.to_datetime(date_strings).tolist()) assert out[out.date == '2015-05-09'].x.tolist() == [1, 5] assert out[out.date == '2020-10-17'].x.tolist() == [2]
def test_filter_without_paths(tempdir): fn = os.path.join(tempdir, 'test.parq') df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g'] }) write(fn, df) pf = ParquetFile(fn) out = pf.to_pandas(filters=[['x', '>', 3]]) pd.util.testing.assert_frame_equal(out, df) out = pf.to_pandas(filters=[['x', '>', 30]]) assert len(out) == 0
def test_multi_cat_fail(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df, row_group_offsets=25) pf = ParquetFile(fn) with pytest.raises(RuntimeError): pf.to_pandas()
def test_statistics(tempdir): s = pd.Series([b'a', b'b', b'c']*20) df = pd.DataFrame({'a': s, 'b': s.astype('category'), 'c': s.astype('category').cat.as_ordered()}) fastparquet.write(tempdir, df, file_scheme='hive') pf = fastparquet.ParquetFile(tempdir) stat = pf.statistics assert stat['max']['a'] == [b'c'] assert stat['min']['a'] == [b'a'] assert stat['max']['b'] == [None] assert stat['min']['b'] == [None] assert stat['max']['c'] == [b'c'] assert stat['min']['c'] == [b'a']
def test_multi(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df) pf = ParquetFile(fn) df1 = pf.to_pandas() assert df1.equals(df) assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
def test_multi_list(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x') write(dir1, df, file_scheme='hive') dir2 = os.path.join(tempdir, 'y') write(dir2, df, file_scheme='hive') dir3 = os.path.join(tempdir, 'z', 'deep') write(dir3, df, file_scheme='hive') pf = ParquetFile([dir1, dir2]) out = pf.to_pandas() # this version may have extra column! assert out.a.tolist() == ['x', 'y', 'z'] * 2 pf = ParquetFile([dir1, dir2, dir3]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 3
def parquet_conv(filename, cwd=os.getcwd(), datasourceformat=".xlsx"): """Converts a file of .xlsx or .csv into .parquet and reads prints/returns the first column :param filename: base filename to be converted to .parquet :param cwd: current working directory :param datasourceformat: what format the datasource comes in :return: the requested column from pset instructions """ parquetfilename = filename + ".parquet" data_wd = os.path.abspath(os.path.join(cwd, "data")) data_source = os.path.join(data_wd, filename + datasourceformat) try: df = pd.read_csv(data_source) except: df = pd.read_excel(data_source) atomic_write(fastparquet.write(parquetfilename, df, compression=None)) result = pd.read_parquet(parquetfilename, engine="fastparquet", columns=["hashed_id"]) print(result) return result
def test_merge_fail(tempdir): fn = str(tempdir) df0 = pd.DataFrame({'a': [1, 2, 3, 4]}) fn0 = os.sep.join([fn, 'out0.parq']) write(fn0, df0) df1 = pd.DataFrame({'a': ['a', 'b', 'c']}) fn1 = os.sep.join([fn, 'out1.parq']) write(fn1, df1) with pytest.raises(ValueError) as e: writer.merge([fn0, fn1]) assert 'schemas' in str(e) os.remove(fn1) write(fn1, df0, file_scheme='hive') with pytest.raises(ValueError) as e: writer.merge([fn0, fn1]) assert 'multi-file' in str(e)
def test_auto_null(tempdir): tmp = str(tempdir) df = pd.DataFrame({ 'a': [1, 2, 3, 0], 'aa': [1, 2, 3, None], 'b': [1., 2., 3., np.nan], 'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'), 'd': ['a', 'b', 'c', None], 'f': [True, False, True, True], 'ff': [True, False, None, True] }) df['e'] = df['d'].astype('category') fn = os.path.join(tmp, "test.parq") with pytest.raises((TypeError, AttributeError)): ## TODO: this should be a nicer error? write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) cols = list(set(df) - {'ff'}) tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']]) write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in ['d', 'ff']: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[cols], df2[cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('float16'), df2[['ff']])
def test_only_partition_columns(tempdir): df = pd.DataFrame({'a': np.random.rand(20), 'b': np.random.choice(['hi', 'ho'], size=20), 'c': np.random.choice(['a', 'b'], size=20)}) write(tempdir, df, file_scheme='hive', partition_on=['b']) pf = ParquetFile(tempdir) df2 = pf.to_pandas(columns=['b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() write(tempdir, df, file_scheme='hive', partition_on=['a', 'b']) pf = ParquetFile(tempdir) df2 = pf.to_pandas(columns=['a', 'b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() df2 = pf.to_pandas(columns=['b']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() df2 = pf.to_pandas(columns=['b', 'c']) df.b.value_counts().to_dict() == df2.b.value_counts().to_dict() with pytest.raises(ValueError): # because this leaves no data to write write(tempdir, df[['b']], file_scheme='hive', partition_on=['b'])
def test_custom_metadata(tempdir): df = pd.DataFrame({'a': [15]}) fn = os.path.join(tempdir, 'temp.parq') write(fn, df, custom_metadata={"hello": "world"}) pf = ParquetFile(fn) assert pf.key_value_metadata['hello'] == 'world'
def test_append_w_partitioning(tempdir): fn = str(tempdir) df = pd.DataFrame({'a': np.random.choice([1, 2, 3], size=50), 'b': np.random.choice(['hello', 'world'], size=50), 'c': np.random.randint(50, size=50)}) write(fn, df, file_scheme='hive', partition_on=['a', 'b']) write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True) write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True) write(fn, df, file_scheme='hive', partition_on=['a', 'b'], append=True) pf = ParquetFile(fn) out = pf.to_pandas() assert len(out) == 200 assert sorted(out.a)[::4] == sorted(df.a) with pytest.raises(ValueError): write(fn, df, file_scheme='hive', partition_on=['a'], append=True) with pytest.raises(ValueError): write(fn, df, file_scheme='hive', partition_on=['b', 'a'], append=True)
def test_duplicate_columns(tempdir): fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('aaa')) with pytest.raises(ValueError) as e: write(fn, df) assert 'duplicate' in str(e.value)
def test_bad_coltype(tempdir): df = pd.DataFrame({'0': [1, 2], (0, 1): [3, 4]}) fn = os.path.join(tempdir, 'temp.parq') with pytest.raises((ValueError, TypeError)) as e: write(fn, df) assert "tuple" in str(e.value)
df.body = df.body.apply(pre_processing) word_grams = TfidfVectorizer(analyzer="word", ngram_range=(1, 5), stop_words="english", max_features=10000) word_vector = word_grams.fit_transform(df.body) word_df = pd.DataFrame() for i, col in enumerate(word_grams.get_feature_names()): word_df[col] = pd.Series(word_vector[:, i].toarray().ravel()) df = pd.merge(df, word_df, left_index=True, right_index=True) del word_df # Note columns changed to _x for time print(df.head()) df = df.drop(["body_x"], axis=1) df = df.set_index("date_created") print("write parquet") fastparquet.write("processed_tweets.parquet", df) print(df.shape)
def to_parquet(filename, prefix="maccdc2012"): with open(filename) as f: traffic = {} nodes = set() for line in f.readlines(): if "unreachable" in line: continue fields = line.split() if not fields: continue if fields[1] != "IP": continue protocol = get_ip_protocol(line) if protocol not in ("tcp", "udp", "eigrp", "icmp"): continue try: addresses = [] # Extract source IP address and convert to integer m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[2]) if not m: continue addresses.append(ip_to_integer(m.group('address'))) # Extract target IP address and convert to integer m = re.match(r'(?P<address>\d+\.\d+\.\d+\.\d+)', fields[4]) if not m: continue addresses.append(ip_to_integer(m.group('address'))) nodes = nodes.union(addresses) src, dst = sorted(addresses) key = (protocol, src, dst) # Extract packet size nbytes = int(fields[-1]) if key in traffic: traffic[key] += nbytes else: traffic[key] = nbytes except: pass nodes = dict([(node, i) for i, node in enumerate(sorted(nodes))]) edges = [] for key in traffic: edge = [nodes[key[1]], nodes[key[2]], key[0], traffic[key]] edges.append(edge) nodes_df = pd.DataFrame(np.arange(len(nodes)), columns=['id']) nodes_df = nodes_df.set_index('id') edges_df = pd.DataFrame( np.array(edges), columns=['source', 'target', 'protocol', 'weight']) edges_df['source'] = pd.to_numeric(edges_df['source']) edges_df['target'] = pd.to_numeric(edges_df['target']) edges_df['weight'] = pd.to_numeric(edges_df['weight']) edges_df['protocol'] = edges_df['protocol'].astype('category') fp.write('{}_nodes.parq'.format(prefix), nodes_df) fp.write('{}_edges.parq'.format(prefix), edges_df)
owner.append(ipinfo['autonomous_system_organization']) asn.append(ipinfo['autonomous_system_number']) ispname.append(ipinfo['isp']) else: print("error: for ip %s, ipinfo==None" % (ip)) owner.append('') asn.append(0) ispname.append('') print("\n DONE getting ISP names") # add IP_owner and IP_ASN columns to the dataframe df["IP_owner"] = owner df["IP_ASN"] = asn # get company name from owner string df["ISP_name"] = ispname return df ############################################################################# the_query = query_writer("06/15/14", "05/13/15", limit=999) print(the_query) project_id = 'mlab-185523' df = acquire_mlab_data(project_id, "01/01/13", "02/01/13") from fastparquet import write write('mlab-test-data-0.parquet', df)
def write_parquet_gzip(df, file_name, num_of_samples): print("Gzip Parquet writing started...") filename = file_name + '.parq' write(filename, df, num_of_samples, "GZIP")
def write_parquet_snappy(df, file_name, num_of_samples): print("Snappy Parquet writing started...") filename = file_name + '.parq' write(filename, df, num_of_samples, "SNAPPY")
import pandas as pd import numpy as np from fastparquet import write df = pd.read_csv('/etc/adult.data', names = ["Age", "Workclass", "fnlwgt", "Education", "Education_Num", "Martial_Status", "Occupation", "Relationship", "Race", "Sex", "Capital_Gain", "Capital_Loss", "Hours_per_week", "Country", "Target"]) write('adult.parq', df, compression='GZIP')
print("Remove constant cols") train_df = train_df.drop(['ID', 'y'], axis=1) test_df = test_df.drop(['ID'], axis=1) print("Removed") ########################################################### # Create interaction features interactions2way = list(set(list(train_df)) - set(BASE_COLS)) interactions2way_list = list(combinations(interactions2way, 2)) for A, B in interactions2way_list: feat = "_".join([A, B]) train_df[feat] = abs(train_df[A] - train_df[B]) test_df[feat] = abs(test_df[A] - test_df[B]) # Now split into train_df and test_df and save the output of the processed dataset. train_df['ID'] = id_train_df train_df['y'] = y_train_df test_df['ID'] = id_test_df print('Writing Parquets') # store fastparquet.write('./data/processed/metalvl2/xtrain' + BUILD_NAME + '.parq', train_df, write_index=False) fastparquet.write('./data/processed/metalvl2/xtest' + BUILD_NAME + '.parq', test_df, write_index=False) print('Finished')
def main(): global baseCode print("Connecting to IB Gateway") print("Client ID: " + str(clientId)) print("Host ID: " + str(host)) ibConn = IBTrader.IBTrader() time.sleep(3) ibConn.connect(clientId=clientId, host=host, port=port) time.sleep(5) ibConn.contracts = {} time.sleep(5) ibConn.contracts = {} ibConn.createCashContract(baseCode[:3], currency=baseCode[3:]) print("Adding: " + baseCode) print("Contracts Processing: " + str(len(ibConn.contracts))) for contract in ibConn.contracts: #baseCode = ibConn.contract_details[contract]['m_summary']['m_localSymbol'].replace('.','') print("Processing: " + baseCode) print("Retrieving Hourly Data") ibConn.requestHistoricalData( ibConn.contracts[contract], resolution="1 hour", end_datetime='{} 22:00:00'.format( (datetime.datetime.today()).strftime("%Y%m%d")), lookback="1 M") waiting = True lastLen = 0 while waiting: try: if len(ibConn.historicalData[baseCode + '_CASH']) > lastLen: lastLen = len(ibConn.historicalData[baseCode + '_CASH']) time.sleep(2) else: waiting = False except KeyError: pass time.sleep(5) print("Saving Hourly Data") hourlyData = ibConn.historicalData[baseCode + '_CASH'] hourlyData = hourlyData.drop(['V', 'OI', 'WAP'], 1).reset_index().sort_values('datetime') filename = baseCode + '_H' + str( (datetime.datetime.today()).strftime("%Y%m%d")) + '.parq' write('/root/data/hour/' + filename, hourlyData) bucket.upload_file('/root/data/hour/' + filename, s3_StorageLocation + filename) ibConn.historicalData = {} years = [2019] for year in years: print("Retrieving Minute Data " + str(year)) for i in range(3, 0, -1): d = datetime.datetime(year, i, calendar.monthrange(year, i)[1]) dateStr = d.strftime("%Y%m%d") print("Month: " + str(d.strftime("%Y %m"))) for contract in ibConn.contracts: #baseCode = ibConn.contract_details[contract]['m_summary']['m_localSymbol'].replace('.','') print("\tProcessing: " + baseCode) ibConn.requestHistoricalData( ibConn.contracts[contract], resolution="1 min", end_datetime='{} 22:00:00'.format(dateStr), lookback="1 M") waiting = True lastLen = 0 while waiting: try: if len(ibConn.historicalData[baseCode + '_CASH']) > lastLen: lastLen = len(ibConn.historicalData[baseCode + '_CASH']) #print("\tBars Received: "+str(lastLen)) time.sleep(10) else: waiting = False except KeyError: pass time.sleep(5) minuteData = ibConn.historicalData[baseCode + '_CASH'] minuteData = minuteData.drop( ['V', 'OI', 'WAP'], 1).reset_index().sort_values('datetime') filename = baseCode + '_M' + str( d.strftime("%Y_%m")) + '.parq' write('/root/data/min/' + filename, minuteData) bucket.upload_file('/root/data/min/' + filename, s3_StorageLocation + "min/" + filename) ibConn.historicalData = {} ibConn.historicalData = {} ibConn.cancelHistoricalData() ibConn.cancelMarketData() ibConn.contracts = {} ibConn.disconnect() print("Collection Complete") exit()
import os import pandas as pd import datetime, time from fastparquet import write import urllib3 import json import warnings warnings.filterwarnings('ignore') https = urllib3.PoolManager() while True: try: now = datetime.datetime.now() datetimeval = datetime.datetime.now().strftime("%Y%m%d%H%M") parquet_file = "data.parquet" if now.minute % 5 == 4 and now.second == 56: r = https.request('GET',"https://www.purpleair.com/json?*") if r.status != 200: time.sleep(240) continue j = json.loads(r.data.decode('utf-8')) data_df = pd.DataFrame(j) write(parquet_file, data_df,compression='GZIP') os.system("aws s3 cp data.parquet s3://utkarsh-midscapstone-whos-polluting-my-air/PurpleAir/{}.parquet".format(datetimeval)) time.sleep(120) except: pass
def test_columns_index_with_multi_index(tmpdir, engine): fn = os.path.join(str(tmpdir), 'test.parquet') index = pd.MultiIndex.from_arrays( [np.arange(10), np.arange(10) + 1], names=['x0', 'x1']) df = pd.DataFrame(np.random.randn(10, 2), columns=['a', 'b'], index=index) df2 = df.reset_index(drop=False) if engine == 'fastparquet': fastparquet.write(fn, df, write_index=True) # fastparquet doesn't support multi-index with pytest.raises(ValueError): ddf = dd.read_parquet(fn, engine=engine) else: import pyarrow as pa pq.write_table(pa.Table.from_pandas(df), fn) # Pyarrow supports multi-index reads ddf = dd.read_parquet(fn, engine=engine) assert_eq(ddf, df) d = dd.read_parquet(fn, columns='a', engine=engine) assert_eq(d, df['a']) d = dd.read_parquet(fn, index=['a', 'b'], columns=['x0', 'x1'], engine=engine) assert_eq(d, df2.set_index(['a', 'b'])[['x0', 'x1']]) # Just index d = dd.read_parquet(fn, index=False, engine=engine) assert_eq(d, df2) d = dd.read_parquet(fn, index=['a'], engine=engine) assert_eq(d, df2.set_index('a')[['b']]) d = dd.read_parquet(fn, index=['x0'], engine=engine) assert_eq(d, df2.set_index('x0')[['a', 'b']]) # Just columns d = dd.read_parquet(fn, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index('x1')[['x0', 'a']]) # Both index and columns d = dd.read_parquet(fn, index=False, columns=['x0', 'b'], engine=engine) assert_eq(d, df2[['x0', 'b']]) for index in ['x1', 'b']: d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) assert_eq(d, df2.set_index(index)[['x0', 'a']]) # Columns and index intersect for index in ['a', 'x0']: with pytest.raises(ValueError): d = dd.read_parquet(fn, index=index, columns=['x0', 'a'], engine=engine) # Series output for ind, col, sol_df in [(None, 'x0', df2.set_index('x1')), (False, 'b', df2), (False, 'x0', df2), ('a', 'x0', df2.set_index('a')), ('a', 'b', df2.set_index('a'))]: d = dd.read_parquet(fn, index=ind, columns=col, engine=engine) assert_eq(d, sol_df[col])
extractor_data = pd.read_csv( '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/date/date_extractions.csv' ) extractor_data['imaginary_id'] = extractor_data['croppedImageId_url'].map( lambda x: x.split('/')[-1]) text = ParquetFile( '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/parquet_data/text_extractions_temp.parq' ).to_pandas() text.columns = ['imaginary_id', 'Text'] df = pd.merge(extractor_data, text, on='imaginary_id') write( '/run/user/1000/gvfs/smb-share:server=nas01.local,share=rnd/data/parquet_data/date_alg_results_and_ocr.parq', df, compression='GZIP', file_scheme='hive') print('rows in extractor data: ', len(extractor_data)) print('rows in text Parquet: ', len(text)) print('rows in merged df: ', len(df)) extractor_data.loc[extractor_data['conclusion'] == 'N\A', 'conclusion'] = np.nan extractor_data.loc[extractor_data['conclusionConfidence'] == 'N\A', 'conclusionConfidence'] = np.nan extractor_data.loc[:, 'conclusionConfidence'] = extractor_data[ 'conclusionConfidence'].astype('float') ## sanity check
twitter_pred_df = pd.DataFrame({ "date_col": twitter_date_col, "twitter_pred": twitter_pred.reshape(twitter_pred.shape[0], ) }) del twitter_test, twitter_train, twitter_data wallstreet_test, wallstreet_train = get_wallstreet_data() wallstreet_data = np.vstack((wallstreet_test, wallstreet_train)) wallstreet_data = np.expand_dims(wallstreet_data, axis=0) wallstreet_pred = wallstreet_model.predict(wallstreet_data) wallstreet_pred = y_scaler.inverse_transform(wallstreet_pred[0]) wallstreet_pred_df = pd.DataFrame({ "date_col": non_twitter_dates, "iex_pred": wallstreet_pred.reshape(wallstreet_pred.shape[0], ) }) del wallstreet_test, wallstreet_train, wallstreet_data ensamble_data = pd.merge(wallstreet_pred_df, iex_pred_df, on="date_col") ensamble_data = pd.merge(ensamble_data, twitter_pred_df, on="date_col") fastparquet.write("ensamble/ensamble_data.pq", ensamble_data)
def test_null_sizes(tempdir): df = pd.DataFrame({'a': [True, None], 'b': [3000, np.nan]}, dtype="O") fastparquet.write(tempdir, df, has_nulls=True, file_scheme='hive') pf = fastparquet.ParquetFile(tempdir) assert pf.dtypes['a'] == 'float16' assert pf.dtypes['b'] == 'float64'
def test_bad_col(tempdir): df = pd.DataFrame({'x': [1, 2]}) fn = os.path.join(tempdir, 'temp.parq') with pytest.raises(ValueError) as e: write(fn, df, has_nulls=['y'])
import numpy as np from dateutil.parser import * print("--Start--") print("clean_iex_data") # Get Data df = pd.read_csv("./iex/mintue_trade_data.csv", na_values=[-1]) # Get date and minute to timestamps df.date = df.date.astype(str) + " " + df.minute df = df.drop(columns=["minute"]) df.date = df.date.apply(parse) # Fill Na with values above df = df.fillna(method="ffill") df = df.dropna() df.average = df.average.shift(-1) df = df[:-1] # Save all data to a pickle fastparquet.write("iex_data/iex_clean.parquet", df) # create a smaller dataframe to add to twitter and wallstreet journal date_df = pd.DataFrame({"date_col": df.date, "stock_price_col" : df.average}) # save smaller df to a pickle fastparquet.write("iex_data/date_iex_data.parquet", date_df) print("--End--")
def test_auto_null_object(tempdir, pnull): tmp = str(tempdir) df = pd.DataFrame({'a': [1, 2, 3, 0], 'aa': pd.Series([1, 2, 3, None], dtype=object), 'b': [1., 2., 3., np.nan], 'c': pd.to_timedelta([1, 2, 3, np.nan], unit='ms'), 'd': ['a', 'b', 'c', None], 'f': [True, False, True, True], 'ff': [True, False, None, True]}) # object df['e'] = df['d'].astype('category') df['bb'] = df['b'].astype('object') df['aaa'] = df['a'].astype('object') object_cols = ['d', 'ff', 'bb', 'aaa', 'aa'] test_cols = list(set(df) - set(object_cols)) + ['d'] fn = os.path.join(tmp, "test.parq") with pytest.raises(ValueError): write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn, pandas_nulls=pnull) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False, check_dtype=False) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) if pnull: tm.assert_frame_equal(df[['aa']].astype('Int64'), df2[['aa']]) tm.assert_frame_equal(df[['ff']].astype("boolean"), df2[['ff']]) else: tm.assert_frame_equal(df[['aa']].astype('float'), df2[['aa']]) tm.assert_frame_equal(df[['ff']].astype("float"), df2[['ff']]) # not giving any value same as has_nulls=True write(fn, df) pf = ParquetFile(fn) for col in pf._schema[1:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=['e']) tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False, check_dtype=False) tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) # 'infer' is new recommended auto-null write(fn, df, has_nulls='infer') pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in object_cols: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']]) # nut legacy None still works write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf._schema[1:]: if col.name in object_cols: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL else: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas() tm.assert_frame_equal(df[test_cols], df2[test_cols], check_categorical=False) tm.assert_frame_equal(df[['ff']].astype('boolean'), df2[['ff']]) tm.assert_frame_equal(df[['bb']].astype('float64'), df2[['bb']]) tm.assert_frame_equal(df[['aaa']].astype('int64'), df2[['aaa']])
def _write(refs, outpath, filetype=None): types = {"json": "json", "parquet": "parquet", "zarr": "zarr"} if filetype is None: ext = os.path.splitext(outpath)[1].lstrip(".") filetype = types[ext] elif filetype not in types: raise KeyError if filetype == "json": with open(outpath, "w") as f: json.dump(refs, f) return import pandas as pd references2 = { k: { "data": v.encode('ascii') if not isinstance(v, list) else None, "url": v[0] if isinstance(v, list) else None, "offset": v[1] if isinstance(v, list) else None, "size": v[2] if isinstance(v, list) else None } for k, v in refs['refs'].items() } # use pandas for sorting df = pd.DataFrame(references2.values(), index=list(references2)).sort_values("offset") if filetype == "zarr": # compression should be NONE, if intent is to store in single zip g = zarr.open_group(outpath, mode='w') g.attrs.update({ k: v for k, v in refs.items() if k in ['version', "templates", "gen"] }) g.array(name="key", data=df.index.values, dtype="object", compression="zstd", object_codec=numcodecs.VLenUTF8()) g.array(name="offset", data=df.offset.values, dtype="uint32", compression="zstd") g.array(name="size", data=df['size'].values, dtype="uint32", compression="zstd") g.array(name="data", data=df.data.values, dtype="object", object_codec=numcodecs.VLenBytes(), compression="gzip") # may be better as fixed length g.array(name="url", data=df.url.values, dtype="object", object_codec=numcodecs.VLenUTF8(), compression='gzip') if filetype == "parquet": import fastparquet metadata = { k: v for k, v in refs.items() if k in ['version', "templates", "gen"] } fastparquet.write(outpath, df, custom_metadata=metadata, compression="ZSTD")
def test_bad_object_encoding(tempdir): df = pd.DataFrame({'x': ['a', 'ab']}) with pytest.raises(ValueError) as e: write(str(tempdir), df, object_encoding='utf-8') assert "utf-8" in str(e.value)
JSON_FILE = 'Parquet/output/nodes.json' nodes = [] tree = ET.parse(open(SOURCE_FILE)) for node in tree.iterfind('node'): nodes.append({ 'id': int(node.get('id')), 'longitude': float(node.get('lon')), 'latitude': float(node.get('lat')), 'username': node.get('user') }) df = pd.DataFrame.from_records(nodes) # Write nodes dictionary in an parquet file write(PARQ_FILE, df) # Write nodes dictionary in an avro file and use snappy compression algorithm write(PARQ_SNAPPY_FILE, df, compression='snappy') # Write nodes dictionary in an avro file and use GZIP compression algorithm write(PARQ_GZIP_FILE, df, compression='GZIP') # do the same with JSON format (for comparison) df.to_json(JSON_FILE) # Compare the size of the file formats def print_file_size(file_path): file_stats = os.stat(file_path) print(f'Size of file {file_path} is {file_stats.st_size}')
#!/usr/bin/env python """ An example of writing parquet files with 'fastparquet'. References: - https://github.com/dask/fastparquet """ import pandas from fastparquet import write df = pandas.read_csv("/etc/passwd", sep=":") # this is uncompressed write (I think!) write('/tmp/file.parq', df) # this is compressed write write('/tmp/file_compressed.parq', df, compression='GZIP', file_scheme='hive')
verbose_eval=False, params=params6, early_stopping_rounds=50): 'XGB6' + BUILD_NAME } merc = GeneralisedStacking(base_estimators_dict=estimators, estimator_type='regression', feval=r2_score, stack_type='s', folds_strategy=skf) merc.fit(train, y_train) lvl1meta_train_regressor = merc.meta_train lvl1meta_test_regressor = merc.predict(test) lvl1meta_train_regressor['ID'] = id_train lvl1meta_train_regressor['y'] = y_train lvl1meta_test_regressor['ID'] = id_test print('Writing Parquets') # store fastparquet.write('./data/processed/metalvl1/xtrain_metalvl1' + BUILD_NAME + '.parq', lvl1meta_train_regressor, write_index=False) fastparquet.write('./data/processed/metalvl1/xtest_metalvl1' + BUILD_NAME + '.parq', lvl1meta_test_regressor, write_index=False) print('Finished')