def time_text(): with tmpdir() as tempdir: result = {} fn = join_path(tempdir, 'temp.parq') n = 1000000 d = pd.DataFrame({ 'a': np.random.choice(['hi', 'you', 'people'], size=n), 'b': np.random.choice([b'hi', b'you', b'people'], size=n)}) for col in d.columns: for fixed in [None, 6]: df = d[[col]] if isinstance(df.iloc[0, 0], bytes): t = "bytes" else: t = 'utf8' write(fn, df) with measure('%s: write, fixed: %s' % (t, fixed), result): write(fn, df, has_nulls=False, write_index=False, fixed_text={col: fixed}, object_encoding=t) pf = ParquetFile(fn) pf.to_pandas() # warm-up with measure('%s: read, fixed: %s' % (t, fixed), result): pf.to_pandas() return result
def test_bad_file_paths(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x=0') fn1 = os.path.join(dir1, 'part.=.parquet') os.makedirs(dir1) write(fn1, df) dir2 = os.path.join(tempdir, 'y/z') fn2 = os.path.join(dir2, 'part.0.parquet') os.makedirs(dir2) write(fn2, df) pf = ParquetFile([fn1, fn2]) assert pf.file_scheme == 'other' out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 2 assert 'dir0' not in out path1 = os.path.join(tempdir, 'data') fn1 = os.path.join(path1, 'out.parq') os.makedirs(path1) write(fn1, df) path2 = os.path.join(tempdir, 'data2') fn2 = os.path.join(path2, 'out.parq') os.makedirs(path2) write(fn2, df) pf = ParquetFile([fn1, fn2]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 2
def test_floating_point_partition_name(tempdir): df = pd.DataFrame({'x': [1e99, 5e-10, 2e+2, -0.1], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1e99, 5e-10, -0.1] assert out[out.y1 == 'bb'].x.tolist() == [200.0]
def test_roundtrip(tempdir, scheme, row_groups, comp): data = pd.DataFrame( { "i32": np.arange(1000, dtype=np.int32), "i64": np.arange(1000, dtype=np.int64), "f": np.arange(1000, dtype=np.float64), "bhello": np.random.choice([b"hello", b"you", b"people"], size=1000).astype("O"), } ) data["a"] = np.array([b"a", b"b", b"c", b"d", b"e"] * 200, dtype="S1") data["aa"] = data["a"].map(lambda x: 2 * x).astype("S2") data["hello"] = data.bhello.str.decode("utf8") data["bcat"] = data.bhello.astype("category") data["cat"] = data.hello.astype("category") fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp) r = ParquetFile(fname) df = r.to_pandas() assert data.cat.dtype == "category" for col in r.columns: assert (df[col] == data[col]).all()
def test_roundtrip_complex(tempdir, scheme): import datetime data = pd.DataFrame( { "ui32": np.arange(1000, dtype=np.uint32), "i16": np.arange(1000, dtype=np.int16), "ui8": np.array([1, 2, 3, 4] * 250, dtype=np.uint8), "f16": np.arange(1000, dtype=np.float16), "dicts": [{"oi": "you"}] * 1000, "t": [datetime.datetime.now()] * 1000, "td": [datetime.timedelta(seconds=1)] * 1000, "bool": np.random.choice([True, False], size=1000), } ) data.loc[100, "t"] = None fname = os.path.join(tempdir, "test.parquet") write(fname, data, file_scheme=scheme) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all()
def test_numerical_partition_name(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5], 'y1': ['aa', 'aa', 'bb', 'aa']}) write(tempdir, df, file_scheme='hive', partition_on=['y1']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert out[out.y1 == 'aa'].x.tolist() == [1, 5, 5] assert out[out.y1 == 'bb'].x.tolist() == [2]
def test_groups_roundtrip(tempdir): df = pd.DataFrame( { "a": np.random.choice(["a", "b", None], size=1000), "b": np.random.randint(0, 64000, size=1000), "c": np.random.choice([True, False], size=1000), } ) writer.write(tempdir, df, partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.columns == ["b"] out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b) writer.write(tempdir, df, row_group_offsets=[0, 50], partition_on=["a", "c"], file_scheme="hive") r = ParquetFile(tempdir) assert r.count == sum(~df.a.isnull()) assert len(r.row_groups) == 8 out = r.to_pandas() for i, row in out.iterrows(): assert row.b in list(df[(df.a == row.a) & (df.c == row.c)].b)
def test_auto_null(tempdir): tmp = str(tempdir) df = pd.DataFrame( { "a": [1, 2, 3, 0], "b": [1.0, 2.0, 3.0, np.nan], "c": pd.to_timedelta([1, 2, 3, np.nan], unit="ms"), "d": ["a", "b", "c", None], } ) df["e"] = df["d"].astype("category") fn = os.path.join(tmp, "test.parq") with pytest.raises(TypeError): ## TODO: this should be a nicer error? write(fn, df, has_nulls=False) write(fn, df, has_nulls=True) pf = ParquetFile(fn) for col in pf.schema[2:]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL assert pf.schema[1].repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED df2 = pf.to_pandas(categories=["e"]) tm.assert_frame_equal(df, df2, check_categorical=False) write(fn, df, has_nulls=None) pf = ParquetFile(fn) for col in pf.schema[1:3]: assert col.repetition_type == parquet_thrift.FieldRepetitionType.REQUIRED assert pf.schema[4].repetition_type == parquet_thrift.FieldRepetitionType.OPTIONAL df2 = pf.to_pandas(categories=["e"]) tm.assert_frame_equal(df, df2, check_categorical=False)
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series, categories, cats, scheme, storage_name_mapping): """Read dataset with fastparquet using ParquetFile machinery""" from fastparquet import ParquetFile pf = ParquetFile(path, open_with=fs.open) relpath = path.replace(base, '').lstrip('/') for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = cats pf.fn = base df = pf.to_pandas(all_columns, categories, index=index_names) if df.index.nlevels == 1: if index_names: df.index.name = storage_name_mapping.get(index_names[0], index_names[0]) else: if index_names: df.index.names = [storage_name_mapping.get(name, name) for name in index_names] df.columns = [storage_name_mapping.get(col, col) for col in all_columns if col not in (index_names or [])] if is_series: return df[df.columns[0]] else: return df
def test_input_column_list_not_mutated(tempdir): df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) write(tempdir, df, file_scheme='hive') cols = ['a'] pf = ParquetFile(tempdir) out = pf.to_pandas(columns=cols) assert cols == ['a']
def test_index_not_in_columns(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('a') write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas(columns=['b']) assert out.index.tolist() == ['x', 'y', 'z'] out = pf.to_pandas(columns=['b'], index=False) assert out.index.tolist() == [0, 1, 2]
def test_filter_stats(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], }) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 4]) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('x', '>=', 5)]) assert out.x.tolist() == [5, 6, 7]
def test_in_filter(tempdir): symbols = ['a', 'a', 'b', 'c', 'c', 'd'] values = [1, 2, 3, 4, 5, 6] df = pd.DataFrame(data={'symbols': symbols, 'values': values}) write(tempdir, df, file_scheme='hive', partition_on=['symbols']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('symbols', 'in', ['a', 'c'])]) assert set(out.symbols) == {'a', 'c'}
def test_write_compression_dict(tempdir, compression): df = pd.DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]}) fn = os.path.join(tempdir, "tmp.parq") writer.write(fn, df, compression=compression) r = ParquetFile(fn) df2 = r.to_pandas() tm.assert_frame_equal(df, df2, check_categorical=False)
def test_to_pandas(): fname = TEST_DATA+'/airlines_parquet/4345e5eef217aa1b-c8f16177f35fd983_1150363067_data.1.parq' pf = ParquetFile(fname) out = pf.to_pandas() assert len(out.columns) == 29 # test for bad integer conversion assert (out.dep_time < 0).sum() == 0 assert out.dep_time.dtype == 'float64'
def test_append_simple(tempdir): fn = os.path.join(str(tempdir), "test.parq") df = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "a", "b", "b"]}) write(fn, df, write_index=False) write(fn, df, append=True, write_index=False) pf = ParquetFile(fn) expected = pd.concat([df, df], ignore_index=True) pd.util.testing.assert_frame_equal(pf.to_pandas(), expected, check_categorical=False)
def test_request_nonexistent_column(tempdir): df = pd.DataFrame({'x': [1, 2, 3]}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) pf = ParquetFile(fn) with pytest.raises(ValueError): pf.to_pandas(columns=['y'])
def test_filter_special(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'symbol': ['NOW', 'OI', 'OI', 'OI', 'NOW', 'NOW', 'OI'] }) write(tempdir, df, file_scheme='hive', partition_on=['symbol']) pf = ParquetFile(tempdir) out = pf.to_pandas(filters=[('symbol', '==', 'NOW')]) assert out.x.tolist() == [1, 5, 6] assert out.symbol.tolist() == ['NOW', 'NOW', 'NOW']
def test_open_standard(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], file_scheme='hive', open_with=open) pf = ParquetFile(fn, open_with=open) d2 = pf.to_pandas() pd.util.testing.assert_frame_equal(d2, df)
def test_read_multiple_no_metadata(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5]}) write(tempdir, df, file_scheme='hive', row_group_offsets=[0, 2]) os.unlink(os.path.join(tempdir, '_metadata')) os.unlink(os.path.join(tempdir, '_common_metadata')) import glob flist = list(sorted(glob.glob(os.path.join(tempdir, '*')))) pf = ParquetFile(flist) assert len(pf.row_groups) == 2 out = pf.to_pandas() pd.util.testing.assert_frame_equal(out, df)
def test_nulls_roundtrip(tempdir): fname = os.path.join(tempdir, "temp.parq") data = pd.DataFrame({"o": np.random.choice(["hello", "world", None], size=1000)}) data["cat"] = data["o"].astype("category") writer.write(fname, data, has_nulls=["o", "cat"]) r = ParquetFile(fname) df = r.to_pandas() for col in r.columns: assert (df[col] == data[col])[~data[col].isnull()].all() assert (data[col].isnull() == df[col].isnull()).all()
def test_datetime_partition_names(tempdir): date_strings = ['2015-05-09', '2018-10-15', '2020-10-17', '2015-05-09'] df = pd.DataFrame({ 'date': date_strings, 'x': [1, 5, 2, 5] }) write(tempdir, df, file_scheme='hive', partition_on=['date']) pf = ParquetFile(tempdir) out = pf.to_pandas() assert set(out.date.tolist()) == set(pd.to_datetime(date_strings).tolist()) assert out[out.date == '2015-05-09'].x.tolist() == [1, 5] assert out[out.date == '2020-10-17'].x.tolist() == [2]
def test_filter_without_paths(tempdir): fn = os.path.join(tempdir, 'test.parq') df = pd.DataFrame({ 'x': [1, 2, 3, 4, 5, 6, 7], 'letter': ['a', 'b', 'c', 'd', 'e', 'f', 'g'] }) write(fn, df) pf = ParquetFile(fn) out = pf.to_pandas(filters=[['x', '>', 3]]) pd.util.testing.assert_frame_equal(out, df) out = pf.to_pandas(filters=[['x', '>', 30]]) assert len(out) == 0
def test_multi_cat_fail(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df, row_group_offsets=25) pf = ParquetFile(fn) with pytest.raises(RuntimeError): pf.to_pandas()
def test_multi(tempdir): fn = os.path.join(tempdir, 'test.parq') N = 200 df = pd.DataFrame( {'a': np.random.randint(10, size=N), 'b': np.random.choice(['a', 'b', 'c'], size=N), 'c': np.arange(200)}) df = df.set_index(['a', 'b']) write(fn, df) pf = ParquetFile(fn) df1 = pf.to_pandas() assert df1.equals(df) assert df1.loc[1, 'a'].equals(df.loc[1, 'a'])
def test_hive_and_drill_list(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x=0') fn1 = os.path.join(dir1, 'part.0.parquet') os.makedirs(dir1) write(fn1, df) dir2 = os.path.join(tempdir, 'y') fn2 = os.path.join(dir2, 'part.0.parquet') os.makedirs(dir2) write(fn2, df) pf = ParquetFile([fn1, fn2]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 2 assert out.dir0.tolist() == ['x=0'] * 3 + ['y'] * 3
def test_single_upper_directory(tempdir): df = pd.DataFrame({'x': [1, 5, 2, 5], 'y': ['aa'] * 4}) write(tempdir, df, file_scheme='hive', partition_on='y') pf = ParquetFile(tempdir) out = pf.to_pandas() assert (out.y == 'aa').all() os.unlink(os.path.join(tempdir, '_metadata')) os.unlink(os.path.join(tempdir, '_common_metadata')) import glob flist = list(sorted(glob.glob(os.path.join(tempdir, '*/*')))) pf = ParquetFile(flist, root=tempdir) assert pf.fn == join_path(os.path.join(tempdir, '_metadata')) out = pf.to_pandas() assert (out.y == 'aa').all()
def test_datetime_roundtrip(tempdir, df, capsys): fname = os.path.join(tempdir, "test.parquet") write(fname, df) r = ParquetFile(fname) out, err = capsys.readouterr() if "x" in df and str(df.x.dtype.tz) == "Europe/London": # warning happens first time only assert "UTC" in err df2 = r.to_pandas() if "x" in df: df["x"] = df.x.dt.tz_convert(None) pd.util.testing.assert_frame_equal(df, df2, check_categorical=False)
def test_multi_list(tempdir): df = pd.DataFrame({'a': ['x', 'y', 'z'], 'b': [4, 5, 6]}) dir1 = os.path.join(tempdir, 'x') write(dir1, df, file_scheme='hive') dir2 = os.path.join(tempdir, 'y') write(dir2, df, file_scheme='hive') dir3 = os.path.join(tempdir, 'z', 'deep') write(dir3, df, file_scheme='hive') pf = ParquetFile([dir1, dir2]) out = pf.to_pandas() # this version may have extra column! assert out.a.tolist() == ['x', 'y', 'z'] * 2 pf = ParquetFile([dir1, dir2, dir3]) out = pf.to_pandas() assert out.a.tolist() == ['x', 'y', 'z'] * 3
def test_append(tempdir, row_groups, partition): fn = str(tempdir) df0 = pd.DataFrame({"a": [1, 2, 3, 0], "b": ["a", "b", "a", "b"], "c": True}) df1 = pd.DataFrame({"a": [4, 5, 6, 7], "b": ["a", "b", "a", "b"], "c": False}) write(fn, df0, partition_on=partition, file_scheme="hive", row_group_offsets=row_groups) write(fn, df1, partition_on=partition, file_scheme="hive", row_group_offsets=row_groups, append=True) pf = ParquetFile(fn) expected = pd.concat([df0, df1], ignore_index=True) assert len(pf.row_groups) == 2 * len(row_groups) * (len(partition) + 1) items_out = {tuple(row[1]) for row in pf.to_pandas()[["a", "b", "c"]].iterrows()} items_in = {tuple(row[1]) for row in expected.iterrows()} assert items_in == items_out
def create_train_test_features( tokenizer: tokenization.FullTokenizer ) -> Tuple[run_classifier.InputFeatures, run_classifier.InputFeatures]: train_input_examples, test_input_examples = (ParquetFile( data_filename(dataset_name)).to_pandas().sample(SAMPLE_SIZE).apply( create_bert_input_example, axis=1) for dataset_name in DATASET_NAMES) train_features, test_features = ( run_classifier.convert_examples_to_features(input_examples, LABEL_LIST, MAX_SEQ_LENGTH, tokenizer) for input_examples in (train_input_examples, test_input_examples)) return train_features, test_features
def test_consolidate_cats(tempdir): import json df = pd.DataFrame({'x': pd.Categorical([1, 2, 1])}) fn = os.path.join(tempdir, 'temp.parq') write(fn, df) pf = ParquetFile(fn) assert 2 == json.loads(pf.fmd.key_value_metadata[0].value )['columns'][0]['metadata']['num_categories'] start = pf.row_groups[0].columns[0].meta_data.key_value_metadata[0].value assert start == '2' pf.row_groups[0].columns[0].meta_data.key_value_metadata[0].value = '5' writer.consolidate_categories(pf.fmd) assert 5 == json.loads(pf.fmd.key_value_metadata[0].value )['columns'][0]['metadata']['num_categories']
def test_append(tempdir, row_groups, partition): fn = str(tempdir) df0 = pd.DataFrame({'a': [1, 2, 3, 0], 'b': ['a', 'b', 'a', 'b'], 'c': True}) df1 = pd.DataFrame({'a': [4, 5, 6, 7], 'b': ['a', 'b', 'a', 'b'], 'c': False}) write(fn, df0, partition_on=partition, file_scheme='hive', row_group_offsets=row_groups) write(fn, df1, partition_on=partition, file_scheme='hive', row_group_offsets=row_groups, append=True) pf = ParquetFile(fn) expected = pd.concat([df0, df1], ignore_index=True) assert len(pf.row_groups) == 2 * len(row_groups) * (len(partition) + 1) items_out = {tuple(row[1]) for row in pf.to_pandas()[['a', 'b', 'c']].iterrows()} items_in = {tuple(row[1]) for row in expected.iterrows()} assert items_in == items_out
def _read_fp_multifile(fs, fs_token, paths, columns=None, categories=None, index=None): """Read dataset with fastparquet by assuming metadata from first file""" from fastparquet import ParquetFile from fastparquet.util import analyse_paths, get_file_scheme, join_path base, fns = analyse_paths(paths) parsed_paths = [join_path(p) for p in paths] scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) (meta, _, index_name, out_type, all_columns, index_names, storage_name_mapping) = _pf_validation( pf, columns, index, categories, []) name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, categories) dsk = {(name, i): (_read_pf_simple, fs, path, base, index_names, all_columns, out_type == Series, categories, pf.cats, pf.file_scheme, storage_name_mapping) for i, path in enumerate(parsed_paths)} divisions = (None, ) * (len(paths) + 1) return out_type(dsk, name, meta, divisions)
def getEPAHistData(month, yr): """Function to get raw epa data from s3""" # Change this to use the csv file being modified every hour try: try: s3 = s3fs.S3FileSystem() myopen = s3.open s3_resource = boto3.resource('s3') s3_resource.Object('midscapstone-whos-polluting-my-air', 'EpaRaw/epa_20{}{}.parquet'.format(yr, month)).load() pf=ParquetFile('midscapstone-whos-polluting-my-air/EpaRaw/epa_20{}{}.parquet'.format(yr, month), open_with=myopen) epa_df=pf.to_pandas() except: raise CustomError("FILE ERROR: Epa Raw Dataframe not found") # Add a datekey column based on local date epa_df.rename(columns={'Latitude':'lat', 'Longitude':'lon', 'UTC':'utc', 'Parameter':'parameter', 'Unit':'epa_pm25_unit', 'Value':'epa_pm25_value', 'RawConcentration':'raw_concentration', 'AQI':'aqi', 'Category':'category', 'SiteName':'site_name', 'AgencyName':'agency_name', 'FullAQSCode':'full_aqs_code', 'IntlAQSCode':'intl_aqs_code'}, inplace=True) epa_df['created'] = epa_df['utc'].apply(lambda x: int(datetime.datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S').replace(tzinfo=tz.tzutc()).astimezone(timezone('US/Pacific')).strftime("%Y%m%d%H%M"))) except Exception as e: print("*** EXCEPTION IN GET EPA HIST DATA *** {}".format(e)) return epa_df
def test_cat_order(tempdir): # #629 fn = os.path.join(tempdir, 'temp.parq') cat = ['hot', 'moderate', 'cold'] catdtype = pd.CategoricalDtype(cat, ordered=True) val = [30, -10, 10] cities = ['Lisbonne', 'Paris', 'Paris'] df = pd.DataFrame({'val': val, 'cat': cat, 'city': cities}) df['cat'] = df['cat'].astype(catdtype) write(fn, df, file_scheme='hive', partition_on=['city']) out = ParquetFile(fn).to_pandas() assert out.cat.cat.ordered assert out.cat.cat.categories.tolist() == catdtype.categories.tolist()
def test_hasnulls_ordering(tempdir): fname = os.path.join(tempdir, 'temp.parq') data = pd.DataFrame({'a': np.random.rand(100), 'b': np.random.rand(100), 'c': np.random.rand(100)}) writer.write(fname, data, has_nulls=['a', 'c']) r = ParquetFile(fname) assert r._schema[1].name == 'a' assert r._schema[1].repetition_type == 1 assert r._schema[2].name == 'b' assert r._schema[2].repetition_type == 0 assert r._schema[3].name == 'c' assert r._schema[3].repetition_type == 1
def test_roundtrip(tempdir, scheme, row_groups, comp): data = pd.DataFrame({'i32': np.arange(1000, dtype=np.int32), 'i64': np.arange(1000, dtype=np.int64), 'f': np.arange(1000, dtype=np.float64), 'bhello': np.random.choice([b'hello', b'you', b'people'], size=1000).astype("O")}) data['a'] = np.array([b'a', b'b', b'c', b'd', b'e']*200, dtype="S1") data['aa'] = data['a'].map(lambda x: 2*x).astype("S2") data['hello'] = data.bhello.str.decode('utf8') data['bcat'] = data.bhello.astype('category') data['cat'] = data.hello.astype('category') fname = os.path.join(tempdir, 'test.parquet') write(fname, data, file_scheme=scheme, row_group_offsets=row_groups, compression=comp) r = ParquetFile(fname) df = r.to_pandas() assert data.cat.dtype == 'category' for col in r.columns: assert (df[col] == data[col]).all()
def incremental_train_with_parquet(self, parquet_path): print("Training incrementally with parquet...") nrows = 0 pf = ParquetFile(parquet_path) classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes).get_unique_binarized_labels( parquet_path, "tema", True) for df in pf.iter_row_groups(): df = df.reset_index() self._update_dataframe(df, is_parquet=True, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector.toarray(), y_train, classes=classes) nrows += len(self.df) print("{} rows already trained\n".format(nrows)) clear_output(wait=True)
def test_cast_index(tempdir): df = pd.DataFrame({'i8': np.array([1, 2, 3, 4], dtype='uint8'), 'i16': np.array([1, 2, 3, 4], dtype='int16'), 'i32': np.array([1, 2, 3, 4], dtype='int32'), 'i62': np.array([1, 2, 3, 4], dtype='int64'), 'f16': np.array([1, 2, 3, 4], dtype='float16'), 'f32': np.array([1, 2, 3, 4], dtype='float32'), 'f64': np.array([1, 2, 3, 4], dtype='float64'), }) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df) pf = ParquetFile(fn) for col in list(df): d = pf.to_pandas(index=col) if d.index.dtype.kind == 'i': assert d.index.dtype == 'int64' elif d.index.dtype.kind == 'u': # new UInt64Index assert pd.__version__ >= '0.20' assert d.index.dtype == 'uint64' else: assert d.index.dtype == 'float64' assert (d.index == df[col]).all()
def convert_to_npy(df=None, save=True, modalities=None): if df is None: df = ParquetFile( os.path.join(project_dir, 'data', 'interim', 'data.parq')).to_pandas().set_index('date') user_data = list() for user, group in df.groupby('user'): # Select activity activity = group[group['modality'] == 'cpm'] # Require 8 hours of data activity = activity[pd.isnull(activity).sum(axis=1) < (16 * 12)] if activity.modality.count() >= 120: group = group.loc[activity.index.tolist()] # Extract modalities modality_data = list() modality_grouped = group.groupby('modality') for modality in modalities: modality_data.append( modality_grouped.get_group(modality).drop(['modality'], axis=1)) # We concatenate on dates to ensure the same dimension across modalities user_data.append( pd.concat(modality_data, axis=1).values.reshape(-1, len(modality_data), 289).transpose(0, 2, 1)) data = np.concatenate(user_data, axis=0) if save: np.save(os.path.join(project_dir, 'data', 'interim', 'data.npy'), data) return data
def run_test(input_file: str, output_dir: str, filters: list): print('Using fastparquet') pf = ParquetFile(input_file) print('Parquet metadata: ' + str(pf.info)) print('Parquet schema: ' + str(pf.schema)) print('Parquet columns: ' + str(pf.columns)) print('Parquet count (total number of rows): ' + str(pf.count)) print('Parquet dtypes: ' + str(pf.dtypes)) print('Parquet statistics: ' + str(pf.statistics)) print('Parquet cats: ' + str(pf.cats)) # possible values of each partitioning field print('Parquet row_groups number: ' + str(len(pf.row_groups))) # print('Parquet row_groups: ' + str(pf.row_groups)) with timeblock('fastparquet read and filter'): data = pf.to_pandas(filters=filters) # data: RowGroup = pf.filter_row_groups(filters=filters) # for df in pf.iter_row_groups(): # print(df.shape) size = sys.getsizeof(data) print('Size of filtered Pandas dataframe in memory: ' + str(size) + ' bytes (' + str(size / 1000000) + ' MB)') milliseconds_since_epoch = int(time() * 1000) output_file = output_dir + str(milliseconds_since_epoch) + '.parquet' print('Output file name: ' + output_file) with timeblock('pyarrow write_table()'): write(output_file, data, compression='SNAPPY') pf = ParquetFile(output_file) print('Parquet metadata of output: ' + str(str(pf.info))) print('Parquet schema of output: ' + str(pf.schema)) print('Size of output file on disk: ' + str(os.path.getsize(output_file)) + ' bytes (' + str(os.path.getsize(output_file) / 1000000) + ' MB)')
def test_compression_lz4(tempdir): pytest.importorskip('lz4') df = pd.DataFrame({ 'x': np.arange(1000), 'y': np.arange(1, 1001), 'z': np.arange(2, 1002), }) fn = os.path.join(tempdir, 'foocomp.parquet') c = { "x": { "type": "gzip", "args": { "compresslevel": 5, } }, "y": { "type": "lz4", "args": { "compression": 5, "store_size": False, } }, "_default": { "type": "gzip", "args": None } } write(fn, df, compression=c) p = ParquetFile(fn) df2 = p.to_pandas() pd.util.testing.assert_frame_equal(df, df2)
def read_parquet_on_ha_hdfs(): """ Read parquet file on HA mode hdfs :return: """ ns = "nameservice1" conf = { "dfs.nameservices": "nameservice1", "dfs.ha.namenodes.nameservice1": "namenode113,namenode188", "dfs.namenode.rpc-address.nameservice1.namenode113": "hostname_of_server1:8020", "dfs.namenode.rpc-address.nameservice1.namenode188": "hostname_of_server2:8020", "dfs.namenode.http-address.nameservice1.namenode113": "hostname_of_server1:50070", "dfs.namenode.http-address.nameservice1.namenode188": "hostname_of_server2:50070", "hadoop.security.authentication": "kerberos" } hdfs = HDFileSystem(host=ns, pars=conf) sc = hdfs.open pf = ParquetFile("/user/hive/warehouse/test.db/test.parquet", open_with=sc) print(pf.to_pandas())
def test_iter(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) df.index.name = 'index' fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], write_index=True) pf = ParquetFile(fn) out = iter(pf.iter_row_groups(index='index')) d1 = next(out) pd.testing.assert_frame_equal(d1, df[:2], check_dtype=False, check_index_type=False) d2 = next(out) pd.testing.assert_frame_equal(d2, df[2:], check_dtype=False, check_index_type=False) with pytest.raises(StopIteration): next(out)
def test_attributes(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) pf = ParquetFile(fn) assert pf.columns == ['x', 'y', 'z'] assert len(pf.row_groups) == 2 assert pf.count == 4 assert fn == pf.info['name'] assert fn in str(pf) for col in df: assert pf.dtypes[col] == df.dtypes[col]
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2]) pf = ParquetFile(fn) result = sorted_partitioned_columns(pf) expected = {'x': {'min': [1, 3], 'max': [2, 4]}, 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}} assert result == expected
def test_timestamp_filer(tempdir): fn = os.path.join(tempdir, 'test.parquet') ts = [ pd.Timestamp('2021/01/01 08:00:00'), pd.Timestamp('2021/01/05 10:00:00') ] val = [10, 34] df = pd.DataFrame({'val': val, 'ts': ts}) # two row-groups write(fn, df, row_group_offsets=1, file_scheme='hive') ts_filter = pd.Timestamp('2021/01/03 00:00:00') pf = ParquetFile(fn) filt = [[('ts', '<', ts_filter)], [('ts', '>=', ts_filter)]] assert pf.to_pandas(filters=filt).val.tolist() == [10, 34] filt = [[('ts', '>=', ts_filter)], [('ts', '<', ts_filter)]] assert pf.to_pandas(filters=filt).val.tolist() == [10, 34] ts_filter_down = pd.Timestamp('2021/01/03 00:00:00') ts_filter_up = pd.Timestamp('2021/01/06 00:00:00') # AND filter filt = [[('ts', '>=', ts_filter_down), ('ts', '<', ts_filter_up)]] assert pf.to_pandas(filters=filt).val.tolist() == [34]
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({ 'x': [1, 2, 3, 4], 'v': [{ 'a': 0 }, { 'b': -1 }, { 'c': 5 }, { 'a': 0 }], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd'] }) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], object_encoding={ 'v': 'json', 'z': 'utf8' }) pf = ParquetFile(fn) # string stats should be stored without byte-encoding zcol = [ c for c in pf.row_groups[0].columns if c.meta_data.path_in_schema == ['z'] ][0] assert zcol.meta_data.statistics.min == b'a' result = sorted_partitioned_columns(pf) expected = { 'x': { 'min': [1, 3], 'max': [2, 4] }, 'z': { 'min': ['a', 'c'], 'max': ['b', 'd'] } } # NB column v should not feature, as dict are unorderable assert result == expected
def test_no_index_name(tempdir): df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'], 'b': [4, 5, 6]}).set_index('__index_level_0__') write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas() assert out.index.name is None assert out.index.tolist() == ['x', 'y', 'z'] df = pd.DataFrame({'__index_level_0__': ['x', 'y', 'z'], 'b': [4, 5, 6]}) write(tempdir, df, file_scheme='hive') pf = ParquetFile(tempdir) out = pf.to_pandas(index='__index_level_0__', columns=['b']) assert out.index.name is None assert out.index.tolist() == ['x', 'y', 'z'] pf = ParquetFile(tempdir) out = pf.to_pandas() assert out.index.name is None assert out.index.tolist() == [0, 1, 2]
def add_indices(self, chunk, part_id): chunk_fname = self.get_fname(part_id) pf = ParquetFile(chunk_fname) row_group_offsets = [0] for rg in pf.row_groups: row_group_offsets.append(row_group_offsets[-1] + rg.num_rows) row_group_offsets = row_group_offsets[:len(pf.row_groups)] high_cardinality_cols = set(chunk.cols) - set( self.categorical_cols) - set(self.partition_cols) for col in high_cardinality_cols: self.add_high_cardinality_index(chunk, col, part_id, row_group_offsets) for col in self.categorical_cols: self.add_categorical_index(chunk, col, part_id, row_group_offsets)
def append(bucket, key1, key2, s3, output_filename): s3_open = s3.open path1='{}{}'.format(bucket,key1) pf1 = ParquetFile(path1, open_with=s3_open) df1=pf1.to_pandas() path2='{}{}'.format(bucket,key2) pf2 = ParquetFile(path2, open_with=s3_open) df2=pf2.to_pandas() data = df1.append(df2) pwrite('{}{}'.format(bucket,output_filename), data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True)
def check_exists(s3: S3, frame: pd.DataFrame, table_name: str, table_partitions: List[AnyStr]): table_exists = s3.fs.exists(f'structured/{table_name}/_metadata') if table_exists: dataset = ParquetFile(f'structured/{table_name}', open_with=s3.fs.open) if not verify_schema(dataset, frame, table_partitions): old_files = [ fn.split(f'structured/{table_name}/')[-1] for fn in s3.fs.find(f'structured/{table_name}') ] deprecation_date = datetime.now().replace(microsecond=0).isoformat() for old_file in old_files: s3.fs.copy(f'structured/{table_name}/{old_file}', f'structured/deprecated/{deprecation_date}/{table_name}/{old_file}') s3.fs.rm(f'structured/{table_name}', recursive=True) table_exists = False return table_exists
def test_partition_columns(tempdir): symbols = ['a', 'a', 'b', 'c', 'c', 'd'] values = [1, 2, 3, 4, 5, 6] df = pd.DataFrame(data={'symbols': symbols, 'values': values}) write(tempdir, df, file_scheme='hive', partition_on=['symbols']) pf = ParquetFile(tempdir) # partition columns always come after actual columns assert pf.to_pandas().columns.tolist() == ['values', 'symbols'] assert pf.to_pandas(columns=['symbols']).columns.tolist() == ['symbols'] assert pf.to_pandas(columns=['values']).columns.tolist() == ['values'] assert pf.to_pandas(columns=[]).columns.tolist() == []
def test_write_with_rgp_by_date_as_index(tempdir): # Step 1 - Writing of a 1st df, with `row_group_offsets=0`, # `file_scheme=hive` and `partition_on=['location', 'color`]. df1 = pd.DataFrame({ 'humidity': [0.3, 0.8, 0.9], 'pressure': [1e5, 1.1e5, 0.95e5], 'location': ['Paris', 'Paris', 'Milan'], 'color': ['red', 'black', 'blue'] }) write(tempdir, df1, row_group_offsets=0, file_scheme='hive', partition_on=['location', 'color']) # Step 2 - Overwriting with a 2nd df having overlapping data, in # 'overwrite' mode: # `row_group_offsets=0`, `file_scheme=hive`, # `partition_on=['location', 'color`] and `append=True`. df2 = pd.DataFrame({ 'humidity': [0.5, 0.3, 0.4, 0.8, 1.1], 'pressure': [9e4, 1e5, 1.1e5, 1.1e5, 0.95e5], 'location': ['Milan', 'Paris', 'Paris', 'Paris', 'Paris'], 'color': ['red', 'black', 'black', 'green', 'green'] }) write(tempdir, df2, row_group_offsets=0, file_scheme='hive', append='overwrite', partition_on=['location', 'color']) expected = pd.DataFrame({'humidity': [0.9, 0.5, 0.3, 0.4, 0.8, 1.1, 0.3], 'pressure': [9.5e4, 9e4, 1e5, 1.1e5, 1.1e5, 9.5e4, 1e5], 'location': ['Milan', 'Milan', 'Paris', 'Paris', 'Paris', 'Paris', 'Paris'], 'color': ['blue', 'red', 'black', 'black', 'green', 'green', 'red']})\ .astype({'location': 'category', 'color': 'category'}) recorded = ParquetFile(tempdir).to_pandas() # df1 is 3 rows, df2 is 5 rows. Because of overlapping data with keys # 'location' = 'Paris' & 'color' = 'black' (1 row in df2, 2 rows in df2) # resulting df contains for this combination values of df2 and not that of # df1. Total resulting number of rows is 7. assert expected.equals(recorded)
def test_cmd_bytesize(tempdir, cmp): from fastparquet import core fn = os.path.join(tempdir, 'tmp.parq') df = pd.DataFrame({'s': ['a', 'b']}, dtype='category') write(fn, df, compression=cmp) pf = ParquetFile(fn) chunk = pf.row_groups[0].columns[0] cmd = chunk.meta_data csize = cmd.total_compressed_size f = open(fn, 'rb') f.seek(cmd.dictionary_page_offset) ph = core.read_thrift(f, parquet_thrift.PageHeader) c1 = ph.compressed_page_size f.seek(c1, 1) ph = core.read_thrift(f, parquet_thrift.PageHeader) c2 = ph.compressed_page_size f.seek(c2, 1) assert csize == f.tell() - cmd.dictionary_page_offset
def test_sorted_row_group_columns(tempdir): df = pd.DataFrame({'x': [1, 2, 3, 4], 'v': [{'a': 0}, {'b': -1}, {'c': 5}, {'a': 0}], 'y': [1.0, 2.0, 1.0, 2.0], 'z': ['a', 'b', 'c', 'd']}) fn = os.path.join(tempdir, 'foo.parquet') write(fn, df, row_group_offsets=[0, 2], object_encoding={'v': 'json', 'z': 'utf8'}) pf = ParquetFile(fn) result = sorted_partitioned_columns(pf) expected = {'x': {'min': [1, 3], 'max': [2, 4]}, 'z': {'min': ['a', 'c'], 'max': ['b', 'd']}} # NB column v should not feature, as dict are unorderable assert result == expected
def test_append_empty(tempdir, scheme): fn = os.path.join(str(tempdir), 'test.parq') df = pd.DataFrame({'a': [1, 2, 3, 0], 'b': ['a', 'a', 'b', 'b']}) write(fn, df.head(0), write_index=False, file_scheme=scheme) pf = ParquetFile(fn) assert pf.count() == 0 assert pf.file_scheme == 'empty' write(fn, df, append=True, write_index=False, file_scheme=scheme) pf = ParquetFile(fn) pd.testing.assert_frame_equal( pf.to_pandas(), df, check_categorical=False, check_dtype=False)
def read_single(n, type): #print(type) if type == 'item': pf = ParquetFile( '/itemFactors/part-0000' + str(n) + '-bb0e8317-d384-4c08-824c-0b2a8661846f-c000.snappy.parquet') return pf.to_pandas() elif type == 'user': pf = ParquetFile( '/userFactors/part-0000' + str(n) + '-e7a03551-5ae9-4231-b614-549034330d20-c000.snappy.parquet') return pf.to_pandas() return -1
def test_write_partitioned_with_empty_categories(tempdir): df = pd.DataFrame({ 'b': np.random.random(size=1000), 'a': pd.Series(np.random.choice(['x', 'z'], size=1000)).astype( CategoricalDtype(categories=['x', 'y', 'z'])), }) write(tempdir, df, partition_on=['a'], file_scheme='hive', write_index=True) out = ParquetFile(tempdir).to_pandas() assert_frame_equal(out, df, check_like=True, check_categorical=False, check_names=False)