def test_statistics(tempdir): s = pd.Series([b'a', b'b', b'c'] * 20) df = pd.DataFrame({ 'a': s, 'b': s.astype('category'), 'c': s.astype('category').cat.as_ordered() }) fastparquet.write(tempdir, df, file_scheme='hive') pf = fastparquet.ParquetFile(tempdir) stat = pf.statistics assert stat['max']['a'] == [b'c'] assert stat['min']['a'] == [b'a'] assert stat['max']['b'] == [b'c'] assert stat['min']['b'] == [b'a'] assert stat['max']['c'] == [b'c'] assert stat['min']['c'] == [b'a']
def read_header( bucket: str, key: str, open_with: Callable[[str, str], Any] = _minio_open_random) -> ParquetFile: """ Ensure a ParquetFile exists, and return it with headers read. May raise FileNotFoundError or FastparquetCouldNotHandleFile. `retval.fn` gives the filename; `retval.columns` gives column names; `retval.dtypes` gives pandas dtypes, and `retval.to_pandas()` reads the entire file. """ filelike = open_with(bucket, key) # raises FileNotFoundError return fastparquet.ParquetFile(filelike)
def test_read_footer_fail(tempdir, size): """Test reading the footer.""" import struct fn = os.path.join(TEST_DATA, "nation.impala.parquet") fout = os.path.join(tempdir, "temp.parquet") with open(fn, 'rb') as f1: with open(fout, 'wb') as f2: f1.seek(-8, 2) head_size = struct.unpack('<i', f1.read(4))[0] f1.seek(-(head_size + 8), 2) block = f1.read(head_size) f2.write(b'0' * 25) # padding f2.write(block[:-size]) f2.write(f1.read()) with pytest.raises(TypeError): p = fastparquet.ParquetFile(fout)
def test_ordering(tmpdir): check_fastparquet() tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp) pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex') assert_eq(ddf, ddf2)
def test_ordering(tmpdir, write_engine, read_engine): tmp = str(tmpdir) df = pd.DataFrame({'a': [1, 2, 3], 'b': [10, 20, 30], 'c': [100, 200, 300]}, index=pd.Index([-1, -2, -3], name='myindex'), columns=['c', 'a', 'b']) ddf = dd.from_pandas(df, npartitions=2) dd.to_parquet(ddf, tmp, engine=write_engine) if read_engine == 'fastparquet': pf = fastparquet.ParquetFile(tmp) assert pf.columns == ['myindex', 'c', 'a', 'b'] ddf2 = dd.read_parquet(tmp, index='myindex', engine=read_engine) assert_eq(ddf, ddf2, check_divisions=False)
def test_partition_cols_supported(self, fp, df_full): # GH #23283 partition_cols = ["bool", "int"] df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", partition_cols=partition_cols, compression=None, ) assert os.path.exists(path) import fastparquet actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2
def test_map_multipage(tempdir): pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "map-test.snappy.parquet")) assert pf.count() == 3551 df = pf.to_pandas() first_row_keys = [u'FoxNews.com', u'News Network', u'mobile technology', u'broadcast', u'sustainability', u'collective intelligence', u'radio', u'business law', u'LLC', u'telecommunications', u'FOX News Network'] last_row_keys = [u'protests', u'gas mask', u'Pot & Painting Party', u'Denver', u'New Year', u'Anderson Cooper', u'gas mask bonk', u'digital media', u'marijuana leaf earrings', u'Screengrab', u'gas mask bongs', u'Randi Kaye', u'Lee Rogers', u'Andy Cohen', u'CNN', u'Times Square', u'Colorado', u'opera', u'slavery', u'Kathy Griffin', u'marijuana cigarette', u'executive producer'] assert len(df) == 3551 assert sorted(df["topics"].iloc[0].keys()) == sorted(first_row_keys) assert sorted(df["topics"].iloc[-1].keys()) == sorted(last_row_keys) assert df.isnull().sum().sum() == 0 # ensure every row got converted
def test_partition_cols_string(self, fp, df_full): # GH #27117 partition_cols = "bool" df = df_full with tm.ensure_clean_dir() as path: df.to_parquet( path, engine="fastparquet", partition_cols=partition_cols, compression=None, ) assert os.path.exists(path) import fastparquet # noqa: F811 actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 1
def read_parquet(path, storage_options=None): """ Construct a SpatialPointsFrame from a spatially partitioned parquet file If the input parquet file does not contain compatible spatial metadata, then the resulting SpatialPointsFrame will have a .spatial property of None, and the spatial_query operation will be unavailable. Parameters ---------- path: str Path to a spatially partitioned parquet file that was created using datashader.spatial.points.to_parquet storage_options : dict or None (default None) Key/value pairs to be passed on to the file-system backend, if any. Returns ------- SpatialPointsFrame A spatially sorted Dask dataframe reconstructed from disk """ _validate_fastparquet() # Read parquet file frame = dd.read_parquet(path, storage_options=storage_options) # Open parquet file fs, _, paths = get_fs_token_paths(path, mode="rb", storage_options=storage_options) # Trim any protocol information from the path before forwarding path = fs._strip_protocol(path) pf = fp.ParquetFile(path, open_with=fs.open) # Check for spatial points metadata if 'SpatialPointsFrame' in pf.key_value_metadata: # Load metadata props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) else: props = None # Call DataFrame constructor with the internals of frame return SpatialPointsFrame(frame.dask, frame._name, frame._meta, frame.divisions, props)
def pq_to_np(rows, source, destination, prefix): """A function to save the parquet files as npy files instead. The data can be split into many smaller npy files by using the rows argument. """ dir_path = pathlib.Path(source) id_prefix_len = len('Train_') if dir_path.is_dir(): for file_path in dir_path.iterdir(): if file_path.suffix == '.parquet': if file_path.stem.startswith(prefix): print(f"Loading {file_path.resolve().as_posix()}.") parquet = pq.ParquetFile(file_path.resolve().as_posix()) df = parquet.to_pandas() df['image_id'] = df['image_id'].map( lambda x: x[id_prefix_len:]) df['image_id'] = pd.to_numeric(df['image_id'], downcast='unsigned') print(df.dtypes) print(f"Exporting data as .npz files.") df_samples = df.shape[0] processed_samples = 0 file_idx = 0 while processed_samples < df_samples: file_size = min(rows, df_samples - processed_samples) output_file = f"{destination}/{file_path.stem}_" \ f"{file_idx}_{file_size}rows.npz" np_samples = df.iloc[ processed_samples:processed_samples + file_size, 1:].to_numpy() np_ids = df.iloc[processed_samples:processed_samples + file_size, 0].to_numpy() np.savez(output_file, ids=np_ids, images=np_samples) print(np_ids.shape, np_samples.shape) processed_samples += rows file_idx += 1 print("Complete.") return
def count_remaining_interactions(): interactions_remained = 0 for file in listdir(path_dataset): if file[-8:] == '.parquet': current_file = fastparquet.ParquetFile(path_dataset + file) current_df = current_file.to_pandas(['post_id', 'uid']) for row in current_df.values: current_oid = int(row[0].split('_')[0]) current_uid = int(row[1]) try: temp = [oid2ind[current_oid], uid2ind[current_uid]] interactions_remained += 1 except KeyError: pass print('Interactions remained:', interactions_remained) return interactions_remained
def test_index(tempdir): s = pd.Series(['a', 'c', 'b'] * 20) df = pd.DataFrame({ 'a': s, 'b': s.astype('category'), 'c': range(60, 0, -1) }) for column in df: d2 = df.set_index(column) fastparquet.write(tempdir, d2, file_scheme='hive', write_index=True) pf = fastparquet.ParquetFile(tempdir) out = pf.to_pandas(index=column, categories=['b']) pd.testing.assert_frame_equal(out, d2, check_categorical=False, check_index_type=False, check_dtype=False)
def test_writing_parquet_with_compression(tmpdir, compression, engine): fn = str(tmpdir) if engine == 'fastparquet' and compression in ['snappy', 'default']: pytest.importorskip('snappy') df = pd.DataFrame({'x': ['a', 'b', 'c'] * 10, 'y': [1, 2, 3] * 10}) ddf = dd.from_pandas(df, npartitions=3) ddf.to_parquet(fn, compression=compression, engine=engine) if engine == 'fastparquet' and compression == 'default': # ensure default compression for fastparquet is Snappy import fastparquet pf = fastparquet.ParquetFile(fn) assert pf.row_groups[0].columns[0].meta_data.codec == 1 out = dd.read_parquet(fn, engine=engine) assert_eq(out, df, check_index=(engine != 'fastparquet'))
def save_embeddings_to_pickle_file(): import pandas import datetime timestart = datetime.datetime.now() print("Embedding vector size =", vector_size) embedding_pickle_file = os.path.join(home_dir, "Models/w2vmodel_pubmed_vs_{}_ws_{}_mc_{}.pkl" \ .format(vector_size, window_size, min_count)) Word2Vec_Model = {} print("Reading the Parquet embedding files ....") files = os.listdir(embedding_full_path) for index, filename in enumerate(files): if "part" in filename: parquet_file_path = os.path.join(embedding_full_path, filename) print("reading {}".format(parquet_file_path)) try: pfile = fastparquet.ParquetFile(parquet_file_path) # convert to pandas dataframe df = pfile.to_pandas() # df = pandas.read_csv(tsv_full_path, sep='\t') #print(df.head()) arr = list(df.values) for ind, vals in enumerate(arr): word = vals[0] word_vec = vals[-vector_size:] word_vec = np.array(word_vec) Word2Vec_Model[word] = word_vec.astype('float32') except: print("Skip {}".format(filename)) #save the embedding matrix into a pickle file print("save the embedding matrix of {} entries into a pickle file".format( len(Word2Vec_Model))) pickle.dump(Word2Vec_Model, open(embedding_pickle_file, "wb")) timeend = datetime.datetime.now() timedelta = round((timeend - timestart).total_seconds() / 60, 2) print("Time taken to execute above cell: " + str(timedelta) + " mins") return (embedding_pickle_file)
def save_embeddings_to_pickle_file(embedding_full_path, embedding_pickle_file, embed_vector_size): import pandas import datetime timestart = datetime.datetime.now() print("Embedding vector size =", embed_vector_size) Word2Vec_Model = {} print("Reading the Parquet embedding files .... {}".format( embedding_full_path)) files = os.listdir(embedding_full_path) for index, filename in enumerate(files): if "part" in filename: parquet_file_path = os.path.join(embedding_full_path, filename) print("reading {}".format(parquet_file_path)) try: pfile = fastparquet.ParquetFile(parquet_file_path) # convert to pandas dataframe df = pfile.to_pandas() #print(df.head()) arr = list(df.values) for ind, vals in enumerate(arr): word = vals[0] word_vec = vals[1:embed_vector_size + 1] word_vec = np.array(word_vec) Word2Vec_Model[word] = word_vec.astype('float32') except: print("Skip {}".format(filename)) #save the embedding matrix into a pickle file print("save the embedding matrix of {} entries into a pickle file".format( len(Word2Vec_Model))) pickle.dump(Word2Vec_Model, open(embedding_pickle_file, "wb")) timeend = datetime.datetime.now() timedelta = round((timeend - timestart).total_seconds() / 60, 2) print( "Time taken to execute the save_embeddings_to_pickle_file function: " + str(timedelta) + " mins") print("Done.")
def test_v2(): # from https://github.com/apache/parquet-testing/tree/master/data pf = fastparquet.ParquetFile( os.path.join(TEST_DATA, 'datapage_v2.snappy.parquet')) expected = { 'a': { 0: 'abc', 1: 'abc', 2: 'abc', 3: None, 4: 'abc' }, 'b': { 0: 1, 1: 2, 2: 3, 3: 4, 4: 5 }, 'c': { 0: 2.0, 1: 3.0, 2: 4.0, 3: 5.0, 4: 2.0 }, 'd': { 0: True, 1: True, 2: True, 3: False, 4: True }, 'e': { 0: [1, 2, 3], 1: None, 2: None, 3: [1, 2, 3], 4: [1, 2] } } out = pf.to_pandas() assert out.to_dict() == expected
def test_with_cache(): import tempfile d = tempfile.mkdtemp() old = intake.config.conf['cache_dir'] expected = fastparquet.ParquetFile(os.path.join(here, 'split')).to_pandas() try: intake.config.conf['cache_dir'] = d cat = intake.open_catalog(os.path.join(here, 'cache_cat.yaml')) s = cat.split() assert isinstance(s.cache[0], intake.source.cache.DirCache) outfiles = s.cache[0].load(s._urlpath, output=False) assert outfiles assert outfiles[0].startswith(s.cache_dirs[0]) loc = s.cache[0]._path(s._urlpath) assert glob.glob(loc + '/*/*/*.parquet') assert s.read().reset_index(drop=True).equals(expected) finally: shutil.rmtree(d) intake.config.conf['cache_dir'] = old
def test_file_csv(parquet_file): """Test the various file times """ p = fastparquet.ParquetFile(parquet_file) data = p.to_pandas() if 'comment_col' in data.columns: mapping = { 'comment_col': "n_comment", 'name': 'n_name', 'nation_key': 'n_nationkey', 'region_key': 'n_regionkey' } data.columns = [mapping[k] for k in data.columns] data.set_index('n_nationkey', inplace=True) for col in cols[1:]: if isinstance(data[col][0], bytes): data[col] = data[col].str.decode('utf8') assert (data[col] == expected[col]).all()
def value(self, erase_overlap=False): ''' :rtype: Pandas dataframe of clustering ''' ret = None if self.cluster is not None: ret = self.cluster else: self.logger.info("reading" + self.path) df = fastparquet.ParquetFile(self.path).to_pandas() self.cluster = df if self.cluster is not None: self.cluster = self.cluster.drop_duplicates() ret = self.cluster if not erase_overlap: return ret else: fn = lambda obj: obj.loc[np.random.choice(obj.index, 1, False), :] return ret.groupby('node', as_index=False).apply(fn)
def test_null_plain_dictionary(): """Test reading a file that contains null records for a plain dictionary column.""" p = fastparquet.ParquetFile( os.path.join(TEST_DATA, "test-null-dictionary.parquet")) data = p.to_pandas() expected = pd.DataFrame([{ "foo": None }] + [{ "foo": "bar" }, { "foo": "baz" }] * 3) for col in data: if isinstance(data[col][1], bytes): # Remove when re-implemented converted types data[col] = data[col].str.decode('utf8') assert (data[col] == expected[col])[~expected[col].isnull()].all() assert sum(data[col].isnull()) == sum(expected[col].isnull())
def test_dir_partition(): """Test creation of categories from directory structure""" x = np.arange(2000) df = pd.DataFrame({ 'num': x, 'cat': pd.Series(np.array(['fred', 'freda'])[x % 2], dtype='category'), 'catnum': pd.Series(np.array([1, 2, 3])[x % 3], dtype='category') }) pf = fastparquet.ParquetFile(os.path.join(TEST_DATA, "split")) out = pf.to_pandas() for cat, catnum in product(['fred', 'freda'], [1, 2, 3]): assert (df.num[(df.cat == cat) & (df.catnum == catnum)].tolist()) ==\ out.num[(out.cat == cat) & (out.catnum == catnum)].tolist() assert out.cat.dtype == 'category' assert out.catnum.dtype == 'category' assert out.catnum.cat.categories.dtype == 'int64'
def test_cat_filters(): path = os.path.join(TEST_DATA, 'split') pf = fastparquet.ParquetFile(path) base_shape = len(pf.to_pandas()) filters = [('cat', '==', 'freda')] assert len(pf.to_pandas(filters=filters)) == 1000 filters = [('cat', '!=', 'freda')] assert len(pf.to_pandas(filters=filters)) == 1000 filters = [('cat', 'in', ['fred', 'freda'])] assert 0 < len(pf.to_pandas(filters=filters)) == 2000 filters = [('cat', 'not in', ['fred', 'frederick'])] assert 0 < len(pf.to_pandas(filters=filters)) == 1000 filters = [('catnum', '==', 2000)] assert len(pf.to_pandas(filters=filters)) == 0 filters = [('catnum', '>=', 2)] assert 0 < len(pf.to_pandas(filters=filters)) == 1333 filters = [('catnum', '>=', 1)] assert len(pf.to_pandas(filters=filters)) == base_shape filters = [('catnum', 'in', [0, 1])] assert len(pf.to_pandas(filters=filters)) == 667 filters = [('catnum', 'not in', [1, 2, 3])] assert len(pf.to_pandas(filters=filters)) == 0 # AND filters = [[('cat', '==', 'freda'), ('catnum', '>=', 2.5)]] assert len(pf.to_pandas(filters=filters)) == 333 # OR filters = [('cat', '==', 'freda'), ('catnum', '>=', 2.5)] assert len(pf.to_pandas(filters=filters)) == 1333 # AND filters = [[('cat', '==', 'freda'), ('catnum', '!=', 2.5)]] assert len(pf.to_pandas(filters=filters)) == 1000
def test_stat_filters(): path = os.path.join(TEST_DATA, 'split') pf = fastparquet.ParquetFile(path) base_shape = len(pf.to_pandas()) filters = [('num', '>', 0)] assert len(pf.to_pandas(filters=filters)) == base_shape filters = [('num', '<', 0)] assert len(pf.to_pandas(filters=filters)) == 0 filters = [('num', '>', 500)] assert 0 < len(pf.to_pandas(filters=filters)) < base_shape filters = [('num', '>', 1500)] assert 0 < len(pf.to_pandas(filters=filters)) < base_shape filters = [('num', '>', 2000)] assert len(pf.to_pandas(filters=filters)) == 0 filters = [('num', '>=', 1999)] assert 0 < len(pf.to_pandas(filters=filters)) < base_shape filters = [('num', '!=', 1000)] assert len(pf.to_pandas(filters=filters)) == base_shape filters = [('num', 'in', [-1, -2])] assert len(pf.to_pandas(filters=filters)) == 0 filters = [('num', 'not in', [-1, -2])] assert len(pf.to_pandas(filters=filters)) == base_shape filters = [('num', 'in', [0])] l = len(pf.to_pandas(filters=filters)) assert 0 < l < base_shape filters = [('num', 'in', [0, 1500])] assert l < len(pf.to_pandas(filters=filters)) < base_shape filters = [('num', 'in', [-1, 1999])] l = len(pf.to_pandas(filters=filters)) assert 0 < l < base_shape
def load_parquet_fp(path: str, **kwargs) -> pd.DataFrame: """ Helper function to load a parquet Dataset as a Pandas DataFrame using fastparquet First creates a [ParquetFile](https://fastparquet.readthedocs.io/en/latest/api.html#fastparquet.ParquetFile) and then converts the ParquetFile to a DataFrame using .to_pandas. Refer to the fastparquet documentation for accepted arguments Parameters ----------- path : str The root directory of the Parquet Dataset stored locally or in S3 Returns -------- pd.DataFrame """ import fastparquet as fp logger.info( f"Reading in Parquet dataset to ParquetFile. kwargs passed {kwargs!r}") fs = kwargs.pop("fs", None) # Pull out arguments that should be directed to to_pandas to_pandas_args = parse_args(fp, ["ParquetFile", "to_pandas"], **kwargs) # Remove these args from kwargs kwargs = { k: v for k, v in kwargs.items() if k in set(kwargs) - set(to_pandas_args) } if s3.is_s3path(path): fs = fs or s3fs.S3FileSystem() myopen = fs.open else: myopen = open pf = fp.ParquetFile(path, open_with=myopen, **kwargs) df = pf.to_pandas(**to_pandas_args) return df
def test_empty_row_groups(tempdir, sql): fn = os.path.join(tempdir, 'output.parquet') d0 = pd.DataFrame({'name': ['alice'], 'age': [20]}) df = sql.createDataFrame(d0) df.write.parquet(fn) import glob files = glob.glob(os.path.join(fn, '*.parquet')) sizes = [os.stat(p).st_size for p in files] msize = max(sizes) pf = fastparquet.ParquetFile(files) # don't necessarily have metadata assert len(files) > 1 # more than one worker was writing d = pf.to_pandas(index=False) pd.util.testing.assert_frame_equal(d, d0) # destroy empty files [os.unlink(f) for (f, s) in zip(files, sizes) if s < msize] # loads anyway, since empty row-groups are not touched d = pf.to_pandas() pd.util.testing.assert_frame_equal(d, d0)
def test_or_filtering(tempdir): path = os.path.join(TEST_DATA, 'split') pf = fastparquet.ParquetFile(path) # Defining 2 filters resulting in 2 disjointed row groups. up_filter = [('num', '>=', 1925)] down_filter = [('num', '<=', 18)] # Check disjointed groups. empty_df = pf.to_pandas(filters=[up_filter + down_filter]) assert empty_df.empty # Reading row groups separately for reference. up_df = pf.to_pandas(filters=up_filter) down_df = pf.to_pandas(filters=down_filter) cols = list(up_df.columns) ref_df = pd.concat([up_df, down_df]).sort_values(cols)\ .reset_index(drop=True) # Reading row groups using OR operation in `filters`. or_filter = [up_filter, down_filter] or_df = pf.to_pandas(filters=or_filter).sort_values(cols)\ .reset_index(drop=True) assert (or_df.equals(ref_df))
def from_parquet(filename): """ Construct a SpatialPointsFrame from a spatially partitioned parquet file Parameters ---------- filename: str Path to a spatially partitioned parquet file that was created using SpatialPointsFrame.partition_and_write Returns ------- SpatialPointsFrame A spatially sorted Dask dataframe reconstructed from disk """ _validate_fastparquet() # Open parquet file pf = fp.ParquetFile(filename) # Check for required metadata if 'SpatialPointsFrame' not in pf.key_value_metadata: raise ValueError(""" The parquet file at '{filename}' does not appear to be spatially partitioned. Please construct a spatially partitioned parquet file using the SpatialPointsFrame.partition_and_write static method.""".format( filename=filename)) # Load metadata props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) # Read parquet file frame = dd.read_parquet(filename) # Call DataFrame constructor with the internals of frame return SpatialPointsFrame(frame.dask, frame._name, frame._meta, frame.divisions, props)
def test_map_array(sql): """ from pyspark.sql.types import * df_schema = StructType([ StructField('map_op_op', MapType(StringType(), StringType(), True), True), StructField('map_op_req', MapType(StringType(), StringType(), False), True), StructField('map_req_op', MapType(StringType(), StringType(), True), False), StructField('map_req_req', MapType(StringType(), StringType(), False), False), StructField('arr_op_op', ArrayType(StringType(), True), True), StructField('arr_op_req', ArrayType(StringType(), False), True), StructField('arr_req_op', ArrayType(StringType(), True), False), StructField('arr_req_req', ArrayType(StringType(), False), False)]) keys = ['fred', 'wilma', 'barney', 'betty'] vals = ['franky', 'benji', 'mighty'] out = [] for i in range(1000): part = [] for field in [f.name for f in df_schema.fields]: sort, nullable, nullvalue = field.split('_') if nullable == 'op' and np.random.random() < 0.3: part.append(None) continue N = np.random.randint(5) ks = np.random.choice(keys, size=N).tolist() vs = np.random.choice(vals + [None] if nullvalue == 'op' else vals, size=N).tolist() if sort == 'map': part.append({k: v for (k, v) in zip(ks, vs)}) else: part.append(vs) out.append(part) df = sql.createDataFrame(out, df_schema) """ fn = os.path.join(TEST_DATA, 'map_array.parq') expected = sql.read.parquet(fn).toPandas() pf = fastparquet.ParquetFile(fn) data = pf.to_pandas() pd.util.testing.assert_frame_equal(data, expected)
def read_parquet(path): """ Construct a SpatialPointsFrame from a spatially partitioned parquet file If the input parquet file does not contain compatible spatial metadata, then the resulting SpatialPointsFrame will have a .spatial property of None, and the spatial_query operation will be unavailable. Parameters ---------- path: str Path to a spatially partitioned parquet file that was created using datashader.spatial.points.to_parquet Returns ------- SpatialPointsFrame A spatially sorted Dask dataframe reconstructed from disk """ _validate_fastparquet() # Open parquet file pf = fp.ParquetFile(path) # Read parquet file frame = dd.read_parquet(path) # Check for spatial points metadata if 'SpatialPointsFrame' in pf.key_value_metadata: # Load metadata props = json.loads(pf.key_value_metadata['SpatialPointsFrame']) else: props = None # Call DataFrame constructor with the internals of frame return SpatialPointsFrame(frame.dask, frame._name, frame._meta, frame.divisions, props)
def read_header(path: Path) -> ParquetFile: """ Ensure a ParquetFile exists, and return it with headers read. May raise OSError (e.g., FileNotFoundError) or FastparquetCouldNotHandleFile. `retval.fn` gives the filename; `retval.columns` gives column names; `retval.dtypes` gives pandas dtypes, and `retval.to_pandas()` reads the entire file. """ try: return fastparquet.ParquetFile(path) except IndexError: # TODO nix this when fastparquet resolves # https://github.com/dask/fastparquet/issues/361 # # The file has a zero-length column list, and fastparquet can't # handle that. # # Our cached DataFrame should be "empty". No columns means no # rows. raise FastparquetIssue361