def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] pf.fmd.key_value_metadata = None return pf.read_row_group_file(rg_piece, columns, categories, index=index, **kwargs.get("read", {}))
def read_partition( cls, fs, piece, columns, index, categories=(), pf=None, **kwargs ): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile( pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file( rg_piece, columns, categories, index=index, **kwargs.get("read", {}) )
def read_partition(cls, fs, piece, columns, index, categories=(), **kwargs): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) if isinstance(piece, tuple): if isinstance(piece[0], str): # We have a path to read from assert parquet_file is None parquet_file = ParquetFile(piece[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) rg_indices = piece[1] or list( range(len(parquet_file.row_groups))) # `piece[1]` will contain row-group indices row_groups = [parquet_file.row_groups[rg] for rg in rg_indices] elif parquet_file: # `piece[1]` will contain actual row-group objects, # but they may be pickled row_groups = piece[0] if isinstance(row_groups, bytes): row_groups = pickle.loads(row_groups) parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats else: raise ValueError("Neither path nor ParquetFile detected!") if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `piece` is NOT a tuple raise ValueError(f"Expected tuple, got {type(piece)}")
def read_partition( cls, fs, pieces, columns, index, categories=(), root_cats=None, root_file_scheme=None, base_path=None, **kwargs, ): null_index_name = False base_path = False if not root_cats else base_path if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) # Always convert pieces to list if not isinstance(pieces, list): pieces = [pieces] sample = pieces[0] if isinstance(sample, tuple): if isinstance(sample[0], str): # We have paths to read from assert parquet_file is None row_groups = [] rg_offset = 0 parquet_file = ParquetFile( [p[0] for p in pieces], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), ) for piece in pieces: _pf = (parquet_file if len(pieces) == 1 else ParquetFile( piece[0], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), )) n_local_row_groups = len(_pf.row_groups) local_rg_indices = piece[1] or list( range(n_local_row_groups)) row_groups += [ parquet_file.row_groups[rg + rg_offset] for rg in local_rg_indices ] rg_offset += n_local_row_groups update_parquet_file = len(row_groups) < len( parquet_file.row_groups) elif parquet_file: row_groups = [] for piece in pieces: # `piece[1]` will contain actual row-group objects, # but they may be pickled rgs = piece[0] if isinstance(rgs, bytes): rgs = pickle.loads(rgs) row_groups += rgs update_parquet_file = True else: raise ValueError("Neither path nor ParquetFile detected!") if update_parquet_file: with _FP_FILE_LOCK: parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index # Update hive-partitioning information if necessary parquet_file.cats = root_cats or {} if root_cats: parquet_file.file_scheme = root_file_scheme parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed if set(columns).issubset(parquet_file.columns + list(parquet_file.cats.keys())): # Convert ParquetFile to pandas return parquet_file.to_pandas( columns=columns, categories=categories, index=index, ) else: # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `sample` is NOT a tuple raise ValueError(f"Expected tuple, got {type(sample)}")