class ParquetFile(object): """ Open a Parquet binary file for reading Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. """ def __init__(self, source, metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema def read(self, nrows=None, columns=None, nthreads=1): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ if nrows is not None: raise NotImplementedError("nrows argument") if columns is None: column_indices = None else: column_indices = [ self.reader.column_name_idx(column) for column in columns ] return self.reader.read(column_indices=column_indices, nthreads=nthreads)
class ParquetFile(object): """ Open a Parquet binary file for reading Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. """ def __init__(self, source, metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema def read(self, nrows=None, columns=None, nthreads=1): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ if nrows is not None: raise NotImplementedError("nrows argument") if columns is None: column_indices = None else: column_indices = [self.reader.column_name_idx(column) for column in columns] return self.reader.read(column_indices=column_indices, nthreads=nthreads)
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. """ def __init__(self, source, metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices(columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_row_group(i, column_indices=column_indices) def read(self, columns=None, nthreads=1): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices(columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_all(column_indices=column_indices) def read_pandas(self, columns=None, nthreads=1): column_indices = self._get_column_indices(columns) custom_metadata = self.metadata.metadata if custom_metadata and b'pandas' in custom_metadata: index_columns = json.loads( custom_metadata[b'pandas'].decode('utf8') )['index_columns'] else: index_columns = [] if column_indices is not None and index_columns: column_indices += map(self.reader.column_name_idx, index_columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_all(column_indices=column_indices) def _get_column_indices(self, column_names): if column_names is None: return None return list(map(self.reader.column_name_idx, column_names))
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. common_metadata : ParquetFileMetadata, default None Will be used in reads for pandas schema metadata if not found in the main file's metadata, no other uses at the moment """ def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() def _build_nested_paths(self): paths = self.reader.column_paths result = defaultdict(list) def _visit_piece(i, key, rest): result[key].append(i) if len(rest) > 0: nested_key = '.'.join((key, rest[0])) _visit_piece(i, nested_key, rest[1:]) for i, path in enumerate(paths): _visit_piece(i, path[0], path[1:]) return result @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, nthreads=nthreads) def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_all(column_indices=column_indices, nthreads=nthreads) def scan_contents(self, columns=None, batch_size=65536): """ Read contents of file with a single thread for indicated columns and batch size. Number of rows in file is returned. This function is used for benchmarking Parameters ---------- columns : list of integers, default None If None, scan all columns batch_size : int, default 64K Number of rows to read at a time internally Returns ------- num_rows : number of rows in file """ column_indices = self._get_column_indices(columns) return self.reader.scan_contents(column_indices, batch_size=batch_size) def _get_column_indices(self, column_names, use_pandas_metadata=False): if column_names is None: return None indices = [] for name in column_names: if name in self._nested_paths_by_prefix: indices.extend(self._nested_paths_by_prefix[name]) if use_pandas_metadata: file_keyvalues = self.metadata.metadata common_keyvalues = (self.common_metadata.metadata if self.common_metadata is not None else None) if file_keyvalues and b'pandas' in file_keyvalues: index_columns = _get_pandas_index_columns(file_keyvalues) elif common_keyvalues and b'pandas' in common_keyvalues: index_columns = _get_pandas_index_columns(common_keyvalues) else: index_columns = [] if indices is not None and index_columns: indices += map(self.reader.column_name_idx, index_columns) return indices
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. common_metadata : ParquetFileMetadata, default None Will be used in reads for pandas schema metadata if not found in the main file's metadata, no other uses at the moment """ def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) self.common_metadata = common_metadata @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, nthreads=nthreads) def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_all(column_indices=column_indices, nthreads=nthreads) def _get_column_indices(self, column_names, use_pandas_metadata=False): if column_names is None: return None indices = list(map(self.reader.column_name_idx, column_names)) if use_pandas_metadata: file_keyvalues = self.metadata.metadata common_keyvalues = (self.common_metadata.metadata if self.common_metadata is not None else None) if file_keyvalues and b'pandas' in file_keyvalues: index_columns = _get_pandas_index_columns(file_keyvalues) elif common_keyvalues and b'pandas' in common_keyvalues: index_columns = _get_pandas_index_columns(common_keyvalues) else: index_columns = [] if indices is not None and index_columns: indices += map(self.reader.column_name_idx, index_columns) return indices
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str, pyarrow.NativeFile, or file-like object Readable source. For passing bytes or buffer-like file containing a Parquet file, use pyarorw.BufferReader metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. common_metadata : ParquetFileMetadata, default None Will be used in reads for pandas schema metadata if not found in the main file's metadata, no other uses at the moment """ def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() def _build_nested_paths(self): paths = self.reader.column_paths result = defaultdict(list) def _visit_piece(i, key, rest): result[key].append(i) if len(rest) > 0: nested_key = '.'.join((key, rest[0])) _visit_piece(i, nested_key, rest[1:]) for i, path in enumerate(paths): _visit_piece(i, path[0], path[1:]) return result @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, nthreads=nthreads) def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_all(column_indices=column_indices, nthreads=nthreads) def scan_contents(self, columns=None, batch_size=65536): """ Read contents of file with a single thread for indicated columns and batch size. Number of rows in file is returned. This function is used for benchmarking Parameters ---------- columns : list of integers, default None If None, scan all columns batch_size : int, default 64K Number of rows to read at a time internally Returns ------- num_rows : number of rows in file """ column_indices = self._get_column_indices(columns) return self.reader.scan_contents(column_indices, batch_size=batch_size) def _get_column_indices(self, column_names, use_pandas_metadata=False): if column_names is None: return None indices = [] for name in column_names: if name in self._nested_paths_by_prefix: indices.extend(self._nested_paths_by_prefix[name]) if use_pandas_metadata: file_keyvalues = self.metadata.metadata common_keyvalues = (self.common_metadata.metadata if self.common_metadata is not None else None) if file_keyvalues and b'pandas' in file_keyvalues: index_columns = _get_pandas_index_columns(file_keyvalues) elif common_keyvalues and b'pandas' in common_keyvalues: index_columns = _get_pandas_index_columns(common_keyvalues) else: index_columns = [] if indices is not None and index_columns: indices += map(self.reader.column_name_idx, index_columns) return indices
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. """ def __init__(self, source, metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices(columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_row_group(i, column_indices=column_indices) def read(self, columns=None, nthreads=1): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices(columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_all(column_indices=column_indices) def read_pandas(self, columns=None, nthreads=1): column_indices = self._get_column_indices(columns) custom_metadata = self.metadata.metadata if custom_metadata and b'pandas' in custom_metadata: index_columns = json.loads( custom_metadata[b'pandas'].decode('utf8'))['index_columns'] else: index_columns = [] if column_indices is not None and index_columns: column_indices += map(self.reader.column_name_idx, index_columns) if nthreads is not None: self.reader.set_num_threads(nthreads) return self.reader.read_all(column_indices=column_indices) def _get_column_indices(self, column_names): if column_names is None: return None return list(map(self.reader.column_name_idx, column_names))
class ParquetFile(object): """ Reader interface for a single Parquet file Parameters ---------- source : str or pyarrow.io.NativeFile Readable source. For passing Python file objects or byte buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. metadata : ParquetFileMetadata, default None Use existing metadata object, rather than reading from file. common_metadata : ParquetFileMetadata, default None Will be used in reads for pandas schema metadata if not found in the main file's metadata, no other uses at the moment """ def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) self.common_metadata = common_metadata @property def metadata(self): return self.reader.metadata @property def schema(self): return self.metadata.schema @property def num_row_groups(self): return self.reader.num_row_groups def read_row_group(self, i, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a single row group from a Parquet file Parameters ---------- columns: list If not None, only these columns will be read from the row group. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the row group as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, nthreads=nthreads) def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read a Table from Parquet format Parameters ---------- columns: list If not None, only these columns will be read from the file. nthreads : int, default 1 Number of columns to read in parallel. If > 1, requires that the underlying file source is threadsafe use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded Returns ------- pyarrow.table.Table Content of the file as a table (of columns) """ column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_all(column_indices=column_indices, nthreads=nthreads) def scan_contents(self, columns=None, batch_size=65536): """ Read contents of file with a single thread for indicated columns and batch size. Number of rows in file is returned. This function is used for benchmarking Parameters ---------- columns : list of integers, default None If None, scan all columns batch_size : int, default 64K Number of rows to read at a time internally Returns ------- num_rows : number of rows in file """ column_indices = self._get_column_indices(columns) return self.reader.scan_contents(column_indices, batch_size=batch_size) def _get_column_indices(self, column_names, use_pandas_metadata=False): if column_names is None: return None indices = list(map(self.reader.column_name_idx, column_names)) if use_pandas_metadata: file_keyvalues = self.metadata.metadata common_keyvalues = (self.common_metadata.metadata if self.common_metadata is not None else None) if file_keyvalues and b'pandas' in file_keyvalues: index_columns = _get_pandas_index_columns(file_keyvalues) elif common_keyvalues and b'pandas' in common_keyvalues: index_columns = _get_pandas_index_columns(common_keyvalues) else: index_columns = [] if indices is not None and index_columns: indices += map(self.reader.column_name_idx, index_columns) return indices