def _get_metadata(pieces: List["pyarrow._dataset.ParquetFileFragment"], schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None): piece_metadata = [] for p in pieces: try: piece_metadata.append(p.metadata) except AttributeError: break input_files = [p.path for p in pieces] if len(piece_metadata) == len(pieces): # Piece metadata was available, construct a normal # BlockMetadata. block_metadata = BlockMetadata( num_rows=sum(m.num_rows for m in piece_metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in range(m.num_row_groups)) for m in piece_metadata), schema=schema, input_files=input_files) else: # Piece metadata was not available, construct an empty # BlockMetadata. block_metadata = BlockMetadata(num_rows=None, size_bytes=None, schema=schema, input_files=input_files) return block_metadata
def _get_block_metadata( self, paths: List[str], schema: Optional[Union[type, "pyarrow.lib.Schema"]], *, pieces: List["pyarrow.dataset.ParquetFileFragment"], prefetched_metadata: Optional[List["pyarrow.parquet.FileMetaData"]], ) -> BlockMetadata: if prefetched_metadata is not None and len(prefetched_metadata) == len(pieces): # Piece metadata was available, construct a normal # BlockMetadata. block_metadata = BlockMetadata( num_rows=sum(m.num_rows for m in prefetched_metadata), size_bytes=sum( sum(m.row_group(i).total_byte_size for i in range(m.num_row_groups)) for m in prefetched_metadata ), schema=schema, input_files=paths, exec_stats=None, ) # Exec stats filled in later. else: # Piece metadata was not available, construct an empty # BlockMetadata. block_metadata = BlockMetadata( num_rows=None, size_bytes=None, schema=schema, input_files=paths ) return block_metadata
def _build_block_metadata( pieces: List["pyarrow.dataset.ParquetFileFragment"], metadata: List["pyarrow.parquet.FileMetaData"], schema: Optional[Union[type, "pyarrow.lib.Schema"]]) -> BlockMetadata: input_files = [p.path for p in pieces] if len(metadata) == len(pieces): # Piece metadata was available, construct a normal # BlockMetadata. block_metadata = BlockMetadata( num_rows=sum(m.num_rows for m in metadata), size_bytes=sum( sum( m.row_group(i).total_byte_size for i in range(m.num_row_groups)) for m in metadata), schema=schema, input_files=input_files, exec_stats=BlockExecStats.TODO) else: # Piece metadata was not available, construct an empty # BlockMetadata. block_metadata = BlockMetadata(num_rows=None, size_bytes=None, schema=schema, input_files=input_files) return block_metadata
def prepare_read(self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, **reader_args) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource. """ import pyarrow as pa import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_file = self._read_file filesystem = _wrap_s3_serialization_workaround(filesystem) def read_files(read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() builder = DelegatingArrowBlockBuilder() for read_path in read_paths: with fs.open_input_stream(read_path) as f: data = read_file(f, read_path, **reader_args) if isinstance(data, pa.Table) or isinstance( data, np.ndarray): builder.add_block(data) else: builder.add(data) return builder.build() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() read_task = ReadTask(lambda read_paths=read_paths: read_files( read_paths, filesystem), BlockMetadata(num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths)) read_tasks.append(read_task) return read_tasks
def _get_block_metadata( self, paths: List[str], schema: Optional[Union[type, "pyarrow.lib.Schema"]], *, rows_per_file: Optional[int], file_sizes: List[Optional[int]], ) -> BlockMetadata: if rows_per_file is None: num_rows = None else: num_rows = len(paths) * rows_per_file return BlockMetadata( num_rows=num_rows, size_bytes=None if None in file_sizes else sum(file_sizes), schema=schema, input_files=paths, exec_stats=None, ) # Exec stats filled in later.
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_infos = _expand_paths(paths, filesystem) file_sizes = [file_info.size for file_info in file_infos] read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: with fs.open_input_stream(read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue if self._rows_per_file() is None: num_rows = None else: num_rows = len(read_paths) * self._rows_per_file() meta = BlockMetadata( num_rows=num_rows, size_bytes=sum(file_sizes), schema=schema, input_files=read_paths, exec_stats=None, ) # Exec stats filled in later. read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks