def for_block(block: Block) -> "BlockAccessor[T]": """Create a block accessor for the given block.""" _check_pyarrow_version() import pandas import pyarrow if isinstance(block, pyarrow.Table): from ray.data._internal.arrow_block import ArrowBlockAccessor return ArrowBlockAccessor(block) elif isinstance(block, pandas.DataFrame): from ray.data._internal.pandas_block import PandasBlockAccessor return PandasBlockAccessor(block) elif isinstance(block, bytes): from ray.data._internal.arrow_block import ArrowBlockAccessor return ArrowBlockAccessor.from_bytes(block) elif isinstance(block, list): from ray.data._internal.simple_block import SimpleBlockAccessor return SimpleBlockAccessor(block) else: raise TypeError("Not a block type: {} ({})".format( block, type(block)))
def __init__( self, delegate: FileBasedDatasource, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider( ), partition_filter: PathPartitionFilter = None, # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ): _check_pyarrow_version() self._delegate = delegate self._schema = schema self._open_stream_args = open_stream_args self._meta_provider = meta_provider self._partition_filter = partition_filter self._block_udf = _block_udf self._reader_args = reader_args paths, self._filesystem = _resolve_paths_and_filesystem( paths, filesystem) self._paths, self._file_sizes = meta_provider.expand_paths( paths, self._filesystem)
def get_read_tasks( self, parallelism: int, ) -> List[ReadTask]: _check_pyarrow_version() import pyarrow read_tasks: List[ReadTask] = [] n = self._n num_columns = self._num_columns block_size = max(1, n // parallelism) def make_block(count: int, num_columns: int) -> Block: return pyarrow.Table.from_arrays( np.random.randint( np.iinfo(np.int64).max, size=(num_columns, count), dtype=np.int64 ), names=[f"c_{i}" for i in range(num_columns)], ) schema = pyarrow.Table.from_pydict( {f"c_{i}": [0] for i in range(num_columns)} ).schema i = 0 while i < n: count = min(block_size, n - i) meta = BlockMetadata( num_rows=count, size_bytes=8 * count * num_columns, schema=schema, input_files=None, exec_stats=None, ) read_tasks.append( ReadTask( lambda count=count, num_columns=num_columns: [ make_block(count, num_columns) ], meta, ) ) i += block_size return read_tasks
def prepare_read( self, parallelism: int, n: int, block_format: str = "list", tensor_shape: Tuple = (1, ), ) -> List[ReadTask]: read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) # Example of a read task. In a real datasource, this would pull data # from an external system instead of generating dummy data. def make_block(start: int, count: int) -> Block: if block_format == "arrow": import pyarrow as pa return pa.Table.from_arrays([np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))), ) return BlockAccessor.batch_to_block(tensor) else: return list(builtins.range(start, start + count)) i = 0 while i < n: count = min(block_size, n - i) if block_format == "arrow": _check_pyarrow_version() import pyarrow as pa schema = pa.Table.from_pydict({"value": [0]}).schema elif block_format == "tensor": _check_pyarrow_version() import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(0, 10), tuple(range(1, 1 + len(tensor_shape)))) schema = BlockAccessor.batch_to_block(tensor).schema elif block_format == "list": schema = int else: raise ValueError("Unsupported block type", block_format) if block_format == "tensor": element_size = np.product(tensor_shape) else: element_size = 1 meta = BlockMetadata( num_rows=count, size_bytes=8 * count * element_size, schema=schema, input_files=None, exec_stats=None, ) read_tasks.append( ReadTask(lambda i=i, count=count: [make_block(i, count)], meta)) i += block_size return read_tasks
def __init__( self, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider( ), _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ): _check_pyarrow_version() import pyarrow as pa import pyarrow.parquet as pq paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) try: pq_ds = pq.ParquetDataset(paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) except OSError as e: _handle_read_os_error(e, paths) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema([schema.field(column) for column in columns], schema.metadata) if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata( schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) inferred_schema = schema else: inferred_schema = schema try: self._metadata = meta_provider.prefetch_file_metadata( pq_ds.pieces) or [] except OSError as e: _handle_read_os_error(e, paths) self._pq_ds = pq_ds self._meta_provider = meta_provider self._inferred_schema = inferred_schema self._block_udf = _block_udf self._reader_args = reader_args self._columns = columns self._schema = schema
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, columns: Optional[List[str]] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider( ), _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a Parquet file-based datasource.""" # NOTE: We override the base class FileBasedDatasource.prepare_read # method in order to leverage pyarrow's ParquetDataset abstraction, # which simplifies partitioning logic. We still use # FileBasedDatasource's write side (do_write), however. _check_pyarrow_version() from ray import cloudpickle import pyarrow as pa import pyarrow.parquet as pq import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) if len(paths) == 1: paths = paths[0] dataset_kwargs = reader_args.pop("dataset_kwargs", {}) pq_ds = pq.ParquetDataset(paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False) if schema is None: schema = pq_ds.schema if columns: schema = pa.schema([schema.field(column) for column in columns], schema.metadata) def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]: # Implicitly trigger S3 subsystem initialization by importing # pyarrow.fs. import pyarrow.fs # noqa: F401 # Deserialize after loading the filesystem class. try: _register_parquet_file_fragment_serialization() pieces: List[ "pyarrow._dataset.ParquetFileFragment"] = cloudpickle.loads( serialized_pieces) finally: _deregister_parquet_file_fragment_serialization() # Ensure that we're reading at least one dataset fragment. assert len(pieces) > 0 from pyarrow.dataset import _get_partition_keys ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size) logger.debug(f"Reading {len(pieces)} parquet pieces") use_threads = reader_args.pop("use_threads", False) for piece in pieces: part = _get_partition_keys(piece.partition_expression) batches = piece.to_batches( use_threads=use_threads, columns=columns, schema=schema, batch_size=PARQUET_READER_ROW_BATCH_SIZE, **reader_args, ) for batch in batches: table = pyarrow.Table.from_batches([batch], schema=schema) if part: for col, value in part.items(): table = table.set_column( table.schema.get_field_index(col), col, pa.array([value] * len(table)), ) # If the table is empty, drop it. if table.num_rows > 0: output_buffer.add_block(table) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() if _block_udf is not None: # Try to infer dataset schema by passing dummy table through UDF. dummy_table = schema.empty_table() try: inferred_schema = _block_udf(dummy_table).schema inferred_schema = inferred_schema.with_metadata( schema.metadata) except Exception: logger.debug( "Failed to infer schema of dataset by passing dummy table " "through UDF due to the following exception:", exc_info=True, ) inferred_schema = schema else: inferred_schema = schema read_tasks = [] metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or [] try: _register_parquet_file_fragment_serialization() for pieces, metadata in zip( np.array_split(pq_ds.pieces, parallelism), np.array_split(metadata, parallelism), ): if len(pieces) <= 0: continue serialized_pieces = cloudpickle.dumps(pieces) input_files = [p.path for p in pieces] meta = meta_provider( input_files, inferred_schema, pieces=pieces, prefetched_metadata=metadata, ) read_tasks.append( ReadTask(lambda p=serialized_pieces: read_pieces(p), meta)) finally: _deregister_parquet_file_fragment_serialization() return read_tasks
def prepare_read( self, parallelism: int, paths: Union[str, List[str]], filesystem: Optional["pyarrow.fs.FileSystem"] = None, schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None, open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: PathPartitionFilter = None, # TODO(ekl) deprecate this once read fusion is available. _block_udf: Optional[Callable[[Block], Block]] = None, **reader_args, ) -> List[ReadTask]: """Creates and returns read tasks for a file-based datasource.""" _check_pyarrow_version() import numpy as np paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem) paths, file_sizes = meta_provider.expand_paths(paths, filesystem) if partition_filter is not None: filtered_paths = partition_filter(paths) if not filtered_paths: raise ValueError( "All provided and expanded paths have been filtered out by " "the path filter; please change the provided paths or the " f"path filter.\nPaths: {paths}\nFilter: {partition_filter}" ) paths = filtered_paths read_stream = self._read_stream filesystem = _wrap_s3_serialization_workaround(filesystem) read_options = reader_args.get("read_options") if read_options is not None: import pyarrow.json as pajson if isinstance(read_options, pajson.ReadOptions): _register_arrow_json_readoptions_serializer() if open_stream_args is None: open_stream_args = {} def read_files( read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper], ) -> Iterable[Block]: logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() ctx = DatasetContext.get_current() output_buffer = BlockOutputBuffer( block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size ) for read_path in read_paths: compression = open_stream_args.pop("compression", None) if compression is None: import pyarrow as pa try: # If no compression manually given, try to detect # compression codec from path. compression = pa.Codec.detect(read_path).name except (ValueError, TypeError): # Arrow's compression inference on the file path # doesn't work for Snappy, so we double-check ourselves. import pathlib suffix = pathlib.Path(read_path).suffix if suffix and suffix[1:] == "snappy": compression = "snappy" else: compression = None if compression == "snappy": # Pass Snappy compression as a reader arg, so datasource subclasses # can manually handle streaming decompression in # self._read_stream(). reader_args["compression"] = compression reader_args["filesystem"] = fs elif compression is not None: # Non-Snappy compression, pass as open_input_stream() arg so Arrow # can take care of streaming decompression for us. open_stream_args["compression"] = compression with self._open_input_source(fs, read_path, **open_stream_args) as f: for data in read_stream(f, read_path, **reader_args): output_buffer.add_block(data) if output_buffer.has_next(): yield output_buffer.next() output_buffer.finalize() if output_buffer.has_next(): yield output_buffer.next() # fix https://github.com/ray-project/ray/issues/24296 parallelism = min(parallelism, len(paths)) read_tasks = [] for read_paths, file_sizes in zip( np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism) ): if len(read_paths) <= 0: continue meta = meta_provider( read_paths, schema, rows_per_file=self._rows_per_file(), file_sizes=file_sizes, ) read_task = ReadTask( lambda read_paths=read_paths: read_files(read_paths, filesystem), meta ) read_tasks.append(read_task) return read_tasks