Пример #1
0
    def for_block(block: Block) -> "BlockAccessor[T]":
        """Create a block accessor for the given block."""
        _check_pyarrow_version()
        import pyarrow
        import pandas

        if isinstance(block, pyarrow.Table):
            from ray.data.impl.arrow_block import ArrowBlockAccessor

            return ArrowBlockAccessor(block)
        elif isinstance(block, pandas.DataFrame):
            from ray.data.impl.pandas_block import PandasBlockAccessor

            return PandasBlockAccessor(block)
        elif isinstance(block, bytes):
            from ray.data.impl.arrow_block import ArrowBlockAccessor

            return ArrowBlockAccessor.from_bytes(block)
        elif isinstance(block, list):
            from ray.data.impl.simple_block import SimpleBlockAccessor

            return SimpleBlockAccessor(block)
        else:
            raise TypeError("Not a block type: {} ({})".format(
                block, type(block)))
Пример #2
0
    def prepare_read(self, parallelism: int, n: int,
                     num_columns: int) -> List[ReadTask]:
        _check_pyarrow_version()
        import pyarrow

        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        def make_block(count: int, num_columns: int) -> Block:
            return pyarrow.Table.from_arrays(
                np.random.randint(np.iinfo(np.int64).max,
                                  size=(num_columns, count),
                                  dtype=np.int64),
                names=[f"c_{i}" for i in range(num_columns)])

        schema = pyarrow.Table.from_pydict(
            {f"c_{i}": [0]
             for i in range(num_columns)}).schema

        i = 0
        while i < n:
            count = min(block_size, n - i)
            read_tasks.append(
                ReadTask(lambda count=count, num_columns=num_columns:
                         make_block(count, num_columns),
                         BlockMetadata(num_rows=count,
                                       size_bytes=8 * count * num_columns,
                                       schema=schema,
                                       input_files=None)))
            i += block_size

        return read_tasks
Пример #3
0
    def prepare_read(self,
                     parallelism: int,
                     n: int,
                     block_format: str = "list",
                     tensor_shape: Tuple = (1, )) -> List[ReadTask]:
        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        # Example of a read task. In a real datasource, this would pull data
        # from an external system instead of generating dummy data.
        def make_block(start: int, count: int) -> Block:
            if block_format == "arrow":
                return pyarrow.Table.from_arrays(
                    [np.arange(start, start + count)], names=["value"])
            elif block_format == "tensor":
                tensor = TensorArray(
                    np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
                        np.arange(start, start + count),
                        tuple(range(1, 1 + len(tensor_shape)))))
                return pyarrow.Table.from_pydict({"value": tensor})
            else:
                return list(builtins.range(start, start + count))

        i = 0
        while i < n:
            count = min(block_size, n - i)
            if block_format == "arrow":
                _check_pyarrow_version()
                import pyarrow
                schema = pyarrow.Table.from_pydict({"value": [0]}).schema
            elif block_format == "tensor":
                _check_pyarrow_version()
                from ray.data.extensions import TensorArray
                import pyarrow
                tensor = TensorArray(
                    np.ones(tensor_shape, dtype=np.int64) * np.expand_dims(
                        np.arange(0, 10), tuple(
                            range(1, 1 + len(tensor_shape)))))
                schema = pyarrow.Table.from_pydict({"value": tensor}).schema
            elif block_format == "list":
                schema = int
            else:
                raise ValueError("Unsupported block type", block_format)
            meta = BlockMetadata(
                num_rows=count,
                size_bytes=8 * count,
                schema=schema,
                input_files=None,
                exec_stats=None)
            read_tasks.append(
                ReadTask(
                    lambda i=i, count=count: [make_block(i, count)], meta))
            i += block_size

        return read_tasks
Пример #4
0
    def for_block(block: Block) -> "BlockAccessor[T]":
        """Create a block accessor for the given block."""
        _check_pyarrow_version()
        import pyarrow

        if isinstance(block, pyarrow.Table):
            from ray.data.impl.arrow_block import \
                ArrowBlockAccessor
            return ArrowBlockAccessor(block)
        elif isinstance(block, list):
            from ray.data.impl.simple_block import \
                SimpleBlockAccessor
            return SimpleBlockAccessor(block)
        elif isinstance(block, np.ndarray):
            from ray.data.impl.tensor_block import \
                TensorBlockAccessor
            return TensorBlockAccessor(block)
        else:
            raise TypeError("Not a block type: {}".format(block))
Пример #5
0
    def prepare_read(self,
                     parallelism: int,
                     paths: Union[str, List[str]],
                     filesystem: Optional["pyarrow.fs.FileSystem"] = None,
                     columns: Optional[List[str]] = None,
                     schema: Optional[Union[type,
                                            "pyarrow.lib.Schema"]] = None,
                     _block_udf: Optional[Callable[[Block], Block]] = None,
                     **reader_args) -> List[ReadTask]:
        """Creates and returns read tasks for a Parquet file-based datasource.
        """
        # NOTE: We override the base class FileBasedDatasource.prepare_read
        # method in order to leverage pyarrow's ParquetDataset abstraction,
        # which simplifies partitioning logic. We still use
        # FileBasedDatasource's write side (do_write), however.
        _check_pyarrow_version()
        from ray import cloudpickle
        import pyarrow as pa
        import pyarrow.parquet as pq
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        if len(paths) == 1:
            paths = paths[0]

        dataset_kwargs = reader_args.pop("dataset_kwargs", {})
        pq_ds = pq.ParquetDataset(paths,
                                  **dataset_kwargs,
                                  filesystem=filesystem,
                                  use_legacy_dataset=False)
        if schema is None:
            schema = pq_ds.schema
        if columns:
            schema = pa.schema([schema.field(column) for column in columns],
                               schema.metadata)

        def read_pieces(serialized_pieces: List[str]):
            # Implicitly trigger S3 subsystem initialization by importing
            # pyarrow.fs.
            import pyarrow.fs  # noqa: F401

            # Deserialize after loading the filesystem class.
            pieces: List["pyarrow._dataset.ParquetFileFragment"] = [
                cloudpickle.loads(p) for p in serialized_pieces
            ]

            # Ensure that we're reading at least one dataset fragment.
            assert len(pieces) > 0

            from pyarrow.dataset import _get_partition_keys

            logger.debug(f"Reading {len(pieces)} parquet pieces")
            use_threads = reader_args.pop("use_threads", False)
            tables = []
            for piece in pieces:
                table = piece.to_table(use_threads=use_threads,
                                       columns=columns,
                                       schema=schema,
                                       **reader_args)
                part = _get_partition_keys(piece.partition_expression)
                if part:
                    for col, value in part.items():
                        table = table.set_column(
                            table.schema.get_field_index(col), col,
                            pa.array([value] * len(table)))
                # If the table is empty, drop it.
                if table.num_rows > 0:
                    tables.append(table)
            if len(tables) > 1:
                table = pa.concat_tables(tables, promote=True)
            elif len(tables) == 1:
                table = tables[0]
            if _block_udf is not None:
                table = _block_udf(table)
            # If len(tables) == 0, all fragments were empty, and we return the
            # empty table from the last fragment.
            return table

        if _block_udf is not None:
            # Try to infer dataset schema by passing dummy table through UDF.
            dummy_table = schema.empty_table()
            try:
                inferred_schema = _block_udf(dummy_table).schema
                inferred_schema = inferred_schema.with_metadata(
                    schema.metadata)
            except Exception:
                logger.info(
                    "Failed to infer schema of dataset by passing dummy table "
                    "through UDF due to the following exception:",
                    exc_info=True)
                inferred_schema = schema
        else:
            inferred_schema = schema
        read_tasks = []
        for pieces in np.array_split(pq_ds.pieces, parallelism):
            if len(pieces) == 0:
                continue
            metadata = _get_metadata(pieces, inferred_schema)
            pieces = [cloudpickle.dumps(p) for p in pieces]
            read_tasks.append(
                ReadTask(lambda pieces_=pieces: read_pieces(pieces_),
                         metadata))

        return read_tasks
Пример #6
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        open_stream_args: Optional[Dict[str, Any]] = None,
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource."""
        _check_pyarrow_version()
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_infos = _expand_paths(paths, filesystem)
        file_sizes = [file_info.size for file_info in file_infos]

        read_stream = self._read_stream

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        if open_stream_args is None:
            open_stream_args = {}

        def read_files(
            read_paths: List[str],
            fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper],
        ) -> Iterable[Block]:
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )
            for read_path in read_paths:
                with fs.open_input_stream(read_path, **open_stream_args) as f:
                    for data in read_stream(f, read_path, **reader_args):
                        output_buffer.add_block(data)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        read_tasks = []
        for read_paths, file_sizes in zip(
            np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)
        ):
            if len(read_paths) <= 0:
                continue

            if self._rows_per_file() is None:
                num_rows = None
            else:
                num_rows = len(read_paths) * self._rows_per_file()
            meta = BlockMetadata(
                num_rows=num_rows,
                size_bytes=sum(file_sizes),
                schema=schema,
                input_files=read_paths,
                exec_stats=None,
            )  # Exec stats filled in later.
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(read_paths, filesystem), meta
            )
            read_tasks.append(read_task)

        return read_tasks
Пример #7
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        open_stream_args: Optional[Dict[str, Any]] = None,
        meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(),
        partition_filter: PathPartitionFilter = None,
        # TODO(ekl) deprecate this once read fusion is available.
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource."""
        _check_pyarrow_version()
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_sizes = meta_provider.expand_paths(paths, filesystem)
        if partition_filter is not None:
            paths = partition_filter(paths)

        read_stream = self._read_stream

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        if open_stream_args is None:
            open_stream_args = {}

        def read_files(
            read_paths: List[str],
            fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper],
        ) -> Iterable[Block]:
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )
            for read_path in read_paths:
                compression = open_stream_args.pop("compression", None)
                if compression is None:
                    import pyarrow as pa

                    try:
                        # If no compression manually given, try to detect
                        # compression codec from path.
                        compression = pa.Codec.detect(read_path).name
                    except (ValueError, TypeError):
                        # Arrow's compression inference on the file path
                        # doesn't work for Snappy, so we double-check ourselves.
                        import pathlib

                        suffix = pathlib.Path(read_path).suffix
                        if suffix and suffix[1:] == "snappy":
                            compression = "snappy"
                        else:
                            compression = None
                if compression == "snappy":
                    # Pass Snappy compression as a reader arg, so datasource subclasses
                    # can manually handle streaming decompression in
                    # self._read_stream().
                    reader_args["compression"] = compression
                    reader_args["filesystem"] = fs
                elif compression is not None:
                    # Non-Snappy compression, pass as open_input_stream() arg so Arrow
                    # can take care of streaming decompression for us.
                    open_stream_args["compression"] = compression
                with self._open_input_source(fs, read_path, **open_stream_args) as f:
                    for data in read_stream(f, read_path, **reader_args):
                        output_buffer.add_block(data)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        # fix https://github.com/ray-project/ray/issues/24296
        parallelism = min(parallelism, len(paths))

        read_tasks = []
        for read_paths, file_sizes in zip(
            np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)
        ):
            if len(read_paths) <= 0:
                continue

            meta = meta_provider(
                read_paths,
                schema,
                rows_per_file=self._rows_per_file(),
                file_sizes=file_sizes,
            )
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(read_paths, filesystem), meta
            )
            read_tasks.append(read_task)

        return read_tasks
Пример #8
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        columns: Optional[List[str]] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        meta_provider: ParquetMetadataProvider = DefaultParquetMetadataProvider(),
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a Parquet file-based datasource."""
        # NOTE: We override the base class FileBasedDatasource.prepare_read
        # method in order to leverage pyarrow's ParquetDataset abstraction,
        # which simplifies partitioning logic. We still use
        # FileBasedDatasource's write side (do_write), however.
        _check_pyarrow_version()
        from ray import cloudpickle
        import pyarrow as pa
        import pyarrow.parquet as pq
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        if len(paths) == 1:
            paths = paths[0]

        dataset_kwargs = reader_args.pop("dataset_kwargs", {})
        pq_ds = pq.ParquetDataset(
            paths, **dataset_kwargs, filesystem=filesystem, use_legacy_dataset=False
        )
        if schema is None:
            schema = pq_ds.schema
        if columns:
            schema = pa.schema(
                [schema.field(column) for column in columns], schema.metadata
            )

        def read_pieces(serialized_pieces: str) -> Iterator[pa.Table]:
            # Implicitly trigger S3 subsystem initialization by importing
            # pyarrow.fs.
            import pyarrow.fs  # noqa: F401

            # Deserialize after loading the filesystem class.
            try:
                _register_parquet_file_fragment_serialization()
                pieces: List[
                    "pyarrow._dataset.ParquetFileFragment"
                ] = cloudpickle.loads(serialized_pieces)
            finally:
                _deregister_parquet_file_fragment_serialization()

            # Ensure that we're reading at least one dataset fragment.
            assert len(pieces) > 0

            from pyarrow.dataset import _get_partition_keys

            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )

            logger.debug(f"Reading {len(pieces)} parquet pieces")
            use_threads = reader_args.pop("use_threads", False)
            for piece in pieces:
                part = _get_partition_keys(piece.partition_expression)
                batches = piece.to_batches(
                    use_threads=use_threads,
                    columns=columns,
                    schema=schema,
                    batch_size=PARQUET_READER_ROW_BATCH_SIZE,
                    **reader_args,
                )
                for batch in batches:
                    table = pyarrow.Table.from_batches([batch], schema=schema)
                    if part:
                        for col, value in part.items():
                            table = table.set_column(
                                table.schema.get_field_index(col),
                                col,
                                pa.array([value] * len(table)),
                            )
                    # If the table is empty, drop it.
                    if table.num_rows > 0:
                        output_buffer.add_block(table)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        if _block_udf is not None:
            # Try to infer dataset schema by passing dummy table through UDF.
            dummy_table = schema.empty_table()
            try:
                inferred_schema = _block_udf(dummy_table).schema
                inferred_schema = inferred_schema.with_metadata(schema.metadata)
            except Exception:
                logger.debug(
                    "Failed to infer schema of dataset by passing dummy table "
                    "through UDF due to the following exception:",
                    exc_info=True,
                )
                inferred_schema = schema
        else:
            inferred_schema = schema
        read_tasks = []
        metadata = meta_provider.prefetch_file_metadata(pq_ds.pieces) or []
        try:
            _register_parquet_file_fragment_serialization()
            for pieces, metadata in zip(
                np.array_split(pq_ds.pieces, parallelism),
                np.array_split(metadata, parallelism),
            ):
                if len(pieces) <= 0:
                    continue
                serialized_pieces = cloudpickle.dumps(pieces)
                input_files = [p.path for p in pieces]
                meta = meta_provider(
                    input_files,
                    inferred_schema,
                    pieces=pieces,
                    prefetched_metadata=metadata,
                )
                read_tasks.append(
                    ReadTask(lambda p=serialized_pieces: read_pieces(p), meta)
                )
        finally:
            _deregister_parquet_file_fragment_serialization()

        return read_tasks
Пример #9
0
    def prepare_read(
            self,
            parallelism: int,
            paths: Union[str, List[str]],
            filesystem: Optional["pyarrow.fs.FileSystem"] = None,
            schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
            open_stream_args: Optional[Dict[str, Any]] = None,
            _block_udf: Optional[Callable[[Block], Block]] = None,
            **reader_args) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource.
        """
        _check_pyarrow_version()
        import pyarrow as pa
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_infos = _expand_paths(paths, filesystem)
        file_sizes = [file_info.size for file_info in file_infos]

        read_file = self._read_file

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        if open_stream_args is None:
            open_stream_args = {}

        def read_files(
                read_paths: List[str],
                fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]):
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            builder = DelegatingArrowBlockBuilder()
            for read_path in read_paths:
                with fs.open_input_stream(read_path, **open_stream_args) as f:
                    data = read_file(f, read_path, **reader_args)
                    if isinstance(data, pa.Table) or isinstance(
                            data, np.ndarray):
                        builder.add_block(data)
                    else:
                        builder.add(data)
            block = builder.build()
            if _block_udf is not None:
                block = _block_udf(block)
            return block

        read_tasks = []
        for read_paths, file_sizes in zip(
                np.array_split(paths, parallelism),
                np.array_split(file_sizes, parallelism)):
            if len(read_paths) <= 0:
                continue

            if self._rows_per_file() is None:
                num_rows = None
            else:
                num_rows = len(read_paths) * self._rows_per_file()
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(
                    read_paths, filesystem),
                BlockMetadata(
                    num_rows=num_rows,
                    size_bytes=sum(file_sizes),
                    schema=schema,
                    input_files=read_paths)
            )
            read_tasks.append(read_task)

        return read_tasks
Пример #10
0
    def prepare_read(
        self,
        parallelism: int,
        paths: Union[str, List[str]],
        filesystem: Optional["pyarrow.fs.FileSystem"] = None,
        schema: Optional[Union[type, "pyarrow.lib.Schema"]] = None,
        open_stream_args: Optional[Dict[str, Any]] = None,
        meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(),
        # TODO(ekl) deprecate this once read fusion is available.
        _block_udf: Optional[Callable[[Block], Block]] = None,
        **reader_args,
    ) -> List[ReadTask]:
        """Creates and returns read tasks for a file-based datasource."""
        _check_pyarrow_version()
        import numpy as np

        paths, filesystem = _resolve_paths_and_filesystem(paths, filesystem)
        paths, file_sizes = meta_provider.expand_paths(paths, filesystem)

        read_stream = self._read_stream

        filesystem = _wrap_s3_serialization_workaround(filesystem)

        if open_stream_args is None:
            open_stream_args = {}

        def read_files(
            read_paths: List[str],
            fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper],
        ) -> Iterable[Block]:
            logger.debug(f"Reading {len(read_paths)} files.")
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            ctx = DatasetContext.get_current()
            output_buffer = BlockOutputBuffer(
                block_udf=_block_udf, target_max_block_size=ctx.target_max_block_size
            )
            for read_path in read_paths:
                with fs.open_input_stream(read_path, **open_stream_args) as f:
                    for data in read_stream(f, read_path, **reader_args):
                        output_buffer.add_block(data)
                        if output_buffer.has_next():
                            yield output_buffer.next()
            output_buffer.finalize()
            if output_buffer.has_next():
                yield output_buffer.next()

        read_tasks = []
        for read_paths, file_sizes in zip(
            np.array_split(paths, parallelism), np.array_split(file_sizes, parallelism)
        ):
            if len(read_paths) <= 0:
                continue

            meta = meta_provider(
                read_paths,
                schema,
                rows_per_file=self._rows_per_file(),
                file_sizes=file_sizes,
            )
            read_task = ReadTask(
                lambda read_paths=read_paths: read_files(read_paths, filesystem), meta
            )
            read_tasks.append(read_task)

        return read_tasks