def read_csv( paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, arrow_open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: Optional[ PathPartitionFilter ] = CSVDatasource.file_extension_filter(), **arrow_csv_args, ) -> Dataset[ArrowRow]: """Create an Arrow dataset from csv files. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_csv("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_csv(["/path/to/file1", "/path/to/file2"]) # doctest: +SKIP >>> # Read multiple directories. >>> ray.data.read_csv( # doctest: +SKIP ... ["s3://bucket/path1", "s3://bucket/path2"]) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match "*.csv*". arrow_csv_args: Other csv read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource( CSVDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, ray_remote_args=ray_remote_args, open_stream_args=arrow_open_stream_args, meta_provider=meta_provider, partition_filter=partition_filter, **arrow_csv_args, )
def read_text( paths: Union[str, List[str]], *, encoding: str = "utf-8", errors: str = "ignore", drop_empty_lines: bool = True, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, arrow_open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: PathPartitionFilter = None, ) -> Dataset[str]: """Create a dataset from lines stored in text files. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_text("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_text(["/path/to/file1", "/path/to/file2"]) # doctest: +SKIP Args: paths: A single file path or a list of file paths (or directories). encoding: The encoding of the files (e.g., "utf-8" or "ascii"). errors: What to do with errors on decoding. Specify either "strict", "ignore", or "replace". Defaults to "ignore". filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used with a custom callback to read only selected partitions of a dataset. Returns: Dataset holding lines of text read from the specified paths. """ def to_text(s): lines = s.decode(encoding).split("\n") if drop_empty_lines: lines = [line for line in lines if line.strip() != ""] return lines return read_binary_files( paths, filesystem=filesystem, parallelism=parallelism, arrow_open_stream_args=arrow_open_stream_args, meta_provider=meta_provider, partition_filter=partition_filter, ).flat_map(to_text)
def read_binary_files( paths: Union[str, List[str]], *, include_paths: bool = False, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = -1, ray_remote_args: Dict[str, Any] = None, arrow_open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: Optional[PathPartitionFilter] = None, ) -> Dataset[Union[Tuple[str, bytes], bytes]]: """Create a dataset from binary files of arbitrary contents. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_binary_files("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_binary_files( # doctest: +SKIP ... ["/path/to/file1", "/path/to/file2"]) Args: paths: A single file path or a list of file paths (or directories). include_paths: Whether to include the full path of the file in the dataset records. When specified, the dataset records will be a tuple of the file path and the file contents. filesystem: The filesystem implementation to read from. ray_remote_args: kwargs passed to ray.remote in the read tasks. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used with a custom callback to read only selected partitions of a dataset. By default, this does not filter out any files. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource( BinaryDatasource(), parallelism=parallelism, paths=paths, include_paths=include_paths, filesystem=filesystem, ray_remote_args=ray_remote_args, open_stream_args=arrow_open_stream_args, schema=bytes, meta_provider=meta_provider, partition_filter=partition_filter, )
def read_json( paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, arrow_open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), **arrow_json_args, ) -> Dataset[ArrowRow]: """Create an Arrow dataset from json files. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_json("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_json(["/path/to/file1", "/path/to/file2"]) # doctest: +SKIP >>> # Read multiple directories. >>> ray.data.read_json( # doctest: +SKIP ... ["s3://bucket/path1", "s3://bucket/path2"]) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. arrow_json_args: Other json read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource( JSONDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, ray_remote_args=ray_remote_args, open_stream_args=arrow_open_stream_args, meta_provider=meta_provider, **arrow_json_args, )
def test_default_file_metadata_provider(caplog, fs, data_path, endpoint_url): storage_options = ( {} if endpoint_url is None else dict(client_kwargs=dict(endpoint_url=endpoint_url)) ) path_module = os.path if urllib.parse.urlparse(data_path).scheme else posixpath path1 = path_module.join(data_path, "test1.csv") path2 = path_module.join(data_path, "test2.csv") paths = [path1, path2] paths, fs = _resolve_paths_and_filesystem(paths, fs) df1 = pd.DataFrame({"one": [1, 2, 3], "two": ["a", "b", "c"]}) df1.to_csv(path1, index=False, storage_options=storage_options) df2 = pd.DataFrame({"one": [4, 5, 6], "two": ["e", "f", "g"]}) df2.to_csv(path2, index=False, storage_options=storage_options) meta_provider = DefaultFileMetadataProvider() with caplog.at_level(logging.WARNING): file_paths, file_sizes = meta_provider.expand_paths(paths, fs) assert "meta_provider=FastFileMetadataProvider()" in caplog.text assert file_paths == paths expected_file_sizes = _get_file_sizes_bytes(paths, fs) assert file_sizes == expected_file_sizes meta = meta_provider( paths, None, rows_per_file=3, file_sizes=file_sizes, ) assert meta.size_bytes == sum(expected_file_sizes) assert meta.num_rows == 6 assert len(paths) == 2 assert all(path in meta.input_files for path in paths) assert meta.schema is None