def read_json( paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, arrow_open_stream_args: Optional[Dict[str, Any]] = None, meta_provider: BaseFileMetadataProvider = DefaultFileMetadataProvider(), partition_filter: Optional[ PathPartitionFilter ] = JSONDatasource.file_extension_filter(), **arrow_json_args, ) -> Dataset[ArrowRow]: """Create an Arrow dataset from json files. Examples: >>> import ray >>> # Read a directory of files in remote storage. >>> ray.data.read_json("s3://bucket/path") # doctest: +SKIP >>> # Read multiple local files. >>> ray.data.read_json(["/path/to/file1", "/path/to/file2"]) # doctest: +SKIP >>> # Read multiple directories. >>> ray.data.read_json( # doctest: +SKIP ... ["s3://bucket/path1", "s3://bucket/path2"]) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The requested parallelism of the read. Parallelism may be limited by the number of files of the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_open_stream_args: kwargs passed to pyarrow.fs.FileSystem.open_input_stream meta_provider: File metadata provider. Custom metadata providers may be able to resolve file metadata more quickly and/or accurately. partition_filter: Path-based partition filter, if any. Can be used with a custom callback to read only selected partitions of a dataset. By default, this filters out any file paths whose file extension does not match "*.json*". arrow_json_args: Other json read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource( JSONDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, ray_remote_args=ray_remote_args, open_stream_args=arrow_open_stream_args, meta_provider=meta_provider, partition_filter=partition_filter, **arrow_json_args, )
def read_json(paths: Union[str, List[str]], *, filesystem: Optional["pyarrow.fs.FileSystem"] = None, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, **arrow_json_args) -> Dataset[ArrowRow]: """Create an Arrow dataset from json files. Examples: >>> # Read a directory of files in remote storage. >>> ray.data.read_json("s3://bucket/path") >>> # Read multiple local files. >>> ray.data.read_json(["/path/to/file1", "/path/to/file2"]) >>> # Read multiple directories. >>> ray.data.read_json(["s3://bucket/path1", "s3://bucket/path2"]) Args: paths: A single file/directory path or a list of file/directory paths. A list of paths can contain both files and directories. filesystem: The filesystem implementation to read from. parallelism: The amount of parallelism to use for the dataset. ray_remote_args: kwargs passed to ray.remote in the read tasks. arrow_json_args: Other json read options to pass to pyarrow. Returns: Dataset holding Arrow records read from the specified paths. """ return read_datasource(JSONDatasource(), parallelism=parallelism, paths=paths, filesystem=filesystem, ray_remote_args=ray_remote_args, **arrow_json_args)