def source(path_or_paths, filesystem=None, partitioning=None, format=None): """ Open a (multi-file) data source. Parameters ---------- path_or_paths : str, pathlib.Path, or list of those Path to a file or to a directory containing the data files, or a list of paths. filesystem : FileSystem, default None By default will be inferred from the path. partitioning : Partitioning(Factory) or str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut. format : str Currently only "parquet" is supported. Returns ------- DataSource of DataSourceDiscovery """ filesystem, paths_or_selector = _ensure_fs_and_paths( path_or_paths, filesystem) partitioning = _ensure_partitioning(partitioning) format = format or "parquet" if format == "parquet": format = ParquetFileFormat() elif not isinstance(format, FileFormat): raise ValueError("format '{0}' is not supported".format(format)) # TODO pass through options options = FileSystemFactoryOptions() if isinstance(partitioning, PartitioningFactory): options.partitioning_factory = partitioning elif isinstance(partitioning, Partitioning): options.partitioning = partitioning discovery = FileSystemSourceFactory(filesystem, paths_or_selector, format, options) # TODO return Source if a specific schema was passed? # need to return SourceFactory since `dataset` might need to # finish the factory with a unified schema return discovery
def _ensure_format(obj): if isinstance(obj, FileFormat): return obj elif obj == "parquet": return ParquetFileFormat() elif obj in {"ipc", "arrow", "feather"}: return IpcFileFormat() elif obj == "csv": return CsvFileFormat() elif obj == "orc": if not _orc_available: raise ValueError(_orc_msg) return OrcFileFormat() else: raise ValueError("format '{}' is not supported".format(obj))
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem, _ = _ensure_filesystem(filesystem) metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path)) factory = ParquetDatasetFactory(metadata_path, filesystem, format) return factory.finish(schema)
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning)) factory = ParquetDatasetFactory(metadata_path, filesystem, format, options=options) return factory.finish(schema)