def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import (FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # First check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: local_path_exists = False else: local_path_exists = (infos.type != FileType.NotFound) if not local_path_exists: # Perhaps it's a URI? try: return FileSystem.from_uri(path) except ValueError as e: if "empty scheme" not in str(e): raise # ARROW-8213: not a URI, assume local path # to get a nice error message. # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None, partitioning=None, partition_base_dir=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. partitioning : Partitioning, PartitioningFactory, str, list of str The partitioning scheme specified with the ``partitioning()`` function. A flavor string can be used as shortcut, and with a list of field names a DirectionaryPartitioning will be inferred. partition_base_dir : str, optional For the purposes of applying the partitioning, paths will be stripped of the partition_base_dir. Files not matching the partition_base_dir prefix will be skipped for partitioning discovery. The ignored files will still be part of the Dataset, but will not have partition information. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) metadata_path = filesystem.normalize_path(_stringify_path(metadata_path)) options = ParquetFactoryOptions( partition_base_dir=partition_base_dir, partitioning=_ensure_partitioning(partitioning) ) factory = ParquetDatasetFactory( metadata_path, filesystem, format, options=options) return factory.finish(schema)
def test_subtree_filesystem(): localfs = LocalFileSystem() subfs = SubTreeFileSystem('/base', localfs) assert subfs.base_path == '/base/' assert subfs.base_fs == localfs subfs = SubTreeFileSystem('/another/base/', LocalFileSystem()) assert subfs.base_path == '/another/base/' assert subfs.base_fs == localfs
def localfs_with_mmap(request, tempdir): return dict( fs=LocalFileSystem(use_mmap=True), pathfn=lambda p: (tempdir / p).as_posix(), allow_move_dir=True, allow_append_to_file=True, )
def py_localfs(request, tempdir): return dict( fs=PyFileSystem(ProxyHandler(LocalFileSystem())), pathfn=lambda p: (tempdir / p).as_posix(), allow_move_dir=True, allow_append_to_file=True, )
def write_dataset(data, base_dir, format=None, partitioning=None, schema=None, filesystem=None, use_threads=True): """ Write a dataset to a given format and partitioning. Parameters ---------- data : Dataset, Table/RecordBatch, or list of Table/RecordBatch The data to write. This can be a Dataset instance or in-memory Arrow data. A Table or RecordBatch is written as a single fragment (resulting in a single file, or multiple files if split according to the `partitioning`). If you have a Table consisting of multiple record batches, you can pass ``table.to_batches()`` to handle each record batch as a separate fragment. base_dir : str The root directory where to write the dataset. format : FileFormat or str The format in which to write the dataset. Currently supported: "ipc"/"feather". If a FileSystemDataset is being written and `format` is not specified, it defaults to the same format as the specified FileSystemDataset. When writing a Table or RecordBatch, this keyword is required. partitioning : Partitioning, optional The partitioning scheme specified with the ``partitioning()`` function. schema : Schema, optional filesystem : FileSystem, optional use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. """ if isinstance(data, Dataset): schema = schema or data.schema if isinstance(data, FileSystemDataset): format = format or data.format elif isinstance(data, (pa.Table, pa.RecordBatch)): schema = schema or data.schema data = [data] elif isinstance(data, list): schema = schema or data[0].schema else: raise ValueError( "Only Dataset, Table/RecordBatch or a list of Table/RecordBatch " "objects are supported." ) format = _ensure_format(format) partitioning = _ensure_write_partitioning(partitioning) if filesystem is None: # fall back to local file system as the default from pyarrow.fs import LocalFileSystem filesystem = LocalFileSystem() filesystem, _ = _ensure_fs(filesystem) _filesystemdataset_write( data, base_dir, schema, format, filesystem, partitioning, use_threads, )
def test_localfs_options(): options = LocalFileSystemOptions() assert options.use_mmap is False options.use_mmap = True assert options.use_mmap is True with pytest.raises(AttributeError): options.xxx = True options = LocalFileSystemOptions(use_mmap=True) assert options.use_mmap is True # LocalFileSystem instantiation LocalFileSystem(LocalFileSystemOptions(use_mmap=True)) LocalFileSystem(use_mmap=False) with pytest.raises(AttributeError): LocalFileSystem(xxx=False)
def main(): parser = argparse.ArgumentParser( description="Generate sample parquet data") parser.add_argument('path', type=str, nargs='?', help='path to save data to', default="./data/data.parquet") parser.add_argument( '--source', type=str, help= 'local path to import data from (optional; can be csv, json or parquet)' ) parser.add_argument( '--endpoint', type=str, help= 'S3 endpoint (e.g.: https://s3.eu-de.cloud-object-storage.appdomain.cloud' ) parser.add_argument('--access_key', type=str, help='S3 access key') parser.add_argument('--secret_key', type=str, help='S3 secret key') args = parser.parse_args() if args.endpoint: print("Using S3 file system") parsed_endpoint = urlparse(args.endpoint) fs = S3FileSystem(endpoint_override=parsed_endpoint.netloc, scheme=parsed_endpoint.scheme, access_key=args.access_key, secret_key=args.secret_key, background_writes=False) else: print("Using local file system") os.makedirs(os.path.dirname(args.path), exist_ok=True) fs = LocalFileSystem() table = import_table(args.source) with fs.open_output_stream(args.path) as f: pq.write_table(table, f) print("Table written to", args.path) print(table.to_pandas())
def _filesystem_for_asset(asset_config: dict): connection = asset_config['connection'] connection_type = connection['type'] if connection_type == "s3": return s3filesystem_from_config(connection["s3"]) elif connection_type == "localfs": return LocalFileSystem() elif connection_type == "httpfs": return httpfs_from_config() raise ValueError( "Unsupported connection type: {}".format(connection_type))
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import FileSystem, LocalFileSystem if filesystem is None: try: filesystem, _ = FileSystem.from_uri(path) except Exception: # when path is not found, we fall back to local file system filesystem = LocalFileSystem() return filesystem
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import ( FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # first check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: return FileSystem.from_uri(path) if infos.type == FileType.NotFound: return FileSystem.from_uri(path) # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def test_read_pandas_passthrough_keywords(tempdir): # ARROW-11464 - previously not all keywords were passed through (such as # the filesystem keyword) df = pd.DataFrame({'a': [1, 2, 3]}) filename = tempdir / 'data.parquet' _write_table(df, filename) result = pq.read_pandas('data.parquet', filesystem=SubTreeFileSystem( str(tempdir), LocalFileSystem())) assert result.equals(pa.table(df))
def test_py_filesystem_equality(): handler1 = DummyHandler(1) handler2 = DummyHandler(2) handler3 = DummyHandler(2) fs1 = PyFileSystem(handler1) fs2 = PyFileSystem(handler1) fs3 = PyFileSystem(handler2) fs4 = PyFileSystem(handler3) assert fs2 is not fs1 assert fs3 is not fs2 assert fs4 is not fs3 assert fs2 == fs1 # Same handler assert fs3 != fs2 # Unequal handlers assert fs4 == fs3 # Equal handlers assert fs1 != LocalFileSystem() assert fs1 != object()
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None): """ Create a FileSystemDataset from a `_metadata` file created via `pyarrrow.parquet.write_metadata`. Parameters ---------- metadata_path : path, Path pointing to a single file parquet metadata file schema : Schema, optional Optionally provide the Schema for the Dataset, in which case it will not be inferred from the source. filesystem : FileSystem or URI string, default None If a single path is given as source and filesystem is None, then the filesystem will be inferred from the path. If an URI string is passed, then a filesystem object is constructed using the URI's optional path component as a directory prefix. See the examples below. Note that the URIs on Windows must follow 'file:///C:...' or 'file:/C:...' patterns. format : ParquetFileFormat An instance of a ParquetFileFormat if special options needs to be passed. Returns ------- FileSystemDataset """ from pyarrow.fs import LocalFileSystem if format is None: format = ParquetFileFormat() elif not isinstance(format, ParquetFileFormat): raise ValueError("format argument must be a ParquetFileFormat") if filesystem is None: filesystem = LocalFileSystem() else: filesystem, _ = _ensure_filesystem(filesystem) metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path)) factory = ParquetDatasetFactory(metadata_path, filesystem, format) return factory.finish(schema)
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path): return self._fs.create_dir(self._path(path)) def open_input_stream(self, path): return self._fs.open_input_stream(self._path(path)) def open_output_stream(self, path): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path)) def get_file_info(self, path): return self._fs.get_file_info(self._path(path)) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))
def test_filesystem_equals(): fs0 = LocalFileSystem() fs1 = LocalFileSystem() fs2 = _MockFileSystem() assert fs0.equals(fs0) assert fs0.equals(fs1) with pytest.raises(TypeError): fs0.equals('string') assert fs0 == fs0 == fs1 assert fs0 != 4 assert fs2 == fs2 assert fs2 != _MockFileSystem() assert SubTreeFileSystem('/base', fs0) == SubTreeFileSystem('/base', fs0) assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/base', fs2) assert SubTreeFileSystem('/base', fs0) != SubTreeFileSystem('/other', fs0)
def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem()
def _ensure_multiple_sources(paths, filesystem=None): """ Treat a list of paths as files belonging to a single file system If the file system is local then also validates that all paths are referencing existing *files* otherwise any non-file paths will be silently skipped (for example on a remote filesystem). Parameters ---------- paths : list of path-like Note that URIs are not allowed. filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str) File system object and a list of normalized paths. Raises ------ TypeError If the passed filesystem has wrong type. IOError If the file system is local and a referenced path is not available or not a file. """ from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType, _ensure_filesystem) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: # construct a filesystem if it is a valid URI filesystem = _ensure_filesystem(filesystem) is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or (isinstance(filesystem, SubTreeFileSystem) and isinstance(filesystem.base_fs, LocalFileSystem))) # allow normalizing irregular paths such as Windows local paths paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths] # validate that all of the paths are pointing to existing *files* # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: continue elif file_type == FileType.NotFound: raise FileNotFoundError(info.path) elif file_type == FileType.Directory: raise IsADirectoryError( 'Path {} points to a directory, but only file paths are ' 'supported. To construct a nested or union dataset pass ' 'a list of dataset objects instead.'.format(info.path)) else: raise IOError( 'Path {} exists but its type is unknown (could be a ' 'special file such as a Unix socket or character device, ' 'or Windows NUL / CON / ...)'.format(info.path)) return filesystem, paths
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
def test_localfs_options(): # LocalFileSystem instantiation LocalFileSystem(use_mmap=False) with pytest.raises(TypeError): LocalFileSystem(xxx=False)
def test_type_name(): fs = LocalFileSystem() assert fs.type_name == "local" fs = _MockFileSystem() assert fs.type_name == "mock"
def test_delta_table_with_filesystem(): table_path = "../rust/tests/data/simple_table" dt = DeltaTable(table_path) filesystem = LocalFileSystem() assert dt.to_pandas(filesystem=filesystem).equals(pd.DataFrame({"id": [5, 7, 9]}))
from datetime import datetime try: import pathlib except ImportError: import pathlib2 as pathlib # py2 compat import pytest from pyarrow import ArrowIOError from pyarrow.fs import (FileType, Selector, FileSystem, LocalFileSystem, SubTreeFileSystem) from pyarrow.tests.test_io import gzip_compress, gzip_decompress @pytest.fixture(params=[ pytest.param(lambda tmp: LocalFileSystem(), id='LocalFileSystem'), pytest.param(lambda tmp: SubTreeFileSystem(tmp, LocalFileSystem()), id='SubTreeFileSystem(LocalFileSystem)') ]) def fs(request, tempdir): return request.param(tempdir.as_posix()) @pytest.fixture def testpath(request, fs, tempdir): # we always use the tempdir for reading and writing test artifacts, but # if the filesystem is wrapped in a SubTreeFileSystem then we don't need # to prepend the path with the tempdir, we also test the API with both # pathlib.Path objects and plain python strings def convert(path): if isinstance(fs, SubTreeFileSystem):
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) if isinstance(path, list): return [self._path(sub_path) for sub_path in path] return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path, **kwargs): return self._fs.create_dir(self._path(path), **kwargs) def open_input_stream(self, path, **kwargs): return self._fs.open_input_stream(self._path(path), **kwargs) def open_output_stream(self, path, **kwargs): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path), **kwargs) def get_file_info(self, path, **kwargs): from pyarrow.fs import FileInfo entries = self._fs.get_file_info(self._path(path), **kwargs) if isinstance(entries, FileInfo): ret = self._adjust_entry(entries) else: assert isinstance(entries, list) ret = list(map(self._adjust_entry, entries)) # import pdb; pdb.set_trace() return ret def _adjust_entry(self, entry): import posixpath from pyarrow.fs import FileInfo mocked_path = os.path.relpath(entry.path, self._root) mocked_parts = mocked_path.split(os.path.sep) return FileInfo( path=posixpath.join(*mocked_parts), type=entry.type, mtime=entry.mtime, size=entry.size, ) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))
def write_dataset(data, base_dir, basename_template=None, format=None, partitioning=None, schema=None, filesystem=None, file_options=None, use_threads=True): """ Write a dataset to a given format and partitioning. Parameters ---------- data : Dataset, Table/RecordBatch, or list of Table/RecordBatch The data to write. This can be a Dataset instance or in-memory Arrow data. base_dir : str The root directory where to write the dataset. basename_template : str, optional A template string used to generate basenames of written data files. The token '{i}' will be replaced with an automatically incremented integer. If not specified, it defaults to "part-{i}." + format.default_extname format : FileFormat or str The format in which to write the dataset. Currently supported: "parquet", "ipc"/"feather". If a FileSystemDataset is being written and `format` is not specified, it defaults to the same format as the specified FileSystemDataset. When writing a Table or RecordBatch, this keyword is required. partitioning : Partitioning, optional The partitioning scheme specified with the ``partitioning()`` function. schema : Schema, optional filesystem : FileSystem, optional file_options : FileWriteOptions, optional FileFormat specific write options, created using the ``FileFormat.make_write_options()`` function. use_threads : bool, default True Write files in parallel. If enabled, then maximum parallelism will be used determined by the number of available CPU cores. """ from pyarrow.fs import LocalFileSystem, _ensure_filesystem if isinstance(data, Dataset): schema = schema or data.schema elif isinstance(data, (pa.Table, pa.RecordBatch)): schema = schema or data.schema data = [data] elif isinstance(data, list): schema = schema or data[0].schema else: raise ValueError( "Only Dataset, Table/RecordBatch or a list of Table/RecordBatch " "objects are supported.") if format is None and isinstance(data, FileSystemDataset): format = data.format else: format = _ensure_format(format) if file_options is None: file_options = format.make_write_options() if format != file_options.format: raise TypeError("Supplied FileWriteOptions have format {}, " "which doesn't match supplied FileFormat {}".format( format, file_options)) if basename_template is None: basename_template = "part-{i}." + format.default_extname partitioning = _ensure_write_partitioning(partitioning) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: filesystem = _ensure_filesystem(filesystem) _filesystemdataset_write( data, base_dir, basename_template, schema, filesystem, partitioning, file_options, use_threads, )