def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import (FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # First check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: local_path_exists = False else: local_path_exists = (infos.type != FileType.NotFound) if not local_path_exists: # Perhaps it's a URI? try: return FileSystem.from_uri(path) except ValueError as e: if "empty scheme" not in str(e): raise # ARROW-8213: not a URI, assume local path # to get a nice error message. # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
def _ensure_fs(filesystem, path): # Validate or infer the filesystem from the path from pyarrow.fs import ( FileSystem, LocalFileSystem, FileType, _normalize_path) if filesystem is None: # first check if the file exists as a local (relative) file path filesystem = LocalFileSystem() try: infos = filesystem.get_file_info([path])[0] except OSError: return FileSystem.from_uri(path) if infos.type == FileType.NotFound: return FileSystem.from_uri(path) # ensure we have a proper path (eg no backslashes on Windows) path = _normalize_path(filesystem, path) return filesystem, path
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path): return self._fs.create_dir(self._path(path)) def open_input_stream(self, path): return self._fs.open_input_stream(self._path(path)) def open_output_stream(self, path): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path)) def get_file_info(self, path): return self._fs.get_file_info(self._path(path)) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))
def _ensure_multiple_sources(paths, filesystem=None): """ Treat a list of paths as files belonging to a single file system If the file system is local then also validates that all paths are referencing existing *files* otherwise any non-file paths will be silently skipped (for example on a remote filesystem). Parameters ---------- paths : list of path-like Note that URIs are not allowed. filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str) File system object and a list of normalized paths. Raises ------ TypeError If the passed filesystem has wrong type. IOError If the file system is local and a referenced path is not available or not a file. """ from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem, _MockFileSystem, FileType, _ensure_filesystem) if filesystem is None: # fall back to local file system as the default filesystem = LocalFileSystem() else: # construct a filesystem if it is a valid URI filesystem = _ensure_filesystem(filesystem) is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem)) or (isinstance(filesystem, SubTreeFileSystem) and isinstance(filesystem.base_fs, LocalFileSystem))) # allow normalizing irregular paths such as Windows local paths paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths] # validate that all of the paths are pointing to existing *files* # possible improvement is to group the file_infos by type and raise for # multiple paths per error category if is_local: for info in filesystem.get_file_info(paths): file_type = info.type if file_type == FileType.File: continue elif file_type == FileType.NotFound: raise FileNotFoundError(info.path) elif file_type == FileType.Directory: raise IsADirectoryError( 'Path {} points to a directory, but only file paths are ' 'supported. To construct a nested or union dataset pass ' 'a list of dataset objects instead.'.format(info.path)) else: raise IOError( 'Path {} exists but its type is unknown (could be a ' 'special file such as a Unix socket or character device, ' 'or Windows NUL / CON / ...)'.format(info.path)) return filesystem, paths
def _ensure_single_source(path, filesystem=None): """ Treat path as either a recursively traversable directory or a single file. Parameters ---------- path : path-like filesystem : FileSystem or str, optional If an URI is passed, then its path component will act as a prefix for the file paths. Returns ------- (FileSystem, list of str or fs.Selector) File system object and either a single item list pointing to a file or an fs.Selector object pointing to a directory. Raises ------ TypeError If the passed filesystem has wrong type. FileNotFoundError If the referenced file or directory doesn't exist. """ from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector path = _stringify_path(path) # if filesystem is not given try to automatically determine one # first check if the file exists as a local (relative) file path # if not then try to parse the path as an URI file_info = None if filesystem is None: filesystem = LocalFileSystem() try: file_info = filesystem.get_file_info([path])[0] except OSError: file_info = None exists_locally = False else: exists_locally = (file_info.type != FileType.NotFound) # if the file or directory doesn't exists locally, then assume that # the path is an URI describing the file system as well if not exists_locally: try: filesystem, path = FileSystem.from_uri(path) except ValueError as e: # ARROW-8213: neither an URI nor a locally existing path, # so assume that local path was given and propagate a nicer # file not found error instead of a more confusing scheme # parsing error if "empty scheme" not in str(e): raise else: # unset file_info to query it again from the new filesystem file_info = None # construct a filesystem if it is a valid URI filesystem, _ = _ensure_fs(filesystem) # ensure that the path is normalized before passing to dataset discovery path = filesystem.normalize_path(path) # retrieve the file descriptor if file_info is None: file_info = filesystem.get_file_info([path])[0] # depending on the path type either return with a recursive # directory selector or as a list containing a single file if file_info.type == FileType.Directory: paths_or_selector = FileSelector(path, recursive=True) elif file_info.type == FileType.File: paths_or_selector = [path] else: raise FileNotFoundError(path) return filesystem, paths_or_selector
class FakeHadoopFileSystem: def __init__(self, *args, **kwargs): from pyarrow.fs import LocalFileSystem self._root = Path(_hdfs_root.name) self._fs = LocalFileSystem() def _path(self, path): from pyarrow.fs import FileSelector if isinstance(path, FileSelector): return FileSelector( os.fspath(self._root / path.base_dir.lstrip("/")), path.allow_not_found, path.recursive, ) if isinstance(path, list): return [self._path(sub_path) for sub_path in path] return os.fspath(self._root / path.lstrip("/")) def create_dir(self, path, **kwargs): return self._fs.create_dir(self._path(path), **kwargs) def open_input_stream(self, path, **kwargs): return self._fs.open_input_stream(self._path(path), **kwargs) def open_output_stream(self, path, **kwargs): import posixpath # NOTE: HadoopFileSystem.open_output_stream creates directories # automatically. self.create_dir(posixpath.dirname(path)) return self._fs.open_output_stream(self._path(path), **kwargs) def get_file_info(self, path, **kwargs): from pyarrow.fs import FileInfo entries = self._fs.get_file_info(self._path(path), **kwargs) if isinstance(entries, FileInfo): ret = self._adjust_entry(entries) else: assert isinstance(entries, list) ret = list(map(self._adjust_entry, entries)) # import pdb; pdb.set_trace() return ret def _adjust_entry(self, entry): import posixpath from pyarrow.fs import FileInfo mocked_path = os.path.relpath(entry.path, self._root) mocked_parts = mocked_path.split(os.path.sep) return FileInfo( path=posixpath.join(*mocked_parts), type=entry.type, mtime=entry.mtime, size=entry.size, ) def move(self, from_path, to_path): self._fs.move(self._path(from_path), self._path(to_path)) def delete_file(self, path): self._fs.delete_file(self._path(path))