Exemplo n.º 1
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (FileSystem, LocalFileSystem, FileType,
                            _normalize_path)

    if filesystem is None:
        # First check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            local_path_exists = False
        else:
            local_path_exists = (infos.type != FileType.NotFound)

        if not local_path_exists:
            # Perhaps it's a URI?
            try:
                return FileSystem.from_uri(path)
            except ValueError as e:
                if "empty scheme" not in str(e):
                    raise
                # ARROW-8213: not a URI, assume local path
                # to get a nice error message.

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemplo n.º 2
0
def _ensure_fs(filesystem, path):
    # Validate or infer the filesystem from the path
    from pyarrow.fs import (
        FileSystem, LocalFileSystem, FileType, _normalize_path)

    if filesystem is None:
        # first check if the file exists as a local (relative) file path
        filesystem = LocalFileSystem()
        try:
            infos = filesystem.get_file_info([path])[0]
        except OSError:
            return FileSystem.from_uri(path)

        if infos.type == FileType.NotFound:
            return FileSystem.from_uri(path)

    # ensure we have a proper path (eg no backslashes on Windows)
    path = _normalize_path(filesystem, path)

    return filesystem, path
Exemplo n.º 3
0
class FakeHadoopFileSystem:
    def __init__(self, *args, **kwargs):
        from pyarrow.fs import LocalFileSystem

        self._root = Path(_hdfs_root.name)
        self._fs = LocalFileSystem()

    def _path(self, path):
        from pyarrow.fs import FileSelector

        if isinstance(path, FileSelector):
            return FileSelector(
                os.fspath(self._root / path.base_dir.lstrip("/")),
                path.allow_not_found,
                path.recursive,
            )

        return os.fspath(self._root / path.lstrip("/"))

    def create_dir(self, path):
        return self._fs.create_dir(self._path(path))

    def open_input_stream(self, path):
        return self._fs.open_input_stream(self._path(path))

    def open_output_stream(self, path):
        import posixpath

        # NOTE: HadoopFileSystem.open_output_stream creates directories
        # automatically.
        self.create_dir(posixpath.dirname(path))
        return self._fs.open_output_stream(self._path(path))

    def get_file_info(self, path):
        return self._fs.get_file_info(self._path(path))

    def move(self, from_path, to_path):
        self._fs.move(self._path(from_path), self._path(to_path))

    def delete_file(self, path):
        self._fs.delete_file(self._path(path))
Exemplo n.º 4
0
def _ensure_multiple_sources(paths, filesystem=None):
    """
    Treat a list of paths as files belonging to a single file system

    If the file system is local then also validates that all paths
    are referencing existing *files* otherwise any non-file paths will be
    silently skipped (for example on a remote filesystem).

    Parameters
    ----------
    paths : list of path-like
        Note that URIs are not allowed.
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    Returns
    -------
    (FileSystem, list of str)
        File system object and a list of normalized paths.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    IOError
        If the file system is local and a referenced path is not available or
        not a file.
    """
    from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem,
                            _MockFileSystem, FileType, _ensure_filesystem)

    if filesystem is None:
        # fall back to local file system as the default
        filesystem = LocalFileSystem()
    else:
        # construct a filesystem if it is a valid URI
        filesystem = _ensure_filesystem(filesystem)

    is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem))
                or (isinstance(filesystem, SubTreeFileSystem)
                    and isinstance(filesystem.base_fs, LocalFileSystem)))

    # allow normalizing irregular paths such as Windows local paths
    paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths]

    # validate that all of the paths are pointing to existing *files*
    # possible improvement is to group the file_infos by type and raise for
    # multiple paths per error category
    if is_local:
        for info in filesystem.get_file_info(paths):
            file_type = info.type
            if file_type == FileType.File:
                continue
            elif file_type == FileType.NotFound:
                raise FileNotFoundError(info.path)
            elif file_type == FileType.Directory:
                raise IsADirectoryError(
                    'Path {} points to a directory, but only file paths are '
                    'supported. To construct a nested or union dataset pass '
                    'a list of dataset objects instead.'.format(info.path))
            else:
                raise IOError(
                    'Path {} exists but its type is unknown (could be a '
                    'special file such as a Unix socket or character device, '
                    'or Windows NUL / CON / ...)'.format(info.path))

    return filesystem, paths
Exemplo n.º 5
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

   Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
        try:
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
        else:
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
            try:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                    raise
            else:
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_fs(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Exemplo n.º 6
0
class FakeHadoopFileSystem:
    def __init__(self, *args, **kwargs):
        from pyarrow.fs import LocalFileSystem

        self._root = Path(_hdfs_root.name)
        self._fs = LocalFileSystem()

    def _path(self, path):
        from pyarrow.fs import FileSelector

        if isinstance(path, FileSelector):
            return FileSelector(
                os.fspath(self._root / path.base_dir.lstrip("/")),
                path.allow_not_found,
                path.recursive,
            )
        if isinstance(path, list):
            return [self._path(sub_path) for sub_path in path]

        return os.fspath(self._root / path.lstrip("/"))

    def create_dir(self, path, **kwargs):
        return self._fs.create_dir(self._path(path), **kwargs)

    def open_input_stream(self, path, **kwargs):
        return self._fs.open_input_stream(self._path(path), **kwargs)

    def open_output_stream(self, path, **kwargs):
        import posixpath

        # NOTE: HadoopFileSystem.open_output_stream creates directories
        # automatically.
        self.create_dir(posixpath.dirname(path))
        return self._fs.open_output_stream(self._path(path), **kwargs)

    def get_file_info(self, path, **kwargs):
        from pyarrow.fs import FileInfo

        entries = self._fs.get_file_info(self._path(path), **kwargs)
        if isinstance(entries, FileInfo):
            ret = self._adjust_entry(entries)
        else:
            assert isinstance(entries, list)
            ret = list(map(self._adjust_entry, entries))

        #        import pdb; pdb.set_trace()

        return ret

    def _adjust_entry(self, entry):
        import posixpath

        from pyarrow.fs import FileInfo

        mocked_path = os.path.relpath(entry.path, self._root)
        mocked_parts = mocked_path.split(os.path.sep)
        return FileInfo(
            path=posixpath.join(*mocked_parts),
            type=entry.type,
            mtime=entry.mtime,
            size=entry.size,
        )

    def move(self, from_path, to_path):
        self._fs.move(self._path(from_path), self._path(to_path))

    def delete_file(self, path):
        self._fs.delete_file(self._path(path))