示例#1
0
def py_fsspec_s3fs(request, s3_connection, s3_server):
    s3fs = pytest.importorskip("s3fs")

    host, port, access_key, secret_key = s3_connection
    bucket = 'pyarrow-filesystem/'

    fs = s3fs.S3FileSystem(
        key=access_key,
        secret=secret_key,
        client_kwargs=dict(endpoint_url='http://{}:{}'.format(host, port))
    )
    fs = PyFileSystem(FSSpecHandler(fs))
    try:
        fs.create_dir(bucket)
    except IOError:
        # BucketAlreadyOwnedByYou on second test
        pass

    return dict(
        fs=fs,
        pathfn=bucket.__add__,
        allow_copy_file=True,
        allow_move_dir=False,
        allow_append_to_file=True,
    )
示例#2
0
def _resolve_paths_and_filesystem(
    paths: Union[str, List[str]],
    filesystem: "pyarrow.fs.FileSystem" = None,
) -> Tuple[List[str], "pyarrow.fs.FileSystem"]:
    """
    Resolves and normalizes all provided paths, infers a filesystem from the
    paths and ensures that all paths use the same filesystem.

    Args:
        paths: A single file/directory path or a list of file/directory paths.
            A list of paths can contain both files and directories.
        filesystem: The filesystem implementation that should be used for
            reading these files. If None, a filesystem will be inferred. If not
            None, the provided filesystem will still be validated against all
            filesystems inferred from the provided paths to ensure
            compatibility.
    """
    import pyarrow as pa
    from pyarrow.fs import (FileSystem, PyFileSystem, FSSpecHandler,
                            _resolve_filesystem_and_path)
    import fsspec

    if isinstance(paths, str):
        paths = [paths]
    elif (not isinstance(paths, list)
          or any(not isinstance(p, str) for p in paths)):
        raise ValueError(
            "paths must be a path string or a list of path strings.")
    elif len(paths) == 0:
        raise ValueError("Must provide at least one path.")

    if filesystem and not isinstance(filesystem, FileSystem):
        if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
            raise TypeError(f"The filesystem passed must either conform to "
                            f"pyarrow.fs.FileSystem, or "
                            f"fsspec.spec.AbstractFileSystem. The provided "
                            f"filesystem was: {filesystem}")
        filesystem = PyFileSystem(FSSpecHandler(filesystem))

    resolved_paths = []
    for path in paths:
        try:
            resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
                path, filesystem)
        except pa.lib.ArrowInvalid as e:
            if "Cannot parse URI" in str(e):
                resolved_filesystem, resolved_path = (
                    _resolve_filesystem_and_path(_encode_url(path),
                                                 filesystem))
                resolved_path = _decode_url(resolved_path)
            else:
                raise
        if filesystem is None:
            filesystem = resolved_filesystem
        else:
            resolved_path = _unwrap_protocol(resolved_path)
        resolved_path = filesystem.normalize_path(resolved_path)
        resolved_paths.append(resolved_path)

    return resolved_paths, filesystem
示例#3
0
def py_fsspec_localfs(request, tempdir):
    fsspec = pytest.importorskip("fsspec")
    fs = fsspec.filesystem('file')
    return dict(
        fs=PyFileSystem(FSSpecHandler(fs)),
        pathfn=lambda p: (tempdir / p).as_posix(),
        allow_move_dir=True,
        allow_append_to_file=True,
    )
示例#4
0
 def test_exists(self, versioned_parquet_data_set, dummy_dataframe, mocker):
     """Test `exists` method invocation for versioned data set."""
     assert not versioned_parquet_data_set.exists()
     mocker.patch(
         "pyarrow.fs._ensure_filesystem",
         return_value=PyFileSystem(FSSpecHandler(versioned_parquet_data_set._fs)),
     )
     versioned_parquet_data_set.save(dummy_dataframe)
     assert versioned_parquet_data_set.exists()
示例#5
0
def py_fsspec_memoryfs(request, tempdir):
    fsspec = pytest.importorskip("fsspec", minversion="0.7.5")
    fs = fsspec.filesystem('memory')
    return dict(
        fs=PyFileSystem(FSSpecHandler(fs)),
        pathfn=lambda p: p,
        allow_copy_file=True,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
示例#6
0
 def test_save_and_load(self, versioned_parquet_data_set, dummy_dataframe, mocker):
     """Test that saved and reloaded data matches the original one for
     the versioned data set."""
     mocker.patch(
         "pyarrow.fs._ensure_filesystem",
         return_value=PyFileSystem(FSSpecHandler(versioned_parquet_data_set._fs)),
     )
     versioned_parquet_data_set.save(dummy_dataframe)
     reloaded_df = versioned_parquet_data_set.load()
     assert_frame_equal(dummy_dataframe, reloaded_df)
示例#7
0
def py_fsspec_memoryfs(request, tempdir):
    fsspec = pytest.importorskip("fsspec", minversion="0.7.5")
    if fsspec.__version__ == "0.8.5":
        # see https://issues.apache.org/jira/browse/ARROW-10934
        pytest.skip("Bug in fsspec 0.8.5 for in-memory filesystem")
    fs = fsspec.filesystem('memory')
    return dict(
        fs=PyFileSystem(FSSpecHandler(fs)),
        pathfn=lambda p: p,
        allow_move_dir=True,
        allow_append_to_file=True,
    )
 def test_prevent_overwrite(self, versioned_parquet_data_set,
                            dummy_dataframe, mocker):
     """Check the error when attempting to override the data set if the
     corresponding parquet file for a given save version already exists."""
     mocker.patch(
         "pyarrow.fs._ensure_filesystem",
         return_value=PyFileSystem(
             FSSpecHandler(versioned_parquet_data_set._fs)),
     )
     versioned_parquet_data_set.save(dummy_dataframe)
     pattern = (r"Save path \`.+\` for ParquetDataSet\(.+\) must "
                r"not exist if versioning is enabled\.")
     with pytest.raises(DataSetError, match=pattern):
         versioned_parquet_data_set.save(dummy_dataframe)
 def test_save_version_warning(
     self,
     versioned_parquet_data_set,
     load_version,
     save_version,
     dummy_dataframe,
     mocker,
 ):
     """Check the warning when saving to the path that differs from
     the subsequent load path."""
     pattern = (
         fr"Save version `{save_version}` did not match load version "
         fr"`{load_version}` for ParquetDataSet\(.+\)")
     mocker.patch(
         "pyarrow.fs._ensure_filesystem",
         return_value=PyFileSystem(
             FSSpecHandler(versioned_parquet_data_set._fs)),
     )
     with pytest.warns(UserWarning, match=pattern):
         versioned_parquet_data_set.save(dummy_dataframe)
示例#10
0
def py_fsspec_s3fs(request, s3_connection, s3_server):
    s3fs = pytest.importorskip("s3fs")
    if sys.version_info < (3, 7) and s3fs.__version__ >= LooseVersion("0.5"):
        pytest.skip("s3fs>=0.5 version is async and requires Python >= 3.7")

    host, port, access_key, secret_key = s3_connection
    bucket = 'pyarrow-filesystem/'

    fs = s3fs.S3FileSystem(
        key=access_key,
        secret=secret_key,
        client_kwargs=dict(endpoint_url='http://{}:{}'.format(host, port)))
    fs = PyFileSystem(FSSpecHandler(fs))
    fs.create_dir(bucket)

    yield dict(
        fs=fs,
        pathfn=bucket.__add__,
        allow_copy_file=True,
        allow_move_dir=False,
        allow_append_to_file=True,
    )
    fs.delete_dir(bucket)
示例#11
0

describe_all_csvs_in_zips(fs)
dlf = fsspec.open("/tmp/dl.zip")
with dlf as f:
    zipf = zipfile.ZipFile(f)
    print(zipf.infolist())
dlf.close()

d1f = fsspec.open("zip://dummy1.csv::/tmp/dl.zip", "rt")
with d1f as f:
    print(f.read())

#d1f = fsspec.open("zip://dummy1.csv::github://tiagoantao:python-performance@/08-persistence/sec1-fsspec/dummy.zip")

#with d1f as f:
#    print(pd.read_csv(f))

zfs = ZipFileSystem("/tmp/dl.zip")
arrow_fs = PyFileSystem(FSSpecHandler(zfs))
my_csv = csv.read_csv(arrow_fs.open_input_stream("dummy1.csv"))
print(my_csv)
#with fsspec.open("zip:local.zip/dummy1.csv") as f:
#    pd.read_csv(f)

##fsa = fsspec.get_mapper("github://*****:*****@")
##print(fsa)
## fs = fsspec.open("git_https.py")
## with fs as f:
##     print(f)
示例#12
0
def httpfs_from_config():
    return PyFileSystem(FSSpecHandler(HTTPFileSystem()))
示例#13
0
def read_remote_parquet(path: str):
    fs, path = get_fs_and_path(path)
    return read_parquet(path, filesystem=PyFileSystem(FSSpecHandler(fs)))
示例#14
0
def _resolve_paths_and_filesystem(
    paths: Union[str, List[str]],
    filesystem: "pyarrow.fs.FileSystem" = None,
) -> Tuple[List[str], "pyarrow.fs.FileSystem"]:
    """
    Resolves and normalizes all provided paths, infers a filesystem from the
    paths and ensures that all paths use the same filesystem.

    Args:
        paths: A single file/directory path or a list of file/directory paths.
            A list of paths can contain both files and directories.
        filesystem: The filesystem implementation that should be used for
            reading these files. If None, a filesystem will be inferred. If not
            None, the provided filesystem will still be validated against all
            filesystems inferred from the provided paths to ensure
            compatibility.
    """
    import pyarrow as pa
    from pyarrow.fs import (
        FileSystem,
        FSSpecHandler,
        PyFileSystem,
        _resolve_filesystem_and_path,
    )

    if isinstance(paths, str):
        paths = [paths]
    elif not isinstance(paths, list) or any(not isinstance(p, str)
                                            for p in paths):
        raise ValueError(
            "paths must be a path string or a list of path strings.")
    elif len(paths) == 0:
        raise ValueError("Must provide at least one path.")

    need_unwrap_path_protocol = True
    if filesystem and not isinstance(filesystem, FileSystem):
        err_msg = (f"The filesystem passed must either conform to "
                   f"pyarrow.fs.FileSystem, or "
                   f"fsspec.spec.AbstractFileSystem. The provided "
                   f"filesystem was: {filesystem}")
        try:
            import fsspec
            from fsspec.implementations.http import HTTPFileSystem
        except ModuleNotFoundError:
            # If filesystem is not a pyarrow filesystem and fsspec isn't
            # installed, then filesystem is neither a pyarrow filesystem nor
            # an fsspec filesystem, so we raise a TypeError.
            raise TypeError(err_msg) from None
        if not isinstance(filesystem, fsspec.spec.AbstractFileSystem):
            raise TypeError(err_msg) from None
        if isinstance(filesystem, HTTPFileSystem):
            # If filesystem is fsspec HTTPFileSystem, the protocol/scheme of paths
            # should not be unwrapped/removed, because HTTPFileSystem expects full file
            # paths including protocol/scheme. This is different behavior compared to
            # file systems implementation in pyarrow.fs.FileSystem.
            need_unwrap_path_protocol = False

        filesystem = PyFileSystem(FSSpecHandler(filesystem))

    resolved_paths = []
    for path in paths:
        path = _resolve_example_path(path)
        try:
            resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
                path, filesystem)
        except pa.lib.ArrowInvalid as e:
            if "Cannot parse URI" in str(e):
                resolved_filesystem, resolved_path = _resolve_filesystem_and_path(
                    _encode_url(path), filesystem)
                resolved_path = _decode_url(resolved_path)
            elif "Unrecognized filesystem type in URI" in str(e):
                scheme = urllib.parse.urlparse(path,
                                               allow_fragments=False).scheme
                if scheme in ["http", "https"]:
                    # If scheme of path is HTTP and filesystem is not resolved,
                    # try to use fsspec HTTPFileSystem. This expects fsspec is
                    # installed.
                    try:
                        from fsspec.implementations.http import HTTPFileSystem
                    except ModuleNotFoundError:
                        raise ImportError(
                            "Please install fsspec to read files from HTTP."
                        ) from None

                    resolved_filesystem = PyFileSystem(
                        FSSpecHandler(HTTPFileSystem()))
                    resolved_path = path
                    need_unwrap_path_protocol = False
                else:
                    raise
            else:
                raise
        if filesystem is None:
            filesystem = resolved_filesystem
        elif need_unwrap_path_protocol:
            resolved_path = _unwrap_protocol(resolved_path)
        resolved_path = filesystem.normalize_path(resolved_path)
        resolved_paths.append(resolved_path)

    return resolved_paths, filesystem