示例#1
0
    def compress_file(self, src_path, dest_path, compression):
        if compression is None:
            shutil.copyfile(src_path, dest_path)
            return

        if compression == "gzip":
            f = gzip.open(dest_path, "w")
        elif compression == "bz2":
            f = bz2.BZ2File(dest_path, "w")
        elif compression == "zip":
            with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
                f.write(src_path, os.path.basename(src_path))
        elif compression == "tar":
            with open(src_path, "rb") as fh:
                with tarfile.open(dest_path, mode="w") as tar:
                    tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path))
                    tar.addfile(tarinfo, fh)
        elif compression == "xz":
            f = get_lzma_file()(dest_path, "w")
        elif compression == "zstd":
            f = import_optional_dependency("zstandard").open(dest_path, "wb")
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        if compression not in ["zip", "tar"]:
            with open(src_path, "rb") as fh, f:
                f.write(fh.read())
示例#2
0
def write_to_compressed(compression, path, data, dest="test"):
    """
    Write data to a compressed file.

    Parameters
    ----------
    compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'}
        The compression type to use.
    path : str
        The file path to write the data.
    data : str
        The data to write.
    dest : str, default "test"
        The destination file (for ZIP only)

    Raises
    ------
    ValueError : An invalid compression value was passed in.
    """
    args: tuple[Any, ...] = (data, )
    mode = "wb"
    method = "write"
    compress_method: Callable

    if compression == "zip":
        compress_method = zipfile.ZipFile
        mode = "w"
        args = (dest, data)
        method = "writestr"
    elif compression == "tar":
        compress_method = tarfile.TarFile
        mode = "w"
        file = tarfile.TarInfo(name=dest)
        bytes = io.BytesIO(data)
        file.size = len(data)
        args = (file, bytes)
        method = "addfile"
    elif compression == "gzip":
        compress_method = gzip.GzipFile
    elif compression == "bz2":
        compress_method = bz2.BZ2File
    elif compression == "zstd":
        compress_method = import_optional_dependency("zstandard").open
    elif compression == "xz":
        compress_method = get_lzma_file()
    else:
        raise ValueError(f"Unrecognized compression type: {compression}")

    with compress_method(path, mode=mode) as f:
        getattr(f, method)(*args)
示例#3
0
def decompress_file(path, compression):
    """
    Open a compressed file and return a file object.

    Parameters
    ----------
    path : str
        The path where the file is read from.

    compression : {'gzip', 'bz2', 'zip', 'xz', None}
        Name of the decompression to use

    Returns
    -------
    file object
    """
    if compression is None:
        f = open(path, "rb")
    elif compression == "gzip":
        # pandas\_testing.py:243: error: Incompatible types in assignment
        # (expression has type "IO[Any]", variable has type "BinaryIO")
        f = gzip.open(path, "rb")  # type: ignore[assignment]
    elif compression == "bz2":
        # pandas\_testing.py:245: error: Incompatible types in assignment
        # (expression has type "BZ2File", variable has type "BinaryIO")
        f = bz2.BZ2File(path, "rb")  # type: ignore[assignment]
    elif compression == "xz":
        f = get_lzma_file(lzma)(path, "rb")
    elif compression == "zip":
        zip_file = zipfile.ZipFile(path)
        zip_names = zip_file.namelist()
        if len(zip_names) == 1:
            # pandas\_testing.py:252: error: Incompatible types in assignment
            # (expression has type "IO[bytes]", variable has type "BinaryIO")
            f = zip_file.open(zip_names.pop())  # type: ignore[assignment]
        else:
            raise ValueError(f"ZIP file {path} error. Only one file per ZIP.")
    else:
        raise ValueError(f"Unrecognized compression type: {compression}")

    try:
        yield f
    finally:
        f.close()
        if compression == "zip":
            zip_file.close()
示例#4
0
    def compress_file(self, src_path, dest_path, compression):
        if compression is None:
            shutil.copyfile(src_path, dest_path)
            return

        if compression == "gzip":
            f = gzip.open(dest_path, "w")
        elif compression == "bz2":
            f = bz2.BZ2File(dest_path, "w")
        elif compression == "zip":
            with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
                f.write(src_path, os.path.basename(src_path))
        elif compression == "xz":
            f = get_lzma_file(lzma)(dest_path, "w")
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        if compression != "zip":
            with open(src_path, "rb") as fh, f:
                f.write(fh.read())
示例#5
0
def write_to_compressed(compression, path, data, dest="test"):
    """
    Write data to a compressed file.

    Parameters
    ----------
    compression : {'gzip', 'bz2', 'zip', 'xz'}
        The compression type to use.
    path : str
        The file path to write the data.
    data : str
        The data to write.
    dest : str, default "test"
        The destination file (for ZIP only)

    Raises
    ------
    ValueError : An invalid compression value was passed in.
    """
    args: tuple[Any, ...] = (data,)
    mode = "wb"
    method = "write"
    compress_method: Callable

    if compression == "zip":
        compress_method = zipfile.ZipFile
        mode = "w"
        args = (dest, data)
        method = "writestr"
    elif compression == "gzip":
        compress_method = gzip.GzipFile
    elif compression == "bz2":
        compress_method = bz2.BZ2File
    elif compression == "xz":
        compress_method = get_lzma_file(lzma)
    else:
        raise ValueError(f"Unrecognized compression type: {compression}")

    with compress_method(path, mode=mode) as f:
        getattr(f, method)(*args)
示例#6
0
def get_handle(
    path_or_buf: FilePath | BaseBuffer,
    mode: str,
    *,
    encoding: str | None = None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors: str | None = None,
    storage_options: StorageOptions = None,
) -> IOHandles[str] | IOHandles[bytes]:
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    {compression_options}

        .. versionchanged:: 1.0.0
           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0
           Passing compression options as keys in dict is now
           supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'.

        .. versionchanged:: 1.4.0 Zstandard support.

    memory_map : bool, default False
        See parsers._parser_params for more information. Only used by read_csv.
    is_text : bool, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.
    storage_options: StorageOptions = None
        Passed to _get_filepath_or_buffer

    .. versionchanged:: 1.2.0

    Returns the dataclass IOHandles
    """
    # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
    encoding = encoding or "utf-8"

    errors = errors or "strict"

    # read_csv does not know whether the buffer is opened in binary/text mode
    if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
        mode += "b"

    # validate encoding and errors
    codecs.lookup(encoding)
    if isinstance(errors, str):
        codecs.lookup_error(errors)

    # open URLs
    ioargs = _get_filepath_or_buffer(
        path_or_buf,
        encoding=encoding,
        compression=compression,
        mode=mode,
        storage_options=storage_options,
    )

    handle = ioargs.filepath_or_buffer
    handles: list[BaseBuffer]

    # memory mapping needs to be the first step
    # only used for read_csv
    handle, memory_map, handles = _maybe_memory_map(handle, memory_map)

    is_path = isinstance(handle, str)
    compression_args = dict(ioargs.compression)
    compression = compression_args.pop("method")

    # Only for write methods
    if "r" not in mode and is_path:
        check_parent_directory(str(handle))

    if compression:
        if compression != "zstd":
            # compression libraries do not like an explicit text-mode
            ioargs.mode = ioargs.mode.replace("t", "")
        elif compression == "zstd" and "b" not in ioargs.mode:
            # python-zstandard defaults to text mode, but we always expect
            # compression libraries to use binary mode.
            ioargs.mode += "b"

        # GZ Compression
        if compression == "gzip":
            if isinstance(handle, str):
                # error: Incompatible types in assignment (expression has type
                # "GzipFile", variable has type "Union[str, BaseBuffer]")
                handle = gzip.GzipFile(  # type: ignore[assignment]
                    filename=handle,
                    mode=ioargs.mode,
                    **compression_args,
                )
            else:
                handle = gzip.GzipFile(
                    # No overload variant of "GzipFile" matches argument types
                    # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
                    fileobj=handle,  # type: ignore[call-overload]
                    mode=ioargs.mode,
                    **compression_args,
                )

        # BZ Compression
        elif compression == "bz2":
            # No overload variant of "BZ2File" matches argument types
            # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
            handle = bz2.BZ2File(  # type: ignore[call-overload]
                handle,
                mode=ioargs.mode,
                **compression_args,
            )

        # ZIP Compression
        elif compression == "zip":
            # error: Argument 1 to "_BytesZipFile" has incompatible type
            # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
            # ReadBuffer[bytes], WriteBuffer[bytes]]"
            handle = _BytesZipFile(
                handle,
                ioargs.mode,
                **compression_args  # type: ignore[arg-type]
            )
            if handle.buffer.mode == "r":
                handles.append(handle)
                zip_names = handle.buffer.namelist()
                if len(zip_names) == 1:
                    handle = handle.buffer.open(zip_names.pop())
                elif not zip_names:
                    raise ValueError(
                        f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in ZIP file. "
                                     f"Only one file per ZIP: {zip_names}")

        # TAR Encoding
        elif compression == "tar":
            compression_args.setdefault("mode", ioargs.mode)
            if isinstance(handle, str):
                handle = _BytesTarFile(name=handle, **compression_args)
            else:
                # error: Argument "fileobj" to "_BytesTarFile" has incompatible
                # type "BaseBuffer"; expected "Union[ReadBuffer[bytes],
                # WriteBuffer[bytes], None]"
                handle = _BytesTarFile(
                    fileobj=handle,
                    **compression_args  # type: ignore[arg-type]
                )
            assert isinstance(handle, _BytesTarFile)
            if "r" in handle.buffer.mode:
                handles.append(handle)
                files = handle.buffer.getnames()
                if len(files) == 1:
                    file = handle.buffer.extractfile(files[0])
                    assert file is not None
                    handle = file
                elif not files:
                    raise ValueError(
                        f"Zero files found in TAR archive {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in TAR archive. "
                                     f"Only one file per TAR archive: {files}")

        # XZ Compression
        elif compression == "xz":
            handle = get_lzma_file()(handle, ioargs.mode)

        # Zstd Compression
        elif compression == "zstd":
            zstd = import_optional_dependency("zstandard")
            if "r" in ioargs.mode:
                open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)}
            else:
                open_args = {"cctx": zstd.ZstdCompressor(**compression_args)}
            handle = zstd.open(
                handle,
                mode=ioargs.mode,
                **open_args,
            )

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        assert not isinstance(handle, str)
        handles.append(handle)

    elif isinstance(handle, str):
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        if ioargs.encoding and "b" not in ioargs.mode:
            # Encoding
            handle = open(
                handle,
                ioargs.mode,
                encoding=ioargs.encoding,
                errors=errors,
                newline="",
            )
        else:
            # Binary mode
            handle = open(handle, ioargs.mode)
        handles.append(handle)

    # Convert BytesIO or file objects passed with an encoding
    is_wrapped = False
    if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
        # not added to handles as it does not open/buffer resources
        handle = _BytesIOWrapper(
            handle,
            encoding=ioargs.encoding,
        )
    elif is_text and (compression or memory_map
                      or _is_binary_mode(handle, ioargs.mode)):
        if (not hasattr(handle, "readable") or not hasattr(handle, "writable")
                or not hasattr(handle, "seekable")):
            handle = _IOWrapper(handle)
        # error: Argument 1 to "TextIOWrapper" has incompatible type
        # "_IOWrapper"; expected "IO[bytes]"
        handle = TextIOWrapper(
            handle,  # type: ignore[arg-type]
            encoding=ioargs.encoding,
            errors=errors,
            newline="",
        )
        handles.append(handle)
        # only marked as wrapped when the caller provided a handle
        is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str)
                          or ioargs.should_close)

    if "r" in ioargs.mode and not hasattr(handle, "read"):
        raise TypeError("Expected file path name or file-like object, "
                        f"got {type(ioargs.filepath_or_buffer)} type")

    handles.reverse()  # close the most recently added buffer first
    if ioargs.should_close:
        assert not isinstance(ioargs.filepath_or_buffer, str)
        handles.append(ioargs.filepath_or_buffer)

    return IOHandles(
        # error: Argument "handle" to "IOHandles" has incompatible type
        # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
        # typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
        handle=handle,  # type: ignore[arg-type]
        # error: Argument "created_handles" to "IOHandles" has incompatible type
        # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
        created_handles=handles,  # type: ignore[arg-type]
        is_wrapped=is_wrapped,
        compression=ioargs.compression,
    )
示例#7
0
def get_handle(
    path_or_buf,
    mode: str,
    encoding=None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors=None,
):
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is one of
        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
        other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0

           Passing compression options as keys in dict is now
           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.

    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.

        .. versionadded:: 1.1.0

    Returns
    -------
    f : file-like
        A file-like object.
    handles : list of file-like objects
        A list of file-like object that were opened in this function.
    """
    need_text_wrapping: Tuple[Type["IOBase"], ...]
    try:
        from s3fs import S3File

        need_text_wrapping = (BufferedIOBase, RawIOBase, S3File)
    except ImportError:
        need_text_wrapping = (BufferedIOBase, RawIOBase)
    # fsspec is an optional dependency. If it is available, add its file-object
    # class to the list of classes that need text wrapping. If fsspec is too old and is
    # needed, get_filepath_or_buffer would already have thrown an exception.
    try:
        from fsspec.spec import AbstractFileSystem

        need_text_wrapping = (*need_text_wrapping, AbstractFileSystem)
    except ImportError:
        pass

    handles: List[Union[IO, _MMapWrapper]] = list()
    f = path_or_buf

    # Convert pathlib.Path/py.path.local or string
    path_or_buf = stringify_path(path_or_buf)
    is_path = isinstance(path_or_buf, str)

    compression, compression_args = get_compression_method(compression)
    if is_path:
        compression = infer_compression(path_or_buf, compression)

    if compression:

        # GZ Compression
        if compression == "gzip":
            if is_path:
                f = gzip.GzipFile(filename=path_or_buf,
                                  mode=mode,
                                  **compression_args)
            else:
                f = gzip.GzipFile(fileobj=path_or_buf,
                                  mode=mode,
                                  **compression_args)

        # BZ Compression
        elif compression == "bz2":
            f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)

        # ZIP Compression
        elif compression == "zip":
            zf = _BytesZipFile(path_or_buf, mode, **compression_args)
            # Ensure the container is closed as well.
            handles.append(zf)
            if zf.mode == "w":
                f = zf
            elif zf.mode == "r":
                zip_names = zf.namelist()
                if len(zip_names) == 1:
                    f = zf.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in ZIP file. "
                                     f"Only one file per ZIP: {zip_names}")

        # XZ Compression
        elif compression == "xz":
            f = get_lzma_file(lzma)(path_or_buf, mode)

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        handles.append(f)

    elif is_path:
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        is_binary_mode = "b" in mode

        if encoding and not is_binary_mode:
            # Encoding
            f = open(path_or_buf,
                     mode,
                     encoding=encoding,
                     errors=errors,
                     newline="")
        elif is_text and not is_binary_mode:
            # No explicit encoding
            f = open(path_or_buf, mode, errors="replace", newline="")
        else:
            # Binary mode
            f = open(path_or_buf, mode)
        handles.append(f)

    # Convert BytesIO or file objects passed with an encoding
    if is_text and (compression or isinstance(f, need_text_wrapping)):
        from io import TextIOWrapper

        g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="")
        if not isinstance(f, (BufferedIOBase, RawIOBase)):
            handles.append(g)
        f = g

    if memory_map and hasattr(f, "fileno"):
        try:
            wrapped = _MMapWrapper(f)
            f.close()
            handles.remove(f)
            handles.append(wrapped)
            f = wrapped
        except Exception:
            # we catch any errors that may have occurred
            # because that is consistent with the lower-level
            # functionality of the C engine (pd.read_csv), so
            # leave the file handler as is then
            pass

    return f, handles
示例#8
0
def get_handle(
    path_or_buf: FilePathOrBuffer,
    mode: str,
    encoding: Optional[str] = None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors: Optional[str] = None,
    storage_options: StorageOptions = None,
) -> IOHandles:
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is one of
        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
        other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0

           Passing compression options as keys in dict is now
           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.

    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.
    storage_options: StorageOptions = None
        Passed to _get_filepath_or_buffer

    .. versionchanged:: 1.2.0

    Returns the dataclass IOHandles
    """
    # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
    encoding = encoding or "utf-8"

    # read_csv does not know whether the buffer is opened in binary/text mode
    if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
        mode += "b"

    # valdiate errors
    if isinstance(errors, str):
        errors = errors.lower()
    if errors not in (
            None,
            "strict",
            "ignore",
            "replace",
            "xmlcharrefreplace",
            "backslashreplace",
            "namereplace",
            "surrogateescape",
            "surrogatepass",
    ):
        raise ValueError(
            f"Invalid value for `encoding_errors` ({errors}). Please see " +
            "https://docs.python.org/3/library/codecs.html#error-handlers " +
            "for valid values.")

    # open URLs
    ioargs = _get_filepath_or_buffer(
        path_or_buf,
        encoding=encoding,
        compression=compression,
        mode=mode,
        storage_options=storage_options,
    )

    handle = ioargs.filepath_or_buffer
    handles: List[Buffer]

    # memory mapping needs to be the first step
    handle, memory_map, handles = _maybe_memory_map(handle, memory_map,
                                                    ioargs.encoding,
                                                    ioargs.mode, errors)

    is_path = isinstance(handle, str)
    compression_args = dict(ioargs.compression)
    compression = compression_args.pop("method")

    if compression:
        # compression libraries do not like an explicit text-mode
        ioargs.mode = ioargs.mode.replace("t", "")

        # GZ Compression
        if compression == "gzip":
            if is_path:
                assert isinstance(handle, str)
                handle = gzip.GzipFile(
                    filename=handle,
                    mode=ioargs.mode,
                    **compression_args,
                )
            else:
                handle = gzip.GzipFile(
                    # error: Argument "fileobj" to "GzipFile" has incompatible type
                    # "Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
                    # TextIOWrapper, mmap]]"; expected "Optional[IO[bytes]]"
                    fileobj=handle,  # type: ignore[arg-type]
                    mode=ioargs.mode,
                    **compression_args,
                )

        # BZ Compression
        elif compression == "bz2":
            handle = bz2.BZ2File(
                # Argument 1 to "BZ2File" has incompatible type "Union[str,
                # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper,
                # mmap]]"; expected "Union[Union[str, bytes, _PathLike[str],
                # _PathLike[bytes]], IO[bytes]]"
                handle,  # type: ignore[arg-type]
                mode=ioargs.mode,
                **compression_args,
            )

        # ZIP Compression
        elif compression == "zip":
            handle = _BytesZipFile(handle, ioargs.mode, **compression_args)
            if handle.mode == "r":
                handles.append(handle)
                zip_names = handle.namelist()
                if len(zip_names) == 1:
                    handle = handle.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in ZIP file. "
                                     f"Only one file per ZIP: {zip_names}")

        # XZ Compression
        elif compression == "xz":
            handle = get_lzma_file(lzma)(handle, ioargs.mode)

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        assert not isinstance(handle, str)
        handles.append(handle)

    elif isinstance(handle, str):
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        if ioargs.encoding and "b" not in ioargs.mode:
            # Encoding
            handle = open(
                handle,
                ioargs.mode,
                encoding=ioargs.encoding,
                errors=errors,
                newline="",
            )
        else:
            # Binary mode
            handle = open(handle, ioargs.mode)
        handles.append(handle)

    # Convert BytesIO or file objects passed with an encoding
    is_wrapped = False
    if is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
        handle = TextIOWrapper(
            # error: Argument 1 to "TextIOWrapper" has incompatible type
            # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
            # expected "IO[bytes]"
            handle,  # type: ignore[arg-type]
            encoding=ioargs.encoding,
            errors=errors,
            newline="",
        )
        handles.append(handle)
        # only marked as wrapped when the caller provided a handle
        is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str)
                          or ioargs.should_close)

    handles.reverse()  # close the most recently added buffer first
    if ioargs.should_close:
        assert not isinstance(ioargs.filepath_or_buffer, str)
        handles.append(ioargs.filepath_or_buffer)

    assert not isinstance(handle, str)
    return IOHandles(
        handle=handle,
        created_handles=handles,
        is_wrapped=is_wrapped,
        is_mmap=memory_map,
        compression=ioargs.compression,
    )
示例#9
0
文件: common.py 项目: x997/pandas
def get_handle(
    path_or_buf: FilePathOrBuffer,
    mode: str,
    encoding: Optional[str] = None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors: Optional[str] = None,
) -> IOHandles:
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is one of
        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
        other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0

           Passing compression options as keys in dict is now
           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.

    memory_map : boolean, default False
        See parsers._parser_params for more information.
    is_text : boolean, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.

    .. versionchanged:: 1.2.0

    Returns the dataclass IOHandles
    """
    need_text_wrapping: Tuple[Type["IOBase"], ...]
    try:
        from s3fs import S3File

        need_text_wrapping = (BufferedIOBase, RawIOBase, S3File)
    except ImportError:
        need_text_wrapping = (BufferedIOBase, RawIOBase)
    # fsspec is an optional dependency. If it is available, add its file-object
    # class to the list of classes that need text wrapping. If fsspec is too old and is
    # needed, get_filepath_or_buffer would already have thrown an exception.
    try:
        from fsspec.spec import AbstractFileSystem

        need_text_wrapping = (*need_text_wrapping, AbstractFileSystem)
    except ImportError:
        pass

    # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
    if encoding is None:
        encoding = "utf-8"

    # Convert pathlib.Path/py.path.local or string
    handle = stringify_path(path_or_buf)

    compression, compression_args = get_compression_method(compression)
    compression = infer_compression(handle, compression)

    # memory mapping needs to be the first step
    handle, memory_map, handles = _maybe_memory_map(
        handle, memory_map, encoding, mode, errors
    )

    is_path = isinstance(handle, str)
    if compression:
        # GZ Compression
        if compression == "gzip":
            if is_path:
                assert isinstance(handle, str)
                handle = gzip.GzipFile(filename=handle, mode=mode, **compression_args)
            else:
                handle = gzip.GzipFile(
                    fileobj=handle,  # type: ignore[arg-type]
                    mode=mode,
                    **compression_args,
                )

        # BZ Compression
        elif compression == "bz2":
            handle = bz2.BZ2File(
                handle, mode=mode, **compression_args  # type: ignore[arg-type]
            )

        # ZIP Compression
        elif compression == "zip":
            handle = _BytesZipFile(handle, mode, **compression_args)
            if handle.mode == "r":
                handles.append(handle)
                zip_names = handle.namelist()
                if len(zip_names) == 1:
                    handle = handle.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError(
                        "Multiple files found in ZIP file. "
                        f"Only one file per ZIP: {zip_names}"
                    )

        # XZ Compression
        elif compression == "xz":
            handle = get_lzma_file(lzma)(handle, mode)

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        assert not isinstance(handle, str)
        handles.append(handle)

    elif is_path:
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        assert isinstance(handle, str)
        if encoding and "b" not in mode:
            # Encoding
            handle = open(handle, mode, encoding=encoding, errors=errors, newline="")
        else:
            # Binary mode
            handle = open(handle, mode)
        handles.append(handle)

    # Convert BytesIO or file objects passed with an encoding
    is_wrapped = False
    if is_text and (
        compression
        or isinstance(handle, need_text_wrapping)
        or "b" in getattr(handle, "mode", "")
    ):
        handle = TextIOWrapper(
            handle,  # type: ignore[arg-type]
            encoding=encoding,
            errors=errors,
            newline="",
        )
        handles.append(handle)
        # do not mark as wrapped when the user provided a string
        is_wrapped = not is_path

    handles.reverse()  # close the most recently added buffer first
    assert not isinstance(handle, str)
    return IOHandles(
        handle=handle,
        created_handles=handles,
        is_wrapped=is_wrapped,
        is_mmap=memory_map,
    )
示例#10
0
def get_handle(
    path_or_buf: FilePath | BaseBuffer,
    mode: str,
    *,
    encoding: str | None = None,
    compression: CompressionOptions = None,
    memory_map: bool = False,
    is_text: bool = True,
    errors: str | None = None,
    storage_options: StorageOptions = None,
) -> IOHandles[str] | IOHandles[bytes]:
    """
    Get file handle for given path/buffer and mode.

    Parameters
    ----------
    path_or_buf : str or file handle
        File path or object.
    mode : str
        Mode to open path_or_buf with.
    encoding : str or None
        Encoding to use.
    compression : str or dict, default None
        If string, specifies compression mode. If dict, value at key 'method'
        specifies compression mode. Compression mode must be one of {'infer',
        'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer'
        and `filepath_or_buffer` is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise
        no compression). If dict and compression mode is one of
        {'zip', 'gzip', 'bz2'}, or inferred as one of the above,
        other entries passed as additional compression options.

        .. versionchanged:: 1.0.0

           May now be a dict with key 'method' as compression mode
           and other keys as compression options if compression
           mode is 'zip'.

        .. versionchanged:: 1.1.0

           Passing compression options as keys in dict is now
           supported for compression modes 'gzip' and 'bz2' as well as 'zip'.

    memory_map : bool, default False
        See parsers._parser_params for more information.
    is_text : bool, default True
        Whether the type of the content passed to the file/buffer is string or
        bytes. This is not the same as `"b" not in mode`. If a string content is
        passed to a binary file/buffer, a wrapper is inserted.
    errors : str, default 'strict'
        Specifies how encoding and decoding errors are to be handled.
        See the errors argument for :func:`open` for a full list
        of options.
    storage_options: StorageOptions = None
        Passed to _get_filepath_or_buffer

    .. versionchanged:: 1.2.0

    Returns the dataclass IOHandles
    """
    # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
    encoding = encoding or "utf-8"

    # read_csv does not know whether the buffer is opened in binary/text mode
    if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
        mode += "b"

    # validate encoding and errors
    if isinstance(encoding, str):
        codecs.lookup(encoding)
    if isinstance(errors, str):
        codecs.lookup_error(errors)

    # open URLs
    ioargs = _get_filepath_or_buffer(
        path_or_buf,
        encoding=encoding,
        compression=compression,
        mode=mode,
        storage_options=storage_options,
    )

    handle = ioargs.filepath_or_buffer
    handles: list[BaseBuffer]

    # memory mapping needs to be the first step
    handle, memory_map, handles = _maybe_memory_map(
        handle,
        memory_map,
        ioargs.encoding,
        ioargs.mode,
        errors,
        ioargs.compression["method"] not in _compression_to_extension,
    )

    is_path = isinstance(handle, str)
    compression_args = dict(ioargs.compression)
    compression = compression_args.pop("method")

    # Only for write methods
    if "r" not in mode and is_path:
        check_parent_directory(str(handle))

    if compression:
        # compression libraries do not like an explicit text-mode
        ioargs.mode = ioargs.mode.replace("t", "")

        # GZ Compression
        if compression == "gzip":
            if is_path:
                assert isinstance(handle, str)
                # error: Incompatible types in assignment (expression has type
                # "GzipFile", variable has type "Union[str, BaseBuffer]")
                handle = gzip.GzipFile(  # type: ignore[assignment]
                    filename=handle,
                    mode=ioargs.mode,
                    **compression_args,
                )
            else:
                handle = gzip.GzipFile(
                    # No overload variant of "GzipFile" matches argument types
                    # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
                    fileobj=handle,  # type: ignore[call-overload]
                    mode=ioargs.mode,
                    **compression_args,
                )

        # BZ Compression
        elif compression == "bz2":
            handle = bz2.BZ2File(
                # Argument 1 to "BZ2File" has incompatible type "Union[str,
                # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper,
                # mmap]]"; expected "Union[Union[str, bytes, _PathLike[str],
                # _PathLike[bytes]], IO[bytes]]"
                handle,  # type: ignore[arg-type]
                mode=ioargs.mode,
                **compression_args,
            )

        # ZIP Compression
        elif compression == "zip":
            # error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str,
            # BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
            # ReadBuffer[bytes], WriteBuffer[bytes]]"
            handle = _BytesZipFile(
                handle,
                ioargs.mode,
                **compression_args  # type: ignore[arg-type]
            )
            if handle.mode == "r":
                handles.append(handle)
                zip_names = handle.namelist()
                if len(zip_names) == 1:
                    handle = handle.open(zip_names.pop())
                elif len(zip_names) == 0:
                    raise ValueError(
                        f"Zero files found in ZIP file {path_or_buf}")
                else:
                    raise ValueError("Multiple files found in ZIP file. "
                                     f"Only one file per ZIP: {zip_names}")

        # XZ Compression
        elif compression == "xz":
            handle = get_lzma_file()(handle, ioargs.mode)

        # Unrecognized Compression
        else:
            msg = f"Unrecognized compression type: {compression}"
            raise ValueError(msg)

        assert not isinstance(handle, str)
        handles.append(handle)

    elif isinstance(handle, str):
        # Check whether the filename is to be opened in binary mode.
        # Binary mode does not support 'encoding' and 'newline'.
        if ioargs.encoding and "b" not in ioargs.mode:
            # Encoding
            handle = open(
                handle,
                ioargs.mode,
                encoding=ioargs.encoding,
                errors=errors,
                newline="",
            )
        else:
            # Binary mode
            handle = open(handle, ioargs.mode)
        handles.append(handle)

    # Convert BytesIO or file objects passed with an encoding
    is_wrapped = False
    if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
        handle = BytesIOWrapper(
            handle,
            encoding=ioargs.encoding,
        )
        handles.append(handle)
        # the (text) handle is always provided by the caller
        # since get_handle would have opened it in binary mode
        is_wrapped = True
    elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)):
        handle = TextIOWrapper(
            # error: Argument 1 to "TextIOWrapper" has incompatible type
            # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]";
            # expected "IO[bytes]"
            handle,  # type: ignore[arg-type]
            encoding=ioargs.encoding,
            errors=errors,
            newline="",
        )
        handles.append(handle)
        # only marked as wrapped when the caller provided a handle
        is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str)
                          or ioargs.should_close)

    if "r" in ioargs.mode and not hasattr(handle, "read"):
        raise TypeError("Expected file path name or file-like object, "
                        f"got {type(ioargs.filepath_or_buffer)} type")

    handles.reverse()  # close the most recently added buffer first
    if ioargs.should_close:
        assert not isinstance(ioargs.filepath_or_buffer, str)
        handles.append(ioargs.filepath_or_buffer)

    return IOHandles(
        # error: Argument "handle" to "IOHandles" has incompatible type
        # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
        # typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
        handle=handle,  # type: ignore[arg-type]
        # error: Argument "created_handles" to "IOHandles" has incompatible type
        # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
        created_handles=handles,  # type: ignore[arg-type]
        is_wrapped=is_wrapped,
        is_mmap=memory_map,
        compression=ioargs.compression,
    )