def compress_file(self, src_path, dest_path, compression): if compression is None: shutil.copyfile(src_path, dest_path) return if compression == "gzip": f = gzip.open(dest_path, "w") elif compression == "bz2": f = bz2.BZ2File(dest_path, "w") elif compression == "zip": with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "tar": with open(src_path, "rb") as fh: with tarfile.open(dest_path, mode="w") as tar: tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path)) tar.addfile(tarinfo, fh) elif compression == "xz": f = get_lzma_file()(dest_path, "w") elif compression == "zstd": f = import_optional_dependency("zstandard").open(dest_path, "wb") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) if compression not in ["zip", "tar"]: with open(src_path, "rb") as fh, f: f.write(fh.read())
def write_to_compressed(compression, path, data, dest="test"): """ Write data to a compressed file. Parameters ---------- compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'} The compression type to use. path : str The file path to write the data. data : str The data to write. dest : str, default "test" The destination file (for ZIP only) Raises ------ ValueError : An invalid compression value was passed in. """ args: tuple[Any, ...] = (data, ) mode = "wb" method = "write" compress_method: Callable if compression == "zip": compress_method = zipfile.ZipFile mode = "w" args = (dest, data) method = "writestr" elif compression == "tar": compress_method = tarfile.TarFile mode = "w" file = tarfile.TarInfo(name=dest) bytes = io.BytesIO(data) file.size = len(data) args = (file, bytes) method = "addfile" elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": compress_method = bz2.BZ2File elif compression == "zstd": compress_method = import_optional_dependency("zstandard").open elif compression == "xz": compress_method = get_lzma_file() else: raise ValueError(f"Unrecognized compression type: {compression}") with compress_method(path, mode=mode) as f: getattr(f, method)(*args)
def decompress_file(path, compression): """ Open a compressed file and return a file object. Parameters ---------- path : str The path where the file is read from. compression : {'gzip', 'bz2', 'zip', 'xz', None} Name of the decompression to use Returns ------- file object """ if compression is None: f = open(path, "rb") elif compression == "gzip": # pandas\_testing.py:243: error: Incompatible types in assignment # (expression has type "IO[Any]", variable has type "BinaryIO") f = gzip.open(path, "rb") # type: ignore[assignment] elif compression == "bz2": # pandas\_testing.py:245: error: Incompatible types in assignment # (expression has type "BZ2File", variable has type "BinaryIO") f = bz2.BZ2File(path, "rb") # type: ignore[assignment] elif compression == "xz": f = get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() if len(zip_names) == 1: # pandas\_testing.py:252: error: Incompatible types in assignment # (expression has type "IO[bytes]", variable has type "BinaryIO") f = zip_file.open(zip_names.pop()) # type: ignore[assignment] else: raise ValueError(f"ZIP file {path} error. Only one file per ZIP.") else: raise ValueError(f"Unrecognized compression type: {compression}") try: yield f finally: f.close() if compression == "zip": zip_file.close()
def compress_file(self, src_path, dest_path, compression): if compression is None: shutil.copyfile(src_path, dest_path) return if compression == "gzip": f = gzip.open(dest_path, "w") elif compression == "bz2": f = bz2.BZ2File(dest_path, "w") elif compression == "zip": with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": f = get_lzma_file(lzma)(dest_path, "w") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) if compression != "zip": with open(src_path, "rb") as fh, f: f.write(fh.read())
def write_to_compressed(compression, path, data, dest="test"): """ Write data to a compressed file. Parameters ---------- compression : {'gzip', 'bz2', 'zip', 'xz'} The compression type to use. path : str The file path to write the data. data : str The data to write. dest : str, default "test" The destination file (for ZIP only) Raises ------ ValueError : An invalid compression value was passed in. """ args: tuple[Any, ...] = (data,) mode = "wb" method = "write" compress_method: Callable if compression == "zip": compress_method = zipfile.ZipFile mode = "w" args = (dest, data) method = "writestr" elif compression == "gzip": compress_method = gzip.GzipFile elif compression == "bz2": compress_method = bz2.BZ2File elif compression == "xz": compress_method = get_lzma_file(lzma) else: raise ValueError(f"Unrecognized compression type: {compression}") with compress_method(path, mode=mode) as f: getattr(f, method)(*args)
def get_handle( path_or_buf: FilePath | BaseBuffer, mode: str, *, encoding: str | None = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors: str | None = None, storage_options: StorageOptions = None, ) -> IOHandles[str] | IOHandles[bytes]: """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. {compression_options} .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'. .. versionchanged:: 1.4.0 Zstandard support. memory_map : bool, default False See parsers._parser_params for more information. Only used by read_csv. is_text : bool, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. storage_options: StorageOptions = None Passed to _get_filepath_or_buffer .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior encoding = encoding or "utf-8" errors = errors or "strict" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" # validate encoding and errors codecs.lookup(encoding) if isinstance(errors, str): codecs.lookup_error(errors) # open URLs ioargs = _get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) handle = ioargs.filepath_or_buffer handles: list[BaseBuffer] # memory mapping needs to be the first step # only used for read_csv handle, memory_map, handles = _maybe_memory_map(handle, memory_map) is_path = isinstance(handle, str) compression_args = dict(ioargs.compression) compression = compression_args.pop("method") # Only for write methods if "r" not in mode and is_path: check_parent_directory(str(handle)) if compression: if compression != "zstd": # compression libraries do not like an explicit text-mode ioargs.mode = ioargs.mode.replace("t", "") elif compression == "zstd" and "b" not in ioargs.mode: # python-zstandard defaults to text mode, but we always expect # compression libraries to use binary mode. ioargs.mode += "b" # GZ Compression if compression == "gzip": if isinstance(handle, str): # error: Incompatible types in assignment (expression has type # "GzipFile", variable has type "Union[str, BaseBuffer]") handle = gzip.GzipFile( # type: ignore[assignment] filename=handle, mode=ioargs.mode, **compression_args, ) else: handle = gzip.GzipFile( # No overload variant of "GzipFile" matches argument types # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" fileobj=handle, # type: ignore[call-overload] mode=ioargs.mode, **compression_args, ) # BZ Compression elif compression == "bz2": # No overload variant of "BZ2File" matches argument types # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" handle = bz2.BZ2File( # type: ignore[call-overload] handle, mode=ioargs.mode, **compression_args, ) # ZIP Compression elif compression == "zip": # error: Argument 1 to "_BytesZipFile" has incompatible type # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]], # ReadBuffer[bytes], WriteBuffer[bytes]]" handle = _BytesZipFile( handle, ioargs.mode, **compression_args # type: ignore[arg-type] ) if handle.buffer.mode == "r": handles.append(handle) zip_names = handle.buffer.namelist() if len(zip_names) == 1: handle = handle.buffer.open(zip_names.pop()) elif not zip_names: raise ValueError( f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError("Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}") # TAR Encoding elif compression == "tar": compression_args.setdefault("mode", ioargs.mode) if isinstance(handle, str): handle = _BytesTarFile(name=handle, **compression_args) else: # error: Argument "fileobj" to "_BytesTarFile" has incompatible # type "BaseBuffer"; expected "Union[ReadBuffer[bytes], # WriteBuffer[bytes], None]" handle = _BytesTarFile( fileobj=handle, **compression_args # type: ignore[arg-type] ) assert isinstance(handle, _BytesTarFile) if "r" in handle.buffer.mode: handles.append(handle) files = handle.buffer.getnames() if len(files) == 1: file = handle.buffer.extractfile(files[0]) assert file is not None handle = file elif not files: raise ValueError( f"Zero files found in TAR archive {path_or_buf}") else: raise ValueError("Multiple files found in TAR archive. " f"Only one file per TAR archive: {files}") # XZ Compression elif compression == "xz": handle = get_lzma_file()(handle, ioargs.mode) # Zstd Compression elif compression == "zstd": zstd = import_optional_dependency("zstandard") if "r" in ioargs.mode: open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)} else: open_args = {"cctx": zstd.ZstdCompressor(**compression_args)} handle = zstd.open( handle, mode=ioargs.mode, **open_args, ) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) assert not isinstance(handle, str) handles.append(handle) elif isinstance(handle, str): # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: # Encoding handle = open( handle, ioargs.mode, encoding=ioargs.encoding, errors=errors, newline="", ) else: # Binary mode handle = open(handle, ioargs.mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): # not added to handles as it does not open/buffer resources handle = _BytesIOWrapper( handle, encoding=ioargs.encoding, ) elif is_text and (compression or memory_map or _is_binary_mode(handle, ioargs.mode)): if (not hasattr(handle, "readable") or not hasattr(handle, "writable") or not hasattr(handle, "seekable")): handle = _IOWrapper(handle) # error: Argument 1 to "TextIOWrapper" has incompatible type # "_IOWrapper"; expected "IO[bytes]" handle = TextIOWrapper( handle, # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", ) handles.append(handle) # only marked as wrapped when the caller provided a handle is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close) if "r" in ioargs.mode and not hasattr(handle, "read"): raise TypeError("Expected file path name or file-like object, " f"got {type(ioargs.filepath_or_buffer)} type") handles.reverse() # close the most recently added buffer first if ioargs.should_close: assert not isinstance(ioargs.filepath_or_buffer, str) handles.append(ioargs.filepath_or_buffer) return IOHandles( # error: Argument "handle" to "IOHandles" has incompatible type # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes], # typing.IO[Any]]"; expected "pandas._typing.IO[Any]" handle=handle, # type: ignore[arg-type] # error: Argument "created_handles" to "IOHandles" has incompatible type # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]" created_handles=handles, # type: ignore[arg-type] is_wrapped=is_wrapped, compression=ioargs.compression, )
def get_handle( path_or_buf, mode: str, encoding=None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors=None, ): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip' and 'bz2' as well as 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. .. versionadded:: 1.1.0 Returns ------- f : file-like A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: from s3fs import S3File need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) # fsspec is an optional dependency. If it is available, add its file-object # class to the list of classes that need text wrapping. If fsspec is too old and is # needed, get_filepath_or_buffer would already have thrown an exception. try: from fsspec.spec import AbstractFileSystem need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) except ImportError: pass handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) compression, compression_args = get_compression_method(compression) if is_path: compression = infer_compression(path_or_buf, compression) if compression: # GZ Compression if compression == "gzip": if is_path: f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": zf = _BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": f = zf elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError("Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}") # XZ Compression elif compression == "xz": f = get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) handles.append(f) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. is_binary_mode = "b" in mode if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") elif is_text and not is_binary_mode: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") if not isinstance(f, (BufferedIOBase, RawIOBase)): handles.append(g) f = g if memory_map and hasattr(f, "fileno"): try: wrapped = _MMapWrapper(f) f.close() handles.remove(f) handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def get_handle( path_or_buf: FilePathOrBuffer, mode: str, encoding: Optional[str] = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors: Optional[str] = None, storage_options: StorageOptions = None, ) -> IOHandles: """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip' and 'bz2' as well as 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. storage_options: StorageOptions = None Passed to _get_filepath_or_buffer .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior encoding = encoding or "utf-8" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" # valdiate errors if isinstance(errors, str): errors = errors.lower() if errors not in ( None, "strict", "ignore", "replace", "xmlcharrefreplace", "backslashreplace", "namereplace", "surrogateescape", "surrogatepass", ): raise ValueError( f"Invalid value for `encoding_errors` ({errors}). Please see " + "https://docs.python.org/3/library/codecs.html#error-handlers " + "for valid values.") # open URLs ioargs = _get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) handle = ioargs.filepath_or_buffer handles: List[Buffer] # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map(handle, memory_map, ioargs.encoding, ioargs.mode, errors) is_path = isinstance(handle, str) compression_args = dict(ioargs.compression) compression = compression_args.pop("method") if compression: # compression libraries do not like an explicit text-mode ioargs.mode = ioargs.mode.replace("t", "") # GZ Compression if compression == "gzip": if is_path: assert isinstance(handle, str) handle = gzip.GzipFile( filename=handle, mode=ioargs.mode, **compression_args, ) else: handle = gzip.GzipFile( # error: Argument "fileobj" to "GzipFile" has incompatible type # "Union[str, Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, # TextIOWrapper, mmap]]"; expected "Optional[IO[bytes]]" fileobj=handle, # type: ignore[arg-type] mode=ioargs.mode, **compression_args, ) # BZ Compression elif compression == "bz2": handle = bz2.BZ2File( # Argument 1 to "BZ2File" has incompatible type "Union[str, # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, # mmap]]"; expected "Union[Union[str, bytes, _PathLike[str], # _PathLike[bytes]], IO[bytes]]" handle, # type: ignore[arg-type] mode=ioargs.mode, **compression_args, ) # ZIP Compression elif compression == "zip": handle = _BytesZipFile(handle, ioargs.mode, **compression_args) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() if len(zip_names) == 1: handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError("Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}") # XZ Compression elif compression == "xz": handle = get_lzma_file(lzma)(handle, ioargs.mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) assert not isinstance(handle, str) handles.append(handle) elif isinstance(handle, str): # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: # Encoding handle = open( handle, ioargs.mode, encoding=ioargs.encoding, errors=errors, newline="", ) else: # Binary mode handle = open(handle, ioargs.mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False if is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" handle, # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", ) handles.append(handle) # only marked as wrapped when the caller provided a handle is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close) handles.reverse() # close the most recently added buffer first if ioargs.should_close: assert not isinstance(ioargs.filepath_or_buffer, str) handles.append(ioargs.filepath_or_buffer) assert not isinstance(handle, str) return IOHandles( handle=handle, created_handles=handles, is_wrapped=is_wrapped, is_mmap=memory_map, compression=ioargs.compression, )
def get_handle( path_or_buf: FilePathOrBuffer, mode: str, encoding: Optional[str] = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors: Optional[str] = None, ) -> IOHandles: """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip' and 'bz2' as well as 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: from s3fs import S3File need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) # fsspec is an optional dependency. If it is available, add its file-object # class to the list of classes that need text wrapping. If fsspec is too old and is # needed, get_filepath_or_buffer would already have thrown an exception. try: from fsspec.spec import AbstractFileSystem need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) except ImportError: pass # Windows does not default to utf-8. Set to utf-8 for a consistent behavior if encoding is None: encoding = "utf-8" # Convert pathlib.Path/py.path.local or string handle = stringify_path(path_or_buf) compression, compression_args = get_compression_method(compression) compression = infer_compression(handle, compression) # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( handle, memory_map, encoding, mode, errors ) is_path = isinstance(handle, str) if compression: # GZ Compression if compression == "gzip": if is_path: assert isinstance(handle, str) handle = gzip.GzipFile(filename=handle, mode=mode, **compression_args) else: handle = gzip.GzipFile( fileobj=handle, # type: ignore[arg-type] mode=mode, **compression_args, ) # BZ Compression elif compression == "bz2": handle = bz2.BZ2File( handle, mode=mode, **compression_args # type: ignore[arg-type] ) # ZIP Compression elif compression == "zip": handle = _BytesZipFile(handle, mode, **compression_args) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() if len(zip_names) == 1: handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError(f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError( "Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}" ) # XZ Compression elif compression == "xz": handle = get_lzma_file(lzma)(handle, mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) assert not isinstance(handle, str) handles.append(handle) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. assert isinstance(handle, str) if encoding and "b" not in mode: # Encoding handle = open(handle, mode, encoding=encoding, errors=errors, newline="") else: # Binary mode handle = open(handle, mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False if is_text and ( compression or isinstance(handle, need_text_wrapping) or "b" in getattr(handle, "mode", "") ): handle = TextIOWrapper( handle, # type: ignore[arg-type] encoding=encoding, errors=errors, newline="", ) handles.append(handle) # do not mark as wrapped when the user provided a string is_wrapped = not is_path handles.reverse() # close the most recently added buffer first assert not isinstance(handle, str) return IOHandles( handle=handle, created_handles=handles, is_wrapped=is_wrapped, is_mmap=memory_map, )
def get_handle( path_or_buf: FilePath | BaseBuffer, mode: str, *, encoding: str | None = None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors: str | None = None, storage_options: StorageOptions = None, ) -> IOHandles[str] | IOHandles[bytes]: """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip' and 'bz2' as well as 'zip'. memory_map : bool, default False See parsers._parser_params for more information. is_text : bool, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. storage_options: StorageOptions = None Passed to _get_filepath_or_buffer .. versionchanged:: 1.2.0 Returns the dataclass IOHandles """ # Windows does not default to utf-8. Set to utf-8 for a consistent behavior encoding = encoding or "utf-8" # read_csv does not know whether the buffer is opened in binary/text mode if _is_binary_mode(path_or_buf, mode) and "b" not in mode: mode += "b" # validate encoding and errors if isinstance(encoding, str): codecs.lookup(encoding) if isinstance(errors, str): codecs.lookup_error(errors) # open URLs ioargs = _get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, mode=mode, storage_options=storage_options, ) handle = ioargs.filepath_or_buffer handles: list[BaseBuffer] # memory mapping needs to be the first step handle, memory_map, handles = _maybe_memory_map( handle, memory_map, ioargs.encoding, ioargs.mode, errors, ioargs.compression["method"] not in _compression_to_extension, ) is_path = isinstance(handle, str) compression_args = dict(ioargs.compression) compression = compression_args.pop("method") # Only for write methods if "r" not in mode and is_path: check_parent_directory(str(handle)) if compression: # compression libraries do not like an explicit text-mode ioargs.mode = ioargs.mode.replace("t", "") # GZ Compression if compression == "gzip": if is_path: assert isinstance(handle, str) # error: Incompatible types in assignment (expression has type # "GzipFile", variable has type "Union[str, BaseBuffer]") handle = gzip.GzipFile( # type: ignore[assignment] filename=handle, mode=ioargs.mode, **compression_args, ) else: handle = gzip.GzipFile( # No overload variant of "GzipFile" matches argument types # "Union[str, BaseBuffer]", "str", "Dict[str, Any]" fileobj=handle, # type: ignore[call-overload] mode=ioargs.mode, **compression_args, ) # BZ Compression elif compression == "bz2": handle = bz2.BZ2File( # Argument 1 to "BZ2File" has incompatible type "Union[str, # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, # mmap]]"; expected "Union[Union[str, bytes, _PathLike[str], # _PathLike[bytes]], IO[bytes]]" handle, # type: ignore[arg-type] mode=ioargs.mode, **compression_args, ) # ZIP Compression elif compression == "zip": # error: Argument 1 to "_BytesZipFile" has incompatible type "Union[str, # BaseBuffer]"; expected "Union[Union[str, PathLike[str]], # ReadBuffer[bytes], WriteBuffer[bytes]]" handle = _BytesZipFile( handle, ioargs.mode, **compression_args # type: ignore[arg-type] ) if handle.mode == "r": handles.append(handle) zip_names = handle.namelist() if len(zip_names) == 1: handle = handle.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError("Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}") # XZ Compression elif compression == "xz": handle = get_lzma_file()(handle, ioargs.mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) assert not isinstance(handle, str) handles.append(handle) elif isinstance(handle, str): # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. if ioargs.encoding and "b" not in ioargs.mode: # Encoding handle = open( handle, ioargs.mode, encoding=ioargs.encoding, errors=errors, newline="", ) else: # Binary mode handle = open(handle, ioargs.mode) handles.append(handle) # Convert BytesIO or file objects passed with an encoding is_wrapped = False if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase): handle = BytesIOWrapper( handle, encoding=ioargs.encoding, ) handles.append(handle) # the (text) handle is always provided by the caller # since get_handle would have opened it in binary mode is_wrapped = True elif is_text and (compression or _is_binary_mode(handle, ioargs.mode)): handle = TextIOWrapper( # error: Argument 1 to "TextIOWrapper" has incompatible type # "Union[IO[bytes], IO[Any], RawIOBase, BufferedIOBase, TextIOBase, mmap]"; # expected "IO[bytes]" handle, # type: ignore[arg-type] encoding=ioargs.encoding, errors=errors, newline="", ) handles.append(handle) # only marked as wrapped when the caller provided a handle is_wrapped = not (isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close) if "r" in ioargs.mode and not hasattr(handle, "read"): raise TypeError("Expected file path name or file-like object, " f"got {type(ioargs.filepath_or_buffer)} type") handles.reverse() # close the most recently added buffer first if ioargs.should_close: assert not isinstance(ioargs.filepath_or_buffer, str) handles.append(ioargs.filepath_or_buffer) return IOHandles( # error: Argument "handle" to "IOHandles" has incompatible type # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes], # typing.IO[Any]]"; expected "pandas._typing.IO[Any]" handle=handle, # type: ignore[arg-type] # error: Argument "created_handles" to "IOHandles" has incompatible type # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]" created_handles=handles, # type: ignore[arg-type] is_wrapped=is_wrapped, is_mmap=memory_map, compression=ioargs.compression, )