def compress_file(self, src_path, dest_path, compression): if compression is None: shutil.copyfile(src_path, dest_path) return if compression == "gzip": f = gzip.open(dest_path, "w") elif compression == "bz2": f = bz2.BZ2File(dest_path, "w") elif compression == "zip": with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": f = _get_lzma_file(lzma)(dest_path, "w") else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) if compression != "zip": with open(src_path, "rb") as fh, f: f.write(fh.read())
def get_handle( path_or_buf, mode: str, encoding=None, compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors=None, ): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is one of {'zip', 'gzip', 'bz2'}, or inferred as one of the above, other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. .. versionchanged:: 1.1.0 Passing compression options as keys in dict is now supported for compression modes 'gzip' and 'bz2' as well as 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True Whether the type of the content passed to the file/buffer is string or bytes. This is not the same as `"b" not in mode`. If a string content is passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list of options. .. versionadded:: 1.1.0 Returns ------- f : file-like A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ need_text_wrapping: Tuple[Type["IOBase"], ...] try: from s3fs import S3File need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) # fsspec is an optional dependency. If it is available, add its file-object # class to the list of classes that need text wrapping. If fsspec is too old and is # needed, get_filepath_or_buffer would already have thrown an exception. try: from fsspec.spec import AbstractFileSystem need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) except ImportError: pass handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) compression, compression_args = get_compression_method(compression) if is_path: compression = infer_compression(path_or_buf, compression) if compression: # GZ Compression if compression == "gzip": if is_path: f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": zf = _BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": f = zf elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( f"Zero files found in ZIP file {path_or_buf}") else: raise ValueError("Multiple files found in ZIP file. " f"Only one file per ZIP: {zip_names}") # XZ Compression elif compression == "xz": f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) handles.append(f) elif is_path: # Check whether the filename is to be opened in binary mode. # Binary mode does not support 'encoding' and 'newline'. is_binary_mode = "b" in mode if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") elif is_text and not is_binary_mode: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") if not isinstance(f, (BufferedIOBase, RawIOBase)): handles.append(g) f = g if memory_map and hasattr(f, "fileno"): try: wrapped = _MMapWrapper(f) f.close() handles.remove(f) handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def _get_handle( path_or_buf, mode: str, encoding=None, compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, ): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : str or file handle File path or object. mode : str Mode to open path_or_buf with. encoding : str or None Encoding to use. compression : str or dict, default None If string, specifies compression mode. If dict, value at key 'method' specifies compression mode. Compression mode must be one of {'infer', 'gzip', 'bz2', 'zip', 'xz', None}. If compression mode is 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). If dict and compression mode is 'zip' or inferred as 'zip', other entries passed as additional compression options. .. versionchanged:: 1.0.0 May now be a dict with key 'method' as compression mode and other keys as compression options if compression mode is 'zip'. memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.). Returns ------- f : file-like A file-like object. handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File need_text_wrapping = (BufferedIOBase, S3File) except ImportError: need_text_wrapping = BufferedIOBase # type: ignore handles = list() # type: List[IO] f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) compression, compression_args = _get_compression_method(compression) if is_path: compression = _infer_compression(path_or_buf, compression) if compression: # GZ Compression if compression == "gzip": if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == "zip": zf = BytesZipFile(path_or_buf, mode, **compression_args) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": f = zf elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( "Zero files found in ZIP file {}".format(path_or_buf)) else: raise ValueError( "Multiple files found in ZIP file." " Only one file per ZIP: {}".format(zip_names)) # XZ Compression elif compression == "xz": f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) handles.append(f) elif is_path: if encoding: # Encoding f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper g = TextIOWrapper(f, encoding=encoding, newline="") if not isinstance(f, BufferedIOBase): handles.append(g) f = g if memory_map and hasattr(f, "fileno"): try: wrapped = MMapWrapper(f) f.close() f = wrapped except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles
def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. Parameters ---------- path_or_buf : a path (str) or buffer mode : str mode to open path_or_buf with encoding : str or None compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no compression). memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like A file-like object handles : list of file-like objects A list of file-like object that were opened in this function. """ try: from s3fs import S3File need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO, ) handles = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, str) if is_path: compression = _infer_compression(path_or_buf, compression) if compression: # GZ Compression if compression == "gzip": if is_path: f = gzip.open(path_or_buf, mode) else: f = gzip.GzipFile(fileobj=path_or_buf) # BZ Compression elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode) else: f = bz2.BZ2File(path_or_buf) # ZIP Compression elif compression == "zip": zf = BytesZipFile(path_or_buf, mode) # Ensure the container is closed as well. handles.append(zf) if zf.mode == "w": f = zf elif zf.mode == "r": zip_names = zf.namelist() if len(zip_names) == 1: f = zf.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError( "Zero files found in ZIP file {}".format(path_or_buf)) else: raise ValueError( "Multiple files found in ZIP file." " Only one file per ZIP: {}".format(zip_names)) # XZ Compression elif compression == "xz": f = _get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: msg = "Unrecognized compression type: {}".format(compression) raise ValueError(msg) handles.append(f) elif is_path: if encoding: # Encoding f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: # Binary mode f = open(path_or_buf, mode) handles.append(f) # Convert BytesIO or file objects passed with an encoding if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding, newline="") handles.append(f) if memory_map and hasattr(f, "fileno"): try: g = MMapWrapper(f) f.close() f = g except Exception: # we catch any errors that may have occurred # because that is consistent with the lower-level # functionality of the C engine (pd.read_csv), so # leave the file handler as is then pass return f, handles