def test_stringify_path_pathlib(self): tm._skip_if_no_pathlib() rel_path = common._stringify_path(Path('.')) assert rel_path == '.' redundant_path = common._stringify_path(Path('foo//bar')) assert redundant_path == os.path.join('foo', 'bar')
def test_stringify_path_pathlib(self): tm._skip_if_no_pathlib() rel_path = common._stringify_path(Path('.')) self.assertEqual(rel_path, '.') redundant_path = common._stringify_path(Path('foo//bar')) self.assertEqual(redundant_path, os.path.join('foo', 'bar'))
def write(self, writer, sheet_name='Sheet1', startrow=0, startcol=0, freeze_panes=None, engine=None): """ writer : string or ExcelWriter object File path or existing ExcelWriter sheet_name : string, default 'Sheet1' Name of sheet which will contain DataFrame startrow : upper left cell row to dump data frame startcol : upper left cell column to dump data frame freeze_panes : tuple of integer (length 2), default None Specifies the one-based bottommost row and rightmost column that is to be frozen engine : string, default None write engine to use if writer is a path - you can also set this via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and ``io.excel.xlsm.writer``. """ from pandas.io.excel import ExcelWriter from pandas.io.common import _stringify_path if isinstance(writer, ExcelWriter): need_save = False else: writer = ExcelWriter(_stringify_path(writer), engine=engine) need_save = True formatted_cells = self.get_formatted_cells() writer.write_cells(formatted_cells, sheet_name, startrow=startrow, startcol=startcol, freeze_panes=freeze_panes) if need_save: writer.save()
def read_feather(path, use_threads=True): """ Load a feather-format object from the file path .. versionadded 0.20.0 Parameters ---------- path : string file path, or file-like object nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 .. deprecated 0.24.0 use_threads: bool, default True Whether to parallelize reading using multiple threads .. versionadded 0.24.0 Returns ------- type of object stored in file """ feather, pyarrow = _try_import() path = _stringify_path(path) if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'): int_use_threads = int(use_threads) if int_use_threads < 1: int_use_threads = 1 return feather.read_feather(path, nthreads=int_use_threads) return feather.read_feather(path, use_threads=bool(use_threads))
def test_stringify_path_localpath(self): tm._skip_if_no_localpath() path = os.path.join('foo', 'bar') abs_path = os.path.abspath(path) lpath = LocalPath(path) self.assertEqual(common._stringify_path(lpath), abs_path)
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : string or file-like object Path to the SAS file. format : string {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : string, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ("If this is a buffer object rather " "than a string name, you must specify " "a format string") filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, compat.string_types): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") if format.lower() == 'xport': from pandas.io.sas.sas_xport import XportReader reader = XportReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) elif format.lower() == 'sas7bdat': from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) else: raise ValueError('unknown SAS format') if iterator or chunksize: return reader data = reader.read() reader.close() return data
def read_feather(path, nthreads=1): """ Load a feather-format object from the file path .. versionadded 0.20.0 Parameters ---------- path : string file path, or file-like object nthreads : int, default 1 Number of CPU threads to use when reading to pandas.DataFrame .. versionadded 0.21.0 Returns ------- type of object stored in file """ feather = _try_import() path = _stringify_path(path) if LooseVersion(feather.__version__) < LooseVersion('0.4.0'): return feather.read_dataframe(path) return feather.read_dataframe(path, nthreads=nthreads)
def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False): path_or_buf = _stringify_path(path_or_buf) if lines and orient != 'records': raise ValueError( "'lines' keyword only valid when 'orient' is records") if orient == 'table' and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or 'values') if orient == 'table' and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler).write() if lines: s = _convert_to_line_delimits(s) if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, 'w') as fh: fh.write(s) elif path_or_buf is None: return s else: path_or_buf.write(s)
def __init__(self, io, engine=None): if engine is None: engine = 'xlrd' if engine not in self._engines: raise ValueError("Unknown engine: {engine}".format(engine=engine)) # could be a str, ExcelFile, Book, etc. self.io = io # Always a string self._io = _stringify_path(io) self._reader = self._engines[engine](self._io)
def to_feather(df, path): """ Write a DataFrame to the feather-format Parameters ---------- df : DataFrame path : string File path """ path = _stringify_path(path) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") feather = _try_import() valid_types = {'string', 'unicode'} # validate index # -------------- # validate that we have only a default index # raise on anything else as we don't serialize the index if not isinstance(df.index, Int64Index): raise ValueError("feather does not support serializing {} " "for the index; you can .reset_index()" "to make the index into column(s)".format( type(df.index))) if not df.index.equals(RangeIndex.from_range(range(len(df)))): raise ValueError("feather does not support serializing a " "non-default index for the index; you " "can .reset_index() to make the index " "into column(s)") if df.index.name is not None: raise ValueError("feather does not serialize index meta-data on a " "default index") # validate columns # ---------------- # must have value column names (strings only) if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") feather.write_dataframe(df, path)
def to_json(path_or_buf, obj, orient=None, date_format='epoch', double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', index=True): if not index and orient not in ['split', 'table']: raise ValueError("'index=False' is only valid when 'orient' is " "'split' or 'table'") path_or_buf = _stringify_path(path_or_buf) if lines and orient != 'records': raise ValueError( "'lines' keyword only valid when 'orient' is records") if orient == 'table' and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or 'values') if orient == 'table' and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, index=index).write() if lines: s = _convert_to_line_delimits(s) if isinstance(path_or_buf, compat.string_types): fh, handles = _get_handle(path_or_buf, 'w', compression=compression) try: fh.write(s) finally: fh.close() elif path_or_buf is None: return s else: path_or_buf.write(s)
def to_msgpack(path_or_buf, *args, **kwargs): """ msgpack (serialize) object to input file path THIS IS AN EXPERIMENTAL LIBRARY and the storage format may not be stable until a future release. Parameters ---------- path_or_buf : string File path, buffer-like, or None if None, return generated string args : an object or objects to serialize encoding: encoding for unicode objects append : boolean whether to append to an existing msgpack (default is False) compress : type of compressor (zlib or blosc), default to None (no compression) """ global compressor compressor = kwargs.pop('compress', None) if compressor: compressor = u(compressor) append = kwargs.pop('append', None) if append: mode = 'a+b' else: mode = 'wb' def writer(fh): for a in args: fh.write(pack(a, **kwargs)) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, compat.string_types): with open(path_or_buf, mode) as fh: writer(fh) elif path_or_buf is None: buf = compat.BytesIO() writer(buf) return buf.getvalue() else: writer(path_or_buf)
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to input file path Parameters ---------- obj : any object path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' a string representing the compression to use in the output file .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: pkl.dump(obj, f, protocol=protocol) finally: for _f in fh: _f.close()
def test_stringify_path_localpath(self): path = os.path.join('foo', 'bar') abs_path = os.path.abspath(path) lpath = LocalPath(path) assert icom._stringify_path(lpath) == abs_path
def test_stringify_path_fspath(self): p = CustomFSPath('foo/bar.csv') result = icom._stringify_path(p) assert result == 'foo/bar.csv'
def __fspath__(self): return _stringify_path(self.path)
def test_stringify_path_pathlib(self): rel_path = icom._stringify_path(Path('.')) assert rel_path == '.' redundant_path = icom._stringify_path(Path('foo//bar')) assert redundant_path == os.path.join('foo', 'bar')
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode self.encoding = encoding self.compression = compression if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and date_format is not None): self.data_index = Index([ x.strftime(date_format) if notna(x) else '' for x in self.data_index ]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return read_wrapper(lambda f: pkl.load(f)) except Exception: # noqa: E722 # reg/patched pickle # compat not used in pandas/compat/pickle_compat.py::load # TODO: remove except block OR modify pc.load to use compat try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except Exception: # noqa: E722 return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except Exception: # noqa: E722 if PY3: return try_read(path, encoding='latin1') raise
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Notes ----- read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) f, fh = _get_handle(path, 'rb', compression=compression, is_text=False) # 1) try standard libary Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes # 3) try pickle_compat with latin1 encoding try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) except Exception: # noqa: E722 try: return pc.load(f, encoding=None) except Exception: # noqa: E722 return pc.load(f, encoding='latin1') finally: f.close() for _f in fh: _f.close()
def read_pickle(path, compression="infer"): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Notes ----- read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) f, fh = _get_handle(path, "rb", compression=compression, is_text=False) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes excs_to_catch = (AttributeError, ImportError) if PY36: excs_to_catch += (ModuleNotFoundError,) try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) except excs_to_catch: # e.g. # "No module named 'pandas.core.sparse.series'" # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib" return pc.load(f, encoding=None) except UnicodeDecodeError: # e.g. can occur for files written in py27; see GH#28645 return pc.load(f, encoding="latin-1") finally: f.close() for _f in fh: _f.close()
def to_msgpack(path_or_buf, *args, **kwargs): """ msgpack (serialize) object to input file path .. deprecated:: 0.25.0 to_msgpack is deprecated and will be removed in a future version. It is recommended to use pyarrow for on-the-wire transmission of pandas objects. Example pyarrow usage: >>> import pandas as pd >>> import pyarrow as pa >>> df = pd.DataFrame({'A': [1, 2, 3]}) >>> context = pa.default_serialization_context() >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() For documentation on pyarrow, see `here <https://arrow.apache.org/docs/python/index.html>`__. Parameters ---------- path_or_buf : string File path, buffer-like, or None if None, return generated bytes args : an object or objects to serialize encoding : encoding for unicode objects append : boolean whether to append to an existing msgpack (default is False) compress : type of compressor (zlib or blosc), default to None (no compression) """ warnings.warn( "to_msgpack is deprecated and will be removed in a " "future version.\n" "It is recommended to use pyarrow for on-the-wire " "transmission of pandas objects.\n" "For a full example, check\n" "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_msgpack.html", # noqa: E501 FutureWarning, stacklevel=3, ) global compressor compressor = kwargs.pop("compress", None) append = kwargs.pop("append", None) if append: mode = "a+b" else: mode = "wb" def writer(fh): for a in args: fh.write(pack(a, **kwargs)) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): try: with open(path_or_buf, mode) as fh: writer(fh) except FileNotFoundError: msg = "File b'{}' does not exist".format(path_or_buf) raise FileNotFoundError(msg) elif path_or_buf is None: buf = BytesIO() writer(buf) return buf.getvalue() else: writer(path_or_buf)
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : same type as object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) f, fh = _get_handle(path, 'rb', compression=compression, is_text=False) # 1) try standard libary Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes # 3) try pickle_compat with latin1 encoding try: with warnings.catch_warnings(record=True): # We want to silence any warnings about, e.g. moved modules. warnings.simplefilter("ignore", Warning) return pickle.load(f) except Exception: # noqa: E722 try: return pc.load(f, encoding=None) except Exception: # noqa: E722 return pc.load(f, encoding='latin1') finally: f.close() for _f in fh: _f.close()
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path Warning: Loading pickled data received from untrusted sources can be unsafe. See: https://docs.python.org/3/library/pickle.html Parameters ---------- path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep='inf', verbose=True, **kwargs): """ Monkeypatched DataFrame.to_excel by xl_link! Changes: -------- Returns ------- XLMap corresponding to position of frame as it appears in excel (see XLMap for details) See Also -------- Pandas.DataFrame.to_excel for info on parameters Note ---- When providing a path as excel_writer, default engine used is 'xlsxwriter', as xlsxwriter workbooks can only be saved once, xl_link suppresses calling `excel_writer.save()`, as a result, `xlmap.writer.save()` should be called once no further changes are to be made to the spreadsheet. """ if isinstance(excel_writer, pd.ExcelWriter): need_save = False else: excel_writer = pd.ExcelWriter(_stringify_path(excel_writer), engine=engine) need_save = True if excel_writer.engine != 'xlsxwriter' else False # xlsxwriter can only save once! super().to_excel(excel_writer, sheet_name=sheet_name, na_rep=na_rep, float_format=float_format, columns=columns, header=header, index=index, index_label=index_label, startrow=startrow, startcol=startcol, engine=engine, merge_cells=merge_cells, encoding=encoding, inf_rep=inf_rep, verbose=verbose, **kwargs) if need_save: excel_writer.save() data_range, index_range, col_range, _ = get_xl_ranges( self.index, self.columns, sheet_name=sheet_name, columns=columns, header=header, index=index, index_label=index_label, startrow=startrow, startcol=startcol, merge_cells=merge_cells) f = self.copy() if isinstance(columns, list) or isinstance(columns, tuple): f = f[columns] return XLMap(data_range, index_range, col_range, f, writer=excel_writer)
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False): """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : string or file-like object Path to the SAS file. format : string {'xport', 'sas7bdat'} or None If None, file format is inferred. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : string, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ("If this is a buffer object rather " "than a string name, you must specify " "a format string") filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, compat.string_types): raise ValueError(buffer_error_msg) try: fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") except: pass if format.lower() == 'xport': from pandas.io.sas.sas_xport import XportReader reader = XportReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) elif format.lower() == 'sas7bdat': from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader(filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize) else: raise ValueError('unknown SAS format') if iterator or chunksize: return reader data = reader.read() reader.close() return data
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any object) from file. .. warning:: Loading pickled data received from untrusted sources can be unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__. Parameters ---------- path : str File path where the pickled object will be loaded. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file See Also -------- DataFrame.to_pickle : Pickle (serialize) DataFrame object to file. Series.to_pickle : Pickle (serialize) Series object to file. read_hdf : Read HDF5 file into a DataFrame. read_sql : Read SQL query or database table into a DataFrame. read_parquet : Load a parquet object, returning a DataFrame. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: with warnings.catch_warnings(record=True): # We want to silencce any warnings about, e.g. moved modules. return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def test_stringify_path_pathlib(self): rel_path = icom._stringify_path(Path(".")) assert rel_path == "." redundant_path = icom._stringify_path(Path("foo//bar")) assert redundant_path == os.path.join("foo", "bar")
def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): self.obj = obj if path_or_buf is None: path_or_buf = StringIO() self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format self.decimal = decimal self.header = header self.index = index self.index_label = index_label self.mode = mode self.encoding = encoding self.compression = compression if quoting is None: quoting = csvlib.QUOTE_MINIMAL self.quoting = quoting if quoting == csvlib.QUOTE_NONE: # prevents crash in _csv quotechar = None self.quotechar = quotechar self.doublequote = doublequote self.escapechar = escapechar self.line_terminator = line_terminator self.date_format = date_format self.tupleize_cols = tupleize_cols self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and not self.tupleize_cols) # validate mi options if self.has_mi_columns: if cols is not None: raise TypeError("cannot specify cols with a MultiIndex on the " "columns") if cols is not None: if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) self.obj = self.obj.loc[:, cols] # update columns to include possible multiplicity of dupes # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, Index): cols = cols.to_native_types(na_rep=na_rep, float_format=float_format, date_format=date_format, quoting=self.quoting) else: cols = list(cols) # save it self.cols = cols # preallocate data 2d list self.blocks = self.obj._data.blocks ncols = sum(b.shape[0] for b in self.blocks) self.data = [None] * ncols if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 self.chunksize = int(chunksize) self.data_index = obj.index if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and date_format is not None): self.data_index = Index([x.strftime(date_format) if notna(x) else '' for x in self.data_index]) self.nlevels = getattr(self.data_index, 'nlevels', 1) if not index: self.nlevels = 0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL): """ Pickle (serialize) object to file. Parameters ---------- obj : any object Any python object. path : str File path where the pickled object will be stored. compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' A string representing the compression to use in the output file. By default, infers from the file extension in specified path. .. versionadded:: 0.20.0 protocol : int Int which indicates which protocol should be used by the pickler, default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible values for this parameter depend on the version of Python. For Python 2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value. For Python >= 3.4, 4 is a valid value. A negative value for the protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. .. [1] https://docs.python.org/3/library/pickle.html .. versionadded:: 0.21.0 See Also -------- read_pickle : Load pickled pandas object (or any object) from file. DataFrame.to_hdf : Write DataFrame to an HDF5 file. DataFrame.to_sql : Write DataFrame to a SQL database. DataFrame.to_parquet : Write a DataFrame to the binary parquet format. Examples -------- >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)}) >>> original_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> pd.to_pickle(original_df, "./dummy.pkl") >>> unpickled_df = pd.read_pickle("./dummy.pkl") >>> unpickled_df foo bar 0 0 5 1 1 6 2 2 7 3 3 8 4 4 9 >>> import os >>> os.remove("./dummy.pkl") """ path = _stringify_path(path) f, fh = _get_handle(path, 'wb', compression=compression, is_text=False) if protocol < 0: protocol = pkl.HIGHEST_PROTOCOL try: f.write(pkl.dumps(obj, protocol=protocol)) finally: for _f in fh: _f.close()
def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path Warning: Loading pickled data received from untrusted sources can be unsafe. See: http://docs.python.org/2.7/library/pickle.html Parameters ---------- path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz', or '.zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 Returns ------- unpickled : type of object stored in file """ path = _stringify_path(path) inferred_compression = _infer_compression(path, compression) def read_wrapper(func): # wrapper file handle open/close operation f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) try: return func(f) finally: for _f in fh: _f.close() def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then # try with the compat pickle to handle subclass changes # pass encoding only if its not None as py2 doesn't handle # the param # cpickle # GH 6899 try: return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: return read_wrapper( lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: if PY3: return try_read(path, encoding='latin1') raise
def to_json( path_or_buf, obj, orient: Optional[str] = None, date_format: str = "epoch", double_precision: int = 10, force_ascii: bool = True, date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, compression: Optional[str] = "infer", index: bool = True, indent: int = 0, ): if not index and orient not in ["split", "table"]: raise ValueError("'index=False' is only valid when 'orient' is " "'split' or 'table'") path_or_buf = _stringify_path(path_or_buf) if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") if orient == "table" and isinstance(obj, Series): obj = obj.to_frame(name=obj.name or "values") writer: Type["Writer"] if orient == "table" and isinstance(obj, DataFrame): writer = JSONTableWriter elif isinstance(obj, Series): writer = SeriesWriter elif isinstance(obj, DataFrame): writer = FrameWriter else: raise NotImplementedError("'obj' should be a Series or a DataFrame") s = writer( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, date_unit=date_unit, default_handler=default_handler, index=index, indent=indent, ).write() if lines: s = convert_to_line_delimits(s) if isinstance(path_or_buf, str): fh, handles = _get_handle(path_or_buf, "w", compression=compression) try: fh.write(s) finally: fh.close() elif path_or_buf is None: return s else: path_or_buf.write(s)
def read_sas( filepath_or_buffer, format=None, index=None, encoding=None, chunksize=None, iterator=False, ): """ Read SAS files stored as either XPORT or SAS7BDAT format files. Parameters ---------- filepath_or_buffer : str, path object or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.sas``. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. format : string {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or 'sas7bdat', uses the corresponding format. index : identifier of index column, defaults to None Identifier of column that should be used as index of the DataFrame. encoding : string, default is None Encoding for text data. If None, text data are stored as raw bytes. chunksize : int Read file `chunksize` lines at a time, returns iterator. iterator : bool, defaults to False If True, returns an iterator for reading the file incrementally. Returns ------- DataFrame if iterator=False and chunksize=None, else SAS7BDATReader or XportReader """ if format is None: buffer_error_msg = ( "If this is a buffer object rather " "than a string name, you must specify " "a format string" ) filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" elif fname.endswith(".sas7bdat"): format = "sas7bdat" else: raise ValueError("unable to infer format of SAS file") if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize ) else: raise ValueError("unknown SAS format") if iterator or chunksize: return reader data = reader.read() reader.close() return data