Exemplo n.º 1
0
    def test_stringify_path_pathlib(self):
        tm._skip_if_no_pathlib()

        rel_path = common._stringify_path(Path('.'))
        assert rel_path == '.'
        redundant_path = common._stringify_path(Path('foo//bar'))
        assert redundant_path == os.path.join('foo', 'bar')
Exemplo n.º 2
0
    def test_stringify_path_pathlib(self):
        tm._skip_if_no_pathlib()

        rel_path = common._stringify_path(Path('.'))
        self.assertEqual(rel_path, '.')
        redundant_path = common._stringify_path(Path('foo//bar'))
        self.assertEqual(redundant_path, os.path.join('foo', 'bar'))
Exemplo n.º 3
0
    def write(self, writer, sheet_name='Sheet1', startrow=0,
              startcol=0, freeze_panes=None, engine=None):
        """
        writer : string or ExcelWriter object
            File path or existing ExcelWriter
        sheet_name : string, default 'Sheet1'
            Name of sheet which will contain DataFrame
        startrow :
            upper left cell row to dump data frame
        startcol :
            upper left cell column to dump data frame
        freeze_panes : tuple of integer (length 2), default None
            Specifies the one-based bottommost row and rightmost column that
            is to be frozen
        engine : string, default None
            write engine to use if writer is a path - you can also set this
            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
            and ``io.excel.xlsm.writer``.
        """
        from pandas.io.excel import ExcelWriter
        from pandas.io.common import _stringify_path

        if isinstance(writer, ExcelWriter):
            need_save = False
        else:
            writer = ExcelWriter(_stringify_path(writer), engine=engine)
            need_save = True

        formatted_cells = self.get_formatted_cells()
        writer.write_cells(formatted_cells, sheet_name,
                           startrow=startrow, startcol=startcol,
                           freeze_panes=freeze_panes)
        if need_save:
            writer.save()
Exemplo n.º 4
0
def read_feather(path, use_threads=True):
    """
    Load a feather-format object from the file path

    .. versionadded 0.20.0

    Parameters
    ----------
    path : string file path, or file-like object
    nthreads : int, default 1
        Number of CPU threads to use when reading to pandas.DataFrame

       .. versionadded 0.21.0
       .. deprecated 0.24.0
    use_threads: bool, default True
        Whether to parallelize reading using multiple threads

       .. versionadded 0.24.0

    Returns
    -------
    type of object stored in file

    """

    feather, pyarrow = _try_import()
    path = _stringify_path(path)

    if LooseVersion(pyarrow.__version__) < LooseVersion('0.11.0'):
        int_use_threads = int(use_threads)
        if int_use_threads < 1:
            int_use_threads = 1
        return feather.read_feather(path, nthreads=int_use_threads)

    return feather.read_feather(path, use_threads=bool(use_threads))
Exemplo n.º 5
0
    def test_stringify_path_localpath(self):
        tm._skip_if_no_localpath()

        path = os.path.join('foo', 'bar')
        abs_path = os.path.abspath(path)
        lpath = LocalPath(path)
        self.assertEqual(common._stringify_path(lpath), abs_path)
Exemplo n.º 6
0
def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
             chunksize=None, iterator=False):
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.

    Parameters
    ----------
    filepath_or_buffer : string or file-like object
        Path to the SAS file.
    format : string {'xport', 'sas7bdat'} or None
        If None, file format is inferred from file extension. If 'xport' or
        'sas7bdat', uses the corresponding format.
    index : identifier of index column, defaults to None
        Identifier of column that should be used as index of the DataFrame.
    encoding : string, default is None
        Encoding for text data.  If None, text data are stored as raw bytes.
    chunksize : int
        Read file `chunksize` lines at a time, returns iterator.
    iterator : bool, defaults to False
        If True, returns an iterator for reading the file incrementally.

    Returns
    -------
    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
    or XportReader
    """
    if format is None:
        buffer_error_msg = ("If this is a buffer object rather "
                            "than a string name, you must specify "
                            "a format string")
        filepath_or_buffer = _stringify_path(filepath_or_buffer)
        if not isinstance(filepath_or_buffer, compat.string_types):
            raise ValueError(buffer_error_msg)
        fname = filepath_or_buffer.lower()
        if fname.endswith(".xpt"):
            format = "xport"
        elif fname.endswith(".sas7bdat"):
            format = "sas7bdat"
        else:
            raise ValueError("unable to infer format of SAS file")

    if format.lower() == 'xport':
        from pandas.io.sas.sas_xport import XportReader
        reader = XportReader(filepath_or_buffer, index=index,
                             encoding=encoding,
                             chunksize=chunksize)
    elif format.lower() == 'sas7bdat':
        from pandas.io.sas.sas7bdat import SAS7BDATReader
        reader = SAS7BDATReader(filepath_or_buffer, index=index,
                                encoding=encoding,
                                chunksize=chunksize)
    else:
        raise ValueError('unknown SAS format')

    if iterator or chunksize:
        return reader

    data = reader.read()
    reader.close()
    return data
Exemplo n.º 7
0
def read_feather(path, nthreads=1):
    """
    Load a feather-format object from the file path

    .. versionadded 0.20.0

    Parameters
    ----------
    path : string file path, or file-like object
    nthreads : int, default 1
        Number of CPU threads to use when reading to pandas.DataFrame

       .. versionadded 0.21.0

    Returns
    -------
    type of object stored in file

    """

    feather = _try_import()
    path = _stringify_path(path)

    if LooseVersion(feather.__version__) < LooseVersion('0.4.0'):
        return feather.read_dataframe(path)

    return feather.read_dataframe(path, nthreads=nthreads)
Exemplo n.º 8
0
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
            double_precision=10, force_ascii=True, date_unit='ms',
            default_handler=None, lines=False):

    path_or_buf = _stringify_path(path_or_buf)
    if lines and orient != 'records':
        raise ValueError(
            "'lines' keyword only valid when 'orient' is records")

    if orient == 'table' and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or 'values')
    if orient == 'table' and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj, orient=orient, date_format=date_format,
        double_precision=double_precision, ensure_ascii=force_ascii,
        date_unit=date_unit, default_handler=default_handler).write()

    if lines:
        s = _convert_to_line_delimits(s)

    if isinstance(path_or_buf, compat.string_types):
        with open(path_or_buf, 'w') as fh:
            fh.write(s)
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Exemplo n.º 9
0
    def __init__(self, io, engine=None):
        if engine is None:
            engine = 'xlrd'
        if engine not in self._engines:
            raise ValueError("Unknown engine: {engine}".format(engine=engine))

        # could be a str, ExcelFile, Book, etc.
        self.io = io
        # Always a string
        self._io = _stringify_path(io)

        self._reader = self._engines[engine](self._io)
Exemplo n.º 10
0
def to_feather(df, path):
    """
    Write a DataFrame to the feather-format

    Parameters
    ----------
    df : DataFrame
    path : string
        File path

    """
    path = _stringify_path(path)
    if not isinstance(df, DataFrame):
        raise ValueError("feather only support IO with DataFrames")

    feather = _try_import()
    valid_types = {'string', 'unicode'}

    # validate index
    # --------------

    # validate that we have only a default index
    # raise on anything else as we don't serialize the index

    if not isinstance(df.index, Int64Index):
        raise ValueError("feather does not support serializing {} "
                         "for the index; you can .reset_index()"
                         "to make the index into column(s)".format(
                             type(df.index)))

    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
        raise ValueError("feather does not support serializing a "
                         "non-default index for the index; you "
                         "can .reset_index() to make the index "
                         "into column(s)")

    if df.index.name is not None:
        raise ValueError("feather does not serialize index meta-data on a "
                         "default index")

    # validate columns
    # ----------------

    # must have value column names (strings only)
    if df.columns.inferred_type not in valid_types:
        raise ValueError("feather must have string column names")

    feather.write_dataframe(df, path)
Exemplo n.º 11
0
def to_json(path_or_buf, obj, orient=None, date_format='epoch',
            double_precision=10, force_ascii=True, date_unit='ms',
            default_handler=None, lines=False, compression='infer',
            index=True):

    if not index and orient not in ['split', 'table']:
        raise ValueError("'index=False' is only valid when 'orient' is "
                         "'split' or 'table'")

    path_or_buf = _stringify_path(path_or_buf)
    if lines and orient != 'records':
        raise ValueError(
            "'lines' keyword only valid when 'orient' is records")

    if orient == 'table' and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or 'values')
    if orient == 'table' and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj, orient=orient, date_format=date_format,
        double_precision=double_precision, ensure_ascii=force_ascii,
        date_unit=date_unit, default_handler=default_handler,
        index=index).write()

    if lines:
        s = _convert_to_line_delimits(s)

    if isinstance(path_or_buf, compat.string_types):
        fh, handles = _get_handle(path_or_buf, 'w', compression=compression)
        try:
            fh.write(s)
        finally:
            fh.close()
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Exemplo n.º 12
0
def to_msgpack(path_or_buf, *args, **kwargs):
    """
    msgpack (serialize) object to input file path

    THIS IS AN EXPERIMENTAL LIBRARY and the storage format
    may not be stable until a future release.

    Parameters
    ----------
    path_or_buf : string File path, buffer-like, or None
                  if None, return generated string
    args : an object or objects to serialize
    encoding: encoding for unicode objects
    append : boolean whether to append to an existing msgpack
             (default is False)
    compress : type of compressor (zlib or blosc), default to None (no
               compression)
    """
    global compressor
    compressor = kwargs.pop('compress', None)
    if compressor:
        compressor = u(compressor)
    append = kwargs.pop('append', None)
    if append:
        mode = 'a+b'
    else:
        mode = 'wb'

    def writer(fh):
        for a in args:
            fh.write(pack(a, **kwargs))

    path_or_buf = _stringify_path(path_or_buf)
    if isinstance(path_or_buf, compat.string_types):
        with open(path_or_buf, mode) as fh:
            writer(fh)
    elif path_or_buf is None:
        buf = compat.BytesIO()
        writer(buf)
        return buf.getvalue()
    else:
        writer(path_or_buf)
Exemplo n.º 13
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to input file path

    Parameters
    ----------
    obj : any object
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
        a string representing the compression to use in the output file

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0


    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)
    f, fh = _get_handle(path, 'wb',
                        compression=inferred_compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        pkl.dump(obj, f, protocol=protocol)
    finally:
        for _f in fh:
            _f.close()
Exemplo n.º 14
0
 def test_stringify_path_localpath(self):
     path = os.path.join('foo', 'bar')
     abs_path = os.path.abspath(path)
     lpath = LocalPath(path)
     assert icom._stringify_path(lpath) == abs_path
Exemplo n.º 15
0
 def test_stringify_path_fspath(self):
     p = CustomFSPath('foo/bar.csv')
     result = icom._stringify_path(p)
     assert result == 'foo/bar.csv'
Exemplo n.º 16
0
 def __fspath__(self):
     return _stringify_path(self.path)
Exemplo n.º 17
0
 def test_stringify_path_pathlib(self):
     rel_path = icom._stringify_path(Path('.'))
     assert rel_path == '.'
     redundant_path = icom._stringify_path(Path('foo//bar'))
     assert redundant_path == os.path.join('foo', 'bar')
Exemplo n.º 18
0
    def __init__(self,
                 obj,
                 path_or_buf=None,
                 sep=",",
                 na_rep='',
                 float_format=None,
                 cols=None,
                 header=True,
                 index=True,
                 index_label=None,
                 mode='w',
                 nanRep=None,
                 encoding=None,
                 compression=None,
                 quoting=None,
                 line_terminator='\n',
                 chunksize=None,
                 tupleize_cols=False,
                 quotechar='"',
                 date_format=None,
                 doublequote=True,
                 escapechar=None,
                 decimal='.'):

        self.obj = obj

        if path_or_buf is None:
            path_or_buf = StringIO()

        self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
        self.sep = sep
        self.na_rep = na_rep
        self.float_format = float_format
        self.decimal = decimal

        self.header = header
        self.index = index
        self.index_label = index_label
        self.mode = mode
        self.encoding = encoding
        self.compression = compression

        if quoting is None:
            quoting = csvlib.QUOTE_MINIMAL
        self.quoting = quoting

        if quoting == csvlib.QUOTE_NONE:
            # prevents crash in _csv
            quotechar = None
        self.quotechar = quotechar

        self.doublequote = doublequote
        self.escapechar = escapechar

        self.line_terminator = line_terminator

        self.date_format = date_format

        self.tupleize_cols = tupleize_cols
        self.has_mi_columns = (isinstance(obj.columns, MultiIndex)
                               and not self.tupleize_cols)

        # validate mi options
        if self.has_mi_columns:
            if cols is not None:
                raise TypeError("cannot specify cols with a MultiIndex on the "
                                "columns")

        if cols is not None:
            if isinstance(cols, Index):
                cols = cols.to_native_types(na_rep=na_rep,
                                            float_format=float_format,
                                            date_format=date_format,
                                            quoting=self.quoting)
            else:
                cols = list(cols)
            self.obj = self.obj.loc[:, cols]

        # update columns to include possible multiplicity of dupes
        # and make sure sure cols is just a list of labels
        cols = self.obj.columns
        if isinstance(cols, Index):
            cols = cols.to_native_types(na_rep=na_rep,
                                        float_format=float_format,
                                        date_format=date_format,
                                        quoting=self.quoting)
        else:
            cols = list(cols)

        # save it
        self.cols = cols

        # preallocate data 2d list
        self.blocks = self.obj._data.blocks
        ncols = sum(b.shape[0] for b in self.blocks)
        self.data = [None] * ncols

        if chunksize is None:
            chunksize = (100000 // (len(self.cols) or 1)) or 1
        self.chunksize = int(chunksize)

        self.data_index = obj.index
        if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex))
                and date_format is not None):
            self.data_index = Index([
                x.strftime(date_format) if notna(x) else ''
                for x in self.data_index
            ])

        self.nlevels = getattr(self.data_index, 'nlevels', 1)
        if not index:
            self.nlevels = 0
Exemplo n.º 19
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path, 'rb',
                            compression=compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            with warnings.catch_warnings(record=True):
                # We want to silence any warnings about, e.g. moved modules.
                warnings.simplefilter("ignore", Warning)
                return read_wrapper(lambda f: pkl.load(f))
        except Exception:  # noqa: E722
            # reg/patched pickle
            # compat not used in pandas/compat/pickle_compat.py::load
            # TODO: remove except block OR modify pc.load to use compat
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except Exception:  # noqa: E722
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))
    try:
        return try_read(path)
    except Exception:  # noqa: E722
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Exemplo n.º 20
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Notes
    -----
    read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'rb', compression=compression, is_text=False)

    # 1) try standard libary Pickle
    # 2) try pickle_compat (older pandas version) to handle subclass changes
    # 3) try pickle_compat with latin1 encoding

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pickle.load(f)
    except Exception:  # noqa: E722
        try:
            return pc.load(f, encoding=None)
        except Exception:  # noqa: E722
            return pc.load(f, encoding='latin1')
    finally:
        f.close()
        for _f in fh:
            _f.close()
Exemplo n.º 21
0
def read_pickle(path, compression="infer"):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Notes
    -----
    read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, "rb", compression=compression, is_text=False)

    # 1) try standard library Pickle
    # 2) try pickle_compat (older pandas version) to handle subclass changes

    excs_to_catch = (AttributeError, ImportError)
    if PY36:
        excs_to_catch += (ModuleNotFoundError,)

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pickle.load(f)
    except excs_to_catch:
        # e.g.
        #  "No module named 'pandas.core.sparse.series'"
        #  "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
        return pc.load(f, encoding=None)
    except UnicodeDecodeError:
        # e.g. can occur for files written in py27; see GH#28645
        return pc.load(f, encoding="latin-1")
    finally:
        f.close()
        for _f in fh:
            _f.close()
Exemplo n.º 22
0
def to_msgpack(path_or_buf, *args, **kwargs):
    """
    msgpack (serialize) object to input file path

    .. deprecated:: 0.25.0

    to_msgpack is deprecated and will be removed in a future version.
    It is recommended to use pyarrow for on-the-wire transmission of
    pandas objects.

    Example pyarrow usage:

    >>> import pandas as pd
    >>> import pyarrow as pa
    >>> df = pd.DataFrame({'A': [1, 2, 3]})
    >>> context = pa.default_serialization_context()
    >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes()

    For documentation on pyarrow, see `here
    <https://arrow.apache.org/docs/python/index.html>`__.

    Parameters
    ----------
    path_or_buf : string File path, buffer-like, or None
                  if None, return generated bytes
    args : an object or objects to serialize
    encoding : encoding for unicode objects
    append : boolean whether to append to an existing msgpack
             (default is False)
    compress : type of compressor (zlib or blosc), default to None (no
               compression)
    """
    warnings.warn(
        "to_msgpack is deprecated and will be removed in a "
        "future version.\n"
        "It is recommended to use pyarrow for on-the-wire "
        "transmission of pandas objects.\n"
        "For a full example, check\n"
        "https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_msgpack.html",  # noqa: E501
        FutureWarning,
        stacklevel=3,
    )

    global compressor
    compressor = kwargs.pop("compress", None)
    append = kwargs.pop("append", None)
    if append:
        mode = "a+b"
    else:
        mode = "wb"

    def writer(fh):
        for a in args:
            fh.write(pack(a, **kwargs))

    path_or_buf = _stringify_path(path_or_buf)
    if isinstance(path_or_buf, str):
        try:
            with open(path_or_buf, mode) as fh:
                writer(fh)
        except FileNotFoundError:
            msg = "File b'{}' does not exist".format(path_or_buf)
            raise FileNotFoundError(msg)
    elif path_or_buf is None:
        buf = BytesIO()
        writer(buf)
        return buf.getvalue()
    else:
        writer(path_or_buf)
Exemplo n.º 23
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'rb', compression=compression, is_text=False)

    # 1) try standard libary Pickle
    # 2) try pickle_compat (older pandas version) to handle subclass changes
    # 3) try pickle_compat with latin1 encoding

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pickle.load(f)
    except Exception:  # noqa: E722
        try:
            return pc.load(f, encoding=None)
        except Exception:  # noqa: E722
            return pc.load(f, encoding='latin1')
    finally:
        f.close()
        for _f in fh:
            _f.close()
Exemplo n.º 24
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any other pickled object) from the specified
    file path

    Warning: Loading pickled data received from untrusted sources can be
    unsafe. See: https://docs.python.org/3/library/pickle.html

    Parameters
    ----------
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path, 'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))
    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Exemplo n.º 25
0
    def to_excel(self,
                 excel_writer,
                 sheet_name='Sheet1',
                 na_rep='',
                 float_format=None,
                 columns=None,
                 header=True,
                 index=True,
                 index_label=None,
                 startrow=0,
                 startcol=0,
                 engine=None,
                 merge_cells=True,
                 encoding=None,
                 inf_rep='inf',
                 verbose=True,
                 **kwargs):
        """

        Monkeypatched DataFrame.to_excel by xl_link!

        Changes:
        --------

        Returns
        -------

        XLMap
            corresponding to position of frame as it appears in excel (see XLMap for details)

        See Also
        --------

        Pandas.DataFrame.to_excel for info on parameters

        Note
        ----
        When providing a path as excel_writer, default engine used is 'xlsxwriter', as xlsxwriter workbooks can only be
        saved once, xl_link suppresses calling `excel_writer.save()`, as a result, `xlmap.writer.save()` should be
        called once no further changes are to be made to the spreadsheet.
        """

        if isinstance(excel_writer, pd.ExcelWriter):
            need_save = False
        else:
            excel_writer = pd.ExcelWriter(_stringify_path(excel_writer),
                                          engine=engine)
            need_save = True if excel_writer.engine != 'xlsxwriter' else False  # xlsxwriter can only save once!

        super().to_excel(excel_writer,
                         sheet_name=sheet_name,
                         na_rep=na_rep,
                         float_format=float_format,
                         columns=columns,
                         header=header,
                         index=index,
                         index_label=index_label,
                         startrow=startrow,
                         startcol=startcol,
                         engine=engine,
                         merge_cells=merge_cells,
                         encoding=encoding,
                         inf_rep=inf_rep,
                         verbose=verbose,
                         **kwargs)

        if need_save:
            excel_writer.save()

        data_range, index_range, col_range, _ = get_xl_ranges(
            self.index,
            self.columns,
            sheet_name=sheet_name,
            columns=columns,
            header=header,
            index=index,
            index_label=index_label,
            startrow=startrow,
            startcol=startcol,
            merge_cells=merge_cells)
        f = self.copy()

        if isinstance(columns, list) or isinstance(columns, tuple):
            f = f[columns]

        return XLMap(data_range,
                     index_range,
                     col_range,
                     f,
                     writer=excel_writer)
def read_sas(filepath_or_buffer,
             format=None,
             index=None,
             encoding=None,
             chunksize=None,
             iterator=False):
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.

    Parameters
    ----------
    filepath_or_buffer : string or file-like object
        Path to the SAS file.
    format : string {'xport', 'sas7bdat'} or None
        If None, file format is inferred.  If 'xport' or 'sas7bdat',
        uses the corresponding format.
    index : identifier of index column, defaults to None
        Identifier of column that should be used as index of the DataFrame.
    encoding : string, default is None
        Encoding for text data.  If None, text data are stored as raw bytes.
    chunksize : int
        Read file `chunksize` lines at a time, returns iterator.
    iterator : bool, defaults to False
        If True, returns an iterator for reading the file incrementally.

    Returns
    -------
    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
    or XportReader
    """
    if format is None:
        buffer_error_msg = ("If this is a buffer object rather "
                            "than a string name, you must specify "
                            "a format string")
        filepath_or_buffer = _stringify_path(filepath_or_buffer)
        if not isinstance(filepath_or_buffer, compat.string_types):
            raise ValueError(buffer_error_msg)
        try:
            fname = filepath_or_buffer.lower()
            if fname.endswith(".xpt"):
                format = "xport"
            elif fname.endswith(".sas7bdat"):
                format = "sas7bdat"
            else:
                raise ValueError("unable to infer format of SAS file")
        except:
            pass

    if format.lower() == 'xport':
        from pandas.io.sas.sas_xport import XportReader
        reader = XportReader(filepath_or_buffer,
                             index=index,
                             encoding=encoding,
                             chunksize=chunksize)
    elif format.lower() == 'sas7bdat':
        from pandas.io.sas.sas7bdat import SAS7BDATReader
        reader = SAS7BDATReader(filepath_or_buffer,
                                index=index,
                                encoding=encoding,
                                chunksize=chunksize)
    else:
        raise ValueError('unknown SAS format')

    if iterator or chunksize:
        return reader

    data = reader.read()
    reader.close()
    return data
Exemplo n.º 27
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path,
                            'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            with warnings.catch_warnings(record=True):
                # We want to silencce any warnings about, e.g. moved modules.
                return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))

    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Exemplo n.º 28
0
 def test_stringify_path_pathlib(self):
     rel_path = icom._stringify_path(Path("."))
     assert rel_path == "."
     redundant_path = icom._stringify_path(Path("foo//bar"))
     assert redundant_path == os.path.join("foo", "bar")
Exemplo n.º 29
0
 def __fspath__(self):
     return _stringify_path(self.path)
Exemplo n.º 30
0
 def test_stringify_path_pathlib(self):
     rel_path = icom._stringify_path(Path('.'))
     assert rel_path == '.'
     redundant_path = icom._stringify_path(Path('foo//bar'))
     assert redundant_path == os.path.join('foo', 'bar')
Exemplo n.º 31
0
    def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
                 float_format=None, cols=None, header=True, index=True,
                 index_label=None, mode='w', nanRep=None, encoding=None,
                 compression=None, quoting=None, line_terminator='\n',
                 chunksize=None, tupleize_cols=False, quotechar='"',
                 date_format=None, doublequote=True, escapechar=None,
                 decimal='.'):

        self.obj = obj

        if path_or_buf is None:
            path_or_buf = StringIO()

        self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
        self.sep = sep
        self.na_rep = na_rep
        self.float_format = float_format
        self.decimal = decimal

        self.header = header
        self.index = index
        self.index_label = index_label
        self.mode = mode
        self.encoding = encoding
        self.compression = compression

        if quoting is None:
            quoting = csvlib.QUOTE_MINIMAL
        self.quoting = quoting

        if quoting == csvlib.QUOTE_NONE:
            # prevents crash in _csv
            quotechar = None
        self.quotechar = quotechar

        self.doublequote = doublequote
        self.escapechar = escapechar

        self.line_terminator = line_terminator

        self.date_format = date_format

        self.tupleize_cols = tupleize_cols
        self.has_mi_columns = (isinstance(obj.columns, MultiIndex) and
                               not self.tupleize_cols)

        # validate mi options
        if self.has_mi_columns:
            if cols is not None:
                raise TypeError("cannot specify cols with a MultiIndex on the "
                                "columns")

        if cols is not None:
            if isinstance(cols, Index):
                cols = cols.to_native_types(na_rep=na_rep,
                                            float_format=float_format,
                                            date_format=date_format,
                                            quoting=self.quoting)
            else:
                cols = list(cols)
            self.obj = self.obj.loc[:, cols]

        # update columns to include possible multiplicity of dupes
        # and make sure sure cols is just a list of labels
        cols = self.obj.columns
        if isinstance(cols, Index):
            cols = cols.to_native_types(na_rep=na_rep,
                                        float_format=float_format,
                                        date_format=date_format,
                                        quoting=self.quoting)
        else:
            cols = list(cols)

        # save it
        self.cols = cols

        # preallocate data 2d list
        self.blocks = self.obj._data.blocks
        ncols = sum(b.shape[0] for b in self.blocks)
        self.data = [None] * ncols

        if chunksize is None:
            chunksize = (100000 // (len(self.cols) or 1)) or 1
        self.chunksize = int(chunksize)

        self.data_index = obj.index
        if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex)) and
                date_format is not None):
            self.data_index = Index([x.strftime(date_format) if notna(x) else
                                     '' for x in self.data_index])

        self.nlevels = getattr(self.data_index, 'nlevels', 1)
        if not index:
            self.nlevels = 0
Exemplo n.º 32
0
 def test_stringify_path_localpath(self):
     path = os.path.join('foo', 'bar')
     abs_path = os.path.abspath(path)
     lpath = LocalPath(path)
     assert icom._stringify_path(lpath) == abs_path
Exemplo n.º 33
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'wb',
                        compression=compression,
                        is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        f.write(pkl.dumps(obj, protocol=protocol))
    finally:
        for _f in fh:
            _f.close()
Exemplo n.º 34
0
 def test_stringify_path_fspath(self):
     p = CustomFSPath('foo/bar.csv')
     result = icom._stringify_path(p)
     assert result == 'foo/bar.csv'
Exemplo n.º 35
0
def read_pickle(path, compression='infer'):
    """
    Load pickled pandas object (or any other pickled object) from the specified
    file path

    Warning: Loading pickled data received from untrusted sources can be
    unsafe. See: http://docs.python.org/2.7/library/pickle.html

    Parameters
    ----------
    path : string
        File path
    compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

        .. versionadded:: 0.20.0

    Returns
    -------
    unpickled : type of object stored in file
    """
    path = _stringify_path(path)
    inferred_compression = _infer_compression(path, compression)

    def read_wrapper(func):
        # wrapper file handle open/close operation
        f, fh = _get_handle(path,
                            'rb',
                            compression=inferred_compression,
                            is_text=False)
        try:
            return func(f)
        finally:
            for _f in fh:
                _f.close()

    def try_read(path, encoding=None):
        # try with cPickle
        # try with current pickle, if we have a Type Error then
        # try with the compat pickle to handle subclass changes
        # pass encoding only if its not None as py2 doesn't handle
        # the param

        # cpickle
        # GH 6899
        try:
            return read_wrapper(lambda f: pkl.load(f))
        except Exception:
            # reg/patched pickle
            try:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=False))
            # compat pickle
            except:
                return read_wrapper(
                    lambda f: pc.load(f, encoding=encoding, compat=True))

    try:
        return try_read(path)
    except:
        if PY3:
            return try_read(path, encoding='latin1')
        raise
Exemplo n.º 36
0
def to_json(
    path_or_buf,
    obj,
    orient: Optional[str] = None,
    date_format: str = "epoch",
    double_precision: int = 10,
    force_ascii: bool = True,
    date_unit: str = "ms",
    default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
    lines: bool = False,
    compression: Optional[str] = "infer",
    index: bool = True,
    indent: int = 0,
):

    if not index and orient not in ["split", "table"]:
        raise ValueError("'index=False' is only valid when 'orient' is "
                         "'split' or 'table'")

    path_or_buf = _stringify_path(path_or_buf)
    if lines and orient != "records":
        raise ValueError("'lines' keyword only valid when 'orient' is records")

    if orient == "table" and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or "values")

    writer: Type["Writer"]
    if orient == "table" and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj,
        orient=orient,
        date_format=date_format,
        double_precision=double_precision,
        ensure_ascii=force_ascii,
        date_unit=date_unit,
        default_handler=default_handler,
        index=index,
        indent=indent,
    ).write()

    if lines:
        s = convert_to_line_delimits(s)

    if isinstance(path_or_buf, str):
        fh, handles = _get_handle(path_or_buf, "w", compression=compression)
        try:
            fh.write(s)
        finally:
            fh.close()
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Exemplo n.º 37
0
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.

        .. versionadded:: 0.20.0
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = _stringify_path(path)
    f, fh = _get_handle(path, 'wb', compression=compression, is_text=False)
    if protocol < 0:
        protocol = pkl.HIGHEST_PROTOCOL
    try:
        f.write(pkl.dumps(obj, protocol=protocol))
    finally:
        for _f in fh:
            _f.close()
Exemplo n.º 38
0
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.sas``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handler (e.g. via builtin ``open`` function)
        or ``StringIO``.
    format : string {'xport', 'sas7bdat'} or None
        If None, file format is inferred from file extension. If 'xport' or
        'sas7bdat', uses the corresponding format.
    index : identifier of index column, defaults to None
        Identifier of column that should be used as index of the DataFrame.
    encoding : string, default is None
        Encoding for text data.  If None, text data are stored as raw bytes.
    chunksize : int
        Read file `chunksize` lines at a time, returns iterator.
    iterator : bool, defaults to False
        If True, returns an iterator for reading the file incrementally.

    Returns
    -------
    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
    or XportReader
    """
    if format is None:
        buffer_error_msg = (
            "If this is a buffer object rather "
            "than a string name, you must specify "
            "a format string"
        )
        filepath_or_buffer = _stringify_path(filepath_or_buffer)
        if not isinstance(filepath_or_buffer, str):
            raise ValueError(buffer_error_msg)
        fname = filepath_or_buffer.lower()
        if fname.endswith(".xpt"):
            format = "xport"
        elif fname.endswith(".sas7bdat"):
            format = "sas7bdat"
        else:
            raise ValueError("unable to infer format of SAS file")

    if format.lower() == "xport":
        from pandas.io.sas.sas_xport import XportReader

        reader = XportReader(
            filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
        )
    elif format.lower() == "sas7bdat":
        from pandas.io.sas.sas7bdat import SAS7BDATReader

        reader = SAS7BDATReader(
            filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
        )
    else:
        raise ValueError("unknown SAS format")

    if iterator or chunksize:
        return reader

    data = reader.read()
    reader.close()
    return data