Exemplo n.º 1
0
 def test_stringify_path_localpath(self):
     path = os.path.join("foo", "bar")
     abs_path = os.path.abspath(path)
     lpath = LocalPath(path)
     assert icom.stringify_path(lpath) == abs_path
Exemplo n.º 2
0
 def test_stringify_path_fspath(self):
     p = CustomFSPath("foo/bar.csv")
     result = icom.stringify_path(p)
     assert result == "foo/bar.csv"
Exemplo n.º 3
0
 def __fspath__(self):
     return stringify_path(self.path)
Exemplo n.º 4
0
 def test_stringify_path_pathlib(self):
     rel_path = icom.stringify_path(Path("."))
     assert rel_path == "."
     redundant_path = icom.stringify_path(Path("foo//bar"))
     assert redundant_path == os.path.join("foo", "bar")
Exemplo n.º 5
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if import_optional_dependency("xlrd", errors="ignore") is None:
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = LooseVersion(get_version(xlrd))

        if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
            ext = "xls"
        else:
            ext = inspect_excel_format(content_or_path=path_or_buffer,
                                       storage_options=storage_options)

        if engine is None:
            # ext will always be valid, otherwise inspect_excel_format would raise
            engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
            if engine == "auto":
                engine = get_default_engine(ext, mode="reader")

        if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
            if xlrd_version >= "2":
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            else:
                caller = inspect.stack()[1]
                if (caller.filename.endswith(
                        os.path.join("pandas", "io", "excel", "_base.py"))
                        and caller.function == "read_excel"):
                    stacklevel = 4
                else:
                    stacklevel = 2
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. As a result, the "
                    f"openpyxl engine will be used if it is installed and the "
                    f"engine argument is not specified. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
Exemplo n.º 6
0
def read_html(
    io: FilePathOrBuffer,
    match: str | Pattern = ".+",
    flavor: str | None = None,
    header: int | Sequence[int] | None = None,
    index_col: int | Sequence[int] | None = None,
    skiprows: int | Sequence[int] | slice | None = None,
    attrs: dict[str, str] | None = None,
    parse_dates: bool = False,
    thousands: str | None = ",",
    encoding: str | None = None,
    decimal: str = ".",
    converters: dict | None = None,
    na_values=None,
    keep_default_na: bool = True,
    displayed_only: bool = True,
) -> list[DataFrame]:
    r"""
    Read HTML tables into a ``list`` of ``DataFrame`` objects.

    Parameters
    ----------
    io : str, path object or file-like object
        A URL, a file-like object, or a raw string containing HTML. Note that
        lxml only accepts the http, ftp and file url protocols. If you have a
        URL that starts with ``'https'`` you might try removing the ``'s'``.

    match : str or compiled regular expression, optional
        The set of tables containing text matching this regex or string will be
        returned. Unless the HTML is extremely simple you will probably need to
        pass a non-empty string here. Defaults to '.+' (match any non-empty
        string). The default value will return all tables contained on a page.
        This value is converted to a regular expression so that there is
        consistent behavior between Beautiful Soup and lxml.

    flavor : str, optional
        The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
        each other, they are both there for backwards compatibility. The
        default of ``None`` tries to use ``lxml`` to parse and if that fails it
        falls back on ``bs4`` + ``html5lib``.

    header : int or list-like, optional
        The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
        make the columns headers.

    index_col : int or list-like, optional
        The column (or list of columns) to use to create the index.

    skiprows : int, list-like or slice, optional
        Number of rows to skip after parsing the column integer. 0-based. If a
        sequence of integers or a slice is given, will skip the rows indexed by
        that sequence.  Note that a single element sequence means 'skip the nth
        row' whereas an integer means 'skip n rows'.

    attrs : dict, optional
        This is a dictionary of attributes that you can pass to use to identify
        the table in the HTML. These are not checked for validity before being
        passed to lxml or Beautiful Soup. However, these attributes must be
        valid HTML table attributes to work correctly. For example, ::

            attrs = {'id': 'table'}

        is a valid attribute dictionary because the 'id' HTML tag attribute is
        a valid HTML attribute for *any* HTML tag as per `this document
        <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::

            attrs = {'asdf': 'table'}

        is *not* a valid attribute dictionary because 'asdf' is not a valid
        HTML attribute even if it is a valid XML attribute.  Valid HTML 4.01
        table attributes can be found `here
        <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
        working draft of the HTML 5 spec can be found `here
        <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
        latest information on table attributes for the modern web.

    parse_dates : bool, optional
        See :func:`~read_csv` for more details.

    thousands : str, optional
        Separator to use to parse thousands. Defaults to ``','``.

    encoding : str, optional
        The encoding used to decode the web page. Defaults to ``None``.``None``
        preserves the previous encoding behavior, which depends on the
        underlying parser library (e.g., the parser library will try to use
        the encoding provided by the document).

    decimal : str, default '.'
        Character to recognize as decimal point (e.g. use ',' for European
        data).

    converters : dict, default None
        Dict of functions for converting values in certain columns. Keys can
        either be integers or column labels, values are functions that take one
        input argument, the cell (not column) content, and return the
        transformed content.

    na_values : iterable, default None
        Custom NA values.

    keep_default_na : bool, default True
        If na_values are specified and keep_default_na is False the default NaN
        values are overridden, otherwise they're appended to.

    displayed_only : bool, default True
        Whether elements with "display: none" should be parsed.

    Returns
    -------
    dfs
        A list of DataFrames.

    See Also
    --------
    read_csv : Read a comma-separated values (csv) file into DataFrame.

    Notes
    -----
    Before using this function you should read the :ref:`gotchas about the
    HTML parsing libraries <io.html.gotchas>`.

    Expect to do some cleanup after you call this function. For example, you
    might need to manually assign column names if the column names are
    converted to NaN when you pass the `header=0` argument. We try to assume as
    little as possible about the structure of the table and push the
    idiosyncrasies of the HTML contained in the table to the user.

    This function searches for ``<table>`` elements and only for ``<tr>``
    and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
    element in the table. ``<td>`` stands for "table data". This function
    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
    If the function has a ``<thead>`` argument, it is used to construct
    the header, otherwise the function attempts to find the header within
    the body (by putting rows with only ``<th>`` elements into the header).

    Similar to :func:`~read_csv` the `header` argument is applied
    **after** `skiprows` is applied.

    This function will *always* return a list of :class:`DataFrame` *or*
    it will fail, e.g., it will *not* return an empty list.

    Examples
    --------
    See the :ref:`read_html documentation in the IO section of the docs
    <io.read_html>` for some examples of reading in HTML tables.
    """
    _importers()

    # Type check here. We don't want to parse only to fail because of an
    # invalid value of an integer skiprows.
    if isinstance(skiprows, numbers.Integral) and skiprows < 0:
        raise ValueError("cannot skip rows starting from the end of the "
                         "data (you passed a negative value)")
    validate_header_arg(header)

    io = stringify_path(io)

    return _parse(
        flavor=flavor,
        io=io,
        match=match,
        header=header,
        index_col=index_col,
        skiprows=skiprows,
        parse_dates=parse_dates,
        thousands=thousands,
        attrs=attrs,
        encoding=encoding,
        decimal=decimal,
        converters=converters,
        na_values=na_values,
        keep_default_na=keep_default_na,
        displayed_only=displayed_only,
    )
Exemplo n.º 7
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if (import_optional_dependency(
                "xlrd", raise_on_missing=False, on_version="ignore") is None):
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = LooseVersion(xlrd.__version__)

        if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase, bytes)):
            ext = inspect_excel_format(content=path_or_buffer,
                                       storage_options=storage_options)
        elif xlrd_version is not None and isinstance(path_or_buffer,
                                                     xlrd.Book):
            ext = "xls"
        else:
            # path_or_buffer is path-like, use stringified path
            ext = inspect_excel_format(path=str(self._io),
                                       storage_options=storage_options)

        if engine is None:
            if ext == "ods":
                engine = "odf"
            elif ext == "xls":
                engine = "xlrd"
            else:
                # GH 35029 - Prefer openpyxl except for xls files
                if (import_optional_dependency("openpyxl",
                                               raise_on_missing=False,
                                               on_version="ignore")
                        is not None):
                    engine = "openpyxl"
                else:
                    engine = "xlrd"

        if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
            if xlrd_version >= "2":
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            else:
                caller = inspect.stack()[1]
                if (caller.filename.endswith(
                        os.path.join("pandas", "io", "excel", "_base.py"))
                        and caller.function == "read_excel"):
                    stacklevel = 4
                else:
                    stacklevel = 2
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. As a result, the "
                    f"openpyxl engine will be used if it is installed and the "
                    f"engine argument is not specified. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
        assert engine in self._engines, f"Engine {engine} not recognized"

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
Exemplo n.º 8
0
 def __fspath__(self):
     # pandas\io\excel\_base.py:744: error: Argument 1 to "stringify_path"
     # has incompatible type "Optional[Any]"; expected "Union[str, Path,
     # IO[Any], IOBase]"  [arg-type]
     return stringify_path(self.path)  # type: ignore[arg-type]
Exemplo n.º 9
0
    def to_string(
        self,
        buf=None,
        na_rep="NaN",
        float_format=None,
        header=True,
        index=True,
        length=False,
        dtype=False,
        name=False,
        max_rows=None,
        min_rows=None,
    ) -> Optional[str]:
        """
        Render a string representation of the Series.

        Follows pandas implementation except when ``max_rows=None``. In this scenario, we set ``max_rows={0}`` to avoid
        accidentally dumping an entire index. This can be overridden by explicitly setting ``max_rows``.

        See Also
        --------
        :pandas_api_docs:`pandas.Series.to_string`
            for argument details.
        """
        # In pandas calling 'to_string' without max_rows set, will dump ALL rows - we avoid this
        # by limiting rows by default.
        num_rows = len(self)  # avoid multiple calls
        if num_rows <= DEFAULT_NUM_ROWS_DISPLAYED:
            if max_rows is None:
                max_rows = num_rows
            else:
                max_rows = min(num_rows, max_rows)
        elif max_rows is None:
            warnings.warn(
                f"Series.to_string called without max_rows set "
                f"- this will return entire index results. "
                f"Setting max_rows={DEFAULT_NUM_ROWS_DISPLAYED}"
                f" overwrite if different behaviour is required.",
                UserWarning,
            )
            max_rows = DEFAULT_NUM_ROWS_DISPLAYED

        # because of the way pandas handles max_rows=0, not having this throws an error
        # see eland issue #56
        if max_rows == 0:
            max_rows = 1

        # Create a slightly bigger dataframe than display
        temp_series = self._build_repr(max_rows + 1)

        if buf is not None:
            _buf = _expand_user(stringify_path(buf))
        else:
            _buf = StringIO()

        # Create repr of fake series without name, length, dtype summary
        temp_series.to_string(
            buf=_buf,
            na_rep=na_rep,
            float_format=float_format,
            header=header,
            index=index,
            length=False,
            dtype=False,
            name=False,
            max_rows=max_rows,
        )

        # Create the summary
        footer = []
        if name and self.name is not None:
            footer.append(f"Name: {self.name}")
        if length and len(self) > max_rows:
            footer.append(f"Length: {len(self.index)}")
        if dtype:
            footer.append(f"dtype: {temp_series.dtype}")

        if footer:
            _buf.write(f"\n{', '.join(footer)}")

        if buf is None:
            result = _buf.getvalue()
            return result
Exemplo n.º 10
0
 def test_stringify_file_and_path_like(self):
     # GH 38125: do not stringify file objects that are also path-like
     fsspec = pytest.importorskip("fsspec")
     with tm.ensure_clean() as path:
         with fsspec.open(f"file://{path}", mode="wb") as fsspec_obj:
             assert fsspec_obj == icom.stringify_path(fsspec_obj)
Exemplo n.º 11
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is None:
            # Determine ext and use odf for ods stream/file
            if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
                ext = None
                if _is_ods_stream(path_or_buffer):
                    engine = "odf"
            else:
                ext = os.path.splitext(str(path_or_buffer))[-1]
                if ext == ".ods":
                    engine = "odf"

            if (import_optional_dependency("xlrd",
                                           raise_on_missing=False,
                                           on_version="ignore") is not None):
                from xlrd import Book

                if isinstance(path_or_buffer, Book):
                    engine = "xlrd"

            # GH 35029 - Prefer openpyxl except for xls files
            if engine is None:
                if ext is None or isinstance(path_or_buffer,
                                             bytes) or ext == ".xls":
                    engine = "xlrd"
                elif (import_optional_dependency("openpyxl",
                                                 raise_on_missing=False,
                                                 on_version="ignore")
                      is not None):
                    engine = "openpyxl"
                else:
                    caller = inspect.stack()[1]
                    if (caller.filename.endswith("pandas/io/excel/_base.py")
                            and caller.function == "read_excel"):
                        stacklevel = 4
                    else:
                        stacklevel = 2
                    warnings.warn(
                        "The xlrd engine is no longer maintained and is not "
                        "supported when using pandas with python >= 3.9. However, "
                        "the engine xlrd will continue to be allowed for the "
                        "indefinite future. Beginning with pandas 1.2.0, the "
                        "openpyxl engine will be used if it is installed and the "
                        "engine argument is not specified. Either install openpyxl "
                        "or specify engine='xlrd' to silence this warning.",
                        FutureWarning,
                        stacklevel=stacklevel,
                    )
                    engine = "xlrd"
        if engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        self.engine = engine
        self.storage_options = storage_options

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
Exemplo n.º 12
0
def to_pickle(obj,
              path,
              compression="infer",
              protocol=pickle.HIGHEST_PROTOCOL):
    """
    Pickle (serialize) object to file.

    Parameters
    ----------
    obj : any object
        Any python object.
    path : str
        File path where the pickled object will be stored.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.
    protocol : int
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
        values for this parameter depend on the version of Python. For Python
        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
        For Python >= 3.4, 4 is a valid value. A negative value for the
        protocol parameter is equivalent to setting its value to
        HIGHEST_PROTOCOL.

        .. [1] https://docs.python.org/3/library/pickle.html
        .. versionadded:: 0.21.0

    See Also
    --------
    read_pickle : Load pickled pandas object (or any object) from file.
    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
    DataFrame.to_sql : Write DataFrame to a SQL database.
    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = stringify_path(path)
    f, fh = get_handle(path, "wb", compression=compression, is_text=False)
    if protocol < 0:
        protocol = pickle.HIGHEST_PROTOCOL
    try:
        f.write(pickle.dumps(obj, protocol=protocol))
    finally:
        f.close()
        for _f in fh:
            _f.close()
Exemplo n.º 13
0
def read_pickle(path, compression="infer"):
    """
    Load pickled pandas object (or any object) from file.

    .. warning::

       Loading pickled data received from untrusted sources can be
       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer', then use
        gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
        or '.zip' respectively, and no decompression otherwise.
        Set to None for no decompression.

    Returns
    -------
    unpickled : same type as object stored in file

    See Also
    --------
    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
    Series.to_pickle : Pickle (serialize) Series object to file.
    read_hdf : Read HDF5 file into a DataFrame.
    read_sql : Read SQL query or database table into a DataFrame.
    read_parquet : Load a parquet object, returning a DataFrame.

    Notes
    -----
    read_pickle is only guaranteed to be backwards compatible to pandas 0.20.3.

    Examples
    --------
    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
    >>> original_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9
    >>> pd.to_pickle(original_df, "./dummy.pkl")

    >>> unpickled_df = pd.read_pickle("./dummy.pkl")
    >>> unpickled_df
       foo  bar
    0    0    5
    1    1    6
    2    2    7
    3    3    8
    4    4    9

    >>> import os
    >>> os.remove("./dummy.pkl")
    """
    path = stringify_path(path)
    f, fh = get_handle(path, "rb", compression=compression, is_text=False)

    # 1) try standard library Pickle
    # 2) try pickle_compat (older pandas version) to handle subclass changes

    excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError)

    try:
        with warnings.catch_warnings(record=True):
            # We want to silence any warnings about, e.g. moved modules.
            warnings.simplefilter("ignore", Warning)
            return pickle.load(f)
    except excs_to_catch:
        # e.g.
        #  "No module named 'pandas.core.sparse.series'"
        #  "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
        return pc.load(f, encoding=None)
    except UnicodeDecodeError:
        # e.g. can occur for files written in py27; see GH#28645
        return pc.load(f, encoding="latin-1")
    finally:
        f.close()
        for _f in fh:
            _f.close()
Exemplo n.º 14
0
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.sas``.

        If you want to pass in a path object, pandas accepts any
        ``os.PathLike``.

        By file-like object, we refer to objects with a ``read()`` method,
        such as a file handler (e.g. via builtin ``open`` function)
        or ``StringIO``.
    format : str {'xport', 'sas7bdat'} or None
        If None, file format is inferred from file extension. If 'xport' or
        'sas7bdat', uses the corresponding format.
    index : identifier of index column, defaults to None
        Identifier of column that should be used as index of the DataFrame.
    encoding : str, default is None
        Encoding for text data.  If None, text data are stored as raw bytes.
    chunksize : int
        Read file `chunksize` lines at a time, returns iterator.
    iterator : bool, defaults to False
        If True, returns an iterator for reading the file incrementally.

    Returns
    -------
    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
    or XportReader
    """
    if format is None:
        buffer_error_msg = (
            "If this is a buffer object rather "
            "than a string name, you must specify a format string"
        )
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if not isinstance(filepath_or_buffer, str):
            raise ValueError(buffer_error_msg)
        fname = filepath_or_buffer.lower()
        if fname.endswith(".xpt"):
            format = "xport"
        elif fname.endswith(".sas7bdat"):
            format = "sas7bdat"
        else:
            raise ValueError("unable to infer format of SAS file")

    reader: ReaderBase
    if format.lower() == "xport":
        from pandas.io.sas.sas_xport import XportReader

        reader = XportReader(
            filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
        )
    elif format.lower() == "sas7bdat":
        from pandas.io.sas.sas7bdat import SAS7BDATReader

        reader = SAS7BDATReader(
            filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
        )
    else:
        raise ValueError("unknown SAS format")

    if iterator or chunksize:
        return reader

    data = reader.read()
    reader.close()
    return data
Exemplo n.º 15
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # First argument can also be bytes, so create a buffer
        if isinstance(path_or_buffer, bytes):
            path_or_buffer = BytesIO(path_or_buffer)

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if import_optional_dependency("xlrd", errors="ignore") is None:
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = Version(get_version(xlrd))

        ext = None
        if engine is None:
            # Only determine ext if it is needed
            if xlrd_version is not None and isinstance(path_or_buffer,
                                                       xlrd.Book):
                ext = "xls"
            else:
                ext = inspect_excel_format(content_or_path=path_or_buffer,
                                           storage_options=storage_options)
                if ext is None:
                    raise ValueError(
                        "Excel file format cannot be determined, you must specify "
                        "an engine manually.")

            engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
            if engine == "auto":
                engine = get_default_engine(ext, mode="reader")

        if engine == "xlrd" and xlrd_version is not None:
            if ext is None:
                # Need ext to determine ext in order to raise/warn
                if isinstance(path_or_buffer, xlrd.Book):
                    ext = "xls"
                else:
                    ext = inspect_excel_format(path_or_buffer,
                                               storage_options=storage_options)

            # Pass through if ext is None, otherwise check if ext valid for xlrd
            if ext and ext != "xls" and xlrd_version >= Version("2"):
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            elif ext and ext != "xls":
                stacklevel = find_stack_level()
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
Exemplo n.º 16
0
def to_json(
    path_or_buf,
    obj,
    orient: Optional[str] = None,
    date_format: str = "epoch",
    double_precision: int = 10,
    force_ascii: bool = True,
    date_unit: str = "ms",
    default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
    lines: bool = False,
    compression: Optional[str] = "infer",
    index: bool = True,
    indent: int = 0,
):

    if not index and orient not in ["split", "table"]:
        raise ValueError(
            "'index=False' is only valid when 'orient' is 'split' or 'table'")

    path_or_buf = stringify_path(path_or_buf)
    if lines and orient != "records":
        raise ValueError("'lines' keyword only valid when 'orient' is records")

    if orient == "table" and isinstance(obj, Series):
        obj = obj.to_frame(name=obj.name or "values")

    writer: Type["Writer"]
    if orient == "table" and isinstance(obj, DataFrame):
        writer = JSONTableWriter
    elif isinstance(obj, Series):
        writer = SeriesWriter
    elif isinstance(obj, DataFrame):
        writer = FrameWriter
    else:
        raise NotImplementedError("'obj' should be a Series or a DataFrame")

    s = writer(
        obj,
        orient=orient,
        date_format=date_format,
        double_precision=double_precision,
        ensure_ascii=force_ascii,
        date_unit=date_unit,
        default_handler=default_handler,
        index=index,
        indent=indent,
    ).write()

    if lines:
        s = convert_to_line_delimits(s)

    if isinstance(path_or_buf, str):
        fh, handles = get_handle(path_or_buf, "w", compression=compression)
        try:
            fh.write(s)
        finally:
            fh.close()
    elif path_or_buf is None:
        return s
    else:
        path_or_buf.write(s)
Exemplo n.º 17
0
def read_sas(
    filepath_or_buffer: FilePath | ReadBuffer[bytes],
    format: str | None = None,
    index: Hashable | None = None,
    encoding: str | None = None,
    chunksize: int | None = None,
    iterator: bool = False,
) -> DataFrame | ReaderBase:
    """
    Read SAS files stored as either XPORT or SAS7BDAT format files.

    Parameters
    ----------
    filepath_or_buffer : str, path object, or file-like object
        String, path object (implementing ``os.PathLike[str]``), or file-like
        object implementing a binary ``read()`` function. The string could be a URL.
        Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be:
        ``file://localhost/path/to/table.sas``.
    format : str {'xport', 'sas7bdat'} or None
        If None, file format is inferred from file extension. If 'xport' or
        'sas7bdat', uses the corresponding format.
    index : identifier of index column, defaults to None
        Identifier of column that should be used as index of the DataFrame.
    encoding : str, default is None
        Encoding for text data.  If None, text data are stored as raw bytes.
    chunksize : int
        Read file `chunksize` lines at a time, returns iterator.

        .. versionchanged:: 1.2

            ``TextFileReader`` is a context manager.
    iterator : bool, defaults to False
        If True, returns an iterator for reading the file incrementally.

        .. versionchanged:: 1.2

            ``TextFileReader`` is a context manager.

    Returns
    -------
    DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
    or XportReader
    """
    if format is None:
        buffer_error_msg = (
            "If this is a buffer object rather "
            "than a string name, you must specify a format string"
        )
        filepath_or_buffer = stringify_path(filepath_or_buffer)
        if not isinstance(filepath_or_buffer, str):
            raise ValueError(buffer_error_msg)
        fname = filepath_or_buffer.lower()
        if fname.endswith(".xpt"):
            format = "xport"
        elif fname.endswith(".sas7bdat"):
            format = "sas7bdat"
        else:
            raise ValueError("unable to infer format of SAS file")

    reader: ReaderBase
    if format.lower() == "xport":
        from pandas.io.sas.sas_xport import XportReader

        reader = XportReader(
            filepath_or_buffer,
            index=index,
            encoding=encoding,
            chunksize=chunksize,
        )
    elif format.lower() == "sas7bdat":
        from pandas.io.sas.sas7bdat import SAS7BDATReader

        reader = SAS7BDATReader(
            filepath_or_buffer,
            index=index,
            encoding=encoding,
            chunksize=chunksize,
        )
    else:
        raise ValueError("unknown SAS format")

    if iterator or chunksize:
        return reader

    with reader:
        return reader.read()