示例#1
0
    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
        # GH 39001
        # Reading of excel file depends on dimension data being correct but
        # writers sometimes omit or get it wrong
        import openpyxl

        version = LooseVersion(get_version(openpyxl))

        if version >= "3.0.0":
            sheet.reset_dimensions()

        data: List[List[Scalar]] = []
        for row_number, row in enumerate(sheet.rows):
            converted_row = [
                self._convert_cell(cell, convert_float) for cell in row
            ]
            data.append(converted_row)

        if version >= "3.0.0" and len(data) > 0:
            # With dimension reset, openpyxl no longer pads rows
            max_width = max(len(data_row) for data_row in data)
            if min(len(data_row) for data_row in data) < max_width:
                empty_cell: List[Scalar] = [""]
                data = [
                    data_row + (max_width - len(data_row)) * empty_cell
                    for data_row in data
                ]

        return data
示例#2
0
    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
        # GH 39001
        # Reading of excel file depends on dimension data being correct but
        # writers sometimes omit or get it wrong
        import openpyxl

        version = LooseVersion(get_version(openpyxl))

        # There is no good way of determining if a sheet is read-only
        # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
        is_readonly = hasattr(sheet, "reset_dimensions")

        if version >= "3.0.0" and is_readonly:
            sheet.reset_dimensions()

        data: List[List[Scalar]] = []
        for row_number, row in enumerate(sheet.rows):
            converted_row = [
                self._convert_cell(cell, convert_float) for cell in row
            ]
            data.append(converted_row)

        if version >= "3.0.0" and is_readonly and len(data) > 0:
            # With dimension reset, openpyxl no longer pads rows
            max_width = max(len(data_row) for data_row in data)
            if min(len(data_row) for data_row in data) < max_width:
                empty_cell: List[Scalar] = [""]
                data = [
                    data_row + (max_width - len(data_row)) * empty_cell
                    for data_row in data
                ]

        return data
示例#3
0
def test_read_with_bad_dimension(datapath, ext, header, expected_data,
                                 filename, read_only, request):
    # GH 38956, 39001 - no/incorrect dimension information
    version = LooseVersion(get_version(openpyxl))
    if (read_only or read_only is None) and version < "3.0.0":
        msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
        request.node.add_marker(pytest.mark.xfail(reason=msg))
    path = datapath("io", "data", "excel", f"{filename}{ext}")
    if read_only is None:
        result = pd.read_excel(path, header=header)
    else:
        wb = openpyxl.load_workbook(path, read_only=read_only)
        result = pd.read_excel(wb, engine="openpyxl", header=header)
        wb.close()
    expected = DataFrame(expected_data)
    tm.assert_frame_equal(result, expected)
示例#4
0
def test_read_with_empty_trailing_rows(datapath, ext, read_only, request):
    # GH 39181
    version = LooseVersion(get_version(openpyxl))
    if (read_only or read_only is None) and version < "3.0.0":
        msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
        request.node.add_marker(pytest.mark.xfail(reason=msg))
    path = datapath("io", "data", "excel", f"empty_trailing_rows{ext}")
    if read_only is None:
        result = pd.read_excel(path)
    else:
        wb = openpyxl.load_workbook(path, read_only=read_only)
        result = pd.read_excel(wb, engine="openpyxl")
        wb.close()
    expected = DataFrame({
        "Title": [np.nan, "A", 1, 2, 3],
        "Unnamed: 1": [np.nan, "B", 4, 5, 6],
        "Unnamed: 2": [np.nan, "C", 7, 8, 9],
    })
    tm.assert_frame_equal(result, expected)
示例#5
0
def _get_dependency_info() -> Dict[str, JSONSerializable]:
    """
    Returns dependency information as a JSON serializable dictionary.
    """
    deps = [
        "pandas",
        # required
        "numpy",
        "pytz",
        "dateutil",
        # install / build,
        "pip",
        "setuptools",
        "Cython",
        # test
        "pytest",
        "hypothesis",
        # docs
        "sphinx",
        # Other, need a min version
        "blosc",
        "feather",
        "xlsxwriter",
        "lxml.etree",
        "html5lib",
        "pymysql",
        "psycopg2",
        "jinja2",
        # Other, not imported.
        "IPython",
        "pandas_datareader",
    ]
    deps.extend(list(VERSIONS))

    result: Dict[str, JSONSerializable] = {}
    for modname in deps:
        mod = import_optional_dependency(modname,
                                         raise_on_missing=False,
                                         on_version="ignore")
        result[modname] = get_version(mod) if mod else None
    return result
示例#6
0
    "header, expected_data",
    [
        (
            0,
            {
                "Title": [np.nan, "A", 1, 2, 3],
                "Unnamed: 1": [np.nan, "B", 4, 5, 6],
                "Unnamed: 2": [np.nan, "C", 7, 8, 9],
            },
        ),
        (2, {
            "A": [1, 2, 3],
            "B": [4, 5, 6],
            "C": [7, 8, 9]
        }),
    ],
)
@pytest.mark.parametrize(
    "filename", ["dimension_missing", "dimension_small", "dimension_large"])
@pytest.mark.xfail(
    LooseVersion(get_version(openpyxl)) < "3.0.0",
    reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
)
def test_read_with_bad_dimension(datapath, ext, header, expected_data,
                                 filename):
    # GH 38956, 39001 - no/incorrect dimension information
    path = datapath("io", "data", "excel", f"{filename}{ext}")
    result = pd.read_excel(path, header=header)
    expected = DataFrame(expected_data)
    tm.assert_frame_equal(result, expected)
示例#7
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if import_optional_dependency("xlrd", errors="ignore") is None:
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = Version(get_version(xlrd))

        ext = None
        if engine is None:
            # Only determine ext if it is needed
            if xlrd_version is not None and isinstance(path_or_buffer,
                                                       xlrd.Book):
                ext = "xls"
            else:
                ext = inspect_excel_format(content_or_path=path_or_buffer,
                                           storage_options=storage_options)
                if ext is None:
                    raise ValueError(
                        "Excel file format cannot be determined, you must specify "
                        "an engine manually.")

            engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
            if engine == "auto":
                engine = get_default_engine(ext, mode="reader")

        if engine == "xlrd" and xlrd_version is not None:
            if ext is None:
                # Need ext to determine ext in order to raise/warn
                if isinstance(path_or_buffer, xlrd.Book):
                    ext = "xls"
                else:
                    ext = inspect_excel_format(path_or_buffer,
                                               storage_options=storage_options)

            # Pass through if ext is None, otherwise check if ext valid for xlrd
            if ext and ext != "xls" and xlrd_version >= Version("2"):
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            elif ext and ext != "xls":
                stacklevel = find_stack_level()
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
示例#8
0
from distutils.version import LooseVersion

import pytest

from pandas.compat._optional import get_version, import_optional_dependency

pytestmark = [
    pytest.mark.filterwarnings(
        # Looks like tree.getiterator is deprecated in favor of tree.iter
        "ignore:This method will be removed in future versions:"
        "PendingDeprecationWarning"),
    pytest.mark.filterwarnings(
        "ignore:This method will be removed in future versions:DeprecationWarning"
    ),
    # GH 26552
    pytest.mark.filterwarnings(
        "ignore:As the xlwt package is no longer maintained:FutureWarning"),
    # GH 38571
    pytest.mark.filterwarnings(
        "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning"
    ),
]

if import_optional_dependency("xlrd", errors="ignore") is None:
    xlrd_version = None
else:
    import xlrd

    xlrd_version = LooseVersion(get_version(xlrd))
示例#9
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if (import_optional_dependency(
                "xlrd", raise_on_missing=False, on_version="ignore") is None):
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = LooseVersion(get_version(xlrd))

        ext = None
        if engine is None:
            # Only determine ext if it is needed
            if xlrd_version is not None and isinstance(path_or_buffer,
                                                       xlrd.Book):
                ext = "xls"
            else:
                ext = inspect_excel_format(content=path_or_buffer,
                                           storage_options=storage_options)

            if ext == "ods":
                engine = "odf"
            elif ext == "xls":
                engine = "xlrd"
            else:
                # GH 35029 - Prefer openpyxl except for xls files
                if (import_optional_dependency("openpyxl",
                                               raise_on_missing=False,
                                               on_version="ignore")
                        is not None):
                    engine = "openpyxl"
                else:
                    engine = "xlrd"

        if engine == "xlrd" and xlrd_version is not None:
            if ext is None:
                # Need ext to determine ext in order to raise/warn
                if isinstance(path_or_buffer, xlrd.Book):
                    ext = "xls"
                else:
                    ext = inspect_excel_format(content=path_or_buffer,
                                               storage_options=storage_options)

            if ext != "xls" and xlrd_version >= "2":
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            elif ext != "xls":
                caller = inspect.stack()[1]
                if (caller.filename.endswith(
                        os.path.join("pandas", "io", "excel", "_base.py"))
                        and caller.function == "read_excel"):
                    stacklevel = 4
                else:
                    stacklevel = 2
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. As a result, the "
                    f"openpyxl engine will be used if it is installed and the "
                    f"engine argument is not specified. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )
        assert engine in self._engines, f"Engine {engine} not recognized"

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
示例#10
0
    def __init__(self,
                 path_or_buffer,
                 engine=None,
                 storage_options: StorageOptions = None):
        if engine is not None and engine not in self._engines:
            raise ValueError(f"Unknown engine: {engine}")

        # Could be a str, ExcelFile, Book, etc.
        self.io = path_or_buffer
        # Always a string
        self._io = stringify_path(path_or_buffer)

        # Determine xlrd version if installed
        if import_optional_dependency("xlrd", errors="ignore") is None:
            xlrd_version = None
        else:
            import xlrd

            xlrd_version = LooseVersion(get_version(xlrd))

        ext = None
        if engine is None:
            # Only determine ext if it is needed
            if xlrd_version is not None and isinstance(path_or_buffer,
                                                       xlrd.Book):
                ext = "xls"
            else:
                ext = inspect_excel_format(content_or_path=path_or_buffer,
                                           storage_options=storage_options)

            # ext will always be valid, otherwise inspect_excel_format would raise
            engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
            if engine == "auto":
                engine = get_default_engine(ext, mode="reader")

        if engine == "xlrd" and xlrd_version is not None:
            if ext is None:
                # Need ext to determine ext in order to raise/warn
                if isinstance(path_or_buffer, xlrd.Book):
                    ext = "xls"
                else:
                    ext = inspect_excel_format(path_or_buffer,
                                               storage_options=storage_options)

            if ext != "xls" and xlrd_version >= "2":
                raise ValueError(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install openpyxl instead."
                )
            elif ext != "xls":
                caller = inspect.stack()[1]
                if (caller.filename.endswith(
                        os.path.join("pandas", "io", "excel", "_base.py"))
                        and caller.function == "read_excel"):
                    stacklevel = 4
                else:
                    stacklevel = 2
                warnings.warn(
                    f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
                    f"only the xls format is supported. Install "
                    f"openpyxl instead.",
                    FutureWarning,
                    stacklevel=stacklevel,
                )

        self.engine = engine
        self.storage_options = storage_options

        self._reader = self._engines[engine](self._io,
                                             storage_options=storage_options)
示例#11
0
from pandas.compat._optional import (
    get_version,
    import_optional_dependency,
)

from pandas.util.version import Version

pytestmark = [
    pytest.mark.filterwarnings(
        # Looks like tree.getiterator is deprecated in favor of tree.iter
        "ignore:This method will be removed in future versions:"
        "PendingDeprecationWarning"),
    pytest.mark.filterwarnings(
        "ignore:This method will be removed in future versions:DeprecationWarning"
    ),
    # GH 26552
    pytest.mark.filterwarnings(
        "ignore:As the xlwt package is no longer maintained:FutureWarning"),
    # GH 38571
    pytest.mark.filterwarnings(
        "ignore:.*In xlrd >= 2.0, only the xls format is supported:FutureWarning"
    ),
]

if import_optional_dependency("xlrd", errors="ignore") is None:
    xlrd_version = None
else:
    import xlrd

    xlrd_version = Version(get_version(xlrd))