def _handle_usecols( self, columns: list[list[str | int | None]], usecols_key: list[str | int | None], num_original_columns: int, ): """ Sets self._col_indices usecols_key is used if there are string usecols. """ if self.usecols is not None: if callable(self.usecols): col_indices = self._evaluate_usecols(self.usecols, usecols_key) elif any(isinstance(u, str) for u in self.usecols): if len(columns) > 1: raise ValueError( "If using multiple headers, usecols must be integers." ) col_indices = [] for col in self.usecols: if isinstance(col, str): try: col_indices.append(usecols_key.index(col)) except ValueError: self._validate_usecols_names(self.usecols, usecols_key) else: col_indices.append(col) else: missing_usecols = [ col for col in self.usecols if col >= num_original_columns ] if missing_usecols: warnings.warn( "Defining usecols with out of bounds indices is deprecated " "and will raise a ParserError in a future version.", FutureWarning, stacklevel=find_stack_level(), ) col_indices = self.usecols columns = [ [n for i, n in enumerate(column) if i in col_indices] for column in columns ] self._col_indices = sorted(col_indices) return columns
def transform_dict_like(self, func): """ Compute transform in the case of a dict-like func """ from pandas.core.reshape.concat import concat obj = self.obj args = self.args kwargs = self.kwargs # transform is currently only for Series/DataFrame assert isinstance(obj, ABCNDFrame) if len(func) == 0: raise ValueError("No transform functions were provided") func = self.normalize_dictlike_arg("transform", obj, func) results: dict[Hashable, DataFrame | Series] = {} failed_names = [] all_type_errors = True for name, how in func.items(): colg = obj._gotitem(name, ndim=1) try: results[name] = colg.transform(how, 0, *args, **kwargs) except Exception as err: if str(err) in { "Function did not transform", "No transform functions were provided", }: raise err else: if not isinstance(err, TypeError): all_type_errors = False failed_names.append(name) # combine results if not results: klass = TypeError if all_type_errors else ValueError raise klass("Transform function failed") if len(failed_names) > 0: warnings.warn( f"{failed_names} did not transform successfully. If any error is " f"raised, this will raise in a future version of pandas. " f"Drop these columns/ops to avoid this warning.", FutureWarning, stacklevel=find_stack_level(), ) return concat(results, axis=1)
def is_monotonic(self) -> bool: """ Return boolean if values in the object are monotonic_increasing. Returns ------- bool """ warnings.warn( "is_monotonic is deprecated and will be removed in a future version. " "Use is_monotonic_increasing instead.", FutureWarning, stacklevel=find_stack_level(), ) return self.is_monotonic_increasing
def parse_date_time(date_col, time_col): """ Parse columns with dates and times into a single datetime column. .. deprecated:: 1.2 """ warnings.warn( """ Use pd.to_datetime(date_col + " " + time_col) instead to get a Pandas Series. Use pd.to_datetime(date_col + " " + time_col).to_pydatetime() instead to get a Numpy array. """, # noqa: E501 FutureWarning, stacklevel=find_stack_level(), ) date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) return parsing.try_parse_date_and_time(date_col, time_col)
def deprecate_ndim_indexing(result, stacklevel: int = 3) -> None: """ Helper function to raise the deprecation warning for multi-dimensional indexing on 1D Series/Index. GH#27125 indexer like idx[:, None] expands dim, but we cannot do that and keep an index, so we currently return ndarray, which is deprecated (Deprecation GH#30588). """ if np.ndim(result) > 1: warnings.warn( "Support for multi-dimensional indexing (e.g. `obj[:, None]`) " "is deprecated and will be removed in a future " "version. Convert to a numpy array before indexing instead.", FutureWarning, stacklevel=find_stack_level(), )
def _deprecate_mismatched_indexing(self, key) -> None: # GH#36148 # we get here with isinstance(key, self._data._recognized_scalars) try: self._data._assert_tzawareness_compat(key) except TypeError: if self.tz is None: msg = ("Indexing a timezone-naive DatetimeIndex with a " "timezone-aware datetime is deprecated and will " "raise KeyError in a future version. " "Use a timezone-naive object instead.") else: msg = ("Indexing a timezone-aware DatetimeIndex with a " "timezone-naive datetime is deprecated and will " "raise KeyError in a future version. " "Use a timezone-aware object instead.") warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
def get_offset(name: str) -> BaseOffset: """ Return DateOffset object associated with rule name. .. deprecated:: 1.0.0 Examples -------- get_offset('EOM') --> BMonthEnd(1) """ warnings.warn( "get_offset is deprecated and will be removed in a future version, " "use to_offset instead.", FutureWarning, stacklevel=find_stack_level(), ) return _get_offset(name)
def __getattr__(name: str): import warnings from pandas.util._exceptions import find_stack_level if name == "CategoricalBlock": warnings.warn( "CategoricalBlock is deprecated and will be removed in a future version. " "Use ExtensionBlock instead.", DeprecationWarning, stacklevel=find_stack_level(), ) from pandas.core.internals.blocks import CategoricalBlock return CategoricalBlock raise AttributeError( f"module 'pandas.core.internals' has no attribute '{name}'")
def copy( self, name: Hashable = None, deep: bool = False, dtype: Dtype | None = None, names=None, ): name = self._validate_names(name=name, names=names, deep=deep)[0] new_index = self._rename(name=name) if dtype: warnings.warn( "parameter dtype is deprecated and will be removed in a future " "version. Use the astype method instead.", FutureWarning, stacklevel=find_stack_level(), ) new_index = new_index.astype(dtype) return new_index
def weekofyear(self): """ The week ordinal of the year. .. deprecated:: 1.1.0 Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead. """ warnings.warn( "Series.dt.weekofyear and Series.dt.week have been deprecated. " "Please use Series.dt.isocalendar().week instead.", FutureWarning, stacklevel=find_stack_level(), ) week_series = self.isocalendar().week week_series.name = self.name if week_series.hasnans: return week_series.astype("float64") return week_series.astype("int64")
def parse_date_fields(year_col, month_col, day_col): """ Parse columns with years, months and days into a single date column. .. deprecated:: 1.2 """ warnings.warn( """ Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) instead to get a Pandas Series. Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) and np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. """, # noqa: E501 FutureWarning, stacklevel=find_stack_level(), ) year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) return parsing.try_parse_year_month_day(year_col, month_col, day_col)
def reconstruct(result): if lib.is_scalar(result): return result if isinstance(result, tuple): # np.modf, np.frexp, np.divmod return tuple(reconstruct(x) for x in result) if result.ndim != self.ndim: if method == "outer": if self.ndim == 2: # we already deprecated for Series msg = ("outer method for ufunc {} is not implemented on " "pandas objects. Returning an ndarray, but in the " "future this will raise a 'NotImplementedError'. " "Consider explicitly converting the DataFrame " "to an array with '.to_numpy()' first.") warnings.warn(msg.format(ufunc), FutureWarning, stacklevel=find_stack_level()) return result raise NotImplementedError return result if isinstance(result, BlockManager): # we went through BlockManager.apply result = self._constructor(result, **reconstruct_kwargs, copy=False) else: # we converted an array, lost our axes result = self._constructor(result, **reconstruct_axes, **reconstruct_kwargs, copy=False) # TODO: When we support multiple values in __finalize__, this # should pass alignable to `__finalize__` instead of self. # Then `np.add(a, b)` would consider attrs from both a and b # when a and b are NDFrames. if len(alignable) == 1: result = result.__finalize__(self) return result
def astype(self, dtype, copy: bool = True, how=lib.no_default): dtype = pandas_dtype(dtype) if how is not lib.no_default: # GH#37982 warnings.warn( "The 'how' keyword in PeriodIndex.astype is deprecated and " "will be removed in a future version. " "Use index.to_timestamp(how=how) instead.", FutureWarning, stacklevel=find_stack_level(), ) else: how = "start" if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) return super().astype(dtype, copy=copy)
def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: """Checks if length of data is equal to length of column names. One set of trailing commas is allowed. self.index_col not False results in a ParserError previously when lengths do not match. Parameters ---------- columns: list of column names data: list of array-likes containing the data column-wise. """ if not self.index_col and len(columns) != len(data) and columns: if len(columns) == len(data) - 1 and np.all( (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) ): return warnings.warn( "Length of header or names does not match length of data. This leads " "to a loss of data with index_col=False.", ParserWarning, stacklevel=find_stack_level(), )
def generic_parser(parse_func, *cols): """ Use dateparser to parse columns with data information into a single datetime column. .. deprecated:: 1.2 """ warnings.warn( """ Use pd.to_datetime instead. """, FutureWarning, stacklevel=find_stack_level(), ) N = _check_columns(cols) results = np.empty(N, dtype=object) for i in range(N): args = [c[i] for c in cols] results[i] = parse_func(*args) return results
def is_inferred_bool_dtype(arr: ArrayLike) -> bool: """ Check if this is a ndarray[bool] or an ndarray[object] of bool objects. Parameters ---------- arr : np.ndarray or ExtensionArray Returns ------- bool Notes ----- This does not include the special treatment is_bool_dtype uses for Categorical. """ if not isinstance(arr, np.ndarray): return False dtype = arr.dtype if dtype == np.dtype(bool): return True elif dtype == np.dtype("object"): result = lib.is_bool_array(arr) if result: # GH#46188 warnings.warn( "In a future version, object-dtype columns with all-bool values " "will not be included in reductions with bool_only=True. " "Explicitly cast to bool dtype instead.", FutureWarning, stacklevel=find_stack_level(), ) return result return False
def union_many(self, others): """ A bit of a hack to accelerate unioning a collection of indexes. """ warnings.warn( "DatetimeIndex.union_many is deprecated and will be removed in " "a future version. Use obj.union instead.", FutureWarning, stacklevel=find_stack_level(), ) this = self for other in others: if not isinstance(this, DatetimeIndex): this = Index.union(this, other) continue if not isinstance(other, DatetimeIndex): try: other = DatetimeIndex(other) except TypeError: pass this, other = this._maybe_utc_convert(other) if len(self) and len(other) and this._can_fast_union(other): # union already has fastpath handling for empty cases this = this._fast_union(other) else: this = Index.union(this, other) res_name = get_unanimous_names(self, *others)[0] if this.name != res_name: return this.rename(res_name) return this
def astype(self, dtype, copy: bool = True, how=lib.no_default): dtype = pandas_dtype(dtype) if how is not lib.no_default: # GH#37982 warnings.warn( "The 'how' keyword in PeriodIndex.astype is deprecated and " "will be removed in a future version. " "Use index.to_timestamp(how=how) instead.", FutureWarning, stacklevel=find_stack_level(), ) else: how = "start" if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. # GH#45038 implement this for PeriodArray (but without "how") # once the "how" deprecation is enforced we can just dispatch # directly to PeriodArray. tz = getattr(dtype, "tz", None) return self.to_timestamp(how=how).tz_localize(tz) return super().astype(dtype, copy=copy)
def is_categorical(arr) -> bool: """ Check whether an array-like is a Categorical instance. Parameters ---------- arr : array-like The array-like to check. Returns ------- boolean Whether or not the array-like is of a Categorical instance. Examples -------- >>> is_categorical([1, 2, 3]) False Categoricals, Series Categoricals, and CategoricalIndex will return True. >>> cat = pd.Categorical([1, 2, 3]) >>> is_categorical(cat) True >>> is_categorical(pd.Series(cat)) True >>> is_categorical(pd.CategoricalIndex([1, 2, 3])) True """ warnings.warn( "is_categorical is deprecated and will be removed in a future version. " "Use is_categorical_dtype instead.", FutureWarning, stacklevel=find_stack_level(), ) return isinstance(arr, ABCCategorical) or is_categorical_dtype(arr)
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas import testing as tm >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> tm.assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (isinstance(left.dtype, CategoricalDtype) and isinstance(right.dtype, CategoricalDtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if isinstance(left.dtype, CategoricalDtype) or isinstance( right.dtype, CategoricalDtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def create_subplots( naxes: int, sharex: bool = False, sharey: bool = False, squeeze: bool = True, subplot_kw=None, ax=None, layout=None, layout_type: str = "box", **fig_kw, ): """ Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of subplots, including the enclosing figure object, in a single call. Parameters ---------- naxes : int Number of required axes. Exceeded axes are set invisible. Default is nrows * ncols. sharex : bool If True, the X axis will be shared amongst all subplots. sharey : bool If True, the Y axis will be shared amongst all subplots. squeeze : bool If True, extra dimensions are squeezed out from the returned axis object: - if only one subplot is constructed (nrows=ncols=1), the resulting single Axis object is returned as a scalar. - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object array of Axis objects are returned as numpy 1-d arrays. - for NxM subplots with N>1 and M>1 are returned as a 2d array. If False, no squeezing is done: the returned axis object is always a 2-d array containing Axis instances, even if it ends up being 1x1. subplot_kw : dict Dict with keywords passed to the add_subplot() call used to create each subplots. ax : Matplotlib axis object, optional layout : tuple Number of rows and columns of the subplot grid. If not specified, calculated from naxes and layout_type layout_type : {'box', 'horizontal', 'vertical'}, default 'box' Specify how to layout the subplot grid. fig_kw : Other keyword arguments to be passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig, ax : tuple - fig is the Matplotlib Figure object - ax can be either a single axis object or an array of axis objects if more than one subplot was created. The dimensions of the resulting array can be controlled with the squeeze keyword, see above. Examples -------- x = np.linspace(0, 2*np.pi, 400) y = np.sin(x**2) # Just a figure and one subplot f, ax = plt.subplots() ax.plot(x, y) ax.set_title('Simple plot') # Two subplots, unpack the output array immediately f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.plot(x, y) ax1.set_title('Sharing Y axis') ax2.scatter(x, y) # Four polar axes plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt if subplot_kw is None: subplot_kw = {} if ax is None: fig = plt.figure(**fig_kw) else: if is_list_like(ax): if squeeze: ax = flatten_axes(ax) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored.", UserWarning, ) if sharex or sharey: warnings.warn( "When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified when creating axes.", UserWarning, stacklevel=find_stack_level(), ) if ax.size == naxes: fig = ax.flat[0].get_figure() return fig, ax else: raise ValueError( f"The number of passed axes must be {naxes}, the " "same as the output plot") fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is if naxes == 1: if squeeze: return fig, ax else: return fig, flatten_axes(ax) else: warnings.warn( "To output multiple subplots, the figure containing " "the passed axes is being cleared.", UserWarning, stacklevel=find_stack_level(), ) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) nplots = nrows * ncols # Create empty object array to hold all axes. It's easiest to make it 1-d # so we can just append subplots upon creation, and then axarr = np.empty(nplots, dtype=object) # Create first subplot separately, so we can share it if requested ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) if sharex: subplot_kw["sharex"] = ax0 if sharey: subplot_kw["sharey"] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based # convention. for i in range(1, nplots): kwds = subplot_kw.copy() # Set sharex and sharey to None for blank/dummy axes, these can # interfere with proper axis limits on the visible axes if # they share axes e.g. issue #7528 if i >= naxes: kwds["sharex"] = None kwds["sharey"] = None ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax if naxes != nplots: for ax in axarr[naxes:]: ax.set_visible(False) handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) if squeeze: # Reshape the array to have the final desired dimension (nrow,ncol), # though discarding unneeded dimensions that equal 1. If we only have # one subplot, just return it instead of a 1-element array. if nplots == 1: axes = axarr[0] else: axes = axarr.reshape(nrows, ncols).squeeze() else: # returned axis array will be always 2-d, even if nrows=ncols=1 axes = axarr.reshape(nrows, ncols) return fig, axes
def assert_almost_equal( left, right, check_dtype: bool | str = "equiv", check_less_precise: bool | int | NoDefault = no_default, rtol: float = 1.0e-5, atol: float = 1.0e-8, **kwargs, ): """ Check that the left and right objects are approximately equal. By approximately equal, we refer to objects that are numbers or that contain numbers which may be equivalent to specific levels of precision. Parameters ---------- left : object right : object check_dtype : bool or {'equiv'}, default 'equiv' Check dtype if both a and b are the same type. If 'equiv' is passed in, then `RangeIndex` and `Int64Index` are also considered equivalent when doing type checking. check_less_precise : bool or int, default False Specify comparison precision. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the number of digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. rtol : float, default 1e-5 Relative tolerance. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. .. versionadded:: 1.1.0 """ if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( check_less_precise # type: ignore[arg-type] ) if isinstance(left, Index): assert_index_equal( left, right, check_exact=False, exact=check_dtype, rtol=rtol, atol=atol, **kwargs, ) elif isinstance(left, Series): assert_series_equal( left, right, check_exact=False, check_dtype=check_dtype, rtol=rtol, atol=atol, **kwargs, ) elif isinstance(left, DataFrame): assert_frame_equal( left, right, check_exact=False, check_dtype=check_dtype, rtol=rtol, atol=atol, **kwargs, ) else: # Other sequences. if check_dtype: if is_number(left) and is_number(right): # Do not compare numeric classes, like np.float64 and float. pass elif is_bool(left) and is_bool(right): # Do not compare bool classes, like np.bool_ and bool. pass else: if isinstance(left, np.ndarray) or isinstance( right, np.ndarray): obj = "numpy array" else: obj = "Input" assert_class_equal(left, right, obj=obj) # if we have "equiv", this becomes True check_dtype = bool(check_dtype) _testing.assert_almost_equal(left, right, check_dtype=check_dtype, rtol=rtol, atol=atol, **kwargs)
def assert_extension_array_equal( left, right, check_dtype=True, index_values=None, check_less_precise=no_default, check_exact=False, rtol: float = 1.0e-5, atol: float = 1.0e-8, ): """ Check that left and right ExtensionArrays are equal. Parameters ---------- left, right : ExtensionArray The two arrays to compare. check_dtype : bool, default True Whether to check if the ExtensionArray dtypes are identical. index_values : numpy.ndarray, default None Optional index (shared by both left and right), used in output. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default False Whether to compare number exactly. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 Notes ----- Missing values are checked separately from valid values. A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. Examples -------- >>> from pandas import testing as tm >>> a = pd.Series([1, 2, 3, 4]) >>> b, c = a.array, a.array >>> tm.assert_extension_array_equal(b, c) """ if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: assert_attr_equal("dtype", left, right, obj="ExtensionArray") if (isinstance(left, DatetimeLikeArrayMixin) and isinstance(right, DatetimeLikeArrayMixin) and type(right) == type(left)): # Avoid slow object-dtype comparisons # np.asarray for case where we have a np.MaskedArray assert_numpy_array_equal(np.asarray(left.asi8), np.asarray(right.asi8), index_values=index_values) return left_na = np.asarray(left.isna()) right_na = np.asarray(right.isna()) assert_numpy_array_equal(left_na, right_na, obj="ExtensionArray NA mask", index_values=index_values) left_valid = np.asarray(left[~left_na].astype(object)) right_valid = np.asarray(right[~right_na].astype(object)) if check_exact: assert_numpy_array_equal(left_valid, right_valid, obj="ExtensionArray", index_values=index_values) else: _testing.assert_almost_equal( left_valid, right_valid, check_dtype=check_dtype, rtol=rtol, atol=atol, obj="ExtensionArray", index_values=index_values, )
def assert_frame_equal( left, right, check_dtype=True, check_index_type="equiv", check_column_type="equiv", check_frame_type=True, check_less_precise=no_default, check_names=True, by_blocks=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_like=False, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", ): """ Check that left and right DataFrame are equal. This function is intended to compare two DataFrames and output any differences. Is is mostly intended for use in unit tests. Additional parameters allow varying the strictness of the equality checks performed. Parameters ---------- left : DataFrame First DataFrame to compare. right : DataFrame Second DataFrame to compare. check_dtype : bool, default True Whether to check the DataFrame dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_column_type : bool or {'equiv'}, default 'equiv' Whether to check the columns class, dtype and inferred_type are identical. Is passed as the ``exact`` argument of :func:`assert_index_equal`. check_frame_type : bool, default True Whether to check the DataFrame class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check that the `names` attribute for both the `index` and `column` attributes of the DataFrame is identical. by_blocks : bool, default False Specify how to compare internal data. If False, compare by columns. If True, compare by blocks. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_like : bool, default False If True, ignore the order of index & columns. Note: index labels must match their respective rows (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'DataFrame' Specify object name being compared, internally used to show appropriate assertion message. See Also -------- assert_series_equal : Equivalent method for asserting Series equality. DataFrame.equals : Check DataFrame equality. Examples -------- This example shows comparing two DataFrames that are equal but with columns of differing dtypes. >>> from pandas.testing import assert_frame_equal >>> df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) >>> df2 = pd.DataFrame({'a': [1, 2], 'b': [3.0, 4.0]}) df1 equals itself. >>> assert_frame_equal(df1, df1) df1 differs from df2 as column 'b' is of a different type. >>> assert_frame_equal(df1, df2) Traceback (most recent call last): ... AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different Attribute "dtype" are different [left]: int64 [right]: float64 Ignore differing dtypes in columns with check_dtype. >>> assert_frame_equal(df1, df2, check_dtype=False) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, DataFrame) if check_frame_type: assert isinstance(left, type(right)) # assert_class_equal(left, right, obj=obj) # shape comparison if left.shape != right.shape: raise_assert_detail(obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}") if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" # index comparison assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.index", ) # column comparison assert_index_equal( left.columns, right.columns, exact=check_column_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, check_order=not check_like, rtol=rtol, atol=atol, obj=f"{obj}.columns", ) if check_like: left, right = left.reindex_like(right), right # compare by blocks if by_blocks: rblocks = right._to_dict_of_blocks() lblocks = left._to_dict_of_blocks() for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))): assert dtype in lblocks assert dtype in rblocks assert_frame_equal(lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj) # compare by columns else: for i, col in enumerate(left.columns): # We have already checked that columns match, so we can do # fast location-based lookups lcol = left._ixs(i, axis=1) rcol = right._ixs(i, axis=1) # GH #38183 # use check_index=False, because we do not want to run # assert_index_equal for each column, # as we already checked it for the whole dataframe before. assert_series_equal( lcol, rcol, check_dtype=check_dtype, check_index_type=check_index_type, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, check_freq=check_freq, obj=f'{obj}.iloc[:, {i}] (column name="{col}")', rtol=rtol, atol=atol, check_index=False, check_flags=False, )
def assert_index_equal( left: Index, right: Index, exact: bool | str = "equiv", check_names: bool = True, check_less_precise: bool | int | NoDefault = no_default, check_exact: bool = True, check_categorical: bool = True, check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, obj: str = "Index", ) -> None: """ Check that left and right Index are equal. Parameters ---------- left : Index right : Index exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for Int64Index as well. check_names : bool, default True Whether to check the names attribute. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default True Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_order : bool, default True Whether to compare the order of index entries as well as their values. If True, both indexes must contain the same elements, in the same order. If False, both indexes must contain the same elements, but in any order. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. Examples -------- >>> from pandas import testing as tm >>> a = pd.Index([1, 2, 3]) >>> b = pd.Index([1, 2, 3]) >>> tm.assert_index_equal(a, b) """ __tracebackhide__ = True def _check_types(left, right, obj="Index") -> None: if not exact: return assert_class_equal(left, right, exact=exact, obj=obj) assert_attr_equal("inferred_type", left, right, obj=obj) # Skip exact dtype checking when `check_categorical` is False if is_categorical_dtype(left.dtype) and is_categorical_dtype( right.dtype): if check_categorical: assert_attr_equal("dtype", left, right, obj=obj) assert_index_equal(left.categories, right.categories, exact=exact) return assert_attr_equal("dtype", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) return unique._shallow_copy(filled, name=index.names[level]) if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=find_stack_level(), ) # https://github.com/python/mypy/issues/7642 # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( check_less_precise # type: ignore[arg-type] ) # instance validation _check_isinstance(left, right, Index) # class / dtype comparison _check_types(left, right, obj=obj) # level comparison if left.nlevels != right.nlevels: msg1 = f"{obj} levels are different" msg2 = f"{left.nlevels}, {left}" msg3 = f"{right.nlevels}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # length comparison if len(left) != len(right): msg1 = f"{obj} length are different" msg2 = f"{len(left)}, {left}" msg3 = f"{len(right)}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # If order doesn't matter then sort the index entries if not check_order: left = Index(safe_sort(left)) right = Index(safe_sort(right)) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: left = cast(MultiIndex, left) right = cast(MultiIndex, right) for level in range(left.nlevels): # cannot use get_level_values here because it can change dtype llevel = _get_ilevel_values(left, level) rlevel = _get_ilevel_values(right, level) lobj = f"MultiIndex level [{level}]" assert_index_equal( llevel, rlevel, exact=exact, check_names=check_names, check_exact=check_exact, rtol=rtol, atol=atol, obj=lobj, ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): mismatch = left._values != right._values diff = np.sum(mismatch.astype(int)) * 100.0 / len(left) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: # if we have "equiv", this becomes True exact_bool = bool(exact) _testing.assert_almost_equal( left.values, right.values, rtol=rtol, atol=atol, check_dtype=exact_bool, obj=obj, lobj=left, robj=right, ) # metadata comparison if check_names: assert_attr_equal("names", left, right, obj=obj) if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex): assert_attr_equal("freq", left, right, obj=obj) if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex): assert_interval_array_equal(left._values, right._values) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
def concat_compat(to_concat, axis: int = 0, ea_compat_axis: bool = False): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation ea_compat_axis : bool, default False For ExtensionArray compat, behave as if axis == 1 when determining whether to drop empty arrays. Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] if non_empties and axis == 0 and not ea_compat_axis: # ea_compat_axis see GH#39574 to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} contains_datetime = any(kind in ["m", "M"] for kind in kinds) or any( isinstance(obj, ABCExtensionArray) and obj.ndim > 1 for obj in to_concat) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) if contains_datetime: return _concat_datetime(to_concat, axis=axis) if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) target_dtype = common_dtype_categorical_compat( to_concat, target_dtype) to_concat = [ cast_to_common_type(arr, target_dtype) for arr in to_concat ] if isinstance(to_concat[0], ABCExtensionArray): # TODO: what about EA-backed Index? cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) if len(kinds) != 1: if not len(kinds - {"i", "u", "f"}) or not len(kinds - {"b", "i", "u"}): # let numpy coerce pass else: # coerce to object to_concat = [x.astype("object") for x in to_concat] kinds = {"o"} result = np.concatenate(to_concat, axis=axis) if "b" in kinds and result.dtype.kind in ["i", "u", "f"]: # GH#39817 warnings.warn( "Behavior when concatenating bool-dtype and numeric-dtype arrays is " "deprecated; in a future version these will cast to object dtype " "(instead of coercing bools to numeric values). To retain the old " "behavior, explicitly cast bool-dtype arrays to numeric dtype.", FutureWarning, stacklevel=find_stack_level(), ) return result
def reindex( self, target, method=None, level=None, limit=None, tolerance=None ) -> tuple[Index, npt.NDArray[np.intp] | None]: """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray[np.intp] or None Indices of output values in original index """ if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" ) if level is not None: raise NotImplementedError( "argument level is not implemented for CategoricalIndex.reindex" ) if limit is not None: raise NotImplementedError( "argument limit is not implemented for CategoricalIndex.reindex" ) target = ibase.ensure_index(target) if self.equals(target): indexer = None missing = np.array([], dtype=np.intp) else: indexer, missing = self.get_indexer_non_unique(target) if not self.is_unique: # GH#42568 warnings.warn( "reindexing with a non-unique Index is deprecated and will " "raise in a future version.", FutureWarning, stacklevel=find_stack_level(), ) if len(self) and indexer is not None: new_target = self.take(indexer) else: new_target = target # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if not isinstance(target, CategoricalIndex) or (cats == -1).any(): new_target, indexer, _ = super()._reindex_non_unique(target) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] cat = self._data._from_backing_data(codes) new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an initial Categorical to begin with # in which case we are going to conform to the passed Categorical if is_categorical_dtype(target): cat = Categorical(new_target, dtype=target.dtype) new_target = type(self)._simple_new(cat, name=self.name) else: # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target new_target = np.asarray(new_target) new_target = Index._with_infer(new_target, name=self.name) return new_target, indexer
def to_arrays(data, columns: Index | None, dtype: DtypeObj | None = None) -> tuple[list[ArrayLike], Index]: """ Return list of arrays, columns. Returns ------- list[ArrayLike] These will become columns in a DataFrame. Index This will become frame.columns. Notes ----- Ensures that len(result_arrays) == len(result_index). """ if isinstance(data, ABCDataFrame): # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values for i, col in enumerate(data.columns) if col in columns ] else: columns = data.columns arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns if not len(data): if isinstance(data, np.ndarray): if data.dtype.names is not None: # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] if len(data) == 0: # GH#42456 the indexing above results in list of 2D ndarrays # TODO: is that an issue with numpy? for i, arr in enumerate(arrays): if arr.ndim == 2: arrays[i] = arr[:, 0] return arrays, columns return [], ensure_index([]) elif isinstance(data[0], Categorical): # GH#38845 deprecate special case warnings.warn( "The behavior of DataFrame([categorical, ...]) is deprecated and " "in a future version will be changed to match the behavior of " "DataFrame([any_listlike, ...]). " "To retain the old behavior, pass as a dictionary " "DataFrame({col: categorical, ..})", FutureWarning, stacklevel=find_stack_level(), ) if columns is None: columns = default_index(len(data)) elif len(columns) > len(data): raise ValueError("len(columns) > len(data)") elif len(columns) < len(data): # doing this here is akin to a pre-emptive reindex data = data[:len(columns)] return data, columns elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray columns = Index(list(data.dtype.names)) arrays = [data[k] for k in columns] return arrays, columns if isinstance(data[0], (list, tuple)): arr = _list_to_arrays(data) elif isinstance(data[0], abc.Mapping): arr, columns = _list_of_dict_to_arrays(data, columns) elif isinstance(data[0], ABCSeries): arr, columns = _list_of_series_to_arrays(data, columns) else: # last ditch effort data = [tuple(x) for x in data] arr = _list_to_arrays(data) content, columns = _finalize_columns_and_data(arr, columns, dtype) return content, columns
def parse( self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype: DtypeArg | None = None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=None, mangle_dupe_cols=True, **kwds, ): if convert_float is None: convert_float = True else: stacklevel = find_stack_level() warnings.warn( "convert_float is deprecated and will be removed in a future version.", FutureWarning, stacklevel=stacklevel, ) validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(dict.fromkeys(sheets).keys()) output = {} for asheetname in sheets: if verbose: print(f"Reading sheet {asheetname}") if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) if hasattr(sheet, "close"): # pyxlsb opens two TemporaryFiles sheet.close() usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) # If there is a MultiIndex header and an index then there is also # a row containing just the index name(s) has_index_names = (is_list_like(header) and len(header) > 1 and index_col is not None) if is_list_like(index_col): # Forward fill values for MultiIndex index. if header is None: offset = 0 elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # GH34673: if MultiIndex names present and not defined in the header, # offset needs to be incremented so that forward filling starts # from the first MI value instead of the name if has_index_names: offset += 1 # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] # GH 12292 : error when read one empty column from excel file try: parser = TextParser( data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, skip_blank_lines=False, # GH 39808 parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds, ) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def __init__(self, path_or_buffer, engine=None, storage_options: StorageOptions = None): if engine is not None and engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) # Determine xlrd version if installed if import_optional_dependency("xlrd", errors="ignore") is None: xlrd_version = None else: import xlrd xlrd_version = Version(get_version(xlrd)) ext = None if engine is None: # Only determine ext if it is needed if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(content_or_path=path_or_buffer, storage_options=storage_options) if ext is None: raise ValueError( "Excel file format cannot be determined, you must specify " "an engine manually.") engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") if engine == "xlrd" and xlrd_version is not None: if ext is None: # Need ext to determine ext in order to raise/warn if isinstance(path_or_buffer, xlrd.Book): ext = "xls" else: ext = inspect_excel_format(path_or_buffer, storage_options=storage_options) # Pass through if ext is None, otherwise check if ext valid for xlrd if ext and ext != "xls" and xlrd_version >= Version("2"): raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) elif ext and ext != "xls": stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " f"openpyxl instead.", FutureWarning, stacklevel=stacklevel, ) self.engine = engine self.storage_options = storage_options self._reader = self._engines[engine](self._io, storage_options=storage_options)