def arithmetic_op(left: ArrayLike, right: Any, op): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame or Index. Series is *not* excluded. op : {operator.add, operator.sub, ...} Or one of the reversed variants from roperator. Returns ------- ndarray or ExtensionArray Or a 2-tuple of these in the case of divmod or rdivmod. """ # NB: We assume that extract_array has already been called # on `left` and `right`. lvalues = ensure_wrapped_if_datetimelike(left) rvalues = ensure_wrapped_if_datetimelike(right) rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance( rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 res_values = op(lvalues, rvalues) else: with np.errstate(all="ignore"): res_values = _na_arithmetic_op(lvalues, rvalues, op) return res_values
def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: """ Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. Note: the caller is responsible for ensuring that numpy warnings are suppressed (with np.errstate(all="ignore")) if needed. Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame, Series, or Index. op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le} Returns ------- ndarray or ExtensionArray """ # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) rvalues = ensure_wrapped_if_datetimelike(right) rvalues = lib.item_from_zerodim(rvalues) if isinstance(rvalues, list): # TODO: same for tuples? rvalues = np.asarray(rvalues) if isinstance(rvalues, (np.ndarray, ABCExtensionArray)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(lvalues) != len(rvalues): raise ValueError("Lengths must match to compare", lvalues.shape, rvalues.shape) if should_extension_dispatch(lvalues, rvalues) or ( (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT) and not is_object_dtype(lvalues.dtype)): # Call the method on lvalues res_values = op(lvalues, rvalues) elif is_scalar(rvalues) and isna(rvalues): # numpy does not like comparisons vs None if op is operator.ne: res_values = np.ones(lvalues.shape, dtype=bool) else: res_values = np.zeros(lvalues.shape, dtype=bool) elif is_numeric_v_string_like(lvalues, rvalues): # GH#36377 going through the numexpr path would incorrectly raise return invalid_comparison(lvalues, rvalues, op) elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) else: res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) return res_values
def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: """ Insert item at selected position. Parameters ---------- loc : int item : hashable value : np.ndarray or ExtensionArray """ # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) value = extract_array(value, extract_numpy=True) if value.ndim == 2: if value.shape[0] == 1: # error: Invalid index type "Tuple[int, slice]" for # "Union[Any, ExtensionArray, ndarray]"; expected type # "Union[int, slice, ndarray]" value = value[0, :] # type: ignore[index] else: raise ValueError( f"Expected a 1D array, got an array with shape {value.shape}" ) value = ensure_wrapped_if_datetimelike(value) # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) # TODO is this copy needed? arrays = self.arrays.copy() arrays.insert(loc, value) self.arrays = arrays self._axes[1] = new_axis
def _concat_datetime(to_concat, axis=0): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ from pandas.core.construction import ensure_wrapped_if_datetimelike to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) return result
def to_array(self, dtype: DtypeObj) -> ArrayLike: """ Helper function to create the actual all-NA array from the NullArrayProxy object. Parameters ---------- arr : NullArrayProxy dtype : the dtype for the resulting array Returns ------- np.ndarray or ExtensionArray """ if isinstance(dtype, ExtensionDtype): empty = dtype.construct_array_type()._from_sequence([], dtype=dtype) indexer = -np.ones(self.n, dtype=np.intp) return empty.take(indexer, allow_fill=True) else: # when introducing missing values, int becomes float, bool becomes object dtype = ensure_dtype_can_hold_na(dtype) fill_value = na_value_for_dtype(dtype) arr = np.empty(self.n, dtype=dtype) arr.fill(fill_value) return ensure_wrapped_if_datetimelike(arr)
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if (is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer)): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) if (isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object")): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = ensure_wrapped_if_datetimelike(arr) if is_extension_array_dtype(dtype) and isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return pd_array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False)
def _concat_datetime(to_concat, axis=0): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ to_concat = [ensure_wrapped_if_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 # multiple types, need to coerce to object if not single_dtype: # ensure_wrapped_if_datetimelike ensures that astype(object) wraps # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) if axis == 1: # TODO(EA2D): kludge not necessary with 2D EAs to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) if result.ndim == 2 and is_extension_array_dtype(result.dtype): # TODO(EA2D): kludge not necessary with 2D EAs assert result.shape[0] == 1 result = result[0] return result
def wrapper( arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan ): if conv_dtype == object: # GH#39755 avoid casting dt64/td64 to integers arr = ensure_wrapped_if_datetimelike(arr) arr = arr.astype(conv_dtype) f(arr, indexer, out, fill_value=fill_value)
def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): """ Set new column(s). This changes the ArrayManager in-place, but replaces (an) existing column(s), not changing column values in-place). Parameters ---------- loc : integer, slice or boolean mask Positional location (already bounds checked) value : np.ndarray or ExtensionArray """ # single column -> single integer index if lib.is_integer(loc): # TODO can we avoid needing to unpack this here? That means converting # DataFrame into 1D array when loc is an integer if isinstance(value, np.ndarray) and value.ndim == 2: assert value.shape[1] == 1 value = value[:, 0] # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item # but we should avoid that and pass directly the proper array value = ensure_wrapped_if_datetimelike(value) assert isinstance(value, (np.ndarray, ExtensionArray)) assert value.ndim == 1 assert len(value) == len(self._axes[0]) # error: Invalid index type "Union[int, slice, ndarray]" for # "List[Union[ndarray, ExtensionArray]]"; expected type "int" self.arrays[loc] = value # type: ignore[index] return # multiple columns -> convert slice or array to integer indices elif isinstance(loc, slice): indices = range( loc.start if loc.start is not None else 0, loc.stop if loc.stop is not None else self.shape_proper[1], loc.step if loc.step is not None else 1, ) else: assert isinstance(loc, np.ndarray) assert loc.dtype == "bool" # error: Incompatible types in assignment (expression has type "ndarray", # variable has type "range") indices = np.nonzero(loc)[0] # type: ignore[assignment] assert value.ndim == 2 assert value.shape[0] == len(self._axes[0]) for value_idx, mgr_idx in enumerate(indices): # error: Invalid index type "Tuple[slice, int]" for # "Union[ExtensionArray, ndarray]"; expected type # "Union[int, slice, ndarray]" value_arr = value[:, value_idx] # type: ignore[index] self.arrays[mgr_idx] = value_arr return
def _arith_method(self, other, op): res_name = ops.get_op_result_name(self, other) lvalues = self._values rvalues = extract_array(other, extract_numpy=True, extract_range=True) rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape) rvalues = ensure_wrapped_if_datetimelike(rvalues) with np.errstate(all="ignore"): result = ops.arithmetic_op(lvalues, rvalues, op) return self._construct_result(result, name=res_name)
def iset(self, loc: Union[int, slice, np.ndarray], value): """ Set new column(s). This changes the ArrayManager in-place, but replaces (an) existing column(s), not changing column values in-place). Parameters ---------- loc : integer, slice or boolean mask Positional location (already bounds checked) value : array-like """ # single column -> single integer index if lib.is_integer(loc): # TODO the extract array should in theory not be needed? value = extract_array(value, extract_numpy=True) # TODO can we avoid needing to unpack this here? That means converting # DataFrame into 1D array when loc is an integer if isinstance(value, np.ndarray) and value.ndim == 2: assert value.shape[1] == 1 value = value[0, :] # TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item # but we should avoid that and pass directly the proper array value = ensure_wrapped_if_datetimelike(value) assert isinstance(value, (np.ndarray, ExtensionArray)) assert value.ndim == 1 assert len(value) == len(self._axes[0]) self.arrays[loc] = value return # multiple columns -> convert slice or array to integer indices elif isinstance(loc, slice): indices = range( loc.start if loc.start is not None else 0, loc.stop if loc.stop is not None else self.shape_proper[1], loc.step if loc.step is not None else 1, ) else: assert isinstance(loc, np.ndarray) assert loc.dtype == "bool" indices = np.nonzero(loc)[0] assert value.ndim == 2 assert value.shape[0] == len(self._axes[0]) for value_idx, mgr_idx in enumerate(indices): value_arr = value[:, value_idx] self.arrays[mgr_idx] = value_arr return
def arithmetic_op(left: ArrayLike, right: Any, op): """ Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... Note: the caller is responsible for ensuring that numpy warnings are suppressed (with np.errstate(all="ignore")) if needed. Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame or Index. Series is *not* excluded. op : {operator.add, operator.sub, ...} Or one of the reversed variants from roperator. Returns ------- ndarray or ExtensionArray Or a 2-tuple of these in the case of divmod or rdivmod. """ # NB: We assume that extract_array has already been called # on `left` and `right`. # We need to special-case datetime64/timedelta64 dtypes (e.g. because numpy # casts integer dtypes to timedelta64 when operating with timedelta64 - GH#22390) lvalues = ensure_wrapped_if_datetimelike(left) rvalues = ensure_wrapped_if_datetimelike(right) rvalues = _maybe_upcast_for_op(rvalues, lvalues.shape) if should_extension_dispatch(lvalues, rvalues) or isinstance( rvalues, Timedelta): # Timedelta is included because numexpr will fail on it, see GH#31457 res_values = op(lvalues, rvalues) else: res_values = _na_arithmetic_op(lvalues, rvalues, op) return res_values
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: """ Helper function for `arr.astype(common_dtype)` but handling all special cases. """ if is_dtype_equal(arr.dtype, dtype): return arr if ( is_categorical_dtype(arr.dtype) and isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.integer) ): # problem case: categorical of int -> gives int as result dtype, # but categorical can contain NAs -> fall back to object dtype try: return arr.astype(dtype, copy=False) except ValueError: return arr.astype(object, copy=False) if is_sparse(arr) and not is_sparse(dtype): # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type, _ # SupportsDType[dtype[Any]], str, Union[Tuple[Any, int], Tuple[Any, # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], _DTypeDict, # Tuple[Any, Any]]]" [arg-type] arr = cast(SparseArray, arr) return arr.to_dense().astype(dtype, copy=False) # type: ignore[arg-type] if ( isinstance(arr, np.ndarray) and arr.dtype.kind in ["m", "M"] and dtype is np.dtype("object") ): # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta # this can happen when concat_compat is called directly on arrays (when arrays # are not coming from Index/Series._values), eg in BlockManager.quantile arr = ensure_wrapped_if_datetimelike(arr) if isinstance(dtype, ExtensionDtype): if isinstance(arr, np.ndarray): # numpy's astype cannot handle ExtensionDtypes return pd_array(arr, dtype=dtype, copy=False) return arr.astype(dtype, copy=False) return arr.astype(dtype, copy=False)
def __init__( self, arrays: List[Union[np.ndarray, ExtensionArray]], axes: List[Index], verify_integrity: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes self.arrays = arrays if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] self.arrays = [ensure_wrapped_if_datetimelike(arr) for arr in arrays] self._verify_integrity()
def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): """ Insert item at selected position. Parameters ---------- loc : int item : hashable value : array_like allow_duplicates: bool If False, trying to insert non-unique item will raise """ if not allow_duplicates and item in self.items: # Should this be a different kind of error?? raise ValueError(f"cannot insert {item}, already exists") if not isinstance(loc, int): raise TypeError("loc must be int") # insert to the axis; this could possibly raise a TypeError new_axis = self.items.insert(loc, item) value = extract_array(value, extract_numpy=True) if value.ndim == 2: if value.shape[0] == 1: value = value[0, :] else: raise ValueError( f"Expected a 1D array, got an array with shape {value.shape}" ) value = ensure_wrapped_if_datetimelike(value) # TODO self.arrays can be empty # assert len(value) == len(self.arrays[0]) # TODO is this copy needed? arrays = self.arrays.copy() arrays.insert(loc, value) self.arrays = arrays self._axes[1] = new_axis
def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], verify_integrity: bool = True, ): self._axes = axes self.arrays = arrays if verify_integrity: assert len(axes) == 1 assert len(arrays) == 1 self._axes = [ensure_index(ax) for ax in self._axes] arr = arrays[0] arr = ensure_wrapped_if_datetimelike(arr) if isinstance(arr, ABCPandasArray): arr = arr.to_numpy() self.arrays = [arr] self._verify_integrity()
def _cmp_method(self, other, op): if isinstance(other, PandasArray): other = other._ndarray other = ops.maybe_prepare_scalar_for_op(other, (len(self),)) pd_op = ops.get_array_op(op) other = ensure_wrapped_if_datetimelike(other) with np.errstate(all="ignore"): result = pd_op(self._ndarray, other) if op is divmod or op is ops.rdivmod: a, b = result if isinstance(a, np.ndarray): # for e.g. op vs TimedeltaArray, we may already # have an ExtensionArray, in which case we do not wrap return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b) return a, b if isinstance(result, np.ndarray): # for e.g. multiplication vs TimedeltaArray, we may already # have an ExtensionArray, in which case we do not wrap return self._wrap_ndarray_result(result) return result
def series_generator(self): values = self.values values = ensure_wrapped_if_datetimelike(values) assert len(values) > 0 # We create one Series object, and will swap out the data inside # of it. Kids: don't do this at home. ser = self.obj._ixs(0, axis=0) mgr = ser._mgr if is_extension_array_dtype(ser.dtype): # values will be incorrect for this block # TODO(EA2D): special case would be unnecessary with 2D EAs obj = self.obj for i in range(len(obj)): yield obj._ixs(i, axis=0) else: for (arr, name) in zip(values, self.index): # GH#35462 re-pin mgr in case setitem changed it ser._mgr = mgr mgr.set_values(arr) ser.name = name yield ser
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # if the array preparation does a copy -> avoid this for ArrayManager, # since the copy is done on conversion to 1D arrays copy_on_sanitize = False if typ == "array" else copy vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or is_1d_only_ea_dtype(dtype): # GH#19157 if isinstance(values, (np.ndarray, ExtensionArray)) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values # error: No overload variant of "__getitem__" of "ExtensionArray" # matches argument type "Tuple[slice, int]" values = [ values[:, n] # type: ignore[call-overload] for n in range(values.shape[1]) ] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ, PeriodDtype values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy_on_sanitize) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array(flat, None, dtype=dtype, copy=copy_on_sanitize, raise_cast_failure=rcf) values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i])) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i] for i in range(values.shape[1])] if copy: arrays = [arr.copy() for arr in arrays] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ ensure_block_shape(dval, 2) for dval in maybe_datetime ] block_values = [ new_block_2d(dvals_list[n], placement=BlockPlacement(n)) for n in range(len(dvals_list)) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index], verify_integrity=False)
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = (f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types") raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalArray") raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ("left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'") raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray left = ensure_wrapped_if_datetimelike(left) left = extract_array(left, extract_numpy=True) right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def concatenate_managers( mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool ) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans, concat_axis) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block if len(join_units) == 1 and not join_units[0].indexers: values = blk.values if copy: values = values.copy() else: values = values.view() fastpath = True elif _is_uniform_join_units(join_units): vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals, axis=1) values = ensure_block_shape(values, blk.ndim) values = ensure_wrapped_if_datetimelike(values) fastpath = blk.values.dtype == values.dtype else: values = _concatenate_join_units(join_units, concat_axis, copy=copy) fastpath = False if fastpath: b = blk.make_block_same_class(values, placement=placement) else: b = new_block(values, placement=placement, ndim=len(axes)) blocks.append(b) return BlockManager(tuple(blocks), axes)
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except IntCastingNaNError: # following Series, we ignore the dtype and retain floating # values instead of casting nans to meaningless ints pass values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def _arith_method(self, other, op): op_name = op.__name__ omask = None if isinstance(other, BaseMaskedArray): other, omask = other._data, other._mask elif is_list_like(other): if not isinstance(other, ExtensionArray): other = np.asarray(other) if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures") # We wrap the non-masked arithmetic logic used for numpy dtypes # in Series/Index arithmetic ops. other = ops.maybe_prepare_scalar_for_op(other, (len(self), )) pd_op = ops.get_array_op(op) other = ensure_wrapped_if_datetimelike(other) if op_name in {"pow", "rpow"} and isinstance(other, np.bool_): # Avoid DeprecationWarning: In future, it will be an error # for 'np.bool_' scalars to be interpreted as an index # e.g. test_array_scalar_like_equivalence other = bool(other) mask = self._propagate_mask(omask, other) if other is libmissing.NA: result = np.ones_like(self._data) if self.dtype.kind == "b": if op_name in { "floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow" }: dtype = "int8" elif op_name in {"truediv", "rtruediv"}: dtype = "float64" else: dtype = "bool" result = result.astype(dtype) elif "truediv" in op_name and self.dtype.kind != "f": # The actual data here doesn't matter since the mask # will be all-True, but since this is division, we want # to end up with floating dtype. result = result.astype(np.float64) else: # Make sure we do this before the "pow" mask checks # to get an expected exception message on shape mismatch. if self.dtype.kind in ["i", "u" ] and op_name in ["floordiv", "mod"]: # TODO(GH#30188) ATM we don't match the behavior of non-masked # types with respect to floordiv-by-zero pd_op = op elif self.dtype.kind == "b" and ("div" in op_name or "pow" in op_name or "mod" in op_name): # TODO(GH#41165): should these be disallowed? pd_op = op with np.errstate(all="ignore"): result = pd_op(self._data, other) if op_name == "pow": # 1 ** x is 1. mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) return self._maybe_mask_result(result, mask)
def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: """ Evaluate a logical operation `|`, `&`, or `^`. Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame, Series, or Index. op : {operator.and_, operator.or_, operator.xor} Or one of the reversed variants from roperator. Returns ------- ndarray or ExtensionArray """ fill_int = lambda x: x def fill_bool(x, left=None): # if `left` is specifically not-boolean, we do not cast to bool if x.dtype.kind in ["c", "f", "O"]: # dtypes that can hold NA mask = isna(x) if mask.any(): x = x.astype(object) x[mask] = False if left is None or is_bool_dtype(left.dtype): x = x.astype(bool) return x is_self_int_dtype = is_integer_dtype(left.dtype) right = lib.item_from_zerodim(right) if is_list_like(right) and not hasattr(right, "dtype"): # e.g. list, tuple right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right lvalues = ensure_wrapped_if_datetimelike(left) rvalues = right if should_extension_dispatch(lvalues, rvalues): # Call the method on lvalues res_values = op(lvalues, rvalues) else: if isinstance(rvalues, np.ndarray): is_other_int_dtype = is_integer_dtype(rvalues.dtype) rvalues = rvalues if is_other_int_dtype else fill_bool(rvalues, lvalues) else: # i.e. scalar is_other_int_dtype = lib.is_integer(rvalues) # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) # error: Cannot call function of unknown type res_values = filler(res_values) # type: ignore[operator] return res_values
def astype_nansafe(arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype or ExtensionDtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no # attribute "reshape" return result.reshape(arr.shape) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) # dispatch on extension dtype if needed if isinstance(dtype, ExtensionDtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) elif not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and (issubclass(dtype.type, str) or dtype == _dtype_obj): from pandas.core.construction import ensure_wrapped_if_datetimelike arr = ensure_wrapped_if_datetimelike(arr) return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) elif is_datetime64_dtype(arr.dtype): if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError( f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr.dtype): if dtype == np.int64: warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, stacklevel=find_stack_level(), ) if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) elif dtype.kind == "m": return astype_td64_unit_conversion(arr, dtype, copy=copy) raise TypeError( f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype( dtype, np.integer): return _astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr.dtype): # work around NumPy brokenness, #1987 if np.issubdtype(dtype.type, np.integer): return lib.astype_intsafe(arr, dtype) # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe elif is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe( to_datetime(arr).values, dtype, copy=copy, ) elif is_timedelta64_dtype(dtype): from pandas import to_timedelta return astype_nansafe(to_timedelta(arr)._values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = (f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead.") raise ValueError(msg) if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.astype(dtype, copy=copy)
def astype_dt64_to_dt64tz( values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False ) -> DatetimeArray: # GH#33401 we have inconsistent behaviors between # Datetimeindex[naive].astype(tzaware) # Series[dt64].astype(tzaware) # This collects them in one place to prevent further fragmentation. from pandas.core.construction import ensure_wrapped_if_datetimelike values = ensure_wrapped_if_datetimelike(values) values = cast("DatetimeArray", values) aware = isinstance(dtype, DatetimeTZDtype) if via_utc: # Series.astype behavior # caller is responsible for checking this assert values.tz is None and aware dtype = cast(DatetimeTZDtype, dtype) if copy: # this should be the only copy values = values.copy() warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use ser.dt.tz_localize instead.", FutureWarning, stacklevel=find_stack_level(), ) # GH#33401 this doesn't match DatetimeArray.astype, which # goes through the `not via_utc` path return values.tz_localize("UTC").tz_convert(dtype.tz) else: # DatetimeArray/DatetimeIndex.astype behavior if values.tz is None and aware: dtype = cast(DatetimeTZDtype, dtype) warnings.warn( "Using .astype to convert from timezone-naive dtype to " "timezone-aware dtype is deprecated and will raise in a " "future version. Use obj.tz_localize instead.", FutureWarning, stacklevel=find_stack_level(), ) return values.tz_localize(dtype.tz) elif aware: # GH#18951: datetime64_tz dtype but not equal means different tz dtype = cast(DatetimeTZDtype, dtype) result = values.tz_convert(dtype.tz) if copy: result = result.copy() return result elif values.tz is not None: warnings.warn( "Using .astype to convert from timezone-aware dtype to " "timezone-naive dtype is deprecated and will raise in a " "future version. Use obj.tz_localize(None) or " "obj.tz_convert('UTC').tz_localize(None) instead", FutureWarning, stacklevel=find_stack_level(), ) result = values.tz_convert("UTC").tz_localize(None) if copy: result = result.copy() return result raise NotImplementedError("dtype_equal case should be handled elsewhere")
def concatenate_managers(mgrs_indexers, axes: list[Index], concat_axis: int, copy: bool) -> Manager: """ Concatenate block managers into one. Parameters ---------- mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples axes : list of Index concat_axis : int copy : bool Returns ------- BlockManager """ # TODO(ArrayManager) this assumes that all managers are of the same type if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) # Assertions disabled for performance # for tup in mgrs_indexers: # # caller is responsible for ensuring this # indexers = tup[1] # assert concat_axis not in indexers if concat_axis == 0: return _concat_managers_axis0(mgrs_indexers, axes, copy) mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) # Assertion disabled for performance # assert all(not x[1] for x in mgrs_indexers) concat_plans = [ _get_mgr_concatenation_plan(mgr) for mgr, _ in mgrs_indexers ] concat_plan = _combine_concat_plans(concat_plans) blocks = [] for placement, join_units in concat_plan: unit = join_units[0] blk = unit.block # Assertion disabled for performance # assert len(join_units) == len(mgrs_indexers) if len(join_units) == 1: values = blk.values if copy: values = values.copy() else: values = values.view() fastpath = True elif _is_uniform_join_units(join_units): vals = [ju.block.values for ju in join_units] if not blk.is_extension: # _is_uniform_join_units ensures a single dtype, so # we can use np.concatenate, which is more performant # than concat_compat values = np.concatenate(vals, axis=1) else: # TODO(EA2D): special-casing not needed with 2D EAs values = concat_compat(vals, axis=1) values = ensure_block_shape(values, ndim=2) values = ensure_wrapped_if_datetimelike(values) fastpath = blk.values.dtype == values.dtype else: values = _concatenate_join_units(join_units, copy=copy) fastpath = False if fastpath: b = blk.make_block_same_class(values, placement=placement) else: b = new_block_2d(values, placement=placement) blocks.append(b) return BlockManager(tuple(blocks), axes)
def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = self.mask_all # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values # matching values? When that holds, we can slice instead # of take (in particular for EAs) new_values = ( sorted_values.reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) name = np.dtype(dtype).name else: dtype, fill_value = maybe_promote(values.dtype, fill_value) if isinstance(dtype, ExtensionDtype): # GH#41875 cls = dtype.construct_array_type() new_values = cls._empty(result_shape, dtype=dtype) new_values[:] = fill_value name = dtype.name else: new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) name = np.dtype(dtype).name new_mask = np.zeros(result_shape, dtype=bool) # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values.dtype): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") elif is_bool_dtype(values.dtype): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values.dtype): # view as datetime64 so we can wrap in DatetimeArray and use # DTA's view method new_values = new_values.view("M8[ns]") new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask
def astype_nansafe( arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False ) -> ArrayLike: """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype or ExtensionDtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ # We get here with 0-dim from sparse arr = np.atleast_1d(arr) # dispatch on extension dtype if needed if isinstance(dtype, ExtensionDtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) elif not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and ( issubclass(dtype.type, str) or dtype == _dtype_obj ): from pandas.core.construction import ensure_wrapped_if_datetimelike arr = ensure_wrapped_if_datetimelike(arr) return arr.astype(dtype, copy=copy) if issubclass(dtype.type, str): shape = arr.shape if arr.ndim > 1: arr = arr.ravel() return lib.ensure_string_array( arr, skipna=skipna, convert_na_value=False ).reshape(shape) elif is_datetime64_dtype(arr.dtype): if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr.dtype): if dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) elif dtype.kind == "m": return astype_td64_unit_conversion(arr, dtype, copy=copy) raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype): return _astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr.dtype): # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe if is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe( to_datetime(arr.ravel()).values.reshape(arr.shape), dtype, copy=copy, ) elif is_timedelta64_dtype(dtype): # bc we know arr.dtype == object, this is equivalent to # `np.asarray(to_timedelta(arr))`, but using a lower-level API that # does not require a circular import. return array_to_timedelta64(arr).view("m8[ns]").astype(dtype, copy=False) if dtype.name in ("datetime64", "timedelta64"): msg = ( f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead." ) raise ValueError(msg) if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.astype(dtype, copy=copy)