def _simple_new(cls, left, right, closed=None, name=None, copy=False, verify_integrity=True): result = IntervalMixin.__new__(cls) if closed is None: closed = 'right' left = _ensure_index(left, copy=copy) right = _ensure_index(right, copy=copy) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) if is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): raise ValueError("must not have differing left [{}] " "and right [{}] types".format( type(left), type(right))) if isinstance(left, ABCPeriodIndex): raise ValueError("Period dtypes are not supported, " "use a PeriodIndex instead") result._left = left result._right = right result._closed = closed result.name = name if verify_integrity: result._validate() result._reset_identity() return result
def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ values = _values_from_object(values) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error # # #18044 in _libs/windows.pyx calc_skew follow this behavior # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or 'right' left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = 'dtype must be an IntervalDtype, got {dtype}' raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ('must not have differing left [{ltype}] and right ' '[{rtype}] types') raise ValueError(msg.format(ltype=type(left).__name__, rtype=type(right).__name__)) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' raise ValueError(msg) elif (isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz)): msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def pad_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'pad_2d_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_2d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for pad_2d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'backfill_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_1d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def nansum(values, axis=None, skipna=True, min_count=0, mask=None): """ Sum the elements along an axis ignoring NaNs Parameters ---------- values : ndarray[dtype] axis: int, optional skipna : bool, default True min_count: int, default 0 mask : ndarray[bool], optional nan-mask if known Returns ------- result : dtype Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nansum(s) 3.0 """ values, mask, dtype, dtype_max, _ = _get_values(values, skipna, 0, mask=mask) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype)
def _simple_new(cls, values, name=None, freq=None, **kwargs): """ Create a new PeriodIndex. Parameters ---------- values : PeriodArray, PeriodIndex, Index[int64], ndarray[int64] Values that can be converted to a PeriodArray without inference or coercion. """ # TODO: raising on floats is tested, but maybe not useful. # Should the callers know not to pass floats? # At the very least, I think we can ensure that lists aren't passed. if isinstance(values, list): values = np.asarray(values) if is_float_dtype(values): raise TypeError("PeriodIndex._simple_new does not accept floats.") values = PeriodArray(values, freq=freq) if not isinstance(values, PeriodArray): raise TypeError("PeriodIndex._simple_new only accepts PeriodArray") result = object.__new__(cls) result._data = values result.name = name result._reset_identity() return result
def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'pad_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_1d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object elif is_timedelta64_dtype(values): # NaTs are treated identically to datetime64, so we can dispatch # to that implementation _method = _pad_1d_datetime if _method is None: raise ValueError('Invalid dtype for pad_1d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def __sub__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None, compute_mask=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if skipna: compute_mask = True if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None and compute_mask: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. # Defer to the PeriodArray implementation. # In remaining cases, this will end up raising TypeError. return NotImplemented elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count)
def nansum(values, axis=None, skipna=True, min_count=0): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype)
def integer_arithmetic_method(self, other): op_name = op.__name__ mask = None if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented if getattr(other, 'ndim', 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif getattr(other, 'ndim', None) == 0: other = other.item() elif is_list_like(other): other = np.asarray(other) if not other.ndim: other = other.item() elif other.ndim == 1: if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError( "can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") # nans propagate if mask is None: mask = self._mask else: mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. if op_name == 'pow': mask = np.where(self == 1, False, mask) elif op_name == 'rpow': mask = np.where(other == 1, False, mask) with np.errstate(all='ignore'): result = op(self._data, other) # divmod returns a tuple if op_name == 'divmod': div, mod = result return (self._maybe_mask_result(div, mask, other, 'floordiv'), self._maybe_mask_result(mod, mask, other, 'mod')) return self._maybe_mask_result(result, mask, other, op_name)
def _simple_new(cls, values, name=None, freq=None, **kwargs): """ Values can be any type that can be coerced to Periods. Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ if not is_integer_dtype(values): values = np.array(values, copy=False) if len(values) > 0 and is_float_dtype(values): raise TypeError("PeriodIndex can't take floats") return cls(values, name=name, freq=freq, **kwargs) return cls._from_ordinals(values, name, freq, **kwargs)
def fill_zeros(result, x, y, name, fill): """ if this is a reversed op, then flip x,y if we have an integer value (or array in y) and we have 0's, fill them with the fill, return the result mask the nan's from x """ if fill is None or is_float_dtype(result): return result if name.startswith(('r', '__r')): x, y = y, x is_variable_type = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar_type = is_scalar(y) if not is_variable_type and not is_scalar_type: return result if is_scalar_type: y = np.array(y) if is_integer_dtype(y): if (y == 0).any(): # GH 7325, mask and nans must be broadcastable (also: PR 9308) # Raveling and then reshaping makes np.putmask faster mask = ((y == 0) & ~np.isnan(result)).ravel() shape = result.shape result = result.astype('float64', copy=False).ravel() np.putmask(result, mask, fill) # if we have a fill of inf, then sign it correctly # (GH 6178 and PR 9308) if np.isinf(fill): signs = y if name.startswith(('r', '__r')) else x signs = np.sign(signs.astype('float', copy=False)) negative_inf_mask = (signs.ravel() < 0) & mask np.putmask(result, negative_inf_mask, -fill) if "floordiv" in name: # (PR 9308) nan_mask = ((y == 0) & (x == 0)).ravel() np.putmask(result, nan_mask, np.nan) result = result.reshape(shape) return result
def _get_next_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): return label + np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: raise TypeError('cannot determine next label for type {typ!r}' .format(typ=type(label)))
def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label - np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: raise TypeError('cannot determine next label for type %r' % type(label))
def _simple_new(cls, values, freq=None): """ Values can be any type that can be coerced to Periods. Ordinals in an ndarray are fastpath-ed to `_from_ordinals` """ if not is_integer_dtype(values): values = np.array(values, copy=False) if len(values) > 0 and is_float_dtype(values): raise TypeError("{cls} can't take floats" .format(cls=cls.__name__)) return cls(values, freq=freq) return cls._from_ordinals(values, freq)
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datelike(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other) or is_period_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def test_is_float_dtype(): assert not com.is_float_dtype(str) assert not com.is_float_dtype(int) assert not com.is_float_dtype(pd.Series([1, 2])) assert not com.is_float_dtype(np.array(['a', 'b'])) assert com.is_float_dtype(float) assert com.is_float_dtype(pd.Index([1, 2.]))
def test_setitem_dtype_upcast(self): # GH3216 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df['c'] = np.nan assert df['c'].dtype == np.float64 df.loc[0, 'c'] = 'foo' expected = DataFrame([{"a": 1, "c": 'foo'}, {"a": 3, "b": 2, "c": np.nan}]) tm.assert_frame_equal(df, expected) # GH10280 df = DataFrame(np.arange(6, dtype='int64').reshape(2, 3), index=list('ab'), columns=['foo', 'bar', 'baz']) for val in [3.14, 'wxyz']: left = df.copy() left.loc['a', 'bar'] = val right = DataFrame([[0, val, 2], [3, 4, 5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) assert is_integer_dtype(left['foo']) assert is_integer_dtype(left['baz']) left = DataFrame(np.arange(6, dtype='int64').reshape(2, 3) / 10.0, index=list('ab'), columns=['foo', 'bar', 'baz']) left.loc['a', 'bar'] = 'wxyz' right = DataFrame([[0, 'wxyz', .2], [.3, .4, .5]], index=list('ab'), columns=['foo', 'bar', 'baz']) tm.assert_frame_equal(left, right) assert is_float_dtype(left['foo']) assert is_float_dtype(left['baz'])
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # GH 4343 tm.skip_if_no_package('scipy') # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results arr = np.eye(3, dtype=dtype) # GH 16179 arr[0, 1] = dtype(2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype except (TypeError, AssertionError): # If conversion to sparse fails for this spmatrix type and arr.dtype, # then the combination is not currently supported in NumPy, so we # can just skip testing it thoroughly return sdf = pd.SparseDataFrame(spm, index=index, columns=columns, default_fill_value=fill_value) # Expected result construction is kind of tricky for all # dtype-fill_value combinations; easiest to cast to something generic # and except later on rarr = arr.astype(object) rarr[arr == 0] = np.nan expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna( fill_value if fill_value is not None else np.nan) # Assert frame is as expected sdf_obj = sdf.astype(object) tm.assert_sp_frame_equal(sdf_obj, expected) tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense()) # Assert spmatrices equal assert dict(sdf.to_coo().todok()) == dict(spm.todok()) # Ensure dtype is preserved if possible was_upcast = ((fill_value is None or is_float(fill_value)) and not is_object_dtype(dtype) and not is_float_dtype(dtype)) res_dtype = (bool if is_bool_dtype(dtype) else float if was_upcast else dtype) tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)}) assert sdf.to_coo().dtype == res_dtype # However, adding a str column results in an upcast to object sdf['strings'] = np.arange(len(sdf)).astype(str) assert sdf.to_coo().dtype == np.object_
def test_set_value(self): with catch_warnings(record=True): for label in self.panel4d.labels: for item in self.panel4d.items: for mjr in self.panel4d.major_axis[::2]: for mnr in self.panel4d.minor_axis: self.panel4d.set_value(label, item, mjr, mnr, 1.) tm.assert_almost_equal( self.panel4d[label][item][mnr][mjr], 1.) res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) assert is_float_dtype(res3['l4'].values) # resize res = self.panel4d.set_value('l4', 'ItemE', 'foo', 'bar', 1.5) assert isinstance(res, Panel4D) assert res is not self.panel4d assert res.get_value('l4', 'ItemE', 'foo', 'bar') == 1.5 res3 = self.panel4d.set_value('l4', 'ItemE', 'foobar', 'baz', 5) assert is_float_dtype(res3['l4'].values)
def nanvar(values, axis=None, skipna=True, ddof=1): values = _values_from_object(values) dtype = values.dtype mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def nanmean(values, axis=None, skipna=True, mask=None): """ Compute the mean of the element along an axis ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s) 1.5 """ values, mask, dtype, dtype_max, _ = _get_values( values, skipna, 0, mask=mask) dtype_sum = dtype_max dtype_count = np.float64 if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return _wrap_results(the_mean, dtype)
def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
def array_equivalent( left, right, strict_nan: bool = False, dtype_equal: bool = False ) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. dtype_equal : bool, default False Whether `left` and `right` are known to have the same dtype according to `is_dtype_equal`. Some methods like `BlockManager.equals`. require that the dtypes match. Setting this to ``True`` can improve performance, but will give different results for arrays that are equal but different dtypes. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False if dtype_equal: # fastpath when we require that the dtypes match (Block.equals) if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): return _array_equivalent_float(left, right) elif is_datetimelike_v_numeric(left.dtype, right.dtype): return False elif needs_i8_conversion(left.dtype): return _array_equivalent_datetimelike(left, right) elif is_string_dtype(left.dtype): # TODO: fastpath for pandas' StringDtype return _array_equivalent_object(left, right, strict_nan) else: return np.array_equal(left, right) # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): if not (left.size and right.size): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if ( left.dtype.type is np.void or right.dtype.type is np.void ) and left.dtype != right.dtype: return False return np.array_equal(left, right)
def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None: subarr = np.array(data, copy=False) # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype): try: subarr = _try_cast(data, True, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path # them) # GH#6140 subarr = sanitize_index(data, index, copy=copy) else: # we will try to copy be-definition here subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): if isinstance(data, ABCPandasArray): # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. subarr = data.to_numpy() else: subarr = data # everything else in this block must also handle ndarray's, # becuase we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) if copy: subarr = data.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype='int64') subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = com.asarray_tuplesafe(data, dtype=dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma g = np.find_common_type(upcast_classes, []) if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, 'dtype'): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data) or is_string_dtype(data): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data): # treat as multiples of the given unit. If after converting to nanos, # there are fractional components left, these are truncated # (i.e. NOT rounded) mask = np.isnan(data) coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') data = (coeff * data).astype(np.int64).view('timedelta64[ns]') data[mask] = iNaT copy = False elif is_timedelta64_dtype(data): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=4) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" .format(dtype=data.dtype)) data = np.array(data, copy=copy) assert data.dtype == 'm8[ns]', data return data, inferred_freq
def nanvar(values, axis=None, skipna=True, ddof=1, mask=None): """ Compute the variance along given axis while ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 2, 3]) >>> nanops.nanvar(s) 1.0 """ values = extract_array(values, extract_numpy=True) dtype = values.dtype mask = _maybe_get_mask(values, skipna, mask) if is_any_int_dtype(values): values = values.astype("f8") if mask is not None: values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(values.shape, mask, axis, ddof) if skipna and mask is not None: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) if mask is not None: np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype)
def _get_empty_dtype_and_na( join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64), np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = _get_upcast_classes(join_units, dtypes) # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: if len(upcast_classes) == 1: cls = upcast_classes["extension"][0] return cls, cls.na_value else: return np.dtype("object"), np.nan elif "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: try: common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(common_dtype): return common_dtype, common_dtype.type(np.nan) elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _can_hold_na(self) -> bool: # type: ignore[override] if is_float_dtype(self.dtype): return True else: return False
def nankurt( values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, ) -> float: """ Compute the sample excess kurtosis The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 1, 3, 2]) >>> nanops.nankurt(s) -1.2892561983471076 """ values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) if skipna and mask is not None: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna and mask is not None: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted4 = adjusted2**2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) with np.errstate(invalid="ignore", divide="ignore"): adj = 3 * (count - 1)**2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 with np.errstate(invalid="ignore", divide="ignore"): result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result
def coerce_to_array(values, dtype, mask=None, copy=False): """ Coerce the input values array to numpy arrays with a mask Parameters ---------- values : 1D list-like dtype : integer dtype mask : boolean 1D array, optional copy : boolean, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is integer numpy array, preserve it's dtype if dtype is None and hasattr(values, 'dtype'): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: if (isinstance(dtype, string_types) and (dtype.startswith("Int") or dtype.startswith("UInt"))): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: dtype = _dtypes[str(np.dtype(dtype))] except KeyError: raise ValueError("invalid dtype specified {}".format(dtype)) if isinstance(values, IntegerArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values) if inferred_type is 'mixed' and isna(values).all(): values = np.empty(len(values)) values.fill(np.nan) elif inferred_type not in [ 'floating', 'integer', 'mixed-integer', 'mixed-integer-float' ]: raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError("{} cannot be converted to an IntegerDtype".format( values.dtype)) if mask is None: mask = isna(values) else: assert len(mask) == len(values) if not values.ndim == 1: raise TypeError("values must be a 1D list-like") if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype('int64') else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 values = safe_cast(values, dtype, copy=False) else: values = safe_cast(values, dtype, copy=False) return values, mask
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : np.dtype dtype for values dtype_max : np.dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) dtype = values.dtype if needs_i8_conversion(values.dtype): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna and (mask is not None) and (fill_value is not None): values = values.copy() if dtype_ok and mask.any(): np.putmask(values, mask, fill_value) # promote if needed else: values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.dtype(np.int64) elif is_float_dtype(dtype): dtype_max = np.dtype(np.float64) return values, mask, dtype, dtype_max, fill_value
def nanmedian(values, axis=None, skipna=True, mask=None): """ Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 2, 2]) >>> nanops.nanmedian(s) 2.0 """ def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan return np.nanmedian(x[mask]) values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask) if not is_float_dtype(values.dtype): try: values = values.astype("f8") except ValueError as err: # e.g. "could not convert string to float: 'a'" raise TypeError from err if mask is not None: values[mask] = np.nan if axis is None: values = values.ravel() notempty = values.size # an array from a frame if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: if not skipna: return _wrap_results( np.apply_along_axis(get_median, axis, values), dtype) # fastpath for the skipna case return _wrap_results(np.nanmedian(values, axis), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array shp = np.array(values.shape) dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype)
def _ea_wrap_cython_operation( self, cy_op: WrappedCythonOp, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs, ) -> ArrayLike: """ If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. """ # TODO: general case implementation overridable by EAs. orig_values = values if is_datetime64tz_dtype(values.dtype) or is_period_dtype( values.dtype): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents npvalues = values.view("M8[ns]") res_values = self._cython_operation(kind, npvalues, how, axis, min_count, **kwargs) if how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation # preserve float64 dtype return res_values res_values = res_values.astype("i8", copy=False) result = type(orig_values)(res_values, dtype=orig_values.dtype) return result elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray values = values.to_numpy("float64", na_value=np.nan) res_values = self._cython_operation(kind, values, how, axis, min_count, **kwargs) if how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation return res_values dtype = cy_op.get_result_dtype(orig_values.dtype) # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = dtype.construct_array_type() # type: ignore[union-attr] return cls._from_sequence(res_values, dtype=dtype) elif is_float_dtype(values.dtype): # FloatingArray values = values.to_numpy(values.dtype.numpy_dtype, na_value=np.nan) res_values = self._cython_operation(kind, values, how, axis, min_count, **kwargs) if how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation return res_values dtype = cy_op.get_result_dtype(orig_values.dtype) # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = dtype.construct_array_type() # type: ignore[union-attr] return cls._from_sequence(res_values, dtype=dtype) raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}")
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) result[mask] = np.nan return result elif is_object_dtype(other.dtype): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, self._recognized_scalars): other = Timedelta(other) # mypy assumes that __new__ returns an instance of the class # github.com/python/mypy/issues/1020 if cast("Timedelta | NaTType", other) is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation return other.__rfloordiv__(self._ndarray) # at this point we should only have numeric scalars; anything # else will raise result = self._ndarray // other freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) np.putmask(result, mask, np.nan) return result elif is_object_dtype(other.dtype): # error: Incompatible types in assignment (expression has type # "List[Any]", variable has type "ndarray") srav = self.ravel() orav = other.ravel() res_list = [srav[n] // orav[n] for n in range(len(srav))] result_flat = np.asarray(res_list) inferred = lib.infer_dtype(result_flat, skipna=False) result = result_flat.reshape(self.shape) if inferred == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) if inferred == "datetime": # GH#39750 occurs when result is all-NaT, which in this # case should be interpreted as td64nat. This can only # occur when self is all-td64nat return self * np.nan return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._ndarray // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def sanitize_array( data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False ): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype ) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "period": from pandas.core.arrays import period_array try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def test_constructor(self): exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_) c1 = Categorical(exp_arr) tm.assert_numpy_array_equal(c1.__array__(), exp_arr) c2 = Categorical(exp_arr, categories=["a", "b", "c"]) tm.assert_numpy_array_equal(c2.__array__(), exp_arr) c2 = Categorical(exp_arr, categories=["c", "b", "a"]) tm.assert_numpy_array_equal(c2.__array__(), exp_arr) # categories must be unique msg = "Categorical categories must be unique" with pytest.raises(ValueError, match=msg): Categorical([1, 2], [1, 2, 2]) with pytest.raises(ValueError, match=msg): Categorical(["a", "b"], ["a", "b", "b"]) # The default should be unordered c1 = Categorical(["a", "b", "c", "a"]) assert not c1.ordered # Categorical as input c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(c1, categories=["a", "b", "c"]) tm.assert_numpy_array_equal(c1.__array__(), c2.__array__()) tm.assert_index_equal(c2.categories, Index(["a", "b", "c"])) # Series of dtype category c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(Series(c1)) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"]) c2 = Categorical(Series(c1)) tm.assert_categorical_equal(c1, c2) # Series c1 = Categorical(["a", "b", "c", "a"]) c2 = Categorical(Series(["a", "b", "c", "a"])) tm.assert_categorical_equal(c1, c2) c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]) c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"]) tm.assert_categorical_equal(c1, c2) # This should result in integer categories, not float! cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) assert is_integer_dtype(cat.categories) # https://github.com/pandas-dev/pandas/issues/3678 cat = Categorical([np.nan, 1, 2, 3]) assert is_integer_dtype(cat.categories) # this should result in floats cat = Categorical([np.nan, 1, 2.0, 3]) assert is_float_dtype(cat.categories) cat = Categorical([np.nan, 1.0, 2.0, 3.0]) assert is_float_dtype(cat.categories) # This doesn't work -> this would probably need some kind of "remember # the original type" feature to try to cast the array interface result # to... # vals = np.asarray(cat[cat.notna()]) # assert is_integer_dtype(vals) # corner cases cat = Categorical([1]) assert len(cat.categories) == 1 assert cat.categories[0] == 1 assert len(cat.codes) == 1 assert cat.codes[0] == 0 cat = Categorical(["a"]) assert len(cat.categories) == 1 assert cat.categories[0] == "a" assert len(cat.codes) == 1 assert cat.codes[0] == 0 with tm.assert_produces_warning(FutureWarning): # GH#38433 cat = Categorical(1) assert len(cat.categories) == 1 assert cat.categories[0] == 1 assert len(cat.codes) == 1 assert cat.codes[0] == 0 # two arrays # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa # the next one are from the old docs with tm.assert_produces_warning(None): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) # noqa cat = Categorical([1, 2], categories=[1, 2, 3]) # this is a legitimate constructor with tm.assert_produces_warning(None): c = Categorical( # noqa np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True )
def putmask_smart(values: np.ndarray, mask: np.ndarray, new) -> np.ndarray: """ Return a new ndarray, try to preserve dtype if possible. Parameters ---------- values : np.ndarray `values`, updated in-place. mask : np.ndarray[bool] Applies to both sides (array like). new : `new values` either scalar or an array like aligned with `values` Returns ------- values : ndarray with updated values this *may* be a copy of the original See Also -------- ndarray.putmask """ # we cannot use np.asarray() here as we cannot have conversions # that numpy does when numeric are mixed with strings # n should be the length of the mask or a scalar here if not is_list_like(new): new = np.repeat(new, len(mask)) # see if we are only masking values that if putted # will work in the current dtype try: nn = new[mask] except TypeError: # TypeError: only integer scalar arrays can be converted to a scalar index pass else: # make sure that we have a nullable type if we have nulls if not isna_compat(values, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats pass elif not (is_float_dtype(values.dtype) or is_integer_dtype(values.dtype)): # only compare integers/floats pass else: # we ignore ComplexWarning here with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", np.ComplexWarning) nn_at = nn.astype(values.dtype) comp = nn == nn_at if is_list_like(comp) and comp.all(): nv = values.copy() nv[mask] = nn_at return nv new = np.asarray(new) if values.dtype.kind == new.dtype.kind: # preserves dtype if possible return _putmask_preserve(values, new, mask) dtype = find_common_type([values.dtype, new.dtype]) values = values.astype(dtype) return _putmask_preserve(values, new, mask)
def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): """ Parameters ---------- data : list-like copy : bool, default False unit : str, optional The timedelta unit to treat integers as multiples of. For numeric data this defaults to ``'ns'``. Must be un-specified if the data contains a str and ``errors=="raise"``. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None if unit is not None: unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data elif isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=tslibs.iNaT) elif is_categorical_dtype(data.dtype): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit or "ns") base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != TD64NS_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(TD64NS_DTYPE) copy = False else: # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError( f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) assert data.dtype == "m8[ns]", data return data, inferred_freq
def period_array( data: Sequence[Period | str | None] | AnyArrayLike, freq: str | Tick | None = None, copy: bool = False, ) -> PeriodArray: """ Construct a new PeriodArray from a sequence of Period scalars. Parameters ---------- data : Sequence of Period objects A sequence of Period objects. These are required to all have the same ``freq.`` Missing values can be indicated by ``None`` or ``pandas.NaT``. freq : str, Tick, or Offset The frequency of every element of the array. This can be specified to avoid inferring the `freq` from `data`. copy : bool, default False Whether to ensure a copy of the data is made. Returns ------- PeriodArray See Also -------- PeriodArray pandas.PeriodIndex Examples -------- >>> period_array([pd.Period('2017', freq='A'), ... pd.Period('2018', freq='A')]) <PeriodArray> ['2017', '2018'] Length: 2, dtype: period[A-DEC] >>> period_array([pd.Period('2017', freq='A'), ... pd.Period('2018', freq='A'), ... pd.NaT]) <PeriodArray> ['2017', '2018', 'NaT'] Length: 3, dtype: period[A-DEC] Integers that look like years are handled >>> period_array([2000, 2001, 2002], freq='D') <PeriodArray> ['2000-01-01', '2001-01-01', '2002-01-01'] Length: 3, dtype: period[D] Datetime-like strings may also be passed >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') <PeriodArray> ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] Length: 4, dtype: period[Q-DEC] """ data_dtype = getattr(data, "dtype", None) if is_datetime64_dtype(data_dtype): return PeriodArray._from_datetime64(data, freq) if is_period_dtype(data_dtype): return PeriodArray(data, freq=freq) # other iterable of some kind if not isinstance(data, (np.ndarray, list, tuple, ABCSeries)): data = list(data) arrdata = np.asarray(data) dtype: PeriodDtype | None if freq: dtype = PeriodDtype(freq) else: dtype = None if is_float_dtype(arrdata) and len(arrdata) > 0: raise TypeError( "PeriodIndex does not allow floating point in construction") if is_integer_dtype(arrdata.dtype): arr = arrdata.astype(np.int64, copy=False) # error: Argument 2 to "from_ordinals" has incompatible type "Union[str, # Tick, None]"; expected "Union[timedelta, BaseOffset, str]" ordinals = libperiod.from_ordinals(arr, freq) # type: ignore[arg-type] return PeriodArray(ordinals, dtype=dtype) data = ensure_object(arrdata) return PeriodArray._from_sequence(data, dtype=dtype)
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.sub) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def nanmean( values: np.ndarray, *, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, ) -> float: """ Compute the mean of the element along an axis ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s) 1.5 """ orig_values = values values, mask, dtype, dtype_max, _ = _get_values( values, skipna, fill_value=0, mask=mask ) dtype_sum = dtype_max dtype_count = np.float64 # not using needs_i8_conversion because that includes period datetimelike = False if dtype.kind in ["m", "M"]: datetimelike = True dtype_sum = np.float64 elif is_integer_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(values.shape, mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, "ndim", False): count = cast(np.ndarray, count) with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan the_mean = _wrap_results(the_mean, dtype) if datetimelike and not skipna: the_mean = _mask_datetimelike_result(the_mean, axis, mask, orig_values) return the_mean
def sanitize_array( data, index: Index | None, dtype: DtypeObj | None = None, copy: bool = False, raise_cast_failure: bool = True, *, allow_2d: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. Parameters ---------- data : Any index : Index or None, default None dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False raise_cast_failure : bool, default True allow_2d : bool, default False If False, raise if we have a 2D Arraylike. Returns ------- np.ndarray or ExtensionArray Notes ----- raise_cast_failure=False is only intended to be True when called from the DataFrame constructor, as the dtype keyword there may be interpreted as only applying to a subset of columns, see GH#24435. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) if isinstance(dtype, PandasDtype): # Avoid ending up with a PandasArray dtype = dtype.numpy_dtype # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) elif isinstance(data, range): # GH#16804 data = range_to_ndarray(data) copy = False if not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data # GH#846 if isinstance(data, np.ndarray): if isinstance(data, np.matrix): data = data.A if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except IntCastingNaNError: warnings.warn( "In a future version, passing float-dtype values containing NaN " "and an integer dtype will raise IntCastingNaNError " "(subclass of ValueError) instead of silently ignoring the " "passed dtype. To retain the old behavior, call Series(arr) or " "DataFrame(arr) without passing a dtype.", FutureWarning, stacklevel=find_stack_level(), ) subarr = np.array(data, copy=copy) except ValueError: if not raise_cast_failure: # i.e. called via DataFrame constructor warnings.warn( "In a future version, passing float-dtype values and an " "integer dtype to DataFrame will retain floating dtype " "if they cannot be cast losslessly (matching Series behavior). " "To retain the old behavior, use DataFrame(data).astype(dtype)", FutureWarning, stacklevel=find_stack_level(), ) # GH#40110 until the deprecation is enforced, we _dont_ # ignore the dtype for DataFrame, and _do_ cast even though # it is lossy. dtype = cast(np.dtype, dtype) return np.array(data, dtype=dtype, copy=copy) subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() else: if isinstance(data, (set, frozenset)): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError(f"'{type(data).__name__}' type is unordered") # materialize e.g. generators, convert e.g. tuples, abc.ValueView if hasattr(data, "__array__"): # e.g. dask array GH#38645 data = np.asarray(data) else: data = list(data) if dtype is not None or len(data) == 0: try: subarr = _try_cast(data, dtype, copy, raise_cast_failure) except ValueError: casted = np.array(data, copy=False) if casted.dtype.kind == "f" and is_integer_dtype(dtype): # GH#40110 match the behavior we have if we passed # a ndarray[float] to begin with return sanitize_array( casted, index, dtype, copy=False, raise_cast_failure=raise_cast_failure, allow_2d=allow_2d, ) else: raise else: subarr = maybe_convert_platform(data) if subarr.dtype == object: subarr = cast(np.ndarray, subarr) subarr = maybe_infer_to_datetimelike(subarr) subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d) if isinstance(subarr, np.ndarray): # at this point we should have dtype be None or subarr.dtype == dtype dtype = cast(np.dtype, dtype) subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) return subarr
def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left) or is_string_dtype(right): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( ensure_object(left.ravel()), ensure_object(right.ravel()) ) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif left_value is libmissing.NA and right_value is not libmissing.NA: return False elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False else: try: if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False elif "boolean value of NA is ambiguous" in str(err): return False raise return True # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False return np.array_equal(left, right)
def _get_values(values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented other = lib.item_from_zerodim(other) if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other return type(self)(result.view('m8[ns]'), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.int64) result[mask] = np.nan return result elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other) or is_float_dtype(other): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError("Cannot divide {typ} by {cls}" .format(typ=dtype, cls=type(self).__name__))
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = (f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types") raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalArray") raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ("left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'") raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray left = ensure_wrapped_if_datetimelike(left) left = extract_array(left, extract_numpy=True) right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share area_data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def _arith_method(self, other, op): op_name = op.__name__ omask = None if getattr(other, "ndim", 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, omask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if len(self) != len(other): raise ValueError("Lengths must match") if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError("can only perform ops with numeric values") elif isinstance(other, (timedelta, np.timedelta64)): other = Timedelta(other) else: if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") if omask is None: mask = self._mask.copy() if other is libmissing.NA: mask |= True else: mask = self._mask | omask if op_name == "pow": # 1 ** x is 1. mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) if other is libmissing.NA: result = np.ones_like(self._data) else: with np.errstate(all="ignore"): result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": div, mod = result return ( self._maybe_mask_result(div, mask, other, "floordiv"), self._maybe_mask_result(mod, mask, other, "mod"), ) return self._maybe_mask_result(result, mask, other, op_name)
def coerce_to_array(values, dtype=None, mask=None, copy: bool = False) -> tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like dtype : float dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is floating numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_float_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and dtype.startswith("Float"): # Avoid DeprecationWarning from NumPy about np.dtype("Float64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), FloatingDtype): try: dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, FloatingArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) if is_object_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass elif inferred_type not in [ "floating", "integer", "mixed-integer", "integer-na", "mixed-integer-float", ]: raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") elif is_bool_dtype(values) and is_float_dtype(dtype): values = np.array(values, dtype=float, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to a FloatingDtype") if values.ndim != 1: raise TypeError("values must be a 1D list-like") if mask is None: mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) if not mask.ndim == 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("float64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here # TODO should this be a safe cast? if mask.any(): values = values.copy() values[mask] = np.nan values = values.astype(dtype, copy=False) # , casting="safe") return values, mask
def period_array(data, freq=None, copy=False): # type: (Sequence[Optional[Period]], Optional[Tick]) -> PeriodArray """ Construct a new PeriodArray from a sequence of Period scalars. Parameters ---------- data : Sequence of Period objects A sequence of Period objects. These are required to all have the same ``freq.`` Missing values can be indicated by ``None`` or ``pandas.NaT``. freq : str, Tick, or Offset The frequency of every element of the array. This can be specified to avoid inferring the `freq` from `data`. copy : bool, default False Whether to ensure a copy of the data is made. Returns ------- PeriodArray See Also -------- PeriodArray pandas.PeriodIndex Examples -------- >>> period_array([pd.Period('2017', freq='A'), ... pd.Period('2018', freq='A')]) <PeriodArray> ['2017', '2018'] Length: 2, dtype: period[A-DEC] >>> period_array([pd.Period('2017', freq='A'), ... pd.Period('2018', freq='A'), ... pd.NaT]) <PeriodArray> ['2017', '2018', 'NaT'] Length: 3, dtype: period[A-DEC] Integers that look like years are handled >>> period_array([2000, 2001, 2002], freq='D') ['2000-01-01', '2001-01-01', '2002-01-01'] Length: 3, dtype: period[D] Datetime-like strings may also be passed >>> period_array(['2000-Q1', '2000-Q2', '2000-Q3', '2000-Q4'], freq='Q') <PeriodArray> ['2000Q1', '2000Q2', '2000Q3', '2000Q4'] Length: 4, dtype: period[Q-DEC] """ if is_datetime64_dtype(data): return PeriodArray._from_datetime64(data, freq) if isinstance(data, (ABCPeriodIndex, ABCSeries, PeriodArray)): return PeriodArray(data, freq) # other iterable of some kind if not isinstance(data, (np.ndarray, list, tuple)): data = list(data) data = np.asarray(data) if freq: dtype = PeriodDtype(freq) else: dtype = None if is_float_dtype(data) and len(data) > 0: raise TypeError("PeriodIndex does not allow " "floating point in construction") data = ensure_object(data) return PeriodArray._from_sequence(data, dtype=dtype)
def nanskew( values: np.ndarray, axis: Optional[int] = None, skipna: bool = True, mask: Optional[np.ndarray] = None, ) -> float: """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float64 Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, np.nan, 1, 2]) >>> nanops.nanskew(s) 1.7320508075688787 """ values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) if not is_float_dtype(values.dtype): values = values.astype("f8") count = _get_counts(values.shape, mask, axis) else: count = _get_counts(values.shape, mask, axis, dtype=values.dtype) if skipna and mask is not None: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna and mask is not None: np.putmask(adjusted, mask, 0) adjusted2 = adjusted**2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error # # #18044 in _libs/windows.pyx calc_skew follow this behavior # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid="ignore", divide="ignore"): result = (count * (count - 1)**0.5 / (count - 2)) * (m3 / m2**1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result