def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not is_scalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if is_scalar(y) and isna(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not is_scalar(y) and needs_i8_conversion(y))): if is_scalar(y): mask = isna(x) y = libindex.convert_scalar(x, com._values_from_object(y)) else: mask = isna(x) | isna(y) y = y.view('i8') x = x.view('i8') try: with np.errstate(all='ignore'): result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) elif is_integer_dtype(dtype) and self.hasnans: # GH 13149 raise ValueError('Cannot convert NA to integer') return super(Float64Index, self).astype(dtype, copy=copy)
def cmp_method(self, other): if isinstance(other, ABCDataFrame): return NotImplemented if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): if other.ndim > 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') if needs_i8_conversion(self) and needs_i8_conversion(other): # we may need to directly compare underlying # representations return self._evaluate_compare(other, op) # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(record=True): with np.errstate(all='ignore'): result = op(self.values, np.asarray(other)) return result
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) elif (is_integer_dtype(dtype) and not is_extension_array_dtype(dtype)) and self.hasnans: # TODO(jreback); this can change once we have an EA Index type # GH 13149 raise ValueError('Cannot convert NA to integer') return super().astype(dtype, copy=copy)
def test_needs_i8_conversion(): assert not com.needs_i8_conversion(str) assert not com.needs_i8_conversion(np.int64) assert not com.needs_i8_conversion(pd.Series([1, 2])) assert not com.needs_i8_conversion(np.array(['a', 'b'])) assert com.needs_i8_conversion(np.datetime64) assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]")) assert com.needs_i8_conversion(pd.DatetimeIndex( [1, 2, 3], tz="US/Eastern"))
def test_fillna(self): # # GH 11343 # though Index.fillna and Series.fillna has separate impl, # test here to confirm these works as the same for orig in self.objs: o = orig.copy() values = o.values # values will not be changed result = o.fillna(o.astype(object).values[0]) if isinstance(o, Index): tm.assert_index_equal(o, result) else: tm.assert_series_equal(o, result) # check shallow_copied assert o is not result for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) if not self._allow_na_ops(o): continue if needs_i8_conversion(o): values = o.astype(object).values fill_value = values[0] values[0:2] = pd.NaT else: values = o.values.copy() fill_value = o.values[0] values[0:2] = null_obj expected = [fill_value] * 2 + list(values[2:]) expected = klass(expected) o = klass(values) # check values has the same dtype as the original assert o.dtype == orig.dtype result = o.fillna(fill_value) if isinstance(o, Index): tm.assert_index_equal(result, expected) else: tm.assert_series_equal(result, expected) # check shallow_copied assert o is not result
def test_get_unique_index(self): for ind in self.indices.values(): # MultiIndex tested separately if not len(ind) or isinstance(ind, MultiIndex): continue idx = ind[[0] * 5] idx_unique = ind[[0]] # We test against `idx_unique`, so first we make sure it's unique # and doesn't contain nans. assert idx_unique.is_unique try: assert not idx_unique.hasnans except NotImplementedError: pass for dropna in [False, True]: result = idx._get_unique_index(dropna=dropna) tm.assert_index_equal(result, idx_unique) # nans: if not ind._can_hold_na: continue if needs_i8_conversion(ind): vals = ind.asi8[[0] * 5] vals[0] = iNaT else: vals = ind.values[[0] * 5] vals[0] = np.nan vals_unique = vals[:2] idx_nan = ind._shallow_copy(vals) idx_unique_nan = ind._shallow_copy(vals_unique) assert idx_unique_nan.is_unique assert idx_nan.dtype == ind.dtype assert idx_unique_nan.dtype == ind.dtype for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: result = i._get_unique_index(dropna=dropna) tm.assert_index_equal(result, expected)
def convert(values): """ convert the numpy values to a list """ dtype = values.dtype if is_categorical_dtype(values): return values elif is_object_dtype(dtype): return values.ravel().tolist() if needs_i8_conversion(dtype): values = values.view('i8') v = values.ravel() if compressor == 'zlib': _check_zlib() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, zlib.compress(v)) elif compressor == 'blosc': _check_blosc() # return string arrays like they are if dtype == np.object_: return v.tolist() # convert to a bytes array v = v.tostring() return ExtType(0, blosc.compress(v, typesize=dtype.itemsize)) # ndarray (on original dtype) return ExtType(0, v.tostring())
def test_value_counts_unique_nunique_null(self, null_obj): for orig in self.objs: o = orig.copy() klass = type(o) values = o._ndarray_values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetime64tz_dtype(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = pd.NaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == o.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = "a" else: if isinstance(o, DatetimeIndex): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = "a" # check values has the same dtype as the original assert o.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: exp = Series(nanloc, o.index, name="a") tm.assert_series_equal(pd.isna(o), exp) expected_s_na = Series( list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype="int64", name="a", ) expected_s = Series( list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype="int64", name="a", ) result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == "a" result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == "a" result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(o): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert o.nunique() == 8 assert o.nunique(dropna=False) == 9
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{dtype} dtype not supported".format(dtype=values.dtype) ) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {how} operations".format(how=how) ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {how} operations".format(how=how) ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize ) else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, codes, func, is_datetimelike, min_count ) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() # type: Optional[List[str]] else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() klass = type(obj) values = obj._ndarray_values num_values = len(orig) if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): pytest.skip(f"values of {klass} cannot be changed") elif isinstance(orig, pd.MultiIndex): pytest.skip("MultiIndex doesn't support isna") elif orig.duplicated().any(): pytest.xfail( "The test implementation isn't flexible enough to deal" " with duplicated values. This isn't a bug in the" " application code, but in the test code.") # special assign to the numpy array if is_datetime64tz_dtype(obj): if isinstance(obj, DatetimeIndex): v = obj.asi8 v[0:2] = iNaT values = obj._shallow_copy(v) else: obj = obj.copy() obj[0:2] = pd.NaT values = obj._values elif needs_i8_conversion(obj): values[0:2] = iNaT values = obj._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == obj.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(obj, (DatetimeIndex, PeriodIndex)): expected_index = obj.copy() expected_index.name = None # attach name to klass obj = klass(values.repeat(range(1, len(obj) + 1))) obj.name = "a" else: if isinstance(obj, DatetimeIndex): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None obj = obj.repeat(range(1, len(obj) + 1)) obj.name = "a" # check values has the same dtype as the original assert obj.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(obj), dtype=np.bool) nanloc[:3] = True if isinstance(obj, Index): tm.assert_numpy_array_equal(pd.isna(obj), nanloc) else: exp = Series(nanloc, obj.index, name="a") tm.assert_series_equal(pd.isna(obj), exp) expected_data = list(range(num_values, 2, -1)) expected_data_na = expected_data.copy() if expected_data_na: expected_data_na.append(3) expected_s_na = Series( expected_data_na, index=expected_index[num_values - 1:0:-1], dtype="int64", name="a", ) expected_s = Series( expected_data, index=expected_index[num_values - 1:1:-1], dtype="int64", name="a", ) result_s_na = obj.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == "a" result_s = obj.value_counts() tm.assert_series_equal(obj.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == "a" result = obj.unique() if isinstance(obj, Index): tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(obj): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) assert result[0] is pd.NaT elif len(obj) > 0: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert obj.nunique() == max(0, num_values - 2) assert obj.nunique(dropna=False) == max(0, num_values - 1)
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError(msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ['inside', 'outside'] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError('Invalid limit_area: expecting one of {}, got ' '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') from pandas import Series ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == 'inside': # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == 'outside': # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[preserve_nans] = np.nan return result
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{} are not support in cython ops".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} operations".format( how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} operations".format( how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) values = extract_array(values, extract_numpy=True) mask = _maybe_get_mask(values, skipna, mask) dtype = values.dtype if needs_i8_conversion(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = np.asarray(values.view("i8")) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() assert mask is not None # for mypy if dtype_ok and mask.any(): np.putmask(values, mask, fill_value) # promote if needed else: values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def array_equivalent(left, right, strict_nan: bool = False, dtype_equal: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. dtype_equal : bool, default False Whether `left` and `right` are known to have the same dtype according to `is_dtype_equal`. Some methods like `BlockManager.equals`. require that the dtypes match. Setting this to ``True`` can improve performance, but will give different results for arrays that are equal but different dtypes. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False if dtype_equal: # fastpath when we require that the dtypes match (Block.equals) if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): return _array_equivalent_float(left, right) elif is_datetimelike_v_numeric(left.dtype, right.dtype): return False elif needs_i8_conversion(left.dtype): return _array_equivalent_datetimelike(left, right) elif is_string_dtype(left.dtype): # TODO: fastpath for pandas' StringDtype return _array_equivalent_object(left, right, strict_nan) else: return np.array_equal(left, right) # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False return np.array_equal(left, right)
def _call_cython_op( self, values: np.ndarray, # np.ndarray[ndim=2] *, min_count: int, ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values dtype = values.dtype is_numeric = is_numeric_dtype(dtype) is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype_str = dtype.kind + "8" values = values.astype(dtype_str, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) values = values.T if mask is not None: mask = mask.T if result_mask is not None: result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) out_dtype = self.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean"]: func( result, counts, values, comp_ids, min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, ) elif self.how in ["add"]: # We support datetimelike func( result, counts, values, comp_ids, min_count, datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) else: # TODO: min_count if self.uses_mask(): func( result, values, comp_ids, ngroups, is_datetimelike, mask=mask, **kwargs, ) else: func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if self.kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan result = result.T if self.how not in self.cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) else: op_result = result # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "ndarray") return op_result # type: ignore[return-value]
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = values.astype("int64") elif is_integer_dtype(dtype): # e.g. uint8 -> uint64, int16 -> int64 dtype = dtype.kind + "8" values = values.astype(dtype, copy=False) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) if how in ["min", "max"]: func( result, counts, values, comp_ids, min_count, is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if kind == "aggregate": # i.e. counts is defined. Locations where count<min_count # need to have the result set to np.nan, which may require casting, # see GH#40767 if is_integer_dtype(result.dtype) and not is_datetimelike: cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): # Note: this conversion could be lossy, see GH#40767 result = result.astype("float64") result[empty_groups] = np.nan if self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def interpolate_1d( xvalues, yvalues, method="linear", limit=None, limit_direction="forward", limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == "time": if not getattr(xvalues, "is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError("time-weighted interpolation only works " "on Series or DataFrames with a " "DatetimeIndex") method = "values" valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = "Invalid limit_direction: expecting one of {valid!r}, got {invalid!r}." raise ValueError( msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError("Invalid limit_area: expecting one of {}, got " "{}.".format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(find_valid_index(yvalues, "first"))) end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == "backward": preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == "inside": # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == "outside": # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, "values", xvalues) yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() if method in ["linear", "time", "index", "values"]: if method in ("values", "index"): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = [ "nearest", "zero", "slinear", "quadratic", "cubic", "barycentric", "krogh", "spline", "polynomial", "from_derivatives", "piecewise_polynomial", "pchip", "akima", ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs, ) result[preserve_nans] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError( msg.format(valid=valid_limit_directions, invalid=limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # violate_limit is a list of the indexes in the series whose yvalue is # currently NaN, and should still be NaN after the interpolation. # Specifically: # # If limit_direction='forward' or None then the list will contain NaNs at # the beginning of the series, and NaNs that are more than 'limit' away # from the prior non-NaN. # # If limit_direction='backward' then the list will contain NaNs at # the end of the series, and NaNs that are more than 'limit' away # from the subsequent non-NaN. # # If limit_direction='both' then the list will contain NaNs that # are more than 'limit' away from any non-NaN. # # If limit=None, then use default behavior of filling an unlimited number # of NaNs in the direction specified by limit_direction # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction # TODO: do we need sorted? if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) elif limit_direction == 'forward': violate_limit = sorted(start_nans) elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) elif limit_direction == 'backward': violate_limit = sorted(end_nans) elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) else: violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = [ 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima' ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def to_numeric(arg, errors="raise", downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Please note that precision loss may occur if really large numbers are passed in. Due to the internal limitations of `ndarray`, if numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that they can stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. downcast : str, default None Can be 'integer', 'signed', 'unsigned', or 'float'. If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. Returns ------- ret Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. DataFrame.convert_dtypes : Convert dtypes. Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 Downcasting of nullable integer and floating dtypes is supported: >>> s = pd.Series([1, 2, 3], dtype="Int64") >>> pd.to_numeric(s, downcast="integer") 0 1 1 2 2 3 dtype: Int8 >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") >>> pd.to_numeric(s, downcast="float") 0 1.0 1 2.1 2 3.0 dtype: Float32 """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndex): is_index = True if needs_i8_conversion(arg.dtype): values = arg.asi8 else: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: raise TypeError("arg must be a list, tuple, 1-d array, or Series") else: values = arg # GH33013: for IntegerArray & FloatingArray extract non-null values for casting # save mask to reconstruct the full array after casting mask: npt.NDArray[np.bool_] | None = None if isinstance(values, NumericArray): mask = values._mask values = values._data[~mask] values_dtype = getattr(values, "dtype", None) if is_numeric_dtype(values_dtype): pass elif is_datetime_or_timedelta_dtype(values_dtype): values = values.view(np.int64) else: values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: values, _ = lib.maybe_convert_numeric( values, set(), coerce_numeric=coerce_numeric) except (ValueError, TypeError): if errors == "raise": raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): typecodes: str | None = None if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for typecode in typecodes: dtype = np.dtype(typecode) if dtype.itemsize <= values.dtype.itemsize: values = maybe_downcast_numeric(values, dtype) # successful conversion if values.dtype == dtype: break # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array if mask is not None: data = np.zeros(mask.shape, dtype=values.dtype) data[~mask] = values from pandas.core.arrays import ( FloatingArray, IntegerArray, ) klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray values = klass(data, mask.copy()) if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values # matching values? When that holds, we can slice instead # of take (in particular for EAs) new_values = (sorted_values.reshape( length, width, stride).swapaxes(1, 2).reshape(result_shape)) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values.dtype): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") elif is_bool_dtype(values.dtype): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values.dtype): new_values = new_values.view(values.dtype) return new_values, new_mask
def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left) or is_string_dtype(right): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object(ensure_object(left.ravel()), ensure_object(right.ravel())) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif left_value is libmissing.NA and right_value is not libmissing.NA: return False elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False else: try: if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False elif "boolean value of NA is ambiguous" in str(err): return False raise return True # NaNs can occur in float and complex arrays. if is_float_dtype(left) or is_complex_dtype(left): # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left) or needs_i8_conversion(right): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False return np.array_equal(left, right)
def _na_ok_dtype(dtype) -> bool: if needs_i8_conversion(dtype): return False return not issubclass(dtype.type, np.integer)
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = (self.sorted_values.reshape( length, width, stride).swapaxes(1, 2).reshape(result_shape)) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") name = "int64" elif is_bool_dtype(values): sorted_values = sorted_values.astype("object") new_values = new_values.astype("object") name = "object" else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # violate_limit is a list of the indexes in the series whose yvalue is # currently NaN, and should still be NaN after the interpolation. # Specifically: # # If limit_direction='forward' or None then the list will contain NaNs at # the beginning of the series, and NaNs that are more than 'limit' away # from the prior non-NaN. # # If limit_direction='backward' then the list will contain NaNs at # the end of the series, and NaNs that are more than 'limit' away # from the subsequent non-NaN. # # If limit_direction='both' then the list will contain NaNs that # are more than 'limit' away from any non-NaN. # # If limit=None, then use default behavior of filling an unlimited number # of NaNs in the direction specified by limit_direction # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') # each possible limit_direction # TODO: do we need sorted? if limit_direction == 'forward' and limit is not None: violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) elif limit_direction == 'forward': violate_limit = sorted(start_nans) elif limit_direction == 'backward' and limit is not None: violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) elif limit_direction == 'backward': violate_limit = sorted(end_nans) elif limit_direction == 'both' and limit is not None: violate_limit = sorted(_interp_limit(invalid, limit, limit)) else: violate_limit = [] xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def _cython_operation(self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations") if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isnull(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which cant be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' def _interp_limit(invalid, fw_limit, bw_limit): "Get idx of values that won't be filled b/c they exceed the limits." for x in np.where(invalid)[0]: if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): yield x valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError('Invalid limit_direction: expecting one of %r, got ' '%r.' % (valid_limit_directions, limit_direction)) from pandas import Series ys = Series(yvalues) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) # This is a list of the indexes in the series whose yvalue is currently # NaN, but whose interpolated yvalue will be overwritten with NaN after # computing the interpolation. For each index in this list, one of these # conditions is true of the corresponding NaN in the yvalues: # # a) It is one of a chain of NaNs at the beginning of the series, and # either limit is not specified or limit_direction is 'forward'. # b) It is one of a chain of NaNs at the end of the series, and limit is # specified and limit_direction is 'backward' or 'both'. # c) Limit is nonzero and it is further than limit from the nearest non-NaN # value (with respect to the limit_direction setting). # # The default behavior is to fill forward with no limit, ignoring NaNs at # the beginning (see issues #9218 and #10420) violate_limit = sorted(start_nans) if limit is not None: if not is_integer(limit): raise ValueError('Limit must be an integer') if limit < 1: raise ValueError('Limit must be greater than 0') if limit_direction == 'forward': violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0))) if limit_direction == 'backward': violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit))) if limit_direction == 'both': violate_limit = sorted(_interp_limit(invalid, limit, limit)) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[violate_limit] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima'] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[violate_limit] = np.nan return result
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._ndarray_values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == o.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetimetz(o): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original assert o.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: exp = Series(nanloc, o.index, name='a') tm.assert_series_equal(pd.isna(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == 'a' result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == 'a' result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan vals = values[2:].astype(object).values tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert o.nunique() == 8 assert o.nunique(dropna=False) == 9
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(values, how) if is_extension_array_dtype(values.dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(ensure_float(values)) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) result = maybe_downcast_to_dtype(result, dtype) return result, names
def _interpolate_2d_with_fill( data: np.ndarray, # floating dtype index: Index, axis: int, method: str = "linear", limit: int | None = None, limit_direction: str = "forward", limit_area: str | None = None, fill_value: Any | None = None, **kwargs, ) -> np.ndarray: """ Column-wise application of _interpolate_1d. Notes ----- The signature does differs from _interpolate_1d because it only includes what is needed for Block.interpolate. """ # validate the interp method clean_interp_method(method, index, **kwargs) if is_valid_na_for_dtype(fill_value, data.dtype): fill_value = na_value_for_dtype(data.dtype, compat=False) if method == "time": if not needs_i8_conversion(index.dtype): raise ValueError( "time-weighted interpolation only works " "on Series or DataFrames with a " "DatetimeIndex" ) method = "values" valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError( "Invalid limit_direction: expecting one of " f"{valid_limit_directions}, got '{limit_direction}'." ) if limit_area is not None: valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError( f"Invalid limit_area: expecting one of {valid_limit_areas}, got " f"{limit_area}." ) # default limit is unlimited GH #16282 limit = algos.validate_limit(nobs=None, limit=limit) indices = _index_to_interp_indices(index, method) def func(yvalues: np.ndarray) -> np.ndarray: # process 1-d slices in the axis direction, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to _interpolate_1d return _interpolate_1d( indices=indices, yvalues=yvalues, method=method, limit=limit, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, bounds_error=False, **kwargs, ) # interp each column independently return np.apply_along_axis(func, axis, data)
def interpolate_1d( xvalues: np.ndarray, yvalues: np.ndarray, method: Optional[str] = "linear", limit: Optional[int] = None, limit_direction: str = "forward", limit_area: Optional[str] = None, fill_value: Optional[Any] = None, bounds_error: bool = False, order: Optional[int] = None, **kwargs, ): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == "time": if not getattr(xvalues, "_is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError("time-weighted interpolation only works " "on Series or DataFrames with a " "DatetimeIndex") method = "values" valid_limit_directions = ["forward", "backward", "both"] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: raise ValueError("Invalid limit_direction: expecting one of " f"{valid_limit_directions}, got '{limit_direction}'.") if limit_area is not None: valid_limit_areas = ["inside", "outside"] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError( f"Invalid limit_area: expecting one of {valid_limit_areas}, got " f"{limit_area}.") # default limit is unlimited GH #16282 limit = algos.validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(find_valid_index(yvalues, "first"))) end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit preserve_nans: Union[List, Set] if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == "backward": preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == "inside": # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == "outside": # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() # xvalues to pass to NumPy/SciPy xvalues = getattr(xvalues, "values", xvalues) if method == "linear": inds = xvalues else: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype): inds = inds.view(np.int64) if method in ("values", "index"): if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) if method in NP_METHODS: # np.interp requires sorted X values, #21037 indexer = np.argsort(inds[valid]) result[invalid] = np.interp(inds[invalid], inds[valid][indexer], yvalues[valid][indexer]) else: result[invalid] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs, ) result[preserve_nans] = np.nan return result
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs xvalues and yvalues will each be 1-d arrays of the same length. Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argumnet. """ # Treat the original, non-scipy methods first. invalid = isna(yvalues) valid = ~invalid if not valid.any(): # have to call np.asarray(xvalues) since xvalues could be an Index # which can't be mutated result = np.empty_like(np.asarray(xvalues), dtype=np.float64) result.fill(np.nan) return result if valid.all(): return yvalues if method == 'time': if not getattr(xvalues, 'is_all_dates', None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError('time-weighted interpolation only works ' 'on Series or DataFrames with a ' 'DatetimeIndex') method = 'values' valid_limit_directions = ['forward', 'backward', 'both'] limit_direction = limit_direction.lower() if limit_direction not in valid_limit_directions: msg = ('Invalid limit_direction: expecting one of {valid!r}, ' 'got {invalid!r}.') raise ValueError( msg.format(valid=valid_limit_directions, invalid=limit_direction)) if limit_area is not None: valid_limit_areas = ['inside', 'outside'] limit_area = limit_area.lower() if limit_area not in valid_limit_areas: raise ValueError('Invalid limit_area: expecting one of {}, got ' '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: # limit = len(xvalues) pass elif not is_integer(limit): raise ValueError('Limit must be an integer') elif limit < 1: raise ValueError('Limit must be greater than 0') from pandas import Series ys = Series(yvalues) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) mid_nans = all_nans - start_nans - end_nans # Like the sets above, preserve_nans contains indices of invalid values, # but in this case, it is the final set of indices that need to be # preserved as NaN after the interpolation. # For example if limit_direction='forward' then preserve_nans will # contain indices of NaNs at the beginning of the series, and NaNs that # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit if limit_direction == 'forward': preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: # both directions... just use _interp_limit preserve_nans = set(_interp_limit(invalid, limit, limit)) # if limit_area is set, add either mid or outside indices # to preserve_nans GH #16284 if limit_area == 'inside': # preserve NaNs on the outside preserve_nans |= start_nans | end_nans elif limit_area == 'outside': # preserve NaNs on the inside preserve_nans |= mid_nans # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) result = yvalues.copy() if method in ['linear', 'time', 'index', 'values']: if method in ('values', 'index'): inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if needs_i8_conversion(inds.dtype.type): inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) result[preserve_nans] = np.nan return result sp_methods = [ 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'krogh', 'spline', 'polynomial', 'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima' ] if method in sp_methods: inds = np.asarray(xvalues) # hack for DatetimeIndex, #1646 if issubclass(inds.dtype.type, np.datetime64): inds = inds.view(np.int64) result[invalid] = _interpolate_scipy_wrapper(inds[valid], yvalues[valid], inds[invalid], method=method, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) result[preserve_nans] = np.nan return result
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function( kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function( kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '{kind}{itemsize}'.format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset( result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = self.mask_all # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values # matching values? When that holds, we can slice instead # of take (in particular for EAs) new_values = (sorted_values.reshape( length, width, stride).swapaxes(1, 2).reshape(result_shape)) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask dtype = values.dtype # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: if isinstance(dtype, ExtensionDtype): # GH#41875 # We are assuming that fill_value can be held by this dtype, # unlike the non-EA case that promotes. cls = dtype.construct_array_type() new_values = cls._empty(result_shape, dtype=dtype) new_values[:] = fill_value else: dtype, fill_value = maybe_promote(dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) name = dtype.name new_mask = np.zeros(result_shape, dtype=bool) # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values.dtype): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values.dtype): # view as datetime64 so we can wrap in DatetimeArray and use # DTA's view method new_values = new_values.view("M8[ns]") new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask
def get_new_values(self): values = self.values # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = mask.all() # we can simply reshape if we don't have a mask if mask_all and len(values): new_values = (self.sorted_values .reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: dtype, fill_value = maybe_promote(values.dtype, self.fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) new_mask = np.zeros(result_shape, dtype=bool) name = np.dtype(dtype).name sorted_values = self.sorted_values # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values): sorted_values = sorted_values.view('i8') new_values = new_values.view('i8') name = 'int64' elif is_bool_dtype(values): sorted_values = sorted_values.astype('object') new_values = new_values.astype('object') name = 'object' else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask f = getattr(_reshape, "unstack_{name}".format(name=name)) f(sorted_values, mask.view('u1'), stride, length, width, new_values, new_mask.view('u1')) # reconstruct dtype if needed if needs_i8_conversion(values): new_values = new_values.view(values.dtype) return new_values, new_mask
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(dtype): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) ngroups = self.ngroups assert axis == 1 values = values.T if how == "ohlc": out_shape = (ngroups, 4) elif arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument") elif kind == "transform": out_shape = values.shape else: out_shape = (ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas.testing import assert_series_equal >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if ( is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype) and not check_categorical ): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray ): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and ( needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype) ): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = ( f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}." ) raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetimetz(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original self.assertEqual(values.dtype, o.dtype) # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetimetz(o): expected_index = orig._values._shallow_copy(values) else: expected_index = pd.Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original self.assertEqual(o.dtype, orig.dtype) # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): self.assert_numpy_array_equal(pd.isnull(o), nanloc) else: exp = pd.Series(nanloc, o.index, name='a') self.assert_series_equal(pd.isnull(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) self.assertTrue(result_s_na.index.name is None) self.assertEqual(result_s_na.name, 'a') result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) self.assertTrue(result_s.index.name is None) self.assertEqual(result_s.name, 'a') result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetimetz(o): # unable to compare NaT / nan tm.assert_numpy_array_equal(result[1:], values[2:].asobject.values) self.assertIs(result[0], pd.NaT) else: tm.assert_numpy_array_equal(result[1:], values[2:]) self.assertTrue(pd.isnull(result[0])) self.assertEqual(result.dtype, orig.dtype) self.assertEqual(o.nunique(), 8) self.assertEqual(o.nunique(dropna=False), 9)