def test_is_categorical_dtype(): assert not com.is_categorical_dtype(object) assert not com.is_categorical_dtype([1, 2, 3]) assert com.is_categorical_dtype(CategoricalDtype()) assert com.is_categorical_dtype(pd.Categorical([1, 2, 3])) assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3]))
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) new_target = self.take(indexer) # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def test_constructor_categorical_dtype(self): result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['a', 'b', 'c'], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) assert result.cat.ordered result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x, y) elif is_categorical_dtype(y) and not is_scalar(y): return op(y, x) if is_object_dtype(x.dtype): result = _comp_method_OBJECT_ARRAY(op, x, y) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if is_scalar(y) and isna(y): if name == '__ne__': return np.ones(len(x), dtype=bool) else: return np.zeros(len(x), dtype=bool) # we have a datetime/timedelta and may need to convert mask = None if (needs_i8_conversion(x) or (not is_scalar(y) and needs_i8_conversion(y))): if is_scalar(y): mask = isna(x) y = libindex.convert_scalar(x, com._values_from_object(y)) else: mask = isna(x) | isna(y) y = y.view('i8') x = x.view('i8') try: with np.errstate(all='ignore'): result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = masker return result
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = _get_series_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_categorical_dtype(left): raise TypeError("{typ} cannot perform the operation " "{op}".format(typ=type(left).__name__, op=str_rep)) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = rvalues.values result = safe_na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None)
def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath is not None: warnings.warn("The 'fastpath' keyword is deprecated, and will be " "removed in a future version.", FutureWarning, stacklevel=2) if fastpath: return cls._simple_new(data, name=name, dtype=dtype) dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) if name is None and hasattr(data, 'name'): name = data.name if not is_categorical_dtype(data): # don't allow scalars # if data is None, then categories must be provided if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] data = cls._create_categorical(data, dtype=dtype) data = data.copy() if copy else data return cls._simple_new(data, name=name)
def _create_categorical(cls, data, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): return Categorical(data, dtype=dtype) if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def __new__(cls, data): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just # do all the validation here. from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) orig = data if is_categorical_dtype(data) else None if orig is not None: data = Series(orig.values.categories, name=orig.name, copy=False) try: if is_datetime64_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(data, orig) else: if is_period_arraylike(data): return PeriodProperties(data, orig) if is_datetime_arraylike(data): return DatetimeProperties(data, orig) except Exception: pass # we raise an attribute error anyway raise AttributeError("Can only use .dt accessor with datetimelike " "values")
def maybe_convert_platform_interval(values): """ Try to do platform conversion, with special casing for IntervalArray. Wrapper around maybe_convert_platform that alters the default return dtype in certain cases to be compatible with IntervalArray. For example, empty lists return with integer dtype instead of object dtype, which is prohibited for IntervalArray. Parameters ---------- values : array-like Returns ------- array """ if isinstance(values, (list, tuple)) and len(values) == 0: # GH 19016 # empty lists/tuples get object dtype by default, but this is not # prohibited for IntervalArray, so coerce to integer instead return np.array([], dtype=np.int64) elif is_categorical_dtype(values): values = np.asarray(values) return maybe_convert_platform(values)
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def convert_pandas_type_to_json_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: name = 'values' else: name = arr.name field = {'name': name, 'type': as_json_table_type(dtype)} if is_categorical_dtype(arr): if hasattr(arr, 'categories'): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered field['constraints'] = {"enum": list(cats)} field['ordered'] = ordered elif is_period_dtype(arr): field['freq'] = arr.freqstr elif is_datetime64tz_dtype(arr): if hasattr(arr, 'dt'): field['tz'] = arr.dt.tz.zone else: field['tz'] = arr.tz.zone return field
def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # GH12574 pytest.raises( ValueError, lambda: Series(pd.Categorical([1, 2, 3]), dtype='int64')) cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype='category') assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype)
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): return items.argsort(ascending=ascending) items = np.asanyarray(items) idx = np.arange(len(items)) mask = isnull(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def test_basic(self): assert is_categorical_dtype(self.dtype) factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') # dtypes assert is_categorical_dtype(s.dtype) assert is_categorical_dtype(s) assert not is_categorical_dtype(np.dtype('float64')) assert is_categorical(s.dtype) assert is_categorical(s) assert not is_categorical(np.dtype('float64')) assert not is_categorical(1.0)
def _concat_asobject(to_concat): to_concat = [x.get_values() if is_categorical_dtype(x.dtype) else x.ravel() for x in to_concat] res = _concat_compat(to_concat) if axis == 1: return res.reshape(1, len(res)) else: return res
def test_basic(self): self.assertTrue(is_categorical_dtype(self.dtype)) factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) s = Series(factor, name='A') # dtypes self.assertTrue(is_categorical_dtype(s.dtype)) self.assertTrue(is_categorical_dtype(s)) self.assertFalse(is_categorical_dtype(np.dtype('float64'))) self.assertTrue(is_categorical(s.dtype)) self.assertTrue(is_categorical(s)) self.assertFalse(is_categorical(np.dtype('float64'))) self.assertFalse(is_categorical(1.0))
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: # Numpy 1.9 support: ensure this mask is a Numpy array ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == u'zlib': _check_zlib() decompress = zlib.decompress elif compress == u'blosc': _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the bytes into a numpy array. buf = np.frombuffer(values, dtype=dtype) buf = buf.copy() # required to not mutate the original data buf.flags.writeable = True return buf
def test_constructor_categorical_dtype(self): result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['a', 'b', 'c'], ordered=True)) assert is_categorical_dtype(result) is True tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) assert result.cat.ordered result = pd.Series(['a', 'b'], dtype=CategoricalDtype(['b', 'a'])) assert is_categorical_dtype(result) tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) assert result.cat.ordered is False # GH 19565 - Check broadcasting of scalar with Categorical dtype result = Series('a', index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(['a', 'a'], index=[0, 1], dtype=CategoricalDtype(['a', 'b'], ordered=True)) tm.assert_series_equal(result, expected, check_categorical=True)
def test_constructor_categorical(self): cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) res = Series(cat) tm.assert_categorical_equal(res.values, cat) # can cast to a new dtype result = Series(pd.Categorical([1, 2, 3]), dtype='int64') expected = pd.Series([1, 2, 3], dtype='int64') tm.assert_series_equal(result, expected) # GH12574 cat = Series(pd.Categorical([1, 2, 3]), dtype='category') assert is_categorical_dtype(cat) assert is_categorical_dtype(cat.dtype) s = Series([1, 2, 3], dtype='category') assert is_categorical_dtype(s) assert is_categorical_dtype(s.dtype)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): if copy: self = self.copy() return self elif is_object_dtype(dtype): return Index(self.values, dtype=object) elif is_categorical_dtype(dtype): from pandas import Categorical return Categorical(self, ordered=True) raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
def _concat_categorical(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype Parameters ---------- to_concat : array of arrays axis : int Axis to provide concatenation in the current implementation this is always 0, e.g. we only have 1D categoricals Returns ------- Categorical A single array, preserving the combined dtypes """ # we could have object blocks and categoricals here # if we only have a single categoricals then combine everything # else its a non-compat categorical categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories if len(categoricals) != len(to_concat): pass else: # when all categories are identical first = to_concat[0] if all(first.is_dtype_equal(other) for other in to_concat[1:]): return union_categoricals(categoricals) # extract the categoricals & coerce to object if needed to_concat = [x.get_values() if is_categorical_dtype(x.dtype) else np.asarray(x).ravel() if not is_datetime64tz_dtype(x) else np.asarray(x.astype(object)) for x in to_concat] result = _concat_compat(to_concat) if axis == 1: result = result.reshape(1, len(result)) return result
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datelike(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other) or is_period_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def nargsort(items, kind='quicksort', ascending=True, na_position='last'): """ This is intended to be a drop-in replacement for np.argsort which handles NaNs. It adds ascending and na_position parameters. GH #6399, #5231 """ # specially handle Categorical if is_categorical_dtype(items): if na_position not in {'first', 'last'}: raise ValueError('invalid na_position: {!r}'.format(na_position)) mask = isna(items) cnt_null = mask.sum() sorted_idx = items.argsort(ascending=ascending, kind=kind) if ascending and na_position == 'last': # NaN is coded as -1 and is listed in front after sorting sorted_idx = np.roll(sorted_idx, -cnt_null) elif not ascending and na_position == 'first': # NaN is coded as -1 and is listed in the end after sorting sorted_idx = np.roll(sorted_idx, cnt_null) return sorted_idx with warnings.catch_warnings(): # https://github.com/pandas-dev/pandas/issues/25439 # can be removed once ExtensionArrays are properly handled by nargsort warnings.filterwarnings( "ignore", category=FutureWarning, message="Converting timezone-aware DatetimeArray to") items = np.asanyarray(items) idx = np.arange(len(items)) mask = isna(items) non_nans = items[~mask] non_nan_idx = idx[~mask] nan_idx = np.nonzero(mask)[0] if not ascending: non_nans = non_nans[::-1] non_nan_idx = non_nan_idx[::-1] indexer = non_nan_idx[non_nans.argsort(kind=kind)] if not ascending: indexer = indexer[::-1] # Finally, place the NaNs at the end or the beginning according to # na_position if na_position == 'last': indexer = np.concatenate([indexer, nan_idx]) elif na_position == 'first': indexer = np.concatenate([nan_idx, indexer]) else: raise ValueError('invalid na_position: {!r}'.format(na_position)) return indexer
def get_kwargs_from_breaks(self, breaks, closed='right'): """ converts intervals in breaks format to a dictionary of kwargs to specific to the format expected by IntervalIndex.from_tuples """ if len(breaks) == 0: return {'data': breaks} tuples = lzip(breaks[:-1], breaks[1:]) if isinstance(breaks, (list, tuple)): return {'data': tuples} elif is_categorical_dtype(breaks): return {'data': breaks._constructor(tuples)} return {'data': com._asarray_tuplesafe(tuples)}
def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> # super(...), which ends up being... DatetimeIndexOpsMixin? # this is complicated. # need a pandas_astype(arr, dtype). from pandas import Categorical dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return np.asarray(self, dtype=object) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return self._format_native_types() elif is_integer_dtype(dtype): values = self._data if values.dtype != dtype: # int32 vs. int64 values = values.astype(dtype) elif copy: values = values.copy() return values elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) elif is_period_dtype(dtype): return self.asfreq(dtype.freq) else: return np.asarray(self, dtype=dtype)
def test_setitem(self): df = DataFrame({'A': range(10)}) s = pd.cut(df.A, 5) assert isinstance(s.cat.categories, IntervalIndex) # B & D end up as Categoricals # the remainer are converted to in-line objects # contining an IntervalIndex.values df['B'] = s df['C'] = np.array(s) df['D'] = s.values df['E'] = np.array(s.values) assert is_categorical_dtype(df['B']) assert is_interval_dtype(df['B'].cat.categories) assert is_categorical_dtype(df['D']) assert is_interval_dtype(df['D'].cat.categories) assert is_object_dtype(df['C']) assert is_object_dtype(df['E']) # they compare equal as Index # when converted to numpy objects c = lambda x: Index(np.array(x)) tm.assert_index_equal(c(df.B), c(df.B), check_names=False) tm.assert_index_equal(c(df.B), c(df.C), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) tm.assert_index_equal(c(df.B), c(df.D), check_names=False) # B & D are the same Series tm.assert_series_equal(df['B'], df['B'], check_names=False) tm.assert_series_equal(df['B'], df['D'], check_names=False) # C & E are the same Series tm.assert_series_equal(df['C'], df['C'], check_names=False) tm.assert_series_equal(df['C'], df['E'], check_names=False)
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or 'right' left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = 'dtype must be an IntervalDtype, got {dtype}' raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ('must not have differing left [{ltype}] and right ' '[{rtype}] types') raise ValueError(msg.format(ltype=type(left).__name__, rtype=type(right).__name__)) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' raise ValueError(msg) elif (isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz)): msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def __init__( self, data: Union[np.ndarray, pd.Series, pd.DataFrame], stats: Sequence[str] = None, *, numeric: bool = True, categorical: bool = True, alpha: float = 0.05, use_t: bool = False, percentiles: Sequence[Union[int, float]] = PERCENTILES, ntop: bool = 5, ): data_arr = data if not isinstance(data, (pd.Series, pd.DataFrame)): data_arr = array_like(data, "data", maxdim=2) if data_arr.ndim == 1: data = pd.Series(data) numeric = bool_like(numeric, "numeric") categorical = bool_like(categorical, "categorical") include = [] col_types = "" if numeric: include.append(np.number) col_types = "numeric" if categorical: include.append("category") col_types += "and " if col_types != "" else "" col_types += "categorical" if not numeric and not categorical: raise ValueError( "At least one of numeric and categorical must be True") self._data = pd.DataFrame(data).select_dtypes(include) if self._data.shape[1] == 0: raise ValueError( "Selecting {col_types} results in an empty DataFrame") self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes] self._is_cat_like = [ is_categorical_dtype(dt) for dt in self._data.dtypes ] if stats is not None: undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS] if undef: raise ValueError( f"{', '.join(undef)} are not known statistics") self._stats = (list(DEFAULT_STATISTICS) if stats is None else list(stats)) self._ntop = int_like(ntop, "ntop") self._compute_top = "top" in self._stats self._compute_freq = "freq" in self._stats if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like): raise ValueError("top must be a non-negative integer") # Expand special stats replacements = { "mode": ["mode", "mode_freq"], "ci": ["upper_ci", "lower_ci"], "jarque_bera": ["jarque_bera", "jarque_bera_pval"], "top": [f"top_{i}" for i in range(1, self._ntop + 1)], "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)], } for key in replacements: if key in self._stats: idx = self._stats.index(key) self._stats = (self._stats[:idx] + replacements[key] + self._stats[idx + 1:]) self._percentiles = array_like(percentiles, "percentiles", maxdim=1, dtype="d") self._percentiles = np.sort(self._percentiles) if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]: raise ValueError("percentiles must be distinct") if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0): raise ValueError("percentiles must be strictly between 0 and 100") self._alpha = float_like(alpha, "alpha") if not 0 < alpha < 1: raise ValueError("alpha must be strictly between 0 and 1") self._use_t = bool_like(use_t, "use_t")
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64 ) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self._values): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) if is_extension_array_dtype(self.dtype): values = self._values else: values = self.values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self.astype(object) values = getattr(values, "values", values) if na_action == "ignore": def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) else: map_f = lib.map_infer # mapper is a function new_values = map_f(values, mapper) return new_values
def test_slicing_and_getting_ops(self): # systematically test the slicing operations: # for all slicing ops: # - returning a dataframe # - returning a column # - returning a row # - returning a single value cats = Categorical(["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]) idx = Index(["h", "i", "j", "k", "l", "m", "n"]) values = [1, 2, 3, 4, 5, 6, 7] df = DataFrame({"cats": cats, "values": values}, index=idx) # the expected values cats2 = Categorical(["b", "c"], categories=["a", "b", "c"]) idx2 = Index(["j", "k"]) values2 = [3, 4] # 2:4,: | "j":"k",: exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2) # :,"cats" | :,0 exp_col = Series(cats, index=idx, name='cats') # "j",: | 2,: exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j") # "j","cats | 2,0 exp_val = "b" # iloc # frame res_df = df.iloc[2:4, :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.iloc[2, :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.iloc[2, 0] assert res_val == exp_val # loc # frame res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", "cats"] assert res_val == exp_val # ix # frame # res_df = df.loc["j":"k",[0,1]] # doesn't work? res_df = df.loc["j":"k", :] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) # row res_row = df.loc["j", :] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) # col res_col = df.loc[:, "cats"] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) # single value res_val = df.loc["j", df.columns[0]] assert res_val == exp_val # iat res_val = df.iat[2, 0] assert res_val == exp_val # at res_val = df.at["j", "cats"] assert res_val == exp_val # fancy indexing exp_fancy = df.iloc[[2]] res_fancy = df[df["cats"] == "b"] tm.assert_frame_equal(res_fancy, exp_fancy) res_fancy = df[df["values"] == 3] tm.assert_frame_equal(res_fancy, exp_fancy) # get_value res_val = df.at["j", "cats"] assert res_val == exp_val # i : int, slice, or sequence of integers res_row = df.iloc[2] tm.assert_series_equal(res_row, exp_row) assert isinstance(res_row["cats"], str) res_df = df.iloc[slice(2, 4)] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[[2, 3]] tm.assert_frame_equal(res_df, exp_df) assert is_categorical_dtype(res_df["cats"]) res_col = df.iloc[:, 0] tm.assert_series_equal(res_col, exp_col) assert is_categorical_dtype(res_col) res_df = df.iloc[:, slice(0, 2)] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"]) res_df = df.iloc[:, [0, 1]] tm.assert_frame_equal(res_df, df) assert is_categorical_dtype(res_df["cats"])
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype(object): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype( empty_dtype ): if self.block is None: # TODO(EA2D): special case unneeded with 2D EAs i8values = np.full(self.shape[1], fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_categorical_dtype(blk_dtype): pass elif is_extension_array_dtype(blk_dtype): pass elif is_extension_array_dtype(empty_dtype): missing_arr = empty_dtype.construct_array_type()._from_sequence( [], dtype=empty_dtype ) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def _bins_to_cuts( x, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, dtype=None, duplicates: str = "raise", ): if duplicates not in ["raise", "drop"]: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument") elif labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def assert_index_equal( left: Index, right: Index, exact: bool | str = "equiv", check_names: bool = True, check_less_precise: bool | int | NoDefault = no_default, check_exact: bool = True, check_categorical: bool = True, check_order: bool = True, rtol: float = 1.0e-5, atol: float = 1.0e-8, obj: str = "Index", ) -> None: """ Check that left and right Index are equal. Parameters ---------- left : Index right : Index exact : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. If 'equiv', then RangeIndex can be substituted for Int64Index as well. check_names : bool, default True Whether to check the names attribute. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default True Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_order : bool, default True Whether to compare the order of index entries as well as their values. If True, both indexes must contain the same elements, in the same order. If False, both indexes must contain the same elements, but in any order. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. Examples -------- >>> from pandas.testing import assert_index_equal >>> a = pd.Index([1, 2, 3]) >>> b = pd.Index([1, 2, 3]) >>> assert_index_equal(a, b) """ __tracebackhide__ = True def _check_types(left, right, obj="Index") -> None: if not exact: return assert_class_equal(left, right, exact=exact, obj=obj) # Skip exact dtype checking when `check_categorical` is False if check_categorical: assert_attr_equal("dtype", left, right, obj=obj) if is_categorical_dtype(left.dtype) and is_categorical_dtype( right.dtype): assert_index_equal(left.categories, right.categories, exact=exact) # allow string-like to have different inferred_types if left.inferred_type in ("string"): assert right.inferred_type in ("string") else: assert_attr_equal("inferred_type", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) return unique._shallow_copy(filled, name=index.names[level]) if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) # error: Argument 1 to "_get_tol_from_less_precise" has incompatible # type "Union[bool, int, NoDefault]"; expected "Union[bool, int]" rtol = atol = _get_tol_from_less_precise( check_less_precise # type: ignore[arg-type] ) # instance validation _check_isinstance(left, right, Index) # class / dtype comparison _check_types(left, right, obj=obj) # level comparison if left.nlevels != right.nlevels: msg1 = f"{obj} levels are different" msg2 = f"{left.nlevels}, {left}" msg3 = f"{right.nlevels}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # length comparison if len(left) != len(right): msg1 = f"{obj} length are different" msg2 = f"{len(left)}, {left}" msg3 = f"{len(right)}, {right}" raise_assert_detail(obj, msg1, msg2, msg3) # If order doesn't matter then sort the index entries if not check_order: left = Index(safe_sort(left)) right = Index(safe_sort(right)) # MultiIndex special comparison for little-friendly error messages if left.nlevels > 1: left = cast(MultiIndex, left) right = cast(MultiIndex, right) for level in range(left.nlevels): # cannot use get_level_values here because it can change dtype llevel = _get_ilevel_values(left, level) rlevel = _get_ilevel_values(right, level) lobj = f"MultiIndex level [{level}]" assert_index_equal( llevel, rlevel, exact=exact, check_names=check_names, check_exact=check_exact, rtol=rtol, atol=atol, obj=lobj, ) # get_level_values may change dtype _check_types(left.levels[level], right.levels[level], obj=obj) # skip exact index checking when `check_categorical` is False if check_exact and check_categorical: if not left.equals(right): diff = (np.sum((left._values != right._values).astype(int)) * 100.0 / len(left)) msg = f"{obj} values are different ({np.round(diff, 5)} %)" raise_assert_detail(obj, msg, left, right) else: # if we have "equiv", this becomes True exact_bool = bool(exact) _testing.assert_almost_equal( left.values, right.values, rtol=rtol, atol=atol, check_dtype=exact_bool, obj=obj, lobj=left, robj=right, ) # metadata comparison if check_names: assert_attr_equal("names", left, right, obj=obj) if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex): assert_attr_equal("freq", left, right, obj=obj) if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex): assert_interval_array_equal(left._values, right._values) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas.testing import assert_series_equal >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): left_values = left._values right_values = right._values # Only check exact if dtype is numeric if isinstance(left_values, ExtensionArray) and isinstance( right_values, ExtensionArray): assert_extension_array_equal( left_values, right_values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: assert_numpy_array_equal( left_values, right_values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early elif np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype('u8') elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view('i8').astype('u8', copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def _cast_types(self, values, cast_type, column): """ Cast values to specified type Parameters ---------- values : ndarray cast_type : string or np.dtype dtype to cast values to column : string column name - used only for error reporting Returns ------- converted : ndarray """ if is_categorical_dtype(cast_type): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) if not is_object_dtype(values) and not known_cats: # TODO: this is for consistency with # c-parser which parses all categories # as strings values = astype_nansafe(values, np.dtype(str)) cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( cats, cats.get_indexer(values), cast_type, true_values=self.true_values) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): # ensure cast_type is an actual dtype and not a string cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: if is_bool_dtype(cast_type): return array_type._from_sequence_of_strings( values, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, ) else: return array_type._from_sequence_of_strings( values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err return values
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) res_name = get_op_result_name(self, other) other = lib.item_from_zerodim(other) # TODO: shouldn't we be applying finalize whenever # not isinstance(other, ABCSeries)? finalizer = (lambda x: x.__finalize__(self) if isinstance(other, (np.ndarray, ABCIndexClass)) else x) if isinstance(other, list): # TODO: same for tuples? other = np.asarray(other) if isinstance(other, ABCDataFrame): # pragma: no cover # Defer to DataFrame implementation; fail early return NotImplemented if isinstance(other, ABCSeries) and not self._indexed_same(other): raise ValueError( "Can only compare identically-labeled Series objects") elif (is_list_like(other) and len(other) != len(self) and not isinstance(other, (set, frozenset))): raise ValueError("Lengths must match") elif isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # TODO: make this treatment consistent across ops and classes. # We are not catching all listlikes here (e.g. frozenset, tuple) # The ambiguous case is object-dtype. See GH#27803 if len(self) != len(other): raise ValueError("Lengths must match to compare") if is_categorical_dtype(self): # Dispatch to Categorical implementation; CategoricalIndex # behavior is non-canonical GH#19513 res_values = dispatch_to_extension_op(op, self, other) elif is_datetime64_dtype(self) or is_datetime64tz_dtype(self): # Dispatch to DatetimeIndex to ensure identical # Series/Index behavior from pandas.core.arrays import DatetimeArray res_values = dispatch_to_extension_op(op, DatetimeArray(self), other) elif is_timedelta64_dtype(self): from pandas.core.arrays import TimedeltaArray res_values = dispatch_to_extension_op(op, TimedeltaArray(self), other) elif is_extension_array_dtype(self) or (is_extension_array_dtype(other) and not is_scalar(other)): # Note: the `not is_scalar(other)` condition rules out # e.g. other == "category" res_values = dispatch_to_extension_op(op, self, other) elif is_scalar(other) and isna(other): # numpy does not like comparisons vs None if op is operator.ne: res_values = np.ones(len(self), dtype=bool) else: res_values = np.zeros(len(self), dtype=bool) else: lvalues = extract_array(self, extract_numpy=True) rvalues = extract_array(other, extract_numpy=True) with np.errstate(all="ignore"): res_values = na_op(lvalues, rvalues) if is_scalar(res_values): raise TypeError( "Could not compare {typ} type with Series".format( typ=type(other))) result = self._constructor(res_values, index=self.index) # rename is needed in case res_name is None and result.name # is not. return finalizer(result).rename(res_name)
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.sub) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def __init__(self, index, grouper=None, obj=None, name=None, level=None, sort=True, observed=False, in_axis=False): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper.values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError('Level %s not in index' % str(level)) level = index.names.index(level) if self.name is None: self.name = index.names[level] self.grouper, self._labels, self._group_index = \ index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get labels elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper else: if self.grouper is None and self.name is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._labels = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): errmsg = ('Grouper result violates len(labels) == ' 'len(data)\nresult: %s' % pprint_thing(self.grouper)) self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, 'dtype', None) is not None: if is_datetime64_dtype(self.grouper): from pandas import to_datetime self.grouper = to_datetime(self.grouper) elif is_timedelta64_dtype(self.grouper): from pandas import to_timedelta self.grouper = to_timedelta(self.grouper)
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64), np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.dtype(np.float64), np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _cmp_method(self, other, op): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): raise ValueError("Lengths must match to compare") other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare if isinstance(other, Interval): other_dtype = pandas_dtype("interval") elif not is_categorical_dtype(other.dtype): other_dtype = other.dtype else: # for categorical defer to categories for dtype other_dtype = other.categories.dtype # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: return invalid_comparison(self, other, op) other = other.categories.take( other.codes, allow_fill=True, fill_value=other.categories._na_value) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: return invalid_comparison(self, other, op) elif not isinstance(other, Interval): other = type(self)(other) if op is operator.eq: return (self._left == other.left) & (self._right == other.right) elif op is operator.ne: return (self._left != other.left) | (self._right != other.right) elif op is operator.gt: return (self._left > other.left) | ( (self._left == other.left) & (self._right > other.right)) elif op is operator.ge: return (self == other) | (self > other) elif op is operator.lt: return (self._left < other.left) | ( (self._left == other.left) & (self._right < other.right)) else: # operator.lt return (self == other) | (self < other) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): return invalid_comparison(self, other, op) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): try: result[i] = op(self[i], obj) except TypeError: if obj is NA: # comparison with np.nan returns NA # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 result[i] = op is operator.ne else: raise return result
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, dropna: bool = True, ) -> Tuple["ops.BaseGrouper", Set[Label], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj._get_axis(axis).name != level: raise ValueError(f"level name {level} is not the name " f"of the {obj._get_axis_name(axis)}") elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, set(), obj else: return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): return key, set(), obj if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings: List[Grouping] = [] exclusions: Set[Label] = set() # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): # items -> .columns for DataFrame, .index for Series items = obj.axes[-1] try: items.get_loc(key) except (KeyError, TypeError, InvalidIndexError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): # IndexError reached in e.g. test_skip_group_keys when we pass # lambda here return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.add(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.add(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.add(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " "must be same length") # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, dropna=dropna, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna) return grouper, exclusions, obj
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def reindex(self, target, method=None, level=None, limit=None, tolerance=None) -> tuple[Index, npt.NDArray[np.intp] | None]: """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray[np.intp] or None Indices of output values in original index """ if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" ) if level is not None: raise NotImplementedError( "argument level is not implemented for CategoricalIndex.reindex" ) if limit is not None: raise NotImplementedError( "argument limit is not implemented for CategoricalIndex.reindex" ) target = ibase.ensure_index(target) if self.equals(target): indexer = None missing = np.array([], dtype=np.intp) else: indexer, missing = self.get_indexer_non_unique(target) if not self.is_unique: # GH#42568 warnings.warn( "reindexing with a non-unique Index is deprecated and will " "raise in a future version.", FutureWarning, stacklevel=find_stack_level(), ) if len(self) and indexer is not None: new_target = self.take(indexer) else: new_target = target # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if not isinstance(target, CategoricalIndex) or (cats == -1).any(): new_target, indexer, _ = super()._reindex_non_unique(target) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] cat = self._data._from_backing_data(codes) new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an initial Categorical to begin with # in which case we are going to conform to the passed Categorical if is_categorical_dtype(target): cat = Categorical(new_target, dtype=target.dtype) new_target = type(self)._simple_new(cat, name=self.name) else: # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target new_target = np.asarray(new_target) new_target = Index(new_target, name=self.name) return new_target, indexer
def hash_array( vals, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ): """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray, Categorical encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError("argument method is not implemented for " "CategoricalIndex.reindex") if level is not None: raise NotImplementedError("argument level is not implemented for " "CategoricalIndex.reindex") if limit is not None: raise NotImplementedError("argument limit is not implemented for " "CategoricalIndex.reindex") target = ibase._ensure_index(target) if not is_categorical_dtype(target) and not target.is_unique: raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes): new_target = self.take(indexer) else: new_target = target # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] new_target = self._create_from_codes(codes) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an inital Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def init_ndarray(values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = [values.name] if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # we could have a categorical type passed or coerced to 'category' # recast this to an arrays_to_mgr if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype(dtype): if not hasattr(values, "dtype"): values = prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) return arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = list(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype values = prep_ndarray(values, copy=copy) if dtype is not None: if not is_dtype_equal(values.dtype, dtype): try: values = values.astype(dtype) except Exception as orig: # e.g. ValueError when trying to cast object dtype to float64 raise ValueError( f"failed to cast to '{dtype}' (Exception was: {orig})" ) from orig index, columns = _get_axes(*values.shape, index=index, columns=columns) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dvals_list = [maybe_infer_to_datetimelike(row) for row in values] for n in range(len(dvals_list)): if isinstance(dvals_list[n], np.ndarray): dvals_list[n] = dvals_list[n].reshape(1, -1) from pandas.core.internals.blocks import make_block # TODO: What about re-joining object columns? block_values = [ make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) block_values = [datelike_vals] else: block_values = [values] return create_block_manager_from_blocks(block_values, [columns, index])
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def _get_grouper( obj, key=None, axis=0, level=None, sort=True, observed=False, mutated=False, validate=True, ): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values If validate, then check for key/level overlaps """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError("multiple levels only valid with " "MultiIndex") if isinstance(level, str): if obj.index.name != level: raise ValueError("level name {} is not the name of the " "index".format(level)) elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) all_hashable = is_tuple and is_hashable(key) if is_tuple: if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) elif isinstance(obj, Series): all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] exclusions = [] # if the actual grouper should be obj[key] def is_in_axis(key): if not _is_label_like(key): try: obj._data.items.get_loc(key) except Exception: return False return True # if the grouper is obj[name] def is_in_obj(gpr): try: return id(gpr) == id(obj[gpr.name]) except Exception: return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def wrapper(self, other, axis=None): # Validate the axis parameter if axis is not None: self._get_axis_number(axis) if isinstance(other, ABCSeries): name = com._maybe_match_name(self, other) if not self._indexed_same(other): msg = 'Can only compare identically-labeled Series objects' raise ValueError(msg) return self._constructor(na_op(self.values, other.values), index=self.index, name=name) elif isinstance(other, ABCDataFrame): # pragma: no cover return NotImplemented elif isinstance(other, (np.ndarray, pd.Index)): # do not check length of zerodim array # as it will broadcast if (not is_scalar(lib.item_from_zerodim(other)) and len(self) != len(other)): raise ValueError('Lengths must match to compare') if isinstance(other, ABCPeriodIndex): # temp workaround until fixing GH 13637 # tested in test_nat_comparisons # (pandas.tests.series.test_operators.TestSeriesOperators) return self._constructor(na_op(self.values, other.astype(object).values), index=self.index) return self._constructor(na_op(self.values, np.asarray(other)), index=self.index).__finalize__(self) elif isinstance(other, pd.Categorical): if not is_categorical_dtype(self): msg = ("Cannot compare a Categorical for op {op} with Series " "of dtype {typ}.\nIf you want to compare values, use " "'series <op> np.asarray(other)'.") raise TypeError(msg.format(op=op, typ=self.dtype)) if is_categorical_dtype(self): # cats are a special case as get_values() would return an ndarray, # which would then not take categories ordering into account # we can go directly to op, as the na_op would just test again and # dispatch to it. with np.errstate(all='ignore'): res = op(self.values, other) else: values = self.get_values() if isinstance(other, (list, np.ndarray)): other = np.asarray(other) with np.errstate(all='ignore'): res = na_op(values, other) if is_scalar(res): raise TypeError( 'Could not compare {typ} type with Series'.format( typ=type(other))) # always return a full value series here res = com._values_from_object(res) res = pd.Series(res, index=self.index, name=self.name, dtype='bool') return res
def _simple_new( cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True ): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ( f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types" ) raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ( "category, object, and string subtypes are not supported " "for IntervalArray" ) raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ( "left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'" ) raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array left = maybe_upcast_datetimelike_array(left) left = extract_array(left, extract_numpy=True) right = maybe_upcast_datetimelike_array(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def __init__( self, index: Index, grouper=None, obj: Optional[FrameOrSeries] = None, name=None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper._values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] ( self.grouper, self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered), name=self.name, ) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): grper = pprint_thing(self.grouper) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): self.grouper = self.grouper.astype("datetime64[ns]") elif is_timedelta64_dtype(self.grouper): self.grouper = self.grouper.astype("timedelta64[ns]")
def __init__( self, index: Index, grouper=None, obj: NDFrame | None = None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ) -> None: self.level = level self._orig_grouper = grouper self.grouping_vector = _convert_grouper(index, grouper) self._all_grouper = None self._index = index self._sort = sort self.obj = obj self._observed = observed self.in_axis = in_axis self._dropna = dropna self._passed_categorical = False # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level ilevel = self._ilevel if ilevel is not None: mapper = self.grouping_vector # In extant tests, the new self.grouping_vector matches # `index.get_level_values(ilevel)` whenever # mapper is None and isinstance(index, MultiIndex) ( self.grouping_vector, # Index self._codes, self._group_index, ) = index._get_grouper_for_level(mapper, level=ilevel) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouping_vector, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) assert self.obj is not None # for mypy _, newgrouper, newobj = self.grouping_vector._get_grouper( self.obj, validate=False) self.obj = newobj ng = newgrouper._get_grouper() if isinstance(newgrouper, ops.BinGrouper): # in this case we have `ng is newgrouper` self.grouping_vector = ng else: # ops.BaseGrouper # use Index instead of ndarray so we can recover the name self.grouping_vector = Index(ng, name=newgrouper.result_index.name) elif is_categorical_dtype(self.grouping_vector): # a passed Categorical self._passed_categorical = True self.grouping_vector, self._all_grouper = recode_for_groupby( self.grouping_vector, sort, observed) elif not isinstance(self.grouping_vector, (Series, Index, ExtensionArray, np.ndarray)): # no level passed if getattr(self.grouping_vector, "ndim", 1) != 1: t = self.name or str(type(self.grouping_vector)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouping_vector = index.map(self.grouping_vector) if not (hasattr(self.grouping_vector, "__len__") and len(self.grouping_vector) == len(index)): grper = pprint_thing(self.grouping_vector) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouping_vector = None # Try for sanity raise AssertionError(errmsg) if isinstance(self.grouping_vector, np.ndarray): # if we have a date/time-like grouper, make sure that we have # Timestamps like self.grouping_vector = sanitize_to_nanoseconds( self.grouping_vector)
def to_orc( df: DataFrame, path: FilePath | WriteBuffer[bytes] | None = None, *, engine: Literal["pyarrow"] = "pyarrow", index: bool | None = None, engine_kwargs: dict[str, Any] | None = None, ) -> bytes | None: """ Write a DataFrame to the ORC format. .. versionadded:: 1.5.0 Parameters ---------- df : DataFrame The dataframe to be written to ORC. Raises NotImplementedError if dtype of one or more columns is category, unsigned integers, intervals, periods or sparse. path : str, file-like object or None, default None If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, we refer to objects with a write() method, such as a file handle (e.g. via builtin open function). If path is None, a bytes object is returned. engine : str, default 'pyarrow' ORC library to use. Pyarrow must be >= 7.0.0. index : bool, optional If ``True``, include the dataframe's index(es) in the file output. If ``False``, they will not be written to the file. If ``None``, similar to ``infer`` the dataframe's index(es) will be saved. However, instead of being saved as values, the RangeIndex will be stored as a range in the metadata so it doesn't require much space and is faster. Other indexes will be included as columns in the file output. engine_kwargs : dict[str, Any] or None, default None Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. Returns ------- bytes if no path argument is provided else None Raises ------ NotImplementedError Dtype of one or more columns is category, unsigned integers, interval, period or sparse. ValueError engine is not pyarrow. Notes ----- * Before using this function you should read the :ref:`user guide about ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ library. * For supported dtypes please refer to `supported ORC features in Arrow <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files. """ if index is None: index = df.index.names[0] is not None if engine_kwargs is None: engine_kwargs = {} # If unsupported dtypes are found raise NotImplementedError # In Pyarrow 9.0.0 this check will no longer be needed for dtype in df.dtypes: if (is_categorical_dtype(dtype) or is_interval_dtype(dtype) or is_period_dtype(dtype) or is_unsigned_integer_dtype(dtype)): raise NotImplementedError( "The dtype of one or more columns is not supported yet.") if engine != "pyarrow": raise ValueError("engine must be 'pyarrow'") engine = import_optional_dependency(engine, min_version="7.0.0") orc = import_optional_dependency("pyarrow.orc") was_none = path is None if was_none: path = io.BytesIO() assert path is not None # For mypy with get_handle(path, "wb", is_text=False) as handles: assert isinstance(engine, ModuleType) # For mypy try: orc.write_table( engine.Table.from_pandas(df, preserve_index=index), handles.handle, **engine_kwargs, ) except TypeError as e: raise NotImplementedError( "The dtype of one or more columns is not supported yet." ) from e if was_none: assert isinstance(path, io.BytesIO) # For mypy return path.getvalue() return None
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? # TODO(EA2D):kludge can be avoided when 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan elif ( how == "add" and is_integer_dtype(orig_values.dtype) and is_extension_array_dtype(orig_values.dtype) ): # We need this to ensure that Series[Int64Dtype].resample().sum() # remains int64 dtype. # Two options for avoiding this special case # 1. mask-aware ops and avoid casting to float with NaN above # 2. specify the result dtype when calling this method result = result.astype("int64") if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype ): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError("{} dtype not supported".format(values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} operations".format(how) ) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} operations".format(how) ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize ) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill( np.empty(out_shape, dtype=out_dtype), fill_value=np.nan ) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, labels, func, is_numeric, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names