def _create_categorical(cls, data, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data): data = data.values if not isinstance(data, ABCCategorical): return Categorical(data, dtype=dtype) if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def _create_from_codes(self, codes, categories=None, ordered=None, name=None): """ *this is an internal non-public method* create the correct categorical from codes Parameters ---------- codes : new codes categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing name : optional name attribute, defaults to existing Returns ------- CategoricalIndex """ if categories is None: categories = self.categories if ordered is None: ordered = self.ordered if name is None: name = self.name cat = Categorical.from_codes(codes, categories=categories, ordered=ordered) return CategoricalIndex(cat, name=name)
def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is not None: values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name)
def _is_dtype_compat(self, other) -> bool: """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError( "categories must match existing categories when appending") else: values = other if not is_list_like(values): values = [values] cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) if not other.isin(values).all(): raise TypeError( "cannot append a non-category item to a CategoricalIndex") return other
def where(self, cond, other=None): if other is None: other = self._na_value values = np.where(cond, self.values, other) cat = Categorical(values, dtype=self.dtype) return self._shallow_copy(cat, **self._get_attributes_dict())
def __new__( cls, data=None, categories=None, ordered=None, dtype: Dtype | None = None, copy: bool = False, name: Hashable = None, ) -> CategoricalIndex: name = maybe_extract_name(name, data, cls) if data is None: # GH#38944 warnings.warn( "Constructing a CategoricalIndex without passing data is " "deprecated and will raise in a future version. " "Use CategoricalIndex([], ...) instead.", FutureWarning, stacklevel=find_stack_level(), ) data = [] if is_scalar(data): raise cls._scalar_data_error(data) data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype, copy=copy) return cls._simple_new(data, name=name)
def _reindex_non_unique( # type: ignore[override] self, target: Index) -> tuple[Index, np.ndarray | None, np.ndarray | None]: """ reindex from a non-unique; which CategoricalIndex's are almost always """ # TODO: rule out `indexer is None` here to make the signature # match the parent class's signature. This should be equivalent # to ruling out `self.equals(target)` new_target, indexer = self.reindex(target) new_indexer = None check = indexer == -1 # error: Item "bool" of "Union[Any, bool]" has no attribute "any" if check.any(): # type: ignore[union-attr] new_indexer = np.arange(len(self.take(indexer)), dtype=np.intp) new_indexer[check] = -1 cats = self.categories.get_indexer(target) if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories cat = Categorical(new_target, dtype=self.dtype) new_target = type(self)._simple_new(cat, name=self.name) return new_target, indexer, new_indexer
def my_qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'): x_is_series, series_index, name, x = pandas.core.reshape.tile._preprocess_for_cut( x) x, dtype = pandas.core.reshape.tile._coerce_to_type(x) if is_integer(q): quantiles = np.linspace(0, 1, q + 1) else: quantiles = q bins = quantile(x, quantiles) labels = pandas.core.reshape.tile._format_labels(bins, precision, right=False, dtype=dtype) bins[-1] += (bins[-1] - bins[0]) * 0.01 t = str(labels).split("\n")[0].split("(")[1][1:-2].split("),") for i in range(len(t) - 1): t[i] += ")" t[-1] = t[-1].replace(")", "]") labels = Categorical(t) fac, bins = pandas.core.reshape.tile._bins_to_cuts(x, bins, labels=labels, right=False, precision=precision, include_lowest=True, dtype=dtype, duplicates=duplicates) return pandas.core.reshape.tile._postprocess_for_cut( fac, bins, retbins, x_is_series, series_index, name, dtype)
def _shallow_copy(self, values=None, **kwargs): if values is None: values = self.values cat = Categorical(values, dtype=self.dtype) name = kwargs.get("name", self.name) return type(self)._simple_new(cat, name=name)
def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name if values is None: values = self.values cat = Categorical(values, dtype=self.dtype) return type(self)._simple_new(cat, name=name)
def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ from pandas import Index from pandas.core.arrays.string_ import StringDtype if dtype is not None: dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self # need to cast to different subtype try: # We need to use Index rules for astype to prevent casting # np.nan entries to int subtypes new_left = Index(self._left, copy=False).astype(dtype.subtype) new_right = Index(self._right, copy=False).astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" ) raise TypeError(msg) from err # TODO: do astype directly on self._combined combined = _get_combined_data(new_left, new_right) return type(self)._simple_new(combined, closed=self.closed) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError) as err: msg = f"Cannot cast {type(self).__name__} to dtype {dtype}" raise TypeError(msg) from err
def where(self, cond, other=None): # TODO: Investigate an alternative implementation with # 1. copy the underlying Categorical # 2. setitem with `cond` and `other` # 3. Rebuild CategoricalIndex. if other is None: other = self._na_value values = np.where(cond, self.values, other) cat = Categorical(values, dtype=self.dtype) return self._shallow_copy(cat, **self._get_attributes_dict())
def where(self, cond, other=None): # TODO: Investigate an alternative implementation with # 1. copy the underlying Categorical # 2. setitem with `cond` and `other` # 3. Rebuild CategoricalIndex. if other is None: other = self._na_value values = np.where(cond, self._values, other) cat = Categorical(values, dtype=self.dtype) return type(self)._simple_new(cat, name=self.name)
def _create_categorical(self, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (ABCSeries, type(self))) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) if isinstance(dtype, CategoricalDtype): # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def _shallow_copy( self, values: Optional[Categorical] = None, name: Label = no_default ): name = self.name if name is no_default else name if values is not None: # In tests we only get here with Categorical objects that # have matching .ordered, and values.categories a subset of # our own. However we do _not_ have a dtype match in general. values = Categorical(values, dtype=self.dtype) return super()._shallow_copy(values=values, name=name)
def __new__( cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) name = maybe_extract_name(name, data, cls) if not is_categorical_dtype(data): # don't allow scalars # if data is None, then categories must be provided if is_scalar(data): if data is not None or categories is None: raise cls._scalar_data_error(data) data = [] assert isinstance(dtype, CategoricalDtype), dtype if not isinstance(data, Categorical) or data.dtype != dtype: data = Categorical(data, dtype=dtype) data = data.copy() if copy else data return cls._simple_new(data, name=name)
def __new__( cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None ): name = maybe_extract_name(name, data, cls) if is_scalar(data): raise cls._scalar_data_error(data) data = Categorical( data, categories=categories, ordered=ordered, dtype=dtype, copy=copy ) return cls._simple_new(data, name=name)
def _create_categorical(cls, data, categories=None, ordered=None, dtype=None): """ *this is an internal non-public method* create the correct categorical from data and the properties Parameters ---------- data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing dtype : CategoricalDtype, defaults to existing Returns ------- Categorical """ if (isinstance(data, (cls, ABCSeries)) and is_categorical_dtype(data)): data = data.values if not isinstance(data, ABCCategorical): if ordered is None and dtype is None: ordered = False data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) if isinstance(dtype, CategoricalDtype) and dtype != data.dtype: # we want to silently ignore dtype='category' data = data._set_dtype(dtype) return data
def _is_dtype_compat(self, other) -> Categorical: """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Parameters ---------- other : Index Returns ------- Categorical Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): other = extract_array(other) if not other._categories_match_up_to_permutation(self): raise TypeError( "categories must match existing categories when appending" ) elif other._is_multi: # preempt raising NotImplementedError in isna call raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex") else: values = other cat = Categorical(other, dtype=self.dtype) other = CategoricalIndex(cat) if not other.isin(values).all(): raise TypeError( "cannot append a non-category item to a CategoricalIndex" ) other = other._values if not ((other == values) | (isna(other) & isna(values))).all(): # GH#37667 see test_equals_non_category raise TypeError( "categories must match existing categories when appending" ) return other
def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self # need to cast to different subtype try: new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) except TypeError: msg = ( "Cannot convert {dtype} to {new_dtype}; subtypes are " "incompatible" ) raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): msg = "Cannot cast {name} to dtype {dtype}" raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
def _reindex_non_unique(self, target): """ reindex from a non-unique; which CategoricalIndex's are almost always """ new_target, indexer = self.reindex(target) new_indexer = None check = indexer == -1 if check.any(): new_indexer = np.arange(len(self.take(indexer))) new_indexer[check] = -1 cats = self.categories.get_indexer(target) if not (cats == -1).any(): # .reindex returns normal Index. Revert to CategoricalIndex if # all targets are included in my categories new_target = Categorical(new_target, dtype=self.dtype) new_target = self._shallow_copy(new_target) return new_target, indexer, new_indexer
def _create_from_codes(self, codes, dtype=None, name=None): """ *this is an internal non-public method* create the correct categorical from codes Parameters ---------- codes : new codes dtype: CategoricalDtype, defaults to existing name : optional name attribute, defaults to existing Returns ------- CategoricalIndex """ if dtype is None: dtype = self.dtype if name is None: name = self.name cat = Categorical.from_codes(codes, dtype=dtype) return CategoricalIndex(cat, name=name)
def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None): name = maybe_extract_name(name, data, cls) if is_scalar(data): # don't allow scalars # if data is None, then categories must be provided if data is not None or categories is None: raise cls._scalar_data_error(data) data = [] data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype, copy=copy) return cls._simple_new(data, name=name)
def reindex( self, target, method=None, level=None, limit=None, tolerance=None ) -> tuple[Index, npt.NDArray[np.intp] | None]: """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray[np.intp] or None Indices of output values in original index """ if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" ) if level is not None: raise NotImplementedError( "argument level is not implemented for CategoricalIndex.reindex" ) if limit is not None: raise NotImplementedError( "argument limit is not implemented for CategoricalIndex.reindex" ) target = ibase.ensure_index(target) if self.equals(target): indexer = None missing = np.array([], dtype=np.intp) else: indexer, missing = self.get_indexer_non_unique(target) if not self.is_unique: # GH#42568 warnings.warn( "reindexing with a non-unique Index is deprecated and will " "raise in a future version.", FutureWarning, stacklevel=find_stack_level(), ) if len(self) and indexer is not None: new_target = self.take(indexer) else: new_target = target # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if not isinstance(target, CategoricalIndex) or (cats == -1).any(): new_target, indexer, _ = super()._reindex_non_unique(target) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] cat = self._data._from_backing_data(codes) new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an initial Categorical to begin with # in which case we are going to conform to the passed Categorical if is_categorical_dtype(target): cat = Categorical(new_target, dtype=target.dtype) new_target = type(self)._simple_new(cat, name=self.name) else: # e.g. test_reindex_with_categoricalindex, test_reindex_duplicate_target new_target = np.asarray(new_target) new_target = Index._with_infer(new_target, name=self.name) return new_target, indexer
def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) Returns ------- new_index : pd.Index Resulting index indexer : np.ndarray or None Indices of output values in original index """ if method is not None: raise NotImplementedError( "argument method is not implemented for CategoricalIndex.reindex" ) if level is not None: raise NotImplementedError( "argument level is not implemented for CategoricalIndex.reindex" ) if limit is not None: raise NotImplementedError( "argument limit is not implemented for CategoricalIndex.reindex" ) target = ibase.ensure_index(target) missing: List[int] if self.equals(target): indexer = None missing = [] else: indexer, missing = self.get_indexer_non_unique(np.array(target)) if len(self.codes) and indexer is not None: new_target = self.take(indexer) else: new_target = target # filling in missing if needed if len(missing): cats = self.categories.get_indexer(target) if (cats == -1).any(): # coerce to a regular index here! result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique(np.array(target)) else: codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] cat = self._data._from_backing_data(codes) new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't # coerce based on the actual values, only on the dtype) # unless we had an initial Categorical to begin with # in which case we are going to conform to the passed Categorical new_target = np.asarray(new_target) if is_categorical_dtype(target): new_target = Categorical(new_target, dtype=target.dtype) new_target = target._shallow_copy(new_target, name=self.name) else: new_target = Index(new_target, name=self.name) return new_target, indexer
def my_cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise'): x_is_series, series_index, name, x = pandas.core.reshape.tile._preprocess_for_cut( x) x, dtype = pandas.core.reshape.tile._coerce_to_type(x) if not np.iterable(bins): if pd._libs.lib.is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") try: # for array-like sz = x.size except AttributeError: x = np.asarray(x) sz = x.size if sz == 0: raise ValueError('Cannot cut empty array') nanmin = pd.core.nanops._nanminmax('min', fill_value_typ='+inf') nanmax = pd.core.nanops._nanminmax('max', fill_value_typ='-inf') rng = (nanmin(x), nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if mn == mx: # adjust end points before binning mn -= .001 * abs(mn) if mn != 0 else .001 mx += .001 * abs(mx) if mx != 0 else .001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) bins2 = copy.deepcopy(bins) adj = (mx - mn) * 0.001 # 0.1% of the range if right: bins[0] -= adj else: bins[-1] += adj elif isinstance(bins, pd.IntervalIndex): if bins.is_overlapping: raise ValueError('Overlapping IntervalIndex is not accepted.') else: if pd.core.dtypes.common.is_datetime64tz_dtype(bins): bins = np.asarray(bins, dtype=None) else: bins = np.asarray(bins) bins = pandas.core.reshape.tile._convert_bin_to_numeric_type( bins, dtype) if (np.diff(bins) < 0).any(): raise ValueError('bins must increase monotonically.') labels = pandas.core.reshape.tile._format_labels(bins2, precision, right=False, dtype=dtype) t = str(labels).split("\n")[0].split("(")[1][1:-2].split("),") for i in range(len(t) - 1): t[i] += ")" t[-1] = t[-1].replace(")", "]") labels = Categorical(t) fac, bins = pandas.core.reshape.tile._bins_to_cuts( x, bins, right=right, labels=labels, precision=precision, include_lowest=include_lowest, dtype=dtype, duplicates=duplicates) return pandas.core.reshape.tile._postprocess_for_cut( fac, bins, retbins, x_is_series, series_index, name, dtype)
def recode_for_groupby(c, sort, observed): """ Code the categories to ensure we can groupby for categoricals. If observed=True, we return a new Categorical with the observed categories only. If sort=False, return a copy of self, coded with categories as returned by .unique(), followed by any categories not appearing in the data. If sort=True, return self. This method is needed solely to ensure the categorical index of the GroupBy result has categories in the order of appearance in the data (GH-8868). Parameters ---------- c : Categorical sort : boolean The value of the sort parameter groupby was called with. observed : boolean Account only for the observed values Returns ------- New Categorical If sort=False, the new categories are set to the order of appearance in codes (unless ordered=True, in which case the original order is preserved), followed by any unrepresented categories in the original order. Categorical or None If we are observed, return the original categorical, otherwise None """ # we only care about observed values if observed: unique_codes = unique1d(c.codes) take_codes = unique_codes[unique_codes != -1] if c.ordered: take_codes = np.sort(take_codes) # we recode according to the uniques categories = c.categories.take(take_codes) codes = _recode_for_categories(c.codes, c.categories, categories) # return a new categorical that maps our new codes # and categories dtype = CategoricalDtype(categories, ordered=c.ordered) return Categorical(codes, dtype=dtype, fastpath=True), c # Already sorted according to c.categories; all is fine if sort: return c, None # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() # above dropped cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)]) return c.reorder_categories(cat.categories), None