def _compute_drop_idx(self): """Helper to compute indices to drop from category to drop""" if self.drop is None: return None elif isinstance(self.drop, str) and self.drop == 'first': return {feature: 0 for feature in self._encoders.keys()} elif isinstance(self.drop, (dict, list)): if isinstance(self.drop, list): self.drop = dict(zip(range(len(self.drop)), self.drop)) if len(self.drop.keys()) != len(self._encoders): msg = ("`drop` should have as many columns as the number " "of features ({}), got {}") raise ValueError(msg.format(len(self._encoders), len(self.drop.keys()))) drop_idx = dict() for feature in self.drop.keys(): self.drop[feature] = Series(self.drop[feature]) if len(self.drop[feature]) != 1: msg = ("Trying to drop multiple values for feature {}, " "this is not supported.").format(feature) raise ValueError(msg) cats = self._encoders[feature].classes_ if not self.drop[feature].isin(cats).all(): msg = ("Some categories for feature {} were supposed " "to be dropped, but were not found in the encoder " "categories.".format(feature)) raise ValueError(msg) cats = Series(cats) idx = cats.isin(self.drop[feature]) drop_idx[feature] = cp.asarray(cats[idx].index) return drop_idx else: msg = ("Wrong input for parameter `drop`. Expected " "'first', None or a dict, got {}") raise ValueError(msg.format(type(self.drop)))
def remove_categories(self, removals, **kwargs): from cudf import Series cats = self.categories.to_series() removals = Series(removals, dtype=cats.dtype) removals_mask = removals.isin(cats) # ensure all the removals are in the current categories # list. If not, raise an error to match Pandas behavior if not removals_mask.all(): vals = removals[~removals_mask].to_array() msg = "removals must all be in old categories: {}".format(vals) raise ValueError(msg) return self.set_categories(cats[~cats.isin(removals)], **kwargs)
def remove_categories(self, removals, **kwargs): """ Remove the specified categories. `removals` must be included in the old categories. Values which were in the removed categories will be set to null. Parameters ---------- removals : category or list-like of category The categories which should be removed. inplace : bool, default False Whether or not to remove the categories inplace or return a copy of this categorical with removed categories. Returns ------- cat Categorical with removed categories or None if inplace. Examples -------- >>> import cudf >>> s = cudf.Series([10, 1, 1, 2, 10, 2, 10], dtype="category") >>> s 0 10 1 1 2 1 3 2 4 10 5 2 6 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.remove_categories([1]) 0 10 1 null 2 null 3 2 4 10 5 2 6 10 dtype: category Categories (2, int64): [2, 10] >>> s 0 10 1 1 2 1 3 2 4 10 5 2 6 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.remove_categories([10], inplace=True) >>> s 0 null 1 1 2 1 3 2 4 null 5 2 6 null dtype: category Categories (2, int64): [1, 2] """ from cudf import Series cats = self.categories.to_series() removals = Series(removals, dtype=cats.dtype) removals_mask = removals.isin(cats) # ensure all the removals are in the current categories # list. If not, raise an error to match Pandas behavior if not removals_mask.all(): vals = removals[~removals_mask].to_array() msg = "removals must all be in old categories: {}".format(vals) raise ValueError(msg) new_categories = cats[~cats.isin(removals)]._column out_col = self._column if not self._categories_equal(new_categories, **kwargs): out_col = self._set_categories(new_categories, **kwargs) return self._return_or_inplace(out_col, **kwargs)