def copy(self, deep=True): if deep: copied_col = libcudf.copying.copy_column(self) return column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column( copied_col.base_data, dtype=copied_col.dtype ), offset=copied_col.offset, size=copied_col.size, mask=copied_col.base_mask, ordered=self.dtype.ordered, ) else: return column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column( self.codes.base_data, dtype=self.codes.dtype ), mask=self.base_mask, ordered=self.dtype.ordered, offset=self.offset, size=self.size, )
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, CategoricalColumn): values = values elif isinstance(values, pd.Series) and ( is_categorical_dtype(values.dtype) ): codes_data = column.as_column(values.cat.codes.values) values = column.build_categorical_column( categories=values.cat.categories, codes=codes_data, ordered=values.cat.ordered, ) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): codes_data = column.as_column(values.codes) values = column.build_categorical_column( categories=values.categories, codes=codes_data, ordered=values.ordered, ) elif isinstance(values, (list, tuple)): values = column.as_column( pd.Categorical(values, categories=values) ) super(CategoricalIndex, self).__init__(values, **kwargs)
def _set_categories( self, current_categories: Any, new_categories: Any, is_unique: bool = False, ordered: bool = False, ) -> CategoricalColumn: """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ cur_cats = column.as_column(current_categories) new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = (cudf.Series(new_cats).drop_duplicates( ignore_index=True)._column) cur_codes = self.codes max_cat_size = (len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)) out_code_dtype = min_unsigned_type(max_cat_size) cur_order = column.arange(len(cur_codes)) old_codes = column.arange(len(cur_cats), dtype=out_code_dtype) new_codes = column.arange(len(new_cats), dtype=out_code_dtype) new_df = cudf.DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = cudf.DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = cudf.DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) ordered = ordered if ordered is not None else self.ordered new_codes = df["new_codes"]._column # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, ordered=ordered, )
def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *replacement*. """ replaced = column.as_column(self.cat().codes) to_replace_col = column.as_column( np.asarray( [self._encode(val) for val in to_replace], dtype=replaced.dtype ) ) replacement_col = column.as_column( np.asarray( [self._encode(val) for val in replacement], dtype=replaced.dtype, ) ) output = libcudf.replace.replace( replaced, to_replace_col, replacement_col ) return column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(output.base_data, dtype=output.dtype), mask=output.base_mask, offset=output.offset, size=output.size, ordered=self.dtype.ordered, )
def copy(self, deep=True): if deep: copied_col = libcudf.copying.copy_column(self) return column.build_categorical_column( categories=self.dtype.categories, codes=copied_col, mask=copied_col.mask, ordered=self.dtype.ordered, ) else: return column.build_categorical_column( categories=self.dtype.categories, codes=self.codes, mask=self.mask, ordered=self.dtype.ordered, )
def set_categories(self, new_categories, **kwargs): """Returns a new Series with the categories set to the specified *new_categories*.""" ordered = kwargs.get("ordered", self.ordered) rename = kwargs.pop("rename", False) new_categories = column.as_column(new_categories) # when called with rename=True, the pandas behavior is # to replace the current category values with the new # categories. if rename: # enforce same length if len(new_categories) != len(self._column.categories): raise ValueError("new_categories must have the same " "number of items as old categories") out_col = column.build_categorical_column( new_categories, self._column.children[0], self._column.mask, self._column.size, ordered=ordered, ) else: out_col = self._column if not self._categories_equal(new_categories, **kwargs): out_col = self._set_categories(new_categories, **kwargs) return self._return_or_inplace(out_col, **kwargs)
def apply_boolean_mask(self, mask): codes = super().apply_boolean_mask(mask) return column.build_categorical_column( categories=self.dtype.categories, codes=codes, mask=codes.mask, ordered=self.dtype.ordered, )
def unique(self, method="sort"): codes = self.as_numerical.unique(method=method) return column.build_categorical_column( categories=self.categories, codes=codes, mask=codes.mask, ordered=self.ordered, )
def sort_by_values(self, ascending=True, na_position="last"): codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories, codes=codes, mask=self.mask, ordered=self.dtype.ordered, ) return col, inds
def unique(self): codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, offset=codes.offset, size=codes.size, ordered=self.ordered, )
def normalize_binop_value(self, other): ary = utils.scalar_broadcast_to(self._encode(other), size=len(self), dtype=self.codes.dtype) col = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(ary), mask=self.mask, ordered=self.dtype.ordered, ) return col
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._column.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = ( Series(new_cats).drop_duplicates(ignore_index=True)._column ) cur_codes = self.codes cur_order = cupy.arange(len(cur_codes)) old_codes = cupy.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cupy.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order") df.reset_index(drop=True, inplace=True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column # codes can't have masks, so take mask out before moving in return column.build_categorical_column( categories=new_cats, codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype), mask=new_codes.base_mask, size=new_codes.size, offset=new_codes.offset, ordered=ordered, )
def fillna( self, fill_value: Any = None, method: Any = None, dtype: Dtype = None ) -> CategoricalColumn: """ Fill null values with *fill_value* """ if not self.nullable: return self if fill_value is not None: fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: if fill_value == self.default_na_value(): fill_value = self.codes.dtype.type(fill_value) else: try: fill_value = self._encode(fill_value) fill_value = self.codes.dtype.type(fill_value) except (ValueError) as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) if isinstance(fill_value, CategoricalColumn): if self.dtype != fill_value.dtype: raise ValueError( "Cannot set a Categorical with another, " "without identical categories" ) # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value.cat()._set_categories( fill_value.cat().categories, self.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype ) result = super().fillna(value=fill_value, method=method) result = column.build_categorical_column( categories=self.dtype.categories._values, codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, mask=result.base_mask, ordered=self.dtype.ordered, ) return result
def sort_by_values( self, ascending: bool = True, na_position="last" ) -> Tuple[CategoricalColumn, NumericalColumn]: codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories._values, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, size=codes.size, ordered=self.dtype.ordered, ) return col, inds
def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: if isinstance(dtype, CategoricalDtype): return column.build_categorical_column( categories=dtype.categories._values, codes=build_column(self.base_data, dtype=self.dtype), mask=self.base_mask, ordered=dtype.ordered, size=self.size, offset=self.offset, null_count=self.null_count, ) return self
def as_unordered(self, inplace=False): if inplace: self._parent.dtype.ordered = False else: from cudf import Series parent = self._parent return Series( column.build_categorical_column( categories=parent.dtype.categories, codes=parent.codes, mask=parent.mask, ordered=False, ))
def normalize_binop_value(self, other): if isinstance(other, np.ndarray) and other.ndim == 0: other = other.item() ary = cudf.utils.utils.scalar_broadcast_to(self._encode(other), size=len(self), dtype=self.codes.dtype) col = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(ary), mask=self.base_mask, ordered=self.dtype.ordered, ) return col
def _create_empty_categorical_column(categorical_column, dtype): return column.build_categorical_column( categories=dtype.categories, codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, np.dtype(categorical_column.cat().codes), )), offset=categorical_column.offset, size=categorical_column.size, mask=categorical_column.base_mask, ordered=dtype.ordered, )
def as_ordered(self, **kwargs): inplace = kwargs.get("inplace", False) data = None if inplace else self._parent if not self.ordered: kwargs["ordered"] = True data = self._set_categories(self.categories, **kwargs) if data is not None: from cudf import Series parent = self._parent return Series( column.build_categorical_column( categories=parent.dtype.categories, codes=parent.cat().codes, mask=parent.mask, ordered=True, ))
def indices_from_labels(obj, labels): from cudf.core.column import column labels = column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, ) else: labels = labels.astype(obj.index.dtype) lhs = cudf.DataFrame({}, index=labels) rhs = cudf.DataFrame({"_": cupy.arange(len(obj))}, index=obj.index) return lhs.join(rhs)["_"]
def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( categories=self.categories, codes=column.as_column(self.codes, dtype=self.codes.dtype), mask=new_mask, ordered=self.dtype.ordered, size=self.codes.size, ) else: col = self signed_dtype = min_signed_type(len(col.categories)) codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes(codes, categories=categories, ordered=col.ordered) return pd.Series(data, index=index)
def fillna(self, fill_value): """ Fill null values with *fill_value* """ if not self.nullable: return self fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: if fill_value == self.default_na_value(): fill_value = self.codes.dtype.type(fill_value) else: try: fill_value = self._encode(fill_value) fill_value = self.codes.dtype.type(fill_value) except (ValueError) as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) # TODO: only required if fill_value has a subset of the categories: fill_value = fill_value.cat()._set_categories( self.categories, is_unique=True ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype ) result = libcudf.replace.replace_nulls(self, fill_value) result = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, mask=None, ordered=self.dtype.ordered, ) return result
def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ codes = categorical.codes if codes is None else codes codes = column.as_column(codes) valid_codes = codes != -1 mask = None if not np.all(valid_codes): mask = cudautils.compact_mask_bytes(valid_codes) mask = Buffer(mask) return column.build_categorical_column( categories=categorical.categories, codes=codes, mask=mask, ordered=categorical.ordered, )
def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ codes = categorical.codes if codes is None else codes codes = column.as_column(codes) valid_codes = codes != codes.dtype.type(-1) mask = None if not valid_codes.all(): mask = bools_to_mask(valid_codes) return column.build_categorical_column( categories=categorical.categories, codes=column.as_column(codes.base_data, dtype=codes.dtype), size=codes.size, mask=mask, ordered=categorical.ordered, )
def indices_from_labels(obj, labels): from cudf.core.column import column if not isinstance(labels, cudf.MultiIndex): labels = column.as_column(labels) if is_categorical_dtype(obj.index): labels = labels.astype("category") codes = labels.codes.astype(obj.index._values.codes.dtype) labels = column.build_categorical_column( categories=labels.dtype.categories, codes=codes, ordered=labels.dtype.ordered, ) else: labels = labels.astype(obj.index.dtype) # join is not guaranteed to maintain the index ordering # so we will sort it with its initial ordering which is stored # in column "__" lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels) rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index) return lhs.join(rhs).sort_values("__")["_"]
def find_and_replace(self, to_replace, replacement, all_nan): """ Return col with *to_replace* replaced with *replacement*. """ # create a dataframe containing the pre-replacement categories # and a copy of them to work with. The index of this dataframe # represents the original ints that map to the categories old_cats = cudf.DataFrame() old_cats["cats"] = column.as_column(self.dtype.categories) new_cats = old_cats.copy(deep=True) # Create a column with the appropriate labels replaced old_cats["cats_replace"] = old_cats["cats"].replace( to_replace, replacement) # Construct the new categorical labels # If a category is being replaced by an existing one, we # want to map it to None. If it's totally new, we want to # map it to the new label it is to be replaced by dtype_replace = cudf.Series(replacement) dtype_replace[dtype_replace.isin(old_cats["cats"])] = None new_cats["cats"] = new_cats["cats"].replace(to_replace, dtype_replace) # anything we mapped to None, we want to now filter out since # those categories don't exist anymore # Resetting the index creates a column 'index' that associates # the original integers to the new labels bmask = new_cats["cats"]._column.notna() new_cats = cudf.DataFrame({ "cats": new_cats["cats"]._column.apply_boolean_mask(bmask) }).reset_index() # old_cats contains replaced categories and the ints that # previously mapped to those categories and the index of # new_cats is a RangeIndex that contains the new ints catmap = old_cats.merge(new_cats, left_on="cats_replace", right_on="cats", how="inner") # The index of this frame is now the old ints, but the column # named 'index', which came from the filtered categories, # contains the new ints that we need to map to to_replace_col = column.as_column(catmap.index).astype( self.cat().codes.dtype) replacement_col = catmap["index"]._column.astype( self.cat().codes.dtype) replaced = column.as_column(self.cat().codes) output = libcudf.replace.replace(replaced, to_replace_col, replacement_col) return column.build_categorical_column( categories=new_cats["cats"], codes=column.as_column(output.base_data, dtype=output.dtype), mask=output.base_mask, offset=output.offset, size=output.size, ordered=self.dtype.ordered, )
def set_categories( self, new_categories, ordered=None, rename=False, inplace=False, ): """ Set the categories to the specified new_categories. `new_categories` can include new categories (which will result in unused categories) or remove old categories (which results in values set to null). If `rename==True`, the categories will simple be renamed (less or more items than in old categories will result in values set to null or in unused categories respectively). This method can be used to perform more than one action of adding, removing, and reordering simultaneously and is therefore faster than performing the individual steps via the more specialised methods. On the other hand this methods does not do checks (e.g., whether the old categories are included in the new categories on a reorder), which can result in surprising changes. Parameters ---------- new_categories : list-like The categories in new order. ordered : bool, default None Whether or not the categorical is treated as a ordered categorical. If not given, do not change the ordered information. rename : bool, default False Whether or not the `new_categories` should be considered as a rename of the old categories or as reordered categories. inplace : bool, default False Whether or not to reorder the categories in-place or return a copy of this categorical with reordered categories. Returns ------- cat Categorical with reordered categories or None if inplace. Examples -------- >>> import cudf >>> s = cudf.Series([1, 1, 2, 10, 2, 10], dtype='category') >>> s 0 1 1 1 2 2 3 10 4 2 5 10 dtype: category Categories (3, int64): [1, 2, 10] >>> s.cat.set_categories([1, 10]) 0 1 1 1 2 null 3 10 4 null 5 10 dtype: category Categories (2, int64): [1, 10] >>> s.cat.set_categories([1, 10], inplace=True) >>> s 0 1 1 1 2 null 3 10 4 null 5 10 dtype: category Categories (2, int64): [1, 10] """ ordered = ordered if ordered is not None else self.ordered new_categories = column.as_column(new_categories) if isinstance(new_categories, CategoricalColumn): new_categories = new_categories.categories # when called with rename=True, the pandas behavior is # to replace the current category values with the new # categories. if rename: # enforce same length if len(new_categories) != len(self._column.categories): raise ValueError("new_categories must have the same " "number of items as old categories") out_col = column.build_categorical_column( categories=new_categories, codes=self._column.base_children[0], mask=self._column.base_mask, size=self._column.size, offset=self._column.offset, ordered=ordered, ) else: out_col = self._column if not (type(out_col.categories) is type(new_categories)): # If both categories are of different Column types, # return a column full of Nulls. out_col = _create_empty_categorical_column( self._column, CategoricalDtype(categories=new_categories, ordered=ordered), ) elif (not self._categories_equal(new_categories, ordered=ordered) or not self.ordered == ordered): out_col = self._set_categories( self._column.categories, new_categories, ordered=ordered, ) return self._return_or_inplace(out_col, inplace=inplace)