def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): return safe_hash(ind.to_frame(index=False)) col = column.as_column(ind) return safe_hash(cudf.Series(col))
def find_last_value(self, value, closest=False): """ Returns offset of last value that matches """ value = pd.to_datetime(value) value = column.as_column(value, dtype=self.dtype).as_numerical[0] return self.as_numerical.find_last_value(value, closest=closest)
def _normalize_find_and_replace_input(input_column_dtype, col_to_normalize): normalized_column = column.as_column(col_to_normalize) col_to_normalize_dtype = normalized_column.dtype if isinstance(col_to_normalize, list): col_to_normalize_dtype = min_numeric_column_type(normalized_column) # Scalar case if len(col_to_normalize) == 1: col_to_normalize_casted = input_column_dtype.type( col_to_normalize[0] ) if not np.isnan(col_to_normalize_casted) and ( col_to_normalize_casted != col_to_normalize[0] ): raise TypeError( f"Cannot safely cast non-equivalent " f"{col_to_normalize[0]} " f"to {input_column_dtype.name}" ) else: col_to_normalize_dtype = input_column_dtype elif hasattr(col_to_normalize, "dtype"): col_to_normalize_dtype = col_to_normalize.dtype else: raise TypeError(f"Type {type(col_to_normalize)} not supported") if ( col_to_normalize_dtype.kind == "f" and input_column_dtype.kind == "i" ) or (col_to_normalize_dtype > input_column_dtype): raise TypeError( f"Potentially unsafe cast for non-equivalent " f"{col_to_normalize_dtype.name} " f"to {input_column_dtype.name}" ) return normalized_column.astype(input_column_dtype)
def deserialize(cls, header: dict, frames: list) -> CategoricalColumn: n_dtype_frames = header["dtype_frames_count"] dtype = CategoricalDtype.deserialize( header["dtype"], frames[:n_dtype_frames] ) n_data_frames = header["data_frames_count"] column_type = pickle.loads(header["data"]["type-serialized"]) data = column_type.deserialize( header["data"], frames[n_dtype_frames : n_dtype_frames + n_data_frames], ) mask = None if "mask" in header: mask = Buffer.deserialize( header["mask"], [frames[n_dtype_frames + n_data_frames]] ) return cast( CategoricalColumn, column.build_column( data=None, dtype=dtype, mask=mask, children=(column.as_column(data.base_data, dtype=data.dtype),), ), )
def as_numerical_column(self, dtype, **kwargs): mem_dtype = np.dtype(dtype) str_dtype = mem_dtype out_dtype = mem_dtype if mem_dtype.type in (np.int8, np.int16): mem_dtype = np.dtype(np.int32) str_dtype = mem_dtype elif mem_dtype.type is np.datetime64: kwargs.update(units=np.datetime_data(mem_dtype)[0]) mem_dtype = np.dtype(np.int64) out_arr = rmm.device_array(shape=len(self), dtype=mem_dtype) out_ptr = libcudf.cudf.get_ctype_ptr(out_arr) kwargs.update({"devptr": out_ptr}) _str_to_numeric_typecast_functions[str_dtype](self.str(), **kwargs) out_col = column.as_column(out_arr) if self.null_count > 0: mask_size = utils.calc_chunk_size(len(self.data), utils.mask_bitsize) out_mask_arr = rmm.device_array(mask_size, dtype="int8") out_mask_ptr = libcudf.cudf.get_ctype_ptr(out_mask_arr) self.data.set_null_bitmask(out_mask_ptr, bdevmem=True) mask = Buffer(out_mask_arr) out_col = out_col.set_mask(mask) return out_col.astype(out_dtype)
def fillna(self, fill_value: Any = None, method: Any = None, dtype: Dtype = None) -> CategoricalColumn: """ Fill null values with *fill_value* """ if not self.nullable: return self if fill_value is not None: fill_is_scalar = np.isscalar(fill_value) if fill_is_scalar: if fill_value == self.default_na_value(): fill_value = self.codes.dtype.type(fill_value) else: try: fill_value = self._encode(fill_value) fill_value = self.codes.dtype.type(fill_value) except (ValueError) as err: err_msg = "fill value must be in categories" raise ValueError(err_msg) from err else: fill_value = column.as_column(fill_value, nan_as_null=False) # TODO: only required if fill_value has a subset of the # categories: fill_value = fill_value.cat()._set_categories( fill_value.cat().categories, self.categories, is_unique=True, ) fill_value = column.as_column(fill_value.codes).astype( self.codes.dtype) result = super().fillna(value=fill_value, method=method) result = column.build_categorical_column( categories=self.dtype.categories._values, codes=column.as_column(result.base_data, dtype=result.dtype), offset=result.offset, size=result.size, mask=result.base_mask, ordered=self.dtype.ordered, ) return result
def find_and_replace( self, to_replace: ColumnLike, replacement: ColumnLike, all_nan: bool = False, ) -> NumericalColumn: """ Return col with *to_replace* replaced with *value*. """ to_replace_col = column.as_column(to_replace) replacement_col = column.as_column(replacement) if type(to_replace_col) != type(replacement_col): raise TypeError( f"to_replace and value should be of same types," f"got to_replace dtype: {to_replace_col.dtype} and " f"value dtype: {replacement_col.dtype}") if not isinstance(to_replace_col, NumericalColumn) and not isinstance( replacement_col, NumericalColumn): return self.copy() to_replace_col = _normalize_find_and_replace_input( self.dtype, to_replace) if all_nan: replacement_col = column.as_column(replacement, dtype=self.dtype) else: replacement_col = _normalize_find_and_replace_input( self.dtype, replacement) replaced = self.copy() if len(replacement_col) == 1 and len(to_replace_col) > 1: replacement_col = column.as_column( utils.scalar_broadcast_to(replacement[0], (len(to_replace_col), ), self.dtype)) elif len(replacement_col) == 1 and len(to_replace_col) == 0: return replaced to_replace_col, replacement_col, replaced = numeric_normalize_types( to_replace_col, replacement_col, replaced) df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col}) df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True) if df._data["old"].null_count == 1: replaced = replaced.fillna( df._data["new"][df._data["old"].isna()][0]) df = df.dropna(subset=["old"]) return libcudf.replace.replace(replaced, df["old"]._column, df["new"]._column)
def get_sorted_inds(by, ascending=True, na_position="last"): """ Sort by the values. Parameters ---------- by : Column or list of Column Column or list of Column objects to sort by. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. Returns ------- col_inds : cuDF Column of indices sorted based on input Difference from pandas: * Support axis='index' only. * Not supporting: inplace, kind * Ascending can be a list of bools to control per column """ if isinstance(by, (ColumnBase)): by = [by] col_inds = column.as_column(cudautils.arange(len(by[0]), dtype="int32")) # This needs to be updated to handle list of bools for ascending if ascending is True: if na_position == "last": na_position = 0 elif na_position == "first": na_position = 1 elif ascending is False: if na_position == "last": na_position = 1 elif na_position == "first": na_position = 0 else: logging.warning( "When using a sequence of booleans for `ascending`, `na_position` " "flag is not yet supported and defaults to treating nulls as " "greater than all numbers") na_position = 0 # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): ascending = [ascending] * len(by) # If given a list-like need to convert to a numpy array and copy to device if isinstance(ascending, collections.abc.Sequence): # Need to flip the boolean here since libcudf has 0 as ascending ascending = [not val for val in ascending] ascending = rmm.to_device(np.array(ascending, dtype="int8")) else: raise ValueError("Must use a boolean or list of booleans") libcudf.sort.order_by(by, col_inds, ascending, na_position) return col_inds
def normalize_chunks(self, size, chunks): if isinstance(chunks, int): # *chunks* is the chunksize return column.arange(0, size, chunks).data_array_view else: # *chunks* is an array of chunk leading offset chunks = column.as_column(chunks) return chunks.data_array_view
def normalize_chunks(self, size, chunks): if isinstance(chunks, six.integer_types): # *chunks* is the chunksize return cudautils.arange(0, size, chunks) else: # *chunks* is an array of chunk leading offset chunks = column.as_column(chunks) return chunks.data_array_view
def add_categories(self, new_categories, **kwargs): new_categories = column.as_column(new_categories) new_categories = self._column.categories.append(new_categories) out_col = self._column if not self._categories_equal(new_categories, **kwargs): out_col = self._set_categories(new_categories, **kwargs) return self._return_or_inplace(out_col, **kwargs)
def _create_empty_categorical_column( categorical_column: CategoricalColumn, dtype: "CategoricalDtype" ) -> CategoricalColumn: return column.build_categorical_column( categories=column.as_column(dtype.categories), codes=column.as_column( cudf.utils.utils.scalar_broadcast_to( categorical_column.default_na_value(), categorical_column.size, np.dtype(categorical_column.cat().codes), ) ), offset=categorical_column.offset, size=categorical_column.size, mask=categorical_column.base_mask, ordered=dtype.ordered, )
def add_categories(self, new_categories, **kwargs): """ Add new categories. `new_categories` will be included at the last/highest place in the categories and will be unused directly after this call. Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : bool, default False Whether or not to add the categories inplace or return a copy of this categorical with added categories. Returns ------- cat Categorical with new categories added or None if inplace. Examples -------- >>> import cudf >>> s = cudf.Series([1, 2], dtype="category") >>> s 0 1 1 2 dtype: category Categories (2, int64): [1, 2] >>> s.cat.add_categories([0, 3, 4]) 0 1 1 2 dtype: category Categories (5, int64): [1, 2, 0, 3, 4] >>> s 0 1 1 2 dtype: category Categories (2, int64): [1, 2] >>> s.cat.add_categories([0, 3, 4], inplace=True) >>> s 0 1 1 2 dtype: category Categories (5, int64): [1, 2, 0, 3, 4] """ new_categories = column.as_column(new_categories) new_categories = self._column.categories.append(new_categories) out_col = self._column if not self._categories_equal(new_categories, **kwargs): out_col = self._set_categories(new_categories, **kwargs) return self._return_or_inplace(out_col, **kwargs)
def wrapper(*args, **kwargs): ret = passed_attr(*args, **kwargs) if isinstance(ret, nvstrings.nvstrings): ret = Series( column.as_column(ret), index=self._index, name=self._name, ) return ret
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.column import column from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): if isinstance(arg[0], slice): df = columns_df[arg[0]] else: df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not ( isinstance(arg[0], slice) or isinstance(arg[1], slice) ): # Pandas returns a numpy scalar in this case return df.iloc[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: if isinstance(arg[0], slice): df = columns_df._slice(arg[0]) elif is_scalar(arg[0]): index = arg[0] if index < 0: index += len(columns_df) df = columns_df._slice(slice(index, index + 1, 1)) else: arg = (column.as_column(arg[0]), arg[1]) if pd.api.types.is_bool_dtype(arg[0]): df = columns_df._apply_boolean_mask(arg[0]) else: df = columns_df._gather(arg[0]) # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def make_aggregate_nullmask(df, columns=None, op="and"): out_mask = None for k in columns or df.columns: if not df[k].nullable: continue nullmask = df[k].nullmask if out_mask is None: out_mask = column.as_column( nullmask.copy(), dtype=utils.mask_dtype ) continue out_mask = libcudfxx.binaryop.binaryop( column.as_column(nullmask), out_mask, op, out_mask.dtype ) return out_mask
def test_gather_single_col(): col = column.as_column(np.arange(100), dtype=np.int32) gather_map = np.array([0, 1, 2, 3, 5, 8, 13, 21], dtype=np.int32) device_gather_map = rmm.to_device(gather_map) out = libcudf.copying.gather(col, device_gather_map) np.testing.assert_array_equal(out.to_array(), gather_map)
def searchsorted(self, value, side="left"): if not self.ordered: raise ValueError("Requires ordered categories") value_col = column.as_column(value) if self.dtype != value_col.dtype: raise TypeError("Categoricals can only compare with the same type") return libcudf.search.search_sorted(self, value_col, side)
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, StringColumn): values = values.copy() elif isinstance(values, StringIndex): values = values._values.copy() else: values = column.as_column(nvstrings.to_device(values)) super(StringIndex, self).__init__(values, **kwargs)
def find_first_value(self, value: ScalarLike, closest: bool = False) -> int: """ Returns offset of first value that matches """ value = pd.to_datetime(value) value = column.as_column(value, dtype=self.dtype).as_numerical[0] return self.as_numerical.find_first_value(value, closest=closest)
def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): """Append a scalar `value` to the end of `col` inplace. Cast to common type if possible """ to_type = find_common_type([type(value), col.dtype]) val_col = as_column(value, dtype=to_type) old_col = col.astype(to_type) col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True)
def make_aggregate_nullmask(df, columns=None, op="and"): out_mask = None for k in columns or df._data: col = cudf.core.dataframe.extract_col(df, k) if not col.nullable: continue nullmask = df[k].nullmask if out_mask is None: out_mask = column.as_column(nullmask.copy(), dtype=utils.mask_dtype) continue out_mask = libcudf.binaryop.binaryop(column.as_column(nullmask), out_mask, op, out_mask.dtype) return out_mask
def fillna(self, fill_value): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) result = column.build_column(result.data, result.dtype, mask=None) return result
def unique(self, method="sort"): # method variable will indicate what algorithm to use to # calculate unique, not used right now if method != "sort": msg = "non sort based unique() not implemented yet" raise NotImplementedError(msg) segs, sortedvals = self._unique_segments() # gather result out_col = column.as_column(sortedvals)[segs] return out_col
def test_as_column_scalar_with_nan(nan_as_null): size = 10 scalar = np.nan expected = cudf.Series([np.nan] * size, nan_as_null=nan_as_null).to_array() got = cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null)).to_array() np.testing.assert_equal(expected, got)
def unique(self): codes = self.as_numerical.unique() return column.build_categorical_column( categories=self.categories, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, offset=codes.offset, size=codes.size, ordered=self.ordered, )
def sort_by_values(self, ascending=True, na_position="last"): codes, inds = self.as_numerical.sort_by_values(ascending, na_position) col = column.build_categorical_column( categories=self.dtype.categories, codes=column.as_column(codes.base_data, dtype=codes.dtype), mask=codes.base_mask, size=codes.size, ordered=self.dtype.ordered, ) return col, inds
def extend(self, array): from cudf.core.column import column needed = array.size self._sentry_capacity(needed) array = column.as_column(array).astype(self.dtype).data.mem self.mem[self.size : self.size + needed].copy_to_device(array) self.size += needed
def fillna(self, fill_value, inplace=False): if is_scalar(fill_value): fill_value = np.datetime64(fill_value, self.time_unit) else: fill_value = column.as_column(fill_value, nan_as_null=False) result = libcudf.replace.replace_nulls(self, fill_value) result = result.replace(mask=None) return self._mimic_inplace(result, inplace)
def _set_categories(self, new_categories, **kwargs): """Returns a new CategoricalColumn with the categories set to the specified *new_categories*. Notes ----- Assumes ``new_categories`` is the same dtype as the current categories """ from cudf import DataFrame, Series cur_cats = self._parent.categories new_cats = column.as_column(new_categories) # Join the old and new categories to build a map from # old to new codes, inserting na_sentinel for any old # categories that don't exist in the new categories # Ensure new_categories is unique first if not (kwargs.get("is_unique", False) or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order new_cats = Series(new_cats).drop_duplicates()._column cur_codes = self.codes cur_order = cudautils.arange(len(cur_codes)) old_codes = cudautils.arange(len(cur_cats), dtype=cur_codes.dtype) new_codes = cudautils.arange(len(new_cats), dtype=cur_codes.dtype) new_df = DataFrame({"new_codes": new_codes, "cats": new_cats}) old_df = DataFrame({"old_codes": old_codes, "cats": cur_cats}) cur_df = DataFrame({"old_codes": cur_codes, "order": cur_order}) # Join the old and new categories and line up their codes df = old_df.merge(new_df, on="cats", how="left") # Join the old and new codes to "recode" the codes data buffer df = cur_df.merge(df, on="old_codes", how="left") df = df.sort_values(by="order").reset_index(True) ordered = kwargs.get("ordered", self.ordered) new_codes = df["new_codes"]._column new_dtype = CategoricalDtype(categories=new_cats, ordered=ordered) if kwargs.get("inplace", False): self._parent.data = None self._parent.mask = new_codes.mask self._parent.dtype = new_dtype self._parent.children = (new_codes, ) return None return column.build_column( data=None, dtype=new_dtype, mask=new_codes.mask, children=(new_codes, ), )