def __getitem__(self, arg): from cudf.dataframe import columnops if isinstance(arg, Number): arg = int(arg) return self.element_indexing(arg) elif isinstance(arg, slice): # compute mask slice if self.null_count > 0: if arg.step is not None and arg.step != 1: raise NotImplementedError(arg) # slicing data subdata = self.data[arg] # slicing mask if self.dtype == "object": data_size = self.data.size() else: data_size = self.data.size bytemask = cudautils.expand_mask_bits(data_size, self.mask.to_gpu_array()) submask = Buffer(cudautils.compact_mask_bytes(bytemask[arg])) col = self.replace(data=subdata, mask=submask) return col else: newbuffer = self.data[arg] return self.replace(data=newbuffer) else: arg = columnops.as_column(arg) if len(arg) == 0: arg = columnops.as_column([], dtype="int32") if pd.api.types.is_integer_dtype(arg.dtype): return self.take(arg.data.mem) if pd.api.types.is_bool_dtype(arg.dtype): return self.apply_boolean_mask(arg) raise NotImplementedError(type(arg))
def reorder_categories(self, new_categories, **kwargs): from cudf.dataframe.series import Series new_categories = columnops.as_column(new_categories) # Compare new_categories against current categories. # Ignore order for comparison because we're only interested # in whether new_categories has all the same values as the # current set of categories. if not self._categories_equal(new_categories, ordered=False): raise ValueError( "items in new_categories are not the same as in " "old categories" ) data = self._set_categories(new_categories, **kwargs) if data is not None: return Series(data=data)
def __getattr__(self, attr, *args, **kwargs): from cudf.dataframe.series import Series if hasattr(self._parent._data, attr): passed_attr = getattr(self._parent._data, attr) if callable(passed_attr): def wrapper(*args, **kwargs): return getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(wrapper, nvstrings.nvstrings): wrapper = Series(columnops.as_column(wrapper), index=self._index) return wrapper else: return passed_attr else: raise AttributeError(attr)
def __init__(self, levels, codes=None, labels=None, names=None): self.names = names column_names = [] if labels: warnings.warn("the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning) if labels and not codes: codes = labels if isinstance(names, (Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList)): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError('Must pass non-zero number of levels/codes') import cudf if not isinstance(codes, cudf.dataframe.dataframe.DataFrame) and\ not isinstance(codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError('Codes is not a Sequence of sequences') if not isinstance(codes, cudf.dataframe.dataframe.DataFrame): self.codes = cudf.dataframe.dataframe.DataFrame() for idx, code in enumerate(codes): code = np.array(code) self.codes.add_column(column_names[idx], columnops.as_column(code)) else: self.codes = codes # converting levels to numpy array will produce a Float64Index # (on empty levels)for levels mimicking the behavior of Pandas self.levels = np.array([Series(level).to_array() for level in levels]) self._validate_levels_and_codes(self.levels, self.codes) self.name = None self.names = names
def __init__(self, values, name=None): if isinstance(values, pd.Series) and \ pd.api.types.is_categorical_dtype(values.dtype): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories.tolist(), ordered=values.cat.ordered) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn(data=Buffer(values.codes), categories=values.categories.tolist(), ordered=values.ordered) elif isinstance(values, (list, tuple)): values = columnops.as_column( pd.Categorical(values, categories=values)) assert values.null_count == 0 self._values = values self.name = name self.names = [name]
def astype(self, dtype): if self.dtype == dtype: return self elif dtype in (np.dtype('int8'), np.dtype('int16')): out_dtype = np.dtype(dtype) dtype = np.dtype('int32') else: out_dtype = np.dtype(dtype) out_arr = rmm.device_array(shape=len(self), dtype=dtype) out_ptr = get_ctype_ptr(out_arr) kwargs = {'devptr': out_ptr} if dtype == np.dtype('datetime64[ms]'): kwargs['units'] = 'ms' _str_to_numeric_typecast_functions[np.dtype(dtype)](self.str(), **kwargs) out_col = columnops.as_column(out_arr) return out_col.astype(out_dtype)
def __getitem__(self, arg): rows = [] len_idx = len(self._sr) if isinstance(arg, tuple): for idx in arg: rows.append(idx) elif isinstance(arg, int): rows.append(arg) elif isinstance(arg, slice): start, stop, step, sln = utils.standard_python_slice(len_idx, arg) if sln > 0: for idx in range(start, stop, step): rows.append(idx) else: raise TypeError(type(arg)) # To check whether all the indices are valid. for idx in rows: if abs(idx) > len_idx or idx == len_idx: raise IndexError("positional indexers are out-of-bounds") for i in range(len(rows)): if rows[i] < 0: rows[i] = len_idx + rows[i] # returns the single elem similar to pandas if isinstance(arg, int) and len(rows) == 1: return self._sr[rows[0]] ret_list = [] for idx in rows: ret_list.append(self._sr[idx]) col_data = columnops.as_column(ret_list, dtype=self._sr.dtype, nan_as_null=True) return Series(col_data, index=as_index(np.asarray(rows)))
def as_index(arbitrary, **kwargs): """Create an Index from an arbitrary object Currently supported inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - GenericIndex for all other inputs. """ kwargs = _setdefault_name(arbitrary, kwargs) if isinstance(arbitrary, Index): return arbitrary.rename(**kwargs) elif isinstance(arbitrary, NumericalColumn): return GenericIndex(arbitrary, **kwargs) elif isinstance(arbitrary, StringColumn): return StringIndex(arbitrary, **kwargs) elif isinstance(arbitrary, DatetimeColumn): return DatetimeIndex(arbitrary, **kwargs) elif isinstance(arbitrary, CategoricalColumn): return CategoricalIndex(arbitrary, **kwargs) elif isinstance(arbitrary, cudf.Series): return as_index(arbitrary._column, **kwargs) elif isinstance(arbitrary, pd.RangeIndex): return RangeIndex(start=arbitrary._start, stop=arbitrary._stop, **kwargs) else: return as_index(columnops.as_column(arbitrary), **kwargs)
def deserialize(cls, header, frames): data, mask = super(CategoricalColumn, cls).deserialize(header, frames) # Handle categories that were serialized as a cudf.Column category_frames = frames[ len(frames) - header["category_frame_count"] : ] cat_typ = pickle.loads(header["categories"]["type"]) _categories = cat_typ.deserialize( header["categories"], category_frames ) categories = columnops.as_column(_categories) return cls( data=data, mask=mask, categories=categories, ordered=header["ordered"], )
def find_last(arr, val): """ Returns the index of the last occurrence of *val* in *arr*. Otherwise, returns -1. Parameters ---------- arr : device array val : scalar """ found = rmm.device_array_like(arr) if found.size > 0: if arr.dtype in ('float32', 'float64'): gpu_mark_found_float.forall(found.size)(arr, val, found, -1) else: gpu_mark_found_int.forall(found.size)(arr, val, found, -1) from cudf.dataframe.columnops import as_column found_col = as_column(found) max_index = found_col.max() return max_index
def __init__(self, data=None, index=None, name=None, nan_as_null=True): if isinstance(data, pd.Series): name = data.name index = as_index(data.index) if isinstance(data, Series): index = data._index if index is None else index name = data.name data = data._column if data is None: data = {} if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data, nan_as_null=nan_as_null) if index is not None and not isinstance(index, Index): raise TypeError('index not a Index type: got {!r}'.format(index)) assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index self.name = name
def from_masked_array(cls, data, mask, null_count=None): """Create a Series with null-mask. This is equivalent to: Series(data).set_mask(mask, null_count=null_count) Parameters ---------- data : 1D array-like The values. Null values must not be skipped. They can appear as garbage values. mask : 1D array-like of numpy.uint8 The null-mask. Valid values are marked as ``1``; otherwise ``0``. The mask bit given the data index ``idx`` is computed as:: (mask[idx // 8] >> (idx % 8)) & 1 null_count : int, optional The number of null values. If None, it is calculated automatically. """ col = columnops.as_column(data).set_mask(mask, null_count=null_count) return cls(data=col)
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, CategoricalColumn): values = values elif isinstance(values, pd.Series) and (is_categorical_dtype( values.dtype)): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories, ordered=values.cat.ordered, ) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn( data=Buffer(values.codes), categories=values.categories, ordered=values.ordered, ) elif isinstance(values, (list, tuple)): values = columnops.as_column( pd.Categorical(values, categories=values)) super(CategoricalIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0
def as_index(arbitrary, name=None): """Create an Index from an arbitrary object Currently supported inputs are: * ``Column`` * ``Buffer`` * ``Series`` * ``Index`` * numba device array * numpy array * pyarrow array * pandas.Categorical Returns ------- result : subclass of Index - CategoricalIndex for Categorical input. - DatetimeIndex for Datetime input. - GenericIndex for all other inputs. """ # This function should probably be moved to Index.__new__ if isinstance(arbitrary, Index): return arbitrary elif isinstance(arbitrary, NumericalColumn): return GenericIndex(arbitrary, name=name) elif isinstance(arbitrary, StringColumn): return StringIndex(arbitrary, name=name) elif isinstance(arbitrary, DatetimeColumn): return DatetimeIndex(arbitrary, name=name) elif isinstance(arbitrary, CategoricalColumn): return CategoricalIndex(arbitrary, name=name) else: if hasattr(arbitrary, 'name') and name is None: name = arbitrary.name if len(arbitrary) == 0: return RangeIndex(0, 0, name=name) return as_index(columnops.as_column(arbitrary), name=name)
def __init__(self, data=None, index=None, name=None, nan_as_null=True, dtype=None): if isinstance(data, pd.Series): name = data.name index = as_index(data.index) if isinstance(data, Series): index = data._index if index is None else index name = data.name data = data._column if data is None: data = {} if not isinstance(data, columnops.TypedColumnBase): data = columnops.as_column(data, nan_as_null=nan_as_null, dtype=dtype) if index is not None and not isinstance(index, Index): index = as_index(index) assert isinstance(data, columnops.TypedColumnBase) self._column = data self._index = RangeIndex(len(data)) if index is None else index self.name = name
def fillna(self, fill_value, inplace=False): """ Fill null values with *fill_value* """ result = self.copy() if np.isscalar(fill_value): if fill_value != self.default_na_value(): if (fill_value not in self.cat().categories): raise ValueError("fill value must be in categories") fill_value = pd.Categorical(fill_value, categories=self.cat().categories) fill_value_col = columnops.as_column(fill_value, nan_as_null=False) # TODO: only required if fill_value has a subset of the categories: fill_value_col = fill_value_col.cat()._set_categories( self.cat().categories) cpp_replace.replace_nulls(result, fill_value_col) result = result.replace(mask=None) return self._mimic_inplace(result, inplace)
def fillna(self, fill_value, inplace=False): """ Fill null values with *fill_value* """ if np.isscalar(fill_value): # castsafely to the same dtype as self fill_value_casted = self.dtype.type(fill_value) if not np.isnan(fill_value) and (fill_value_casted != fill_value): raise TypeError( "Cannot safely cast non-equivalent {} to {}".format( type(fill_value).__name__, self.dtype.name ) ) fill_value = fill_value_casted else: fill_value = columnops.as_column(fill_value, nan_as_null=False) # cast safely to the same dtype as self if is_integer_dtype(self.dtype): fill_value = safe_cast_to_int(fill_value, self.dtype) else: fill_value = fill_value.astype(self.dtype) result = cpp_replace.apply_replace_nulls(self, fill_value) return self._mimic_inplace(result, inplace)
def __init__( self, levels=None, codes=None, labels=None, names=None, **kwargs ): from cudf.dataframe.series import Series self.name = None self.names = names self._source_data = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: self._source_data = kwargs["source_data"].reset_index(drop=True) self._codes = codes self._levels = levels return # name setup if isinstance( names, ( Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList, ), ): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") from cudf import DataFrame if not isinstance(codes, DataFrame) and not isinstance( codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray) ): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, DataFrame): self._codes = codes elif len(levels) == len(codes): self._codes = DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = columnops.as_column(codes) self._codes[name] = codes.astype(np.int64) else: raise ValueError( "MultiIndex has unequal number of levels and " "codes and is inconsistent!" ) self._levels = [Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) self._source_data = DataFrame() for i, name in enumerate(self._codes.columns): codes = as_index(self._codes[name]._column) level = DataFrame({name: self._levels[i]}) level = DataFrame(index=codes).join(level) self._source_data[name] = level[name].reset_index(drop=True) self.names = [None] * len(self._levels) if names is None else names
def _find_first_and_last(self, value): found_indices = self.str().contains(f"^{value}$").data.mem found_indices = cudautils.astype(found_indices, "int32") first = columnops.as_column(found_indices).find_first_value(1) last = columnops.as_column(found_indices).find_last_value(1) return first, last
def wrapper(*args, **kwargs): ret = getattr(self._parent._data, attr)(*args, **kwargs) if isinstance(ret, nvstrings.nvstrings): ret = Series(columnops.as_column(ret), index=self._index) return ret
def get_sorted_inds(by, ascending=True, na_position="last"): """ Sort by the values. Parameters ---------- by : Column or list of Column Column or list of Column objects to sort by. ascending : bool or list of bool, default True If True, sort values in ascending order, otherwise descending. na_position : {‘first’ or ‘last’}, default ‘last’ Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs at the end. Returns ------- col_inds : cuDF Column of indices sorted based on input Difference from pandas: * Support axis='index' only. * Not supporting: inplace, kind * Ascending can be a list of bools to control per column """ if isinstance(by, (Column)): by = [by] inds = Buffer(cudautils.arange(len(by[0]))) # This is due to current limitation in libcudf of using int32 col_inds = columnops.as_column(inds).astype('int32') # This needs to be updated to handle list of bools for ascending if ascending is True: if na_position == "last": na_position = 0 elif na_position == "first": na_position = 1 elif ascending is False: if na_position == "last": na_position = 1 elif na_position == "first": na_position = 0 else: logging.warning( "When using a sequence of booleans for `ascending`, `na_position` " "flag is not yet supported and defaults to treating nulls as " "greater than all numbers" ) na_position = 0 # If given a scalar need to construct a sequence of length # of columns if np.isscalar(ascending): ascending = [ascending] * len(by) # If given a list-like need to convert to a numpy array and copy to device if isinstance(ascending, collections.abc.Sequence): # Need to flip the boolean here since libcudf has 0 as ascending ascending = [not val for val in ascending] ascending = rmm.to_device(np.array(ascending, dtype='int8')) else: raise ValueError("Must use a boolean or list of booleans") cpp_sort.apply_order_by(by, col_inds, ascending, na_position) return col_inds
def append(self, other): from cudf.dataframe.columnops import as_column return Column._concat([self, as_column(other)])
def searchsorted(self, value, side="left"): value_col = columnops.as_column(value) return cpp_search.search_sorted(self, value_col, side)
def take(self, indices): return columnops.as_column(self._values).element_indexing(indices)
def astype(self, dtype): from cudf.dataframe import columnops return columnops.as_column(self).astype(dtype).data