def __init__(self, values, name=None): if isinstance(values, pd.Series) and \ pd.api.types.is_categorical_dtype(values.dtype): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories.tolist(), ordered=values.cat.ordered) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn(data=Buffer(values.codes), categories=values.categories.tolist(), ordered=values.ordered) self._values = values self.name = name self.names = [name]
def __setitem__(self, key, value): """ Set the value of self[key] to value. If value and self are of different types, value is coerced to self.dtype """ import cudf.bindings.copying as cpp_copying from cudf.dataframe import columnops if isinstance(key, slice): key_start, key_stop, key_stride = key.indices(len(self)) if key_stride != 1: raise NotImplementedError("Stride not supported in slice") nelem = abs(key_stop - key_start) else: key = columnops.as_column(key) if pd.api.types.is_bool_dtype(key.dtype): if not len(key) == len(self): raise ValueError( "Boolean mask must be of same length as column") key = columnops.as_column(cudautils.arange(len(self)))[key] nelem = len(key) if utils.is_scalar(value): if is_categorical_dtype(self.dtype): from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.buffer import Buffer from cudf.utils.cudautils import fill_value data = rmm.device_array(nelem, dtype="int8") fill_value(data, self._encode(value)) value = CategoricalColumn( data=Buffer(data), categories=self._categories, ordered=False, ) elif value is None: value = columnops.column_empty(nelem, self.dtype, masked=True) else: to_dtype = pd.api.types.pandas_dtype(self.dtype) value = utils.scalar_broadcast_to(value, nelem, to_dtype) value = columnops.as_column(value).astype(self.dtype) if len(value) != nelem: msg = (f"Size mismatch: cannot set value " f"of size {len(value)} to indexing result of size " f"{nelem}") raise ValueError(msg) if isinstance(key, slice): out = cpp_copying.apply_copy_range(self, value, key_start, key_stop, 0) else: out = cpp_copying.apply_scatter(value, key, self) self._data = out.data self._mask = out.mask self._update_null_count()
def __init__(self, values, name=None): if isinstance(values, pd.Series) and \ pd.api.types.is_categorical_dtype(values.dtype): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories.tolist(), ordered=values.cat.ordered) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn(data=Buffer(values.codes), categories=values.categories.tolist(), ordered=values.ordered) elif isinstance(values, (list, tuple)): values = columnops.as_column( pd.Categorical(values, categories=values)) assert values.null_count == 0 self._values = values self.name = name self.names = [name]
def __init__(self, values, **kwargs): kwargs = _setdefault_name(values, kwargs) if isinstance(values, CategoricalColumn): values = values elif isinstance(values, pd.Series) and (is_categorical_dtype( values.dtype)): values = CategoricalColumn( data=Buffer(values.cat.codes.values), categories=values.cat.categories, ordered=values.cat.ordered, ) elif isinstance(values, (pd.Categorical, pd.CategoricalIndex)): values = CategoricalColumn( data=Buffer(values.codes), categories=values.categories, ordered=values.ordered, ) elif isinstance(values, (list, tuple)): values = columnops.as_column( pd.Categorical(values, categories=values)) super(CategoricalIndex, self).__init__(values, **kwargs) assert self._values.null_count == 0
def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple(set([val for o in objs for val in o])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def melt(frame, id_vars=None, value_vars=None, var_name='variable', value_name='value'): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- .. code-block:: python import cudf import numpy as np df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, 'B': {0: 1, 1: 3, 2: 6}, 'C': {0: 1.0, 1: np.nan, 2: 4.0}, 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) df2 = cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) print(df2) Output: .. code-block:: python A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError( "The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(pd.api.types.is_categorical_dtype(t) for t in dtypes): raise NotImplementedError('Categorical columns are not yet ' 'supported for function') # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError('all cols in value_vars must have the same dtype') # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError( "'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap))) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series(Buffer.null(dtype=A.dtype)) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append(Series(Buffer( cudautils.full(size=N, value=i, dtype=np.int8)))) temp = Series._concat(objs=var_cols, index=None) mdata[var_name] = Series(CategoricalColumn( categories=tuple(value_vars), data=temp._column.data, ordered=False)) # Step 3: add values mdata[value_name] = Series._concat( objs=[frame[val] for val in value_vars], index=None) return DataFrame(mdata)
def _apply_agg(self, agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=True): """ Parameters ---------- agg_type : str The aggregation function to run. result : DataFrame The DataFrame to store the result of the aggregation into. add_col_values : bool Boolean to indicate whether this is the first aggregation being run and should add the additional columns' values. ctx : gdf_context cffi object Context object to pass information such as if the dataframe is sorted and/or which method to use for grouping. val_columns : list of *str* The list of column names that the aggregation should be performed on. val_columns_out : list of *str* The list of columns names that the aggregation results should be output into. """ if sort_result: ctx.flag_sort_result = 1 ncols = len(self._by) cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by] first_run = add_col_values need_to_index = False col_count = 0 for val_col in val_columns: col_agg = self._df[val_col]._column.cffi_view # assuming here that if there are multiple aggregations that the # aggregated results will be in the same order for GDF_SORT method if need_to_index: out_col_indices_series = Series( Buffer(rmm.device_array(col_agg.size, dtype=np.int32))) out_col_indices = out_col_indices_series._column.cffi_view else: out_col_indices = ffi.NULL out_col_values_series = [Series(Buffer(rmm.device_array( col_agg.size, dtype=self._df[self._by[i]]._column.data.dtype))) for i in range(0, ncols)] out_col_values = [ out_col_values_series[i]._column.cffi_view for i in range(0, ncols)] if agg_type == "count": out_col_agg_series = Series( Buffer(rmm.device_array(col_agg.size, dtype=np.int64))) else: out_col_agg_series = Series(Buffer(rmm.device_array( col_agg.size, dtype=self._df[val_col]._column.data.dtype))) out_col_agg = out_col_agg_series._column.cffi_view agg_func = self._NAMED_FUNCTIONS.get(agg_type, None) if agg_func is None: raise RuntimeError( "ERROR: this aggregator has not been implemented yet") err = agg_func( ncols, cols, col_agg, out_col_indices, out_col_values, out_col_agg, ctx) if (err is not None): print(err) raise RuntimeError(err) num_row_results = out_col_agg.size if first_run: for i, thisBy in enumerate(self._by): result[thisBy] = out_col_values_series[i][ :num_row_results] if is_categorical_dtype(self._df[thisBy].dtype): result[thisBy] = CategoricalColumn( data=result[thisBy].data, categories=self._df[thisBy].cat.categories, ordered=self._df[thisBy].cat.ordered) out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index() result[val_columns_out[col_count] ] = out_col_agg_series[:num_row_results] out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index() first_run = False col_count = col_count + 1 return result
def _apply_agg(self, agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=True): """ Parameters ---------- agg_type : str The aggregation function to run. result : DataFrame The DataFrame to store the result of the aggregation into. add_col_values : bool Boolean to indicate whether this is the first aggregation being run and should add the additional columns' values. ctx : gdf_context cffi object Context object to pass information such as if the dataframe is sorted and/or which method to use for grouping. val_columns : list of *str* The list of column names that the aggregation should be performed on. val_columns_out : list of *str* The list of columns names that the aggregation results should be output into. """ if sort_result: ctx.flag_sort_result = 1 ncols = len(self._by) cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by] first_run = add_col_values need_to_index = self._as_index col_count = 0 if isinstance(val_columns, (str, Number)): val_columns = [val_columns] for val_col in val_columns: col_agg = self._df[val_col]._column.cffi_view # assuming here that if there are multiple aggregations that the # aggregated results will be in the same order for GDF_SORT method if need_to_index: out_col_indices_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int32 ) ) ) out_col_indices = out_col_indices_series._column.cffi_view else: out_col_indices = ffi.NULL out_col_values_series = [] for i in range(0, ncols): if self._df[self._by[i]].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') col = Series([''], dtype='str')[gather_map]\ .reset_index(drop=True) else: col = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[self._by[i]]._column.data.dtype ) ) ) out_col_values_series.append(col) out_col_values = [ out_col_values_series[i]._column.cffi_view for i in range(0, ncols)] if agg_type == "count": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int64 ) ) ) elif agg_type == "mean": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.float64 ) ) ) else: if self._df[val_col].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') out_col_agg_series = Series( [''], dtype='str' )[gather_map].reset_index(drop=True) else: out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[val_col]._column.data.dtype ) ) ) out_col_agg = out_col_agg_series._column.cffi_view agg_func = self._NAMED_FUNCTIONS.get(agg_type, None) if agg_func is None: raise RuntimeError( "ERROR: this aggregator has not been implemented yet") err = agg_func( ncols, cols, col_agg, out_col_indices, out_col_values, out_col_agg, ctx) if (err is not None): raise RuntimeError(err) num_row_results = out_col_agg.size # NVStrings columns are not the same going in as coming out but we # can't create entire CFFI views otherwise multiple objects will # try to free the memory for i, col in enumerate(out_col_values_series): if col.dtype == np.dtype("object") and len(col) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_values[i].dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_values_series[i]._column._data = nvstr_obj out_col_values_series[i]._column._nvcategory = nvcat_obj if out_col_agg_series.dtype == np.dtype("object") and \ len(out_col_agg_series) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_agg.dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_agg_series._column._data = nvstr_obj out_col_agg_series._column._nvcategory = nvcat_obj if first_run: for i, thisBy in enumerate(self._by): result[thisBy] = out_col_values_series[i][ :num_row_results] if is_categorical_dtype(self._df[thisBy].dtype): result[thisBy] = CategoricalColumn( data=result[thisBy].data, categories=self._df[thisBy].cat.categories, ordered=self._df[thisBy].cat.ordered ) if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) if isinstance(val_columns_out, (str, Number)): result[val_columns_out] = out_col_agg_series[:num_row_results] else: result[val_columns_out[col_count] ] = out_col_agg_series[:num_row_results] if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) first_run = False col_count = col_count + 1 return result
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col