def _apply_agg(self, agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=True): """ Parameters ---------- agg_type : str The aggregation function to run. result : DataFrame The DataFrame to store the result of the aggregation into. add_col_values : bool Boolean to indicate whether this is the first aggregation being run and should add the additional columns' values. ctx : gdf_context cffi object Context object to pass information such as if the dataframe is sorted and/or which method to use for grouping. val_columns : list of *str* The list of column names that the aggregation should be performed on. val_columns_out : list of *str* The list of columns names that the aggregation results should be output into. """ if sort_result: ctx.flag_sort_result = 1 ncols = len(self._by) cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by] first_run = add_col_values need_to_index = False col_count = 0 for val_col in val_columns: col_agg = self._df[val_col]._column.cffi_view # assuming here that if there are multiple aggregations that the # aggregated results will be in the same order for GDF_SORT method if need_to_index: out_col_indices_series = Series( Buffer(rmm.device_array(col_agg.size, dtype=np.int32))) out_col_indices = out_col_indices_series._column.cffi_view else: out_col_indices = ffi.NULL out_col_values_series = [Series(Buffer(rmm.device_array( col_agg.size, dtype=self._df[self._by[i]]._column.data.dtype))) for i in range(0, ncols)] out_col_values = [ out_col_values_series[i]._column.cffi_view for i in range(0, ncols)] if agg_type == "count": out_col_agg_series = Series( Buffer(rmm.device_array(col_agg.size, dtype=np.int64))) else: out_col_agg_series = Series(Buffer(rmm.device_array( col_agg.size, dtype=self._df[val_col]._column.data.dtype))) out_col_agg = out_col_agg_series._column.cffi_view agg_func = self._NAMED_FUNCTIONS.get(agg_type, None) if agg_func is None: raise RuntimeError( "ERROR: this aggregator has not been implemented yet") err = agg_func( ncols, cols, col_agg, out_col_indices, out_col_values, out_col_agg, ctx) if (err is not None): print(err) raise RuntimeError(err) num_row_results = out_col_agg.size if first_run: for i, thisBy in enumerate(self._by): result[thisBy] = out_col_values_series[i][ :num_row_results] if is_categorical_dtype(self._df[thisBy].dtype): result[thisBy] = CategoricalColumn( data=result[thisBy].data, categories=self._df[thisBy].cat.categories, ordered=self._df[thisBy].cat.ordered) out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index() result[val_columns_out[col_count] ] = out_col_agg_series[:num_row_results] out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index() first_run = False col_count = col_count + 1 return result
def _get_row_major(self, df, row_tuple): slice_access = False if isinstance(row_tuple[0], numbers.Number): valid_indices = row_tuple[0] elif isinstance(row_tuple[0], slice): # 1. empty slice compute if row_tuple[0].stop == 0: valid_indices = [] else: slice_access = True start = row_tuple[0].start or 0 stop = row_tuple[0].stop or len(df) step = row_tuple[0].step or 1 valid_indices = cudautils.arange(start, stop, step) else: valid_indices = self._compute_validity_mask(df, row_tuple) from cudf import Series result = df.take(Series(valid_indices)) # Build new index - INDEX based MultiIndex # --------------- from cudf import DataFrame out_index = DataFrame() # Select the last n-k columns where n is the number of source # levels and k is the length of the indexing tuple size = 0 if not isinstance(row_tuple[0], (numbers.Number, slice)): size = len(row_tuple) for k in range(size, len(df.index.levels)): out_index.add_column(df.index.names[k], df.index.codes[df.index.codes.columns[k]]) # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the proper codes. if len(out_index.columns) == 1: out_index = [] for val in result.index.codes[result.index.codes.columns[len(result.index.codes.columns)-1]]: # noqa: E501 out_index.append(result.index.levels[ len(result.index.codes.columns)-1][val]) out_index = as_index(out_index) out_index.name = result.index.names[len(result.index.names)-1] result.index = out_index else: if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly result = result.T result = result[result.columns[0]] # convert to Series series_name = [] for idx, code in enumerate(result.columns.codes): series_name.append(result.columns.levels[idx][ result.columns.codes[code][0]]) result = Series(list(result._cols.values())[0], name=series_name) result.name = tuple(series_name) elif(len(out_index.columns)) > 0: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) result.index = result.index._popn(size) return result
def _apply_agg(self, agg_type, result, add_col_values, ctx, val_columns, val_columns_out, sort_result=True): """ Parameters ---------- agg_type : str The aggregation function to run. result : DataFrame The DataFrame to store the result of the aggregation into. add_col_values : bool Boolean to indicate whether this is the first aggregation being run and should add the additional columns' values. ctx : gdf_context cffi object Context object to pass information such as if the dataframe is sorted and/or which method to use for grouping. val_columns : list of *str* The list of column names that the aggregation should be performed on. val_columns_out : list of *str* The list of columns names that the aggregation results should be output into. """ if sort_result: ctx.flag_sort_result = 1 ncols = len(self._by) cols = [self._df[thisBy]._column.cffi_view for thisBy in self._by] first_run = add_col_values need_to_index = self._as_index col_count = 0 if isinstance(val_columns, (str, Number)): val_columns = [val_columns] for val_col in val_columns: col_agg = self._df[val_col]._column.cffi_view # assuming here that if there are multiple aggregations that the # aggregated results will be in the same order for GDF_SORT method if need_to_index: out_col_indices_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int32 ) ) ) out_col_indices = out_col_indices_series._column.cffi_view else: out_col_indices = ffi.NULL out_col_values_series = [] for i in range(0, ncols): if self._df[self._by[i]].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') col = Series([''], dtype='str')[gather_map]\ .reset_index(drop=True) else: col = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[self._by[i]]._column.data.dtype ) ) ) out_col_values_series.append(col) out_col_values = [ out_col_values_series[i]._column.cffi_view for i in range(0, ncols)] if agg_type == "count": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.int64 ) ) ) elif agg_type == "mean": out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=np.float64 ) ) ) else: if self._df[val_col].dtype == np.dtype('object'): # This isn't ideal, but no better way to create an # nvstrings object of correct size gather_map = zeros(col_agg.size, dtype='int32') out_col_agg_series = Series( [''], dtype='str' )[gather_map].reset_index(drop=True) else: out_col_agg_series = Series( Buffer( rmm.device_array( col_agg.size, dtype=self._df[val_col]._column.data.dtype ) ) ) out_col_agg = out_col_agg_series._column.cffi_view agg_func = self._NAMED_FUNCTIONS.get(agg_type, None) if agg_func is None: raise RuntimeError( "ERROR: this aggregator has not been implemented yet") err = agg_func( ncols, cols, col_agg, out_col_indices, out_col_values, out_col_agg, ctx) if (err is not None): raise RuntimeError(err) num_row_results = out_col_agg.size # NVStrings columns are not the same going in as coming out but we # can't create entire CFFI views otherwise multiple objects will # try to free the memory for i, col in enumerate(out_col_values_series): if col.dtype == np.dtype("object") and len(col) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_values[i].dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_values_series[i]._column._data = nvstr_obj out_col_values_series[i]._column._nvcategory = nvcat_obj if out_col_agg_series.dtype == np.dtype("object") and \ len(out_col_agg_series) > 0: import nvcategory nvcat_ptr = int( ffi.cast( "uintptr_t", out_col_agg.dtype_info.category ) ) nvcat_obj = None if nvcat_ptr: nvcat_obj = nvcategory.bind_cpointer(nvcat_ptr) nvstr_obj = nvcat_obj.to_strings() else: import nvstrings nvstr_obj = nvstrings.to_device([]) out_col_agg_series._column._data = nvstr_obj out_col_agg_series._column._nvcategory = nvcat_obj if first_run: for i, thisBy in enumerate(self._by): result[thisBy] = out_col_values_series[i][ :num_row_results] if is_categorical_dtype(self._df[thisBy].dtype): result[thisBy] = CategoricalColumn( data=result[thisBy].data, categories=self._df[thisBy].cat.categories, ordered=self._df[thisBy].cat.ordered ) if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) if isinstance(val_columns_out, (str, Number)): result[val_columns_out] = out_col_agg_series[:num_row_results] else: result[val_columns_out[col_count] ] = out_col_agg_series[:num_row_results] if out_col_agg_series.dtype != np.dtype("object"): out_col_agg_series.data.size = num_row_results out_col_agg_series = out_col_agg_series.reset_index(drop=True) first_run = False col_count = col_count + 1 return result
def _index_and_downcast(self, result, index, index_key): from cudf import DataFrame from cudf import Series if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] if ( len(index_key) > 0 and not isinstance(index_key, tuple) ) or isinstance(index_key[0], slice): index_key = index_key[0] slice_access = False if isinstance(index_key, slice): slice_access = True out_index = DataFrame() # Select the last n-k columns where n is the number of _source_data # columns and k is the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._source_data.columns)): out_index.add_column( index.names[k], index._source_data[index._source_data.columns[k]], ) if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result.columns[0]] # convert to Series series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(result.columns._source_data[code][0]) result = Series(list(result._cols.values())[0], index=result.index) result.name = tuple(series_name) elif len(result) == 0 and slice_access is False: # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(index._source_data[code][0]) result = Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the _source_data column names last_column = index._source_data.columns[-1] out_index = index._source_data[last_column] out_index = as_index(out_index) out_index.name = index.names[len(index.names) - 1] index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) index = index._popn(size) if isinstance(index_key, tuple): result = result.set_index(index) return result