def _add_fold(s, kfold, fold_seed=None): """Deterministically computes a '__fold__' column, given an optional random seed""" typ = np.min_scalar_type(kfold * 2) if fold_seed is None: # If we don't have a specific seed, # just use a simple modulo-based mapping fold = _arange(len(s), like_df=s, dtype=typ) np.mod(fold, kfold, out=fold) return fold else: state = _random_state(fold_seed, like_df=s) return state.choice(_arange(kfold, like_df=s, dtype=typ), len(s))
def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: new_df = type(df)() tmp = "__tmp__" # Temporary column for sorting df[tmp] = _arange(len(df), like_df=df, dtype="int32") cat_names, multi_col_group = nvt_cat._get_multicolumn_names( col_selector, df.columns, self.name_sep) _read_pq_func = _read_parquet_dispatch(df) for name in cat_names: new_part = type(df)() storage_name = self.storage_name.get(name, name) name = multi_col_group.get(name, name) path = self.categories[storage_name] selection_l = list(name) if isinstance(name, tuple) else [name] selection_r = list(name) if isinstance(name, tuple) else [storage_name] stat_df = nvt_cat._read_groupby_stat_df(path, storage_name, self.cat_cache, _read_pq_func) tran_df = df[selection_l + [tmp]].merge(stat_df, left_on=selection_l, right_on=selection_r, how="left") tran_df = tran_df.sort_values(tmp) tran_df.drop(columns=selection_l + [tmp], inplace=True) new_cols = [c for c in tran_df.columns if c not in new_df.columns] new_part = tran_df[new_cols].reset_index(drop=True) new_df = _concat_columns([new_df, new_part]) df.drop(columns=[tmp], inplace=True) return new_df
def transform(self, columns: ColumnNames, df: DataFrameType) -> DataFrameType: # Add temporary column for sorting tmp = "__tmp__" df[tmp] = _arange(len(df), like_df=df, dtype="int32") fit_folds = self.kfold > 1 if fit_folds: df[self.fold_name] = _add_fold(df.index, self.kfold, self.fold_seed) # Need mean of contiuous target column y_mean = self.target_mean or self.means # Loop over categorical-column groups and apply logic new_df = None for ind, cat_group in enumerate(columns): if isinstance(cat_group, tuple): cat_group = list(cat_group) elif isinstance(cat_group, str): cat_group = [cat_group] if new_df is None: new_df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind) else: _df = self._op_group_logic(cat_group, df, y_mean, fit_folds, ind) new_df = _concat_columns([new_df, _df]) # Drop temporary columns df.drop(columns=[tmp, "__fold__"] if fit_folds and self.drop_folds else [tmp], inplace=True) if fit_folds and not self.drop_folds: new_df[self.fold_name] = df[self.fold_name] return new_df
def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFrameType: self.cpu = isinstance(df, pd.DataFrame) tmp = "__tmp__" # Temporary column for sorting df[tmp] = _arange(len(df), like_df=df, dtype="int32") new_df = self._merge(df, self._ext) new_df = new_df.sort_values(tmp) new_df.drop(columns=[tmp], inplace=True) df.drop(columns=[tmp], inplace=True) new_df.reset_index(drop=True, inplace=True) return new_df
def _encode( name, storage_name, path, df, cat_cache, na_sentinel=-1, freq_threshold=0, search_sorted=False, buckets=None, encode_type="joint", cat_names=None, max_size=0, ): if isinstance(buckets, int): buckets = {name: buckets for name in cat_names} # this is to apply freq_hashing logic if max_size: freq_threshold = 1 value = None selection_l = name if isinstance(name, list) else [name] selection_r = name if isinstance(name, list) else [storage_name] list_col = _is_list_col(selection_l, df) if path: read_pq_func = _read_parquet_dispatch(df) if cat_cache is not None: cat_cache = ( cat_cache if isinstance(cat_cache, str) else cat_cache.get(storage_name, "disk") ) if len(df): with get_worker_cache("cats") as cache: value = fetch_table_data( cache, path, columns=selection_r, cache=cat_cache, cats_only=True, reader=read_pq_func, ) else: value = read_pq_func(path, columns=selection_r) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if value is None: value = type(df)() for c in selection_r: typ = df[selection_l[0]].dtype if len(selection_l) == 1 else df[c].dtype value[c] = df._constructor_sliced([None], dtype=typ) value.index.name = "labels" value.reset_index(drop=False, inplace=True) if not search_sorted: if list_col: codes = _flatten_list_column(df[selection_l[0]]) codes["order"] = _arange(len(codes), like_df=df) else: codes = type(df)({"order": _arange(len(df), like_df=df)}, index=df.index) for c in selection_l: codes[c] = df[c].copy() if buckets and storage_name in buckets: na_sentinel = _hash_bucket(df, buckets, selection_l, encode_type=encode_type) # apply frequency hashing if freq_threshold and buckets and storage_name in buckets: merged_df = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order") merged_df.reset_index(drop=True, inplace=True) max_id = merged_df["labels"].max() merged_df["labels"].fillna( df._constructor_sliced(na_sentinel + max_id + 1), inplace=True ) labels = merged_df["labels"].values # only do hashing elif buckets and storage_name in buckets: labels = na_sentinel # no hashing else: na_sentinel = 0 labels = codes.merge( value, left_on=selection_l, right_on=selection_r, how="left" ).sort_values("order")["labels"] labels.fillna(na_sentinel, inplace=True) labels = labels.values else: # Use `searchsorted` if we are using a "full" encoding if list_col: labels = value[selection_r].searchsorted( df[selection_l[0]].list.leaves, side="left", na_position="first" ) else: labels = value[selection_r].searchsorted( df[selection_l], side="left", na_position="first" ) labels[labels >= len(value[selection_r])] = na_sentinel if list_col: labels = _encode_list_column(df[selection_l[0]], labels) return labels