def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ new_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if self._exclude_cols: for col in self._exclude_cols: input_cols.remove(col) for col in input_cols: out_col = self._output_prefix + col + self._output_suffix if cudf_is_available() and isinstance(new_df, cudf.DataFrame): X = self._uniques[col].get_indexer(new_df[col].to_array()) else: X = self._uniques[col].get_indexer(new_df[col]) if self._unseen == "n_unique": missing_values = new_df[col].isna() unseen_values = np.invert(new_df[col].isin(self._uniques[col])) unseen_mask = np.bitwise_xor(missing_values, unseen_values) X[unseen_mask] = len(self._uniques[col]) new_df[out_col] = X return new_df
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ new_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() for col in input_cols: out_col = self._output_prefix + col + self._output_suffix X = self._uniques[col].get_indexer(new_df[col]) if self._unseen == "n_unique": missing_values = new_df[col].isna() unseen_values = np.invert(new_df[col].isin(self._uniques[col])) unseen_mask = np.bitwise_xor(missing_values, unseen_values) X[unseen_mask] = len(self._uniques[col]) missing_values = new_df[col].isna() unseen_values = np.invert(new_df[col].isin(self._uniques[col])) unseen_mask = np.bitwise_xor(missing_values, unseen_values) X[~unseen_mask] = np.array(self._labels[col])[X[~unseen_mask]] new_df[out_col] = X return new_df
def fit_transform(self, input_df: XDataFrame) -> XDataFrame: """Fit to data frame, then transform it. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ new_df = input_df.copy() if not self._input_cols: self._input_cols = [ col for col in new_df.columns.tolist() if (col not in self._exclude_cols) ] return self.transform(new_df)
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame: Output data frame. """ new_df = input_df.copy() for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix count_encoder = self._count_encoders[col] new_df[out_col] = count_encoder.transform(new_df[col].copy()) return new_df
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. """ new_df = input_df.copy() generated_cols = [] n_fixed_cols = len(self._include_cols) for cols_pairs in combinations(self._input_cols, r=self._r - n_fixed_cols): fixed_cols_str = "".join(self._include_cols) pairs_cols_str = "".join(cols_pairs) new_col = (self._output_prefix + fixed_cols_str + pairs_cols_str + self._output_suffix) generated_cols.append(new_col) concat_cols = self._include_cols + list(cols_pairs) new_ser = None for col in concat_cols: if new_ser is None: new_ser = new_df[col].copy() else: if self._operator == "+": new_ser = new_ser + new_df[col] elif self._operator == "-": new_ser = new_ser - new_df[col] elif self._operator == "*": new_ser = new_ser * new_df[col] elif self._operator == "/": new_ser = new_ser / new_df[col] elif self._operator == "%": new_ser = new_ser % new_df[col] else: raise RuntimeError("Unknown operator is used.") new_df[new_col] = new_ser if self._drop_origin: return new_df[generated_cols] return new_df
def transform(self, input_df: XDataFrame) -> XDataFrame: out_df = input_df.copy() for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix if isinstance(input_df[col], pd.Series): X = column_or_1d(input_df[col], warn=True) elif cudf and isinstance(input_df[col], cudf.Series): X = input_df[col] else: raise TypeError out_df[out_col] = self._target_encoders[col].transform(X) if self.noise_level > 0: np.random.seed(self.random_state) out_df += np.random.normal(0, self.noise_level, out_df.shape) return out_df
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ if isinstance(input_df, pd.DataFrame): new_df = input_df.copy() elif cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = input_df.to_pandas() else: raise RuntimeError("Unexpected data type: {}".format(type(input_df))) generated_cols = [] input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() if len(self._exclude_cols) > 0: input_cols = [col for col in input_cols if col not in self._exclude_cols] for col in input_cols: new_col = self._output_prefix + col + self._output_suffix if self._fillna is not None: new_df[new_col] = ( new_df[col].fillna(self._fillna).apply(self._lambda_func) ) else: new_df[new_col] = new_df[col].apply(self._lambda_func) generated_cols.append(new_col) if cudf_is_available() and isinstance(input_df, cudf.DataFrame): new_df = cudf.from_pandas(new_df) if self._drop_origin: return new_df[generated_cols] return new_df
def transform(self, input_df: XDataFrame) -> XDataFrame: """Transform data frame. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ out_df = input_df.copy() for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix if isinstance(input_df[col], pd.Series): X = column_or_1d(input_df[col], warn=True) elif cudf and isinstance(input_df[col], cudf.Series): X = input_df[col] else: raise TypeError out_df[out_col] = self._target_encoders[col].transform(X) return out_df
def aggregation( input_df: XDataFrame, group_key: str, group_values: List[str], agg_methods: List[str], ): """Aggregate values after grouping table rows by a given key. Arguments: input_df (XDataFrame) : Input data frame. group_key (str) : Used to determine the groups for the groupby. group_values (List[str]) : Used to aggregate values for the groupby. agg_methods (List[str]) : List of function names, e.g. ['mean', 'max', 'min', 'std']. Returns: Tuple[XDataFrame, List[str]] : Tuple of output dataframe and new column names. """ new_df = input_df.copy() new_cols = [] for agg_method in agg_methods: for col in group_values: new_col = f"agg_{agg_method}_{col}_grpby_{group_key}" # NOTE(smly): # Failed when cudf.DataFrame try to merge with cudf.Series. # Use workaround to merge with cudf.DataFrame. # Ref: http://github.com/rapidsai/cudf/issues/5013 df_agg = (input_df[[col] + [group_key]].groupby(group_key)[[ col ]].agg(agg_method)) df_agg.columns = [new_col] new_cols.append(new_col) new_df = new_df.merge(df_agg, how="left", right_index=True, left_on=group_key) return new_df, new_cols
def fit_transform(self, input_df: XDataFrame) -> XDataFrame: """Fit to data frame, then transform it. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame: Output data frame. """ new_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = new_df.columns.tolist() self._input_cols = input_cols for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix count_encoder = _CountEncoder() self._count_encoders[col] = count_encoder new_df[out_col] = count_encoder.fit_transform(new_df[col].copy()) return new_df
def fit_transform(self, input_df: XDataFrame, y: XSeries = None) -> XDataFrame: out_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = input_df.columns.tolist() self._input_cols = input_cols # Remove `target_col` from `self._input_cols`. if self._target_col in self._input_cols: self._input_cols.remove(self._target_col) for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix target_encoder = _TargetEncoder(self.fold) self._target_encoders[col] = target_encoder if isinstance(input_df[col], pd.Series): X = column_or_1d(input_df[col], warn=True) if y is None: y = column_or_1d(input_df[self._target_col], warn=True) else: y = column_or_1d(y, warn=True) elif cudf and isinstance(input_df[col], cudf.Series): X = input_df[col] if y is None: y = input_df[self._target_col] else: raise TypeError out_df[out_col] = target_encoder.fit_transform(X, y).copy() if self.noise_level > 0: np.random.seed(self.random_state) out_df += np.random.normal(0, self.noise_level, out_df.shape) return out_df
def fit_transform(self, input_df: XDataFrame) -> XDataFrame: """Fit to data frame, then transform it. Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ out_df = input_df.copy() input_cols = self._input_cols if not input_cols: input_cols = input_df.columns.tolist() self._input_cols = input_cols # Remove `target_col` from `self._input_cols`. if self._target_col in self._input_cols: self._input_cols.remove(self._target_col) for col in self._input_cols: out_col = self._output_prefix + col + self._output_suffix target_encoder = _TargetEncoder(self.fold) self._target_encoders[col] = target_encoder if isinstance(input_df[col], pd.Series): X = column_or_1d(input_df[col], warn=True) y = column_or_1d(input_df[self._target_col], warn=True) elif cudf and isinstance(input_df[col], cudf.Series): X = input_df[col] y = input_df[self._target_col] else: raise TypeError out_df[out_col] = target_encoder.fit_transform(X, y).copy() return out_df
def cv( self, y_train: AoS, train_features: XDataFrame, test_features: XDataFrame, y_valid: Optional[AoS], valid_features: Optional[XDataFrame], feature_name: List[str], folds_ids: List[Tuple[np.ndarray, np.ndarray]], target_scaler: Optional[MinMaxScaler], config: dict, log: bool = True, ) -> Tuple[List[Model], np.ndarray, np.ndarray, Optional[np.ndarray], pd.DataFrame, dict]: # initialize valid_exists = True if valid_features is not None else False test_preds = np.zeros(len(test_features)) oof_preds = np.zeros(len(train_features)) if valid_exists: valid_preds = np.zeros(len(valid_features)) else: valid_preds = None importances = pd.DataFrame(index=feature_name) best_iteration = 0.0 cv_score_list: List[dict] = [] models: List[Model] = [] with timer("make X"): X_train = train_features.copy() X_test = test_features.copy() X_valid = valid_features.copy( ) if valid_features is not None else None with timer("make y"): y = y_train.values if isinstance(y_train, pd.Series) else y_train y_valid = y_valid.values if isinstance(y_valid, pd.Series) else y_valid for i_fold, (trn_idx, val_idx) in enumerate(folds_ids): self.fold = i_fold # get train data and valid data x_trn = X_train.iloc[trn_idx] y_trn = y[trn_idx] x_val = X_train.iloc[val_idx] y_val = y[val_idx] x_trn, y_trn = get_sampling(x_trn, y_trn, config) # train model model, best_score = self.fit(x_trn, y_trn, x_val, y_val, config) cv_score_list.append(best_score) models.append(model) best_iteration += self.get_best_iteration(model) / len(folds_ids) # predict oof and test oof_preds[val_idx] = self.predict(model, x_val).reshape(-1) test_preds += self.predict(model, X_test).reshape(-1) / len(folds_ids) if valid_exists: valid_preds += self.predict( model, valid_features).reshape(-1) / len(folds_ids) # get feature importances importances_tmp = pd.DataFrame( self.get_feature_importance(model), columns=[f"gain_{i_fold+1}"], index=feature_name, ) importances = importances.join(importances_tmp, how="inner") # summary of feature importance feature_importance = importances.mean(axis=1) # save raw prediction self.raw_oof_preds = oof_preds self.raw_test_preds = test_preds self.raw_valid_preds = valid_preds # post_process (if you have any) y, oof_preds, test_preds, y_valid, valid_preds = self.post_process( oof_preds=oof_preds, test_preds=test_preds, valid_preds=valid_preds, y_train=y_train, y_valid=y_valid, train_features=train_features, test_features=test_features, valid_features=valid_features, target_scaler=target_scaler, config=config, ) # print oof score oof_score = calc_metric(y, oof_preds) print(f"oof score: {oof_score:.5f}") if valid_exists: valid_score = calc_metric(y_valid, valid_preds) print(f"valid score: {valid_score:.5f}") if log: logging.info(f"oof score: {oof_score:.5f}") if valid_exists: logging.info(f"valid score: {valid_score:.5f}") evals_results = { "evals_result": { "oof_score": oof_score, "cv_score": { f"cv{i + 1}": cv_score for i, cv_score in enumerate(cv_score_list) }, "n_data": np.shape(X_train)[0], "best_iteration": best_iteration, "n_features": np.shape(X_train)[1], "feature_importance": feature_importance.sort_values(ascending=False).to_dict(), } } if valid_exists: evals_results["valid_score"] = valid_score return ( models, oof_preds, test_preds, valid_preds, feature_importance, evals_results, )