def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit(X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False) score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model}
def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit( X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False, ) score = (model.evals_result()["validation_0"][self.metric][ model.best_iteration] * self.loss_sign) return {"loss": score, "status": STATUS_OK, "model": model}
def _train_test( x: pd.DataFrame, y: pd.Series, model: xgb.XGBModel, scorer: Callable, test_samples: Optional[int] = None, test_ratio: Optional[float] = None, time_series: bool = False, random_state: Optional[int] = None, ): datasets = split(x, y, test_samples, test_ratio, time_series, random_state) x_train, x_test, y_train, y_test = datasets model.fit(x_train, y_train) return scorer(model, x_test, y_test)
class AutoXGB(BaseAutoML): params = {"random_state": RANDOM_SEED, "n_jobs": -1} default_xgb_space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)), "max_depth": hp.choice("num_leaves", [6, 8, 10]), "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 0.9, 0.1), "subsample": hp.quniform("subsample", 0.5, 0.9, 0.1), "min_child_weight": hp.choice("min_child_weight", [10, 25, 100]), } def __init__( self, objective="reg:squarederror", metric="rmse", boosting="gbtree", params=params, space=default_xgb_space, n_est=500, n_stop=10, sample_size=SAMPLE_SIZE, feature_selection=True, n_fs=10, fs_th=1e-5, fs_pct=0.1, hyperparam_opt=True, n_hpopt=100, n_random_col=10, random_state=RANDOM_SEED, shuffle=True, ): self.metric, minimize = self._get_metric_alias_minimize(metric) self.params.update(params) self.params.update({"objective": objective, "booster": boosting}) super(AutoXGB, self).__init__( params=self.params, space=space, n_est=n_est, n_stop=n_stop, sample_size=sample_size, feature_selection=feature_selection, n_fs=n_fs, fs_th=fs_th, fs_pct=fs_pct, hyperparam_opt=hyperparam_opt, n_hpopt=n_hpopt, minimize=minimize, n_random_col=n_random_col, random_state=random_state, shuffle=shuffle, ) @staticmethod def _get_metric_alias_minimize(metric): """Get XGBoost metric alias. As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters Args: metric (str): a metric name Returns: (tuple): - (str): the standard metric name for LightGBM - (bool): a flag whether to minimize or maximize the metric """ assert metric in [ "rmse", "rmsle", "mae", "logloss", "error", "merror", "mlogloss", "auc", "aucpr", "ndcg", "map", "poisson-nloglik", "gamma-nloglik", "cox-nloglik", "gamma-deviance", "tweedie-nloglik", ], "Invalid metric: {}".format(metric) if metric in ["auc", "aucpr", "ndcg", "map"]: minimize = False else: minimize = True return metric, minimize @staticmethod def get_feature_importance(model): return model.feature_importances_ def feature_importance(self): return self.model.feature_importances_ def optimize_hyperparam(self, X, y, test_size=0.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit( X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False, ) score = (model.evals_result()["validation_0"][self.metric][ model.best_iteration] * self.loss_sign) return {"loss": score, "status": STATUS_OK, "model": model} trials = Trials() best = hyperopt.fmin( fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, ) hyperparams = space_eval(self.space, best) return hyperparams, trials def fit(self, X, y): self.model = XGBModel(n_estimators=self.n_best, **self.params) self.model.fit(X=X[self.features], y=y, eval_metric="mae", verbose=False) return self def predict(self, X): return self.model.predict(X[self.features])
class AutoXGB(BaseAutoML): params = {'random_state': RANDOM_SEED, 'n_jobs': -1} space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)), "max_depth": hp.choice("num_leaves", [6, 8, 10]), "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1), "subsample": hp.quniform("subsample", .5, .9, 0.1), "min_child_weight": hp.choice('min_child_weight', [10, 25, 100]), } def __init__(self, objective='reg:linear', metric='rmse', boosting='gbtree', params=params, space=space, n_est=500, n_stop=10, sample_size=SAMPLE_SIZE, feature_selection=True, n_fs=10, fs_th=1e-5, hyperparam_opt=True, n_hpopt=100, n_random_col=10, random_state=RANDOM_SEED, shuffle=True): self.metric, minimize = self._get_metric_alias_minimize(metric) self.params.update(params) self.params.update({'objective': objective, 'booster': boosting}) super(AutoXGB, self).__init__(params=self.params, space=space, n_est=n_est, n_stop=n_stop, sample_size=sample_size, feature_selection=feature_selection, n_fs=n_fs, fs_th=1e-5, hyperparam_opt=hyperparam_opt, n_hpopt=n_hpopt, minimize=minimize, n_random_col=n_random_col, random_state=random_state, shuffle=shuffle) @staticmethod def _get_metric_alias_minimize(metric): """Get XGBoost metric alias. As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters Args: metric (str): a metric name Returns: (tuple): - (str): the standard metric name for LightGBM - (bool): a flag whether to minimize or maximize the metric """ assert metric in [ 'rmse', 'rmsle', 'mae', 'logloss', 'error', 'merror', 'mlogloss', 'auc', 'aucpr', 'ndcg', 'map', 'poisson-nloglik', 'gamma-nloglik', 'cox-nloglik', 'gamma-deviance', 'tweedie-nloglik' ], 'Invalid metric: {}'.format(metric) if metric in ['auc', 'aucpr', 'ndcg', 'map']: minimize = False else: minimize = True return metric, minimize @staticmethod def get_feature_importance(model): return model.feature_importances_ def feature_importance(self): return self.model.feature_importances_ def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit(X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False) score = model.evals_result()['validation_0'][self.metric][ model.best_iteration] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model} trials = Trials() best = hyperopt.fmin(fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, rstate=self.random_state) hyperparams = space_eval(self.space, best) return hyperparams, trials def fit(self, X, y): self.model = XGBModel(n_estimators=self.n_best, **self.params) self.model.fit(X=X[self.features], y=y, eval_metric='mae', verbose=False) return self def predict(self, X): return self.model.predict(X[self.features])
def fit_xgb_model(out_fold_num: int, in_fold_num: int, run_cfg: Dict[str, Any], model: XGBModel, X_train_in: FlattenCorrsDataset, X_val_in: FlattenCorrsDataset) -> Dict: model_saving_path = create_name_for_xgbmodel(model=model, outer_split_num=out_fold_num, inner_split_num=in_fold_num, run_cfg=run_cfg) train_arr = np.array([data.x.numpy() for data in X_train_in]) val_arr = np.array([data.x.numpy() for data in X_val_in]) if run_cfg['target_var'] == 'gender': y_train = [int(data.sex.item()) for data in X_train_in] y_val = [int(data.sex.item()) for data in X_val_in] elif run_cfg['target_var'] == 'age': # np.array() because of printing calls in the regressor_metrics function y_train = np.array([float(data.age.item()) for data in X_train_in]) y_val = np.array([float(data.age.item()) for data in X_val_in]) model.fit(train_arr, y_train, callbacks=[wandb.xgboost.wandb_callback()]) pickle.dump(model, open(model_saving_path, "wb")) if run_cfg['target_var'] == 'gender': train_metrics = return_classifier_metrics( y_train, pred_prob=model.predict_proba(train_arr)[:, 1], pred_binary=model.predict(train_arr), flatten_approach=True) val_metrics = return_classifier_metrics( y_val, pred_prob=model.predict_proba(val_arr)[:, 1], pred_binary=model.predict(val_arr), flatten_approach=True) print( '{:1d}-{:1d}: Auc: {:.4f} / {:.4f}, Acc: {:.4f} / {:.4f}, F1: {:.4f} /' ' {:.4f} '.format(out_fold_num, in_fold_num, train_metrics['auc'], val_metrics['auc'], train_metrics['acc'], val_metrics['acc'], train_metrics['f1'], val_metrics['f1'])) wandb.log({ f'train_auc{in_fold_num}': train_metrics['auc'], f'val_auc{in_fold_num}': val_metrics['auc'], f'train_acc{in_fold_num}': train_metrics['acc'], f'val_acc{in_fold_num}': val_metrics['acc'], f'train_sens{in_fold_num}': train_metrics['sensitivity'], f'val_sens{in_fold_num}': val_metrics['sensitivity'], f'train_spec{in_fold_num}': train_metrics['specificity'], f'val_spec{in_fold_num}': val_metrics['specificity'], f'train_f1{in_fold_num}': train_metrics['f1'], f'val_f1{in_fold_num}': val_metrics['f1'] }) else: train_metrics = return_regressor_metrics( y_train, pred_prob=model.predict(train_arr)) val_metrics = return_regressor_metrics( y_val, pred_prob=model.predict(val_arr)) print('{:1d}-{:1d}: R2: {:.4f} / {:.4f}, R: {:.4f} / {:.4f}'.format( out_fold_num, in_fold_num, train_metrics['r2'], val_metrics['r2'], train_metrics['r'], val_metrics['r'])) wandb.log({ f'train_r2{in_fold_num}': train_metrics['r2'], f'val_r2{in_fold_num}': val_metrics['r2'], f'train_r{in_fold_num}': train_metrics['r'], f'val_r{in_fold_num}': val_metrics['r'] }) return val_metrics