def fit(self, X, y): self.model = XGBModel(n_estimators=self.n_best, **self.params) self.model.fit(X=X[self.features], y=y, eval_metric="mae", verbose=False) return self
def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit(X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False) score = model.evals_result()['validation_0'][self.metric][model.best_iteration] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model}
def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit( X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False, ) score = (model.evals_result()["validation_0"][self.metric][ model.best_iteration] * self.loss_sign) return {"loss": score, "status": STATUS_OK, "model": model}
def _train_test( x: pd.DataFrame, y: pd.Series, model: xgb.XGBModel, scorer: Callable, test_samples: Optional[int] = None, test_ratio: Optional[float] = None, time_series: bool = False, random_state: Optional[int] = None, ): datasets = split(x, y, test_samples, test_ratio, time_series, random_state) x_train, x_test, y_train, y_test = datasets model.fit(x_train, y_train) return scorer(model, x_test, y_test)
def create_name_for_xgbmodel(run_cfg: Dict[str, Any], outer_split_num: int, model: XGBModel, inner_split_num: int, prefix_location='logs/', suffix='.pkl') -> str: if run_cfg['analysis_type'] == AnalysisType.FLATTEN_CORRS: model_str_representation = run_cfg['analysis_type'].value for key in [ 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'gamma', 'learning_rate', 'max_depth', 'min_child_weight', 'n_estimators', 'subsample' ]: model_str_representation += key[-3:] + '_' + str( model.get_params()[key]) return prefix_location + '_'.join([ run_cfg['target_var'], run_cfg['dataset_type'].value, str(outer_split_num), str(inner_split_num), model_str_representation, str(run_cfg['num_nodes']), run_cfg['param_conn_type'].value ]) + suffix
class AutoXGB(BaseAutoML): params = {"random_state": RANDOM_SEED, "n_jobs": -1} default_xgb_space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)), "max_depth": hp.choice("num_leaves", [6, 8, 10]), "colsample_bytree": hp.quniform("colsample_bytree", 0.5, 0.9, 0.1), "subsample": hp.quniform("subsample", 0.5, 0.9, 0.1), "min_child_weight": hp.choice("min_child_weight", [10, 25, 100]), } def __init__( self, objective="reg:squarederror", metric="rmse", boosting="gbtree", params=params, space=default_xgb_space, n_est=500, n_stop=10, sample_size=SAMPLE_SIZE, feature_selection=True, n_fs=10, fs_th=1e-5, fs_pct=0.1, hyperparam_opt=True, n_hpopt=100, n_random_col=10, random_state=RANDOM_SEED, shuffle=True, ): self.metric, minimize = self._get_metric_alias_minimize(metric) self.params.update(params) self.params.update({"objective": objective, "booster": boosting}) super(AutoXGB, self).__init__( params=self.params, space=space, n_est=n_est, n_stop=n_stop, sample_size=sample_size, feature_selection=feature_selection, n_fs=n_fs, fs_th=fs_th, fs_pct=fs_pct, hyperparam_opt=hyperparam_opt, n_hpopt=n_hpopt, minimize=minimize, n_random_col=n_random_col, random_state=random_state, shuffle=shuffle, ) @staticmethod def _get_metric_alias_minimize(metric): """Get XGBoost metric alias. As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters Args: metric (str): a metric name Returns: (tuple): - (str): the standard metric name for LightGBM - (bool): a flag whether to minimize or maximize the metric """ assert metric in [ "rmse", "rmsle", "mae", "logloss", "error", "merror", "mlogloss", "auc", "aucpr", "ndcg", "map", "poisson-nloglik", "gamma-nloglik", "cox-nloglik", "gamma-deviance", "tweedie-nloglik", ], "Invalid metric: {}".format(metric) if metric in ["auc", "aucpr", "ndcg", "map"]: minimize = False else: minimize = True return metric, minimize @staticmethod def get_feature_importance(model): return model.feature_importances_ def feature_importance(self): return self.model.feature_importances_ def optimize_hyperparam(self, X, y, test_size=0.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit( X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False, ) score = (model.evals_result()["validation_0"][self.metric][ model.best_iteration] * self.loss_sign) return {"loss": score, "status": STATUS_OK, "model": model} trials = Trials() best = hyperopt.fmin( fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, ) hyperparams = space_eval(self.space, best) return hyperparams, trials def fit(self, X, y): self.model = XGBModel(n_estimators=self.n_best, **self.params) self.model.fit(X=X[self.features], y=y, eval_metric="mae", verbose=False) return self def predict(self, X): return self.model.predict(X[self.features])
class AutoXGB(BaseAutoML): params = {'random_state': RANDOM_SEED, 'n_jobs': -1} space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)), "max_depth": hp.choice("num_leaves", [6, 8, 10]), "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1), "subsample": hp.quniform("subsample", .5, .9, 0.1), "min_child_weight": hp.choice('min_child_weight', [10, 25, 100]), } def __init__(self, objective='reg:linear', metric='rmse', boosting='gbtree', params=params, space=space, n_est=500, n_stop=10, sample_size=SAMPLE_SIZE, feature_selection=True, n_fs=10, fs_th=1e-5, hyperparam_opt=True, n_hpopt=100, n_random_col=10, random_state=RANDOM_SEED, shuffle=True): self.metric, minimize = self._get_metric_alias_minimize(metric) self.params.update(params) self.params.update({'objective': objective, 'booster': boosting}) super(AutoXGB, self).__init__(params=self.params, space=space, n_est=n_est, n_stop=n_stop, sample_size=sample_size, feature_selection=feature_selection, n_fs=n_fs, fs_th=1e-5, hyperparam_opt=hyperparam_opt, n_hpopt=n_hpopt, minimize=minimize, n_random_col=n_random_col, random_state=random_state, shuffle=shuffle) @staticmethod def _get_metric_alias_minimize(metric): """Get XGBoost metric alias. As defined at https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters Args: metric (str): a metric name Returns: (tuple): - (str): the standard metric name for LightGBM - (bool): a flag whether to minimize or maximize the metric """ assert metric in [ 'rmse', 'rmsle', 'mae', 'logloss', 'error', 'merror', 'mlogloss', 'auc', 'aucpr', 'ndcg', 'map', 'poisson-nloglik', 'gamma-nloglik', 'cox-nloglik', 'gamma-deviance', 'tweedie-nloglik' ], 'Invalid metric: {}'.format(metric) if metric in ['auc', 'aucpr', 'ndcg', 'map']: minimize = False else: minimize = True return metric, minimize @staticmethod def get_feature_importance(model): return model.feature_importances_ def feature_importance(self): return self.model.feature_importances_ def optimize_hyperparam(self, X, y, test_size=.2, n_eval=100): X_trn, X_val, y_trn, y_val = train_test_split(X, y, test_size=test_size, shuffle=self.shuffle) def objective(hyperparams): model = XGBModel(n_estimators=self.n_est, **self.params, **hyperparams) model.fit(X=X_trn, y=y_trn, eval_set=[(X_val, y_val)], eval_metric=self.metric, early_stopping_rounds=self.n_stop, verbose=False) score = model.evals_result()['validation_0'][self.metric][ model.best_iteration] * self.loss_sign return {'loss': score, 'status': STATUS_OK, 'model': model} trials = Trials() best = hyperopt.fmin(fn=objective, space=self.space, trials=trials, algo=tpe.suggest, max_evals=n_eval, verbose=1, rstate=self.random_state) hyperparams = space_eval(self.space, best) return hyperparams, trials def fit(self, X, y): self.model = XGBModel(n_estimators=self.n_best, **self.params) self.model.fit(X=X[self.features], y=y, eval_metric='mae', verbose=False) return self def predict(self, X): return self.model.predict(X[self.features])
def build_model(self): model = XGBModel(**self.model_params) return model
def fit_xgb_model(out_fold_num: int, in_fold_num: int, run_cfg: Dict[str, Any], model: XGBModel, X_train_in: FlattenCorrsDataset, X_val_in: FlattenCorrsDataset) -> Dict: model_saving_path = create_name_for_xgbmodel(model=model, outer_split_num=out_fold_num, inner_split_num=in_fold_num, run_cfg=run_cfg) train_arr = np.array([data.x.numpy() for data in X_train_in]) val_arr = np.array([data.x.numpy() for data in X_val_in]) if run_cfg['target_var'] == 'gender': y_train = [int(data.sex.item()) for data in X_train_in] y_val = [int(data.sex.item()) for data in X_val_in] elif run_cfg['target_var'] == 'age': # np.array() because of printing calls in the regressor_metrics function y_train = np.array([float(data.age.item()) for data in X_train_in]) y_val = np.array([float(data.age.item()) for data in X_val_in]) model.fit(train_arr, y_train, callbacks=[wandb.xgboost.wandb_callback()]) pickle.dump(model, open(model_saving_path, "wb")) if run_cfg['target_var'] == 'gender': train_metrics = return_classifier_metrics( y_train, pred_prob=model.predict_proba(train_arr)[:, 1], pred_binary=model.predict(train_arr), flatten_approach=True) val_metrics = return_classifier_metrics( y_val, pred_prob=model.predict_proba(val_arr)[:, 1], pred_binary=model.predict(val_arr), flatten_approach=True) print( '{:1d}-{:1d}: Auc: {:.4f} / {:.4f}, Acc: {:.4f} / {:.4f}, F1: {:.4f} /' ' {:.4f} '.format(out_fold_num, in_fold_num, train_metrics['auc'], val_metrics['auc'], train_metrics['acc'], val_metrics['acc'], train_metrics['f1'], val_metrics['f1'])) wandb.log({ f'train_auc{in_fold_num}': train_metrics['auc'], f'val_auc{in_fold_num}': val_metrics['auc'], f'train_acc{in_fold_num}': train_metrics['acc'], f'val_acc{in_fold_num}': val_metrics['acc'], f'train_sens{in_fold_num}': train_metrics['sensitivity'], f'val_sens{in_fold_num}': val_metrics['sensitivity'], f'train_spec{in_fold_num}': train_metrics['specificity'], f'val_spec{in_fold_num}': val_metrics['specificity'], f'train_f1{in_fold_num}': train_metrics['f1'], f'val_f1{in_fold_num}': val_metrics['f1'] }) else: train_metrics = return_regressor_metrics( y_train, pred_prob=model.predict(train_arr)) val_metrics = return_regressor_metrics( y_val, pred_prob=model.predict(val_arr)) print('{:1d}-{:1d}: R2: {:.4f} / {:.4f}, R: {:.4f} / {:.4f}'.format( out_fold_num, in_fold_num, train_metrics['r2'], val_metrics['r2'], train_metrics['r'], val_metrics['r'])) wandb.log({ f'train_r2{in_fold_num}': train_metrics['r2'], f'val_r2{in_fold_num}': val_metrics['r2'], f'train_r{in_fold_num}': train_metrics['r'], f'val_r{in_fold_num}': val_metrics['r'] }) return val_metrics
def to_mls(xgboost_model: xgboost.XGBModel, **kwargs): params = xgboost_model.get_params() def standardize_types(v): if isinstance(v, np.ndarray): return [normalize_float(x) for x in v.tolist()] elif isinstance(v, float): return normalize_float(v) elif callable(v): return str(v) # TODO return v def deep_get_params(params): if isinstance(params, (list, tuple)): return [deep_get_params(x) for x in params] elif isinstance(params, dict): return {k: deep_get_params(v) for k, v in params.items()} else: v = standardize_types(params) try: p = v.get_params() t = type(v).__module__ + "." + type(v).__name__ return {"@value": {"type": t, "params": deep_get_params(p)}} except AttributeError: try: json.dumps(v) return v except TypeError as e: raise NotImplementedError( "can't convert sklearn model of type {} to mls: {}".format( type(xgboost_model), e ) ) params = deep_get_params(params) model_hash = xgboost_model.__hash__() model_class = "{}.{}".format( type(xgboost_model).__module__, type(xgboost_model).__name__ ) algo = Algorithm(_id=model_class) implementation = Implementation( _id=generate_unique_id("http://www.w3.org/ns/mls#Implementation"), parameters=[ HyperParameter(key, model_hash=model_hash) for key in params.keys() ], implements=algo, version=xgboost.__version__, ) input_values = [ HyperParameterSetting( value=val, specified_by=HyperParameter(key, model_hash=model_hash), model_hash=model_hash, ) for key, val in params.items() if val is not None ] output_values = [] if EVALUATION_MEASURE_KEY in kwargs: eval_measure = kwargs[EVALUATION_MEASURE_KEY] output_values.append(evaluation_measure(eval_measure[0], eval_measure[1])) model = Run(model_hash, implementation, input_values, output_values, algo) return RunSchema().dumps(model)