def build_model(X_train, y_train, X_valid, y_valid): best_params = { 'base_score': 2, 'colsample_bylevel': 0.75, 'colsample_bynode': 0.57, 'colsample_bytree': 0.95, 'gamma': 0.25, 'learning_rate': 1.7, 'max_depth': 18, 'min_child_weight': 0.025, 'n_estimators': 353, 'n_jobs': -1, 'num_class': 3, 'num_parallel_tree': 105, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 0.8, 'verbosity': 0, 'reg_alpha': 0.05, 'reg_lambda': 1, 'rate_drop': 0.5 } best_xgb = XGBRFClassifier(**best_params) best_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=['merror'], early_stopping_rounds=50, callbacks=[print_evaluation(period=5), early_stop(stopping_rounds=15)], verbose=False,) return best_xgb
def train_and_evaluate_xgb(train, early, val, params): X_train, y_train = train X_early, y_early = early X_val, y_val = val # create model model = create_xgb_model(params) # drop skew for col in X_train.columns: if 'skew' in col: X_train.drop(col, 1, inplace=True) X_early.drop(col, 1, inplace=True) X_val.drop(col, 1, inplace=True) # Fit model using early stopping early = early_stop(stopping_rounds=30, maximize=False) model.fit(X_train.drop(['ID', 'ID_temp'], 1), y_train, eval_set=[(X_train.drop(['ID', 'ID_temp'], 1), y_train), (X_early.drop(['ID', 'ID_temp'], 1), y_early)], callbacks=[early]) # Validation evaluation y_val_preds = model.predict(X_val.drop(['ID', 'ID_temp'], 1)) cur_mse = mean_squared_error(y_val_preds, y_val) return cur_mse
def run(self, data, y, groups, test, eval_metric, n_splits=10, early_stopping_rounds=100): oof_preds_LGBM = np.zeros((data.shape[0])) sub_preds_LGBM = np.zeros((test.shape[0])) df_sub_preds_LGBM = pd.DataFrame() self.df_feature_importance = pd.DataFrame() if not self.clf: self.build_clf() folds = GroupKFold(n_splits=n_splits) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data, y, groups)): train_x, train_y = data.iloc[train_idx], y.iloc[train_idx] valid_x, valid_y = data.iloc[valid_idx], y.iloc[valid_idx] print("Starting LightGBM. Fold {},Train shape: {}, test shape: {}". format(n_fold + 1, data.shape, test.shape)) self.clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric=eval_metric, verbose=100, callbacks=[ early_stop(early_stopping_rounds, maximize=True, verbose=True) ]) oof_preds_LGBM[valid_idx] += self.clf.predict_proba(valid_x)[:, 1] # sub_preds_LGBM += self.clf.predict_proba(test)[:, 1]/ (folds.n_splits) df_sub_preds_LGBM['fold_{}'.format( n_fold)] = self.clf.predict_proba(test)[:, 1] df_fold_importance = pd.DataFrame() df_fold_importance["feature"] = self.features df_fold_importance["importance"] = self.clf.feature_importances_ df_fold_importance["fold"] = n_fold + 1 self.df_feature_importance = pd.concat( [self.df_feature_importance, df_fold_importance], axis=0) print('Summary:') print('XGB Testing_Set average_precision_score %.6f' % average_precision_score(y, oof_preds_LGBM)) return oof_preds_LGBM, df_sub_preds_LGBM, self.clf
def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma, alpha, max_delta_step): params['min_child_weight'] = int(min_child_weight) params['colsample_bytree'] = max(min(colsample_bytree, 1), 0) params['max_depth'] = int(max_depth) params['subsample'] = max(min(subsample, 1), 0) params['gamma'] = max(gamma, 0) params['alpha'] = max(alpha, 0) params['max_delta_step'] = max(int(max_delta_step), 0) cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5, seed=random_state, callbacks=[callback.early_stop(50)]) return cv_result['test-auc-mean'].values[-1]
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None): # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init """Train a booster with given parameters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round: int Number of boosting iterations. evals: list of pairs (DMatrix, string) List of items to be evaluated during training, this allows user to watch performance on the validation set. obj : function Customized objective function. feval : function Customized evaluation function. maximize : bool Whether to maximize feval. early_stopping_rounds: int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) evals_result: dict This dictionary stores the evaluation results of all the items in watchlist. Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and a parameter containing ('eval_metric': 'logloss') Returns: {'train': {'logloss': ['0.48253', '0.35953']}, 'eval': {'logloss': ['0.480385', '0.357756']}} verbose_eval : bool or int Requires at least one item in evals. If `verbose_eval` is True then the evaluation metric on the validation set is printed at each boosting stage. If `verbose_eval` is an integer then the evaluation metric on the validation set is printed at every given `verbose_eval` boosting stage. The last boosting stage / the boosting stage found by using `early_stopping_rounds` is also printed. Example: with verbose_eval=4 and at least one item in evals, an evaluation metric is printed every 4 boosting stages, instead of every boosting stage. learning_rates: list or function (deprecated - use callback API instead) List of learning rate for each boosting round or a customized function that calculates eta in terms of current number of round and the total number of boosting round (e.g. yields learning rate decay) xgb_model : file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using xgb.callback module. Example: [xgb.callback.reset_learning_rate(custom_rates)] Returns ------- booster : a trained booster model """ callbacks = [] if callbacks is None else callbacks # Most of legacy advanced options becomes callbacks if isinstance(verbose_eval, bool) and verbose_eval: callbacks.append(callback.print_evaluation()) else: if isinstance(verbose_eval, int): callbacks.append(callback.print_evaluation(verbose_eval)) if early_stopping_rounds is not None: callbacks.append( callback.early_stop(early_stopping_rounds, maximize=maximize, verbose=bool(verbose_eval))) if evals_result is not None: callbacks.append(callback.record_evaluation(evals_result)) if learning_rates is not None: warnings.warn( "learning_rates parameter is deprecated - use callback API instead", DeprecationWarning) callbacks.append(callback.reset_learning_rate(learning_rates)) return _train_internal(params, dtrain, num_boost_round=num_boost_round, evals=evals, obj=obj, feval=feval, xgb_model=xgb_model, callbacks=callbacks)
def cv(params, X_train, y_train, features=None, num_boost_round=20, nfold=3, folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=1234, callbacks=None): ''' Cross-validation with given parameters. Madified from cv method found in xgboost package (https://github.com/dmlc/xgboost) to use spatial data with statistical features without risking data bleed. Parameters ---------- params : dict Booster params X_train : pandas.DataFrame X data to be trained y_train : pandas.DataFrame y data to be trained features : list features selected to be trained num_boost_round : int : 20 Number of boosting iterations. nfold : int : 3 Number of folds in CV. folds : a KFold or StratifiedKFold instance or list of fold indeces Sklearn KFolds or StratifiedKFolds object. Alternatively may explicitly pass sample indices for each fold. For ``n`` folds, **folds** should be a length ``n`` list of tuples. Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used as the training samples for the ``n``th fold and ``out`` is a list of indices to be used as the testing samples for the ``n``th fold. metrics : string ot list of strings Evaluation metrics to be watches in CV. obj : function Custom objective function. feval : function Custom evaluation function. maximize : bool Whether to maximize feval. early_stopping_rounds : int Activates early stopping. Cross-validation metric (average of validation metric computed over CV folds) needs to improve at least once in every **early_stopping_rounds** round(s) to continue training. The last entry in the evaluation history will represent the best iteration. If there's more than one metric in the **eval_metric** parameter given **params**, the last metric will be used for early stopping. fpreproc : function Preprocessing function that takes (dtrain, dtest, param) and returns transformed versions of those. as_pandas : bool : True Return pd.DataFrame when pandas is installed. If False or pandas is not installed, return np.ndarray verbose_eval : bool, int, or None : None Whether to display the progress. If None, progress will be displayed when np.ndarray is returned. If True, progress will be displayed at boosting stage. If an integer is given, progress will be displayed at every given `verbose_eval` boosting stage. show_stdv : bool : True Whether to display the standard deviation in progress. Results are not affected, and always contains std. seed : int : 1234 seed used to generate folds (passed to numpy.random.seed). callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks using :ref:`Callback API <callback_api>`. Example: .. code-block:: python [xgb.callback.reset_learning_rate(custom_rates)] Returns ------- results : pandas.DataFrame results of crossvalidated model with metrics of each boosted round ''' metrics = list(metrics) #if isinstance(metrics, str): # print('test') # metrics = [metrics] if not features: features = X_train.columns if isinstance(params, list): _metrics = [x[1] for x in params if x[0] == 'eval_metric'] params = dict(params) if 'eval_metric' in params: params['eval_metric'] = _metrics else: params = dict((k, v) for k, v in params.items()) if (not metrics) and 'eval_metric' in params: if isinstance(params['eval_metric'], list): metrics = params['eval_metric'] else: metrics = [params['eval_metric']] params.pop("eval_metric", None) results = {} # create folds in data cvfolds, wt_list = mknfold(X_train, y_train, nfold, params, metrics, features) # setup callbacks callbacks = [] if callbacks is None else callbacks if early_stopping_rounds is not None: callbacks.append( callback.early_stop(early_stopping_rounds, maximize=maximize, verbose=False)) if isinstance(verbose_eval, bool) and verbose_eval: callbacks.append(callback.print_evaluation(show_stdv=show_stdv)) elif isinstance(verbose_eval, int): callbacks.append( callback.print_evaluation(verbose_eval, show_stdv=show_stdv)) callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False) ] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False) ] for i in range(num_boost_round): for cb in callbacks_before_iter: cb( CallbackEnv(model=None, cvfolds=cvfolds, iteration=i, begin_iteration=0, end_iteration=num_boost_round, rank=0, evaluation_result_list=None)) for fold in cvfolds: fold.update(i, obj) res = aggcv([f.eval(i, feval) for f in cvfolds], wt_list) for key, mean, std in res: if key + '-mean' not in results: results[key + '-mean'] = [] if key + '-std' not in results: results[key + '-std'] = [] results[key + '-mean'].append(mean) results[key + '-std'].append(std) try: for cb in callbacks_after_iter: cb( CallbackEnv(model=None, cvfolds=cvfolds, iteration=i, begin_iteration=0, end_iteration=num_boost_round, rank=0, evaluation_result_list=res)) except EarlyStopException as e: for k in results: results[k] = results[k][:(e.best_iteration + 1)] break if as_pandas: results = pd.DataFrame.from_dict(results) return results
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True): # pylint: disable = invalid-name """Cross-validation with given parameters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round : int Number of boosting iterations. nfold : int Number of folds in CV. stratified : bool Perform stratified sampling. folds : a KFold or StratifiedKFold instance Sklearn KFolds or StratifiedKFolds. metrics : string or list of strings Evaluation metrics to be watched in CV. obj : function Custom objective function. feval : function Custom evaluation function. maximize : bool Whether to maximize feval. early_stopping_rounds: int Activates early stopping. CV error needs to decrease at least every <early_stopping_rounds> round(s) to continue. Last entry in evaluation history is the one from best iteration. fpreproc : function Preprocessing function that takes (dtrain, dtest, param) and returns transformed versions of those. as_pandas : bool, default True Return pd.DataFrame when pandas is installed. If False or pandas is not installed, return np.ndarray verbose_eval : bool, int, or None, default None Whether to display the progress. If None, progress will be displayed when np.ndarray is returned. If True, progress will be displayed at boosting stage. If an integer is given, progress will be displayed at every given `verbose_eval` boosting stage. show_stdv : bool, default True Whether to display the standard deviation in progress. Results are not affected, and always contains std. seed : int Seed used to generate the folds (passed to numpy.random.seed). callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using xgb.callback module. Example: [xgb.callback.reset_learning_rate(custom_rates)] shuffle : bool Shuffle data before creating folds. Returns ------- evaluation history : list(string) """ if stratified is True and not SKLEARN_INSTALLED: raise XGBoostError( 'sklearn needs to be installed in order to use stratified cv') if isinstance(metrics, str): metrics = [metrics] if isinstance(params, list): _metrics = [x[1] for x in params if x[0] == 'eval_metric'] params = dict(params) if 'eval_metric' in params: params['eval_metric'] = _metrics else: params = dict((k, v) for k, v in params.items()) if len(metrics) == 0 and 'eval_metric' in params: if isinstance(params['eval_metric'], list): metrics = params['eval_metric'] else: metrics = [params['eval_metric']] params.pop("eval_metric", None) results = {} cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc, stratified, folds, shuffle) # setup callbacks callbacks = [] if callbacks is None else callbacks if early_stopping_rounds is not None: callbacks.append( callback.early_stop(early_stopping_rounds, maximize=maximize, verbose=False)) if isinstance(verbose_eval, bool) and verbose_eval: callbacks.append(callback.print_evaluation(show_stdv=show_stdv)) else: if isinstance(verbose_eval, int): callbacks.append( callback.print_evaluation(verbose_eval, show_stdv=show_stdv)) callbacks_before_iter = [ cb for cb in callbacks if cb.__dict__.get('before_iteration', False) ] callbacks_after_iter = [ cb for cb in callbacks if not cb.__dict__.get('before_iteration', False) ] for i in range(num_boost_round): for cb in callbacks_before_iter: cb( CallbackEnv(model=None, cvfolds=cvfolds, iteration=i, begin_iteration=0, end_iteration=num_boost_round, rank=0, evaluation_result_list=None)) for fold in cvfolds: fold.update(i, obj) res = aggcv([f.eval(i, feval) for f in cvfolds]) for key, mean, std in res: if key + '-mean' not in results: results[key + '-mean'] = [] if key + '-std' not in results: results[key + '-std'] = [] results[key + '-mean'].append(mean) results[key + '-std'].append(std) try: for cb in callbacks_after_iter: cb( CallbackEnv(model=None, cvfolds=cvfolds, iteration=i, begin_iteration=0, end_iteration=num_boost_round, rank=0, evaluation_result_list=res)) except EarlyStopException as e: for k in results.keys(): results[k] = results[k][:(e.best_iteration + 1)] break if as_pandas: try: import pandas as pd results = pd.DataFrame.from_dict(results) except ImportError: pass return (results, cvfolds)
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None): # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init """Train a booster with given parameters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round: int Number of boosting iterations. evals: list of pairs (DMatrix, string) List of items to be evaluated during training, this allows user to watch performance on the validation set. obj : function Customized objective function. feval : function Customized evaluation function. maximize : bool Whether to maximize feval. early_stopping_rounds: int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have three additional fields: bst.best_score, bst.best_iteration and bst.best_ntree_limit. (Use bst.best_ntree_limit to get the correct value if num_parallel_tree and/or num_class appears in the parameters) evals_result: dict This dictionary stores the evaluation results of all the items in watchlist. Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and a parameter containing ('eval_metric': 'logloss') Returns: {'train': {'logloss': ['0.48253', '0.35953']}, 'eval': {'logloss': ['0.480385', '0.357756']}} verbose_eval : bool or int Requires at least one item in evals. If `verbose_eval` is True then the evaluation metric on the validation set is printed at each boosting stage. If `verbose_eval` is an integer then the evaluation metric on the validation set is printed at every given `verbose_eval` boosting stage. The last boosting stage / the boosting stage found by using `early_stopping_rounds` is also printed. Example: with verbose_eval=4 and at least one item in evals, an evaluation metric is printed every 4 boosting stages, instead of every boosting stage. learning_rates: list or function (deprecated - use callback API instead) List of learning rate for each boosting round or a customized function that calculates eta in terms of current number of round and the total number of boosting round (e.g. yields learning rate decay) xgb_model : file name of stored xgb model or 'Booster' instance Xgb model to be loaded before training (allows training continuation). callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using xgb.callback module. Example: [xgb.callback.reset_learning_rate(custom_rates)] Returns ------- booster : a trained booster model """ callbacks = [] if callbacks is None else callbacks # Most of legacy advanced options becomes callbacks if isinstance(verbose_eval, bool) and verbose_eval: callbacks.append(callback.print_evaluation()) else: if isinstance(verbose_eval, int): callbacks.append(callback.print_evaluation(verbose_eval)) if early_stopping_rounds is not None: callbacks.append(callback.early_stop(early_stopping_rounds, maximize=maximize, verbose=bool(verbose_eval))) if evals_result is not None: callbacks.append(callback.record_evaluation(evals_result)) if learning_rates is not None: warnings.warn( "learning_rates parameter is deprecated - use callback API instead", DeprecationWarning) callbacks.append(callback.reset_learning_rate(learning_rates)) return _train_internal(params, dtrain, num_boost_round=num_boost_round, evals=evals, obj=obj, feval=feval, xgb_model=xgb_model, callbacks=callbacks)