def get_callbacks_watchlist(train_cfg, train_dmatrix, val_dmatrix, model_dir, checkpoint_dir, is_master, fold=None): if checkpoint_dir and fold is not None: checkpoint_dir = os.path.join(checkpoint_dir, f"model-{fold}") # Set callbacks xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir) if xgb_model is not None: if fold is not None: xgb_model = f"{xgb_model}-{fold}" logging.info("Checkpoint loaded from %s", xgb_model) logging.info("Resuming from iteration %s", iteration) callbacks = [] callbacks.append( checkpointing.print_checkpointed_evaluation(start_iteration=iteration)) if checkpoint_dir: save_checkpoint = checkpointing.save_checkpoint( checkpoint_dir, start_iteration=iteration) callbacks.append(save_checkpoint) # Parse arguments for intermediate model callback save_model_on_termination = train_cfg.pop('save_model_on_termination', "false") if save_model_on_termination == "true": model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME save_intermediate_model = checkpointing.save_intermediate_model( model_dir, model_name) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) watchlist = [(train_dmatrix, 'train')] if val_dmatrix is not None: watchlist.append((val_dmatrix, 'validation')) return xgb_model, iteration, callbacks, watchlist
def train_job(train_cfg, train_dmatrix, val_dmatrix, model_dir, checkpoint_dir, is_master): """Train and save XGBoost model using data on current node. If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration. Trained model is only saved if 'is_master' is True. :param train_cfg: Training hyperparameter configurations :param train_dmatrix: Training Data Matrix :param val_dmatrix: Validation Data Matrix :param model_dir: Directory where model will be saved :param is_master: True if single node training, or the current node is the master node in distributed training. """ # Parse arguments for intermediate model callback save_model_on_termination = train_cfg.pop('save_model_on_termination', "false") # Parse arguments for train() API early_stopping_rounds = train_cfg.get('early_stopping_rounds') num_round = train_cfg.pop("num_round") # Evaluation metrics to use with train() API tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric") eval_metric = train_cfg.get("eval_metric") cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval( tuning_objective_metric_param, eval_metric) if cleaned_eval_metric: train_cfg['eval_metric'] = cleaned_eval_metric else: train_cfg.pop('eval_metric', None) # Set callback evals watchlist = [(train_dmatrix, 'train')] if val_dmatrix is not None: watchlist.append((val_dmatrix, 'validation')) xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir) num_round -= iteration if xgb_model is not None: logging.info("Checkpoint loaded from %s", xgb_model) logging.info("Resuming from iteration %s", iteration) callbacks = [] callbacks.append( checkpointing.print_checkpointed_evaluation(start_iteration=iteration)) if checkpoint_dir: save_checkpoint = checkpointing.save_checkpoint( checkpoint_dir, start_iteration=iteration) callbacks.append(save_checkpoint) if save_model_on_termination == "true": save_intermediate_model = checkpointing.save_intermediate_model( model_dir, MODEL_NAME) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) add_debugging(callbacks=callbacks, hyperparameters=train_cfg, train_dmatrix=train_dmatrix, val_dmatrix=val_dmatrix) logging.info("Train matrix has {} rows".format(train_dmatrix.num_row())) if val_dmatrix: logging.info("Validation matrix has {} rows".format( val_dmatrix.num_row())) try: bst = xgb.train(train_cfg, train_dmatrix, num_boost_round=num_round, evals=watchlist, feval=configured_feval, early_stopping_rounds=early_stopping_rounds, callbacks=callbacks, xgb_model=xgb_model, verbose_eval=False) except Exception as e: for customer_error_message in CUSTOMER_ERRORS: if customer_error_message in str(e): raise exc.UserError(str(e)) exception_prefix = "XGB train call failed with exception" raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e))) if not os.path.exists(model_dir): os.makedirs(model_dir) if is_master: model_location = model_dir + '/xgboost-model' with open(model_location, 'wb') as f: pkl.dump(bst, f, protocol=4) logging.debug("Stored trained model at {}".format(model_location))
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix, model_dir, checkpoint_dir, is_master): """Train and save XGBoost model using data on current node. If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration. Trained model is only saved if 'is_master' is True. :param train_cfg: Training hyperparameter configurations :param train_dmatrix: Training Data Matrix :param val_dmatrix: Validation Data Matrix :param train_val_dmatrix: Training + Validation Data Matrix :param model_dir: Directory where model will be saved :param is_master: True if single node training, or the current node is the master node in distributed training. """ # Parse arguments for intermediate model callback save_model_on_termination = train_cfg.pop('save_model_on_termination', "false") # Parse arguments for train() API early_stopping_rounds = train_cfg.get('early_stopping_rounds') num_round = train_cfg.pop("num_round") # Evaluation metrics to use with train() API tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric") eval_metric = train_cfg.get("eval_metric") cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval( tuning_objective_metric_param, eval_metric) if cleaned_eval_metric: train_cfg['eval_metric'] = cleaned_eval_metric else: train_cfg.pop('eval_metric', None) # Set callback evals watchlist = [(train_dmatrix, 'train')] if val_dmatrix is not None: watchlist.append((val_dmatrix, 'validation')) xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir) num_round -= iteration if xgb_model is not None: logging.info("Checkpoint loaded from %s", xgb_model) logging.info("Resuming from iteration %s", iteration) callbacks = [] callbacks.append( checkpointing.print_checkpointed_evaluation(start_iteration=iteration)) if checkpoint_dir: save_checkpoint = checkpointing.save_checkpoint( checkpoint_dir, start_iteration=iteration) callbacks.append(save_checkpoint) if save_model_on_termination == "true": save_intermediate_model = checkpointing.save_intermediate_model( model_dir, MODEL_NAME) callbacks.append(save_intermediate_model) add_sigterm_handler(model_dir, is_master) add_debugging(callbacks=callbacks, hyperparameters=train_cfg, train_dmatrix=train_dmatrix, val_dmatrix=val_dmatrix) logging.info("Train matrix has {} rows and {} columns".format( train_dmatrix.num_row(), train_dmatrix.num_col())) if val_dmatrix: logging.info("Validation matrix has {} rows".format( val_dmatrix.num_row())) try: nfold = train_cfg.pop("_nfold", None) bst = xgb.train(train_cfg, train_dmatrix, num_boost_round=num_round, evals=watchlist, feval=configured_feval, early_stopping_rounds=early_stopping_rounds, callbacks=callbacks, xgb_model=xgb_model, verbose_eval=False) if nfold is not None and train_val_dmatrix is not None: logging.info( "Run {}-fold cross validation on the data of {} rows".format( nfold, train_val_dmatrix.num_row())) # xgb.cv returns a pandas data frame of evaluation results. cv_eval_result = xgb.cv( train_cfg, train_val_dmatrix, nfold=nfold, num_boost_round=num_round, feval=configured_feval, early_stopping_rounds=early_stopping_rounds, verbose_eval=True, show_stdv=True, shuffle=False) logging.info("The final metrics of cross validation") cv_last_epoch = len(cv_eval_result.index) - 1 cv_eval_report = f"[{cv_last_epoch}]" cv_eval_columns = cv_eval_result.columns # Skip the standard deviation columns for j in range(0, len(cv_eval_columns), 2): metric_name = cv_eval_columns[j][:-5].replace( "test-", "validation-", 1) metric_val = cv_eval_result.at[cv_last_epoch, cv_eval_columns[j]] cv_eval_report += '\t{0}:{1:.5f}'.format( metric_name, metric_val) print(cv_eval_report) except Exception as e: for customer_error_message in CUSTOMER_ERRORS: if customer_error_message in str(e): raise exc.UserError(str(e)) exception_prefix = "XGB train call failed with exception" raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e))) if not os.path.exists(model_dir): os.makedirs(model_dir) if is_master: model_location = model_dir + '/xgboost-model' with open(model_location, 'wb') as f: pkl.dump(bst, f, protocol=4) logging.debug("Stored trained model at {}".format(model_location))