示例#1
0
def get_callbacks_watchlist(train_cfg,
                            train_dmatrix,
                            val_dmatrix,
                            model_dir,
                            checkpoint_dir,
                            is_master,
                            fold=None):
    if checkpoint_dir and fold is not None:
        checkpoint_dir = os.path.join(checkpoint_dir, f"model-{fold}")

    # Set callbacks
    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
    if xgb_model is not None:
        if fold is not None:
            xgb_model = f"{xgb_model}-{fold}"
        logging.info("Checkpoint loaded from %s", xgb_model)
        logging.info("Resuming from iteration %s", iteration)

    callbacks = []
    callbacks.append(
        checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
    if checkpoint_dir:
        save_checkpoint = checkpointing.save_checkpoint(
            checkpoint_dir, start_iteration=iteration)
        callbacks.append(save_checkpoint)

    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")
    if save_model_on_termination == "true":
        model_name = f"{MODEL_NAME}-{fold}" if fold is not None else MODEL_NAME
        save_intermediate_model = checkpointing.save_intermediate_model(
            model_dir, model_name)
        callbacks.append(save_intermediate_model)
        add_sigterm_handler(model_dir, is_master)

    watchlist = [(train_dmatrix, 'train')]
    if val_dmatrix is not None:
        watchlist.append((val_dmatrix, 'validation'))

    return xgb_model, iteration, callbacks, watchlist
def train_job(train_cfg, train_dmatrix, val_dmatrix, model_dir, checkpoint_dir,
              is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Parse arguments for train() API
    early_stopping_rounds = train_cfg.get('early_stopping_rounds')
    num_round = train_cfg.pop("num_round")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    # Set callback evals
    watchlist = [(train_dmatrix, 'train')]
    if val_dmatrix is not None:
        watchlist.append((val_dmatrix, 'validation'))

    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
    num_round -= iteration
    if xgb_model is not None:
        logging.info("Checkpoint loaded from %s", xgb_model)
        logging.info("Resuming from iteration %s", iteration)

    callbacks = []
    callbacks.append(
        checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
    if checkpoint_dir:
        save_checkpoint = checkpointing.save_checkpoint(
            checkpoint_dir, start_iteration=iteration)
        callbacks.append(save_checkpoint)

    if save_model_on_termination == "true":
        save_intermediate_model = checkpointing.save_intermediate_model(
            model_dir, MODEL_NAME)
        callbacks.append(save_intermediate_model)
        add_sigterm_handler(model_dir, is_master)

    add_debugging(callbacks=callbacks,
                  hyperparameters=train_cfg,
                  train_dmatrix=train_dmatrix,
                  val_dmatrix=val_dmatrix)

    logging.info("Train matrix has {} rows".format(train_dmatrix.num_row()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        bst = xgb.train(train_cfg,
                        train_dmatrix,
                        num_boost_round=num_round,
                        evals=watchlist,
                        feval=configured_feval,
                        early_stopping_rounds=early_stopping_rounds,
                        callbacks=callbacks,
                        xgb_model=xgb_model,
                        verbose_eval=False)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        model_location = model_dir + '/xgboost-model'
        with open(model_location, 'wb') as f:
            pkl.dump(bst, f, protocol=4)
        logging.debug("Stored trained model at {}".format(model_location))
def train_job(train_cfg, train_dmatrix, val_dmatrix, train_val_dmatrix,
              model_dir, checkpoint_dir, is_master):
    """Train and save XGBoost model using data on current node.

    If doing distributed training, XGBoost will use rabit to sync the trained model between each boosting iteration.
    Trained model is only saved if 'is_master' is True.

    :param train_cfg: Training hyperparameter configurations
    :param train_dmatrix: Training Data Matrix
    :param val_dmatrix: Validation Data Matrix
    :param train_val_dmatrix: Training + Validation Data Matrix
    :param model_dir: Directory where model will be saved
    :param is_master: True if single node training, or the current node is the master node in distributed training.
    """
    # Parse arguments for intermediate model callback
    save_model_on_termination = train_cfg.pop('save_model_on_termination',
                                              "false")

    # Parse arguments for train() API
    early_stopping_rounds = train_cfg.get('early_stopping_rounds')
    num_round = train_cfg.pop("num_round")

    # Evaluation metrics to use with train() API
    tuning_objective_metric_param = train_cfg.get("_tuning_objective_metric")
    eval_metric = train_cfg.get("eval_metric")
    cleaned_eval_metric, configured_feval = train_utils.get_eval_metrics_and_feval(
        tuning_objective_metric_param, eval_metric)
    if cleaned_eval_metric:
        train_cfg['eval_metric'] = cleaned_eval_metric
    else:
        train_cfg.pop('eval_metric', None)

    # Set callback evals
    watchlist = [(train_dmatrix, 'train')]
    if val_dmatrix is not None:
        watchlist.append((val_dmatrix, 'validation'))

    xgb_model, iteration = checkpointing.load_checkpoint(checkpoint_dir)
    num_round -= iteration
    if xgb_model is not None:
        logging.info("Checkpoint loaded from %s", xgb_model)
        logging.info("Resuming from iteration %s", iteration)

    callbacks = []
    callbacks.append(
        checkpointing.print_checkpointed_evaluation(start_iteration=iteration))
    if checkpoint_dir:
        save_checkpoint = checkpointing.save_checkpoint(
            checkpoint_dir, start_iteration=iteration)
        callbacks.append(save_checkpoint)

    if save_model_on_termination == "true":
        save_intermediate_model = checkpointing.save_intermediate_model(
            model_dir, MODEL_NAME)
        callbacks.append(save_intermediate_model)
        add_sigterm_handler(model_dir, is_master)

    add_debugging(callbacks=callbacks,
                  hyperparameters=train_cfg,
                  train_dmatrix=train_dmatrix,
                  val_dmatrix=val_dmatrix)

    logging.info("Train matrix has {} rows and {} columns".format(
        train_dmatrix.num_row(), train_dmatrix.num_col()))
    if val_dmatrix:
        logging.info("Validation matrix has {} rows".format(
            val_dmatrix.num_row()))

    try:
        nfold = train_cfg.pop("_nfold", None)

        bst = xgb.train(train_cfg,
                        train_dmatrix,
                        num_boost_round=num_round,
                        evals=watchlist,
                        feval=configured_feval,
                        early_stopping_rounds=early_stopping_rounds,
                        callbacks=callbacks,
                        xgb_model=xgb_model,
                        verbose_eval=False)

        if nfold is not None and train_val_dmatrix is not None:
            logging.info(
                "Run {}-fold cross validation on the data of {} rows".format(
                    nfold, train_val_dmatrix.num_row()))
            # xgb.cv returns a pandas data frame of evaluation results.
            cv_eval_result = xgb.cv(
                train_cfg,
                train_val_dmatrix,
                nfold=nfold,
                num_boost_round=num_round,
                feval=configured_feval,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=True,
                show_stdv=True,
                shuffle=False)

            logging.info("The final metrics of cross validation")
            cv_last_epoch = len(cv_eval_result.index) - 1
            cv_eval_report = f"[{cv_last_epoch}]"
            cv_eval_columns = cv_eval_result.columns
            # Skip the standard deviation columns
            for j in range(0, len(cv_eval_columns), 2):
                metric_name = cv_eval_columns[j][:-5].replace(
                    "test-", "validation-", 1)
                metric_val = cv_eval_result.at[cv_last_epoch,
                                               cv_eval_columns[j]]
                cv_eval_report += '\t{0}:{1:.5f}'.format(
                    metric_name, metric_val)
            print(cv_eval_report)
    except Exception as e:
        for customer_error_message in CUSTOMER_ERRORS:
            if customer_error_message in str(e):
                raise exc.UserError(str(e))

        exception_prefix = "XGB train call failed with exception"
        raise exc.AlgorithmError("{}:\n {}".format(exception_prefix, str(e)))

    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    if is_master:
        model_location = model_dir + '/xgboost-model'
        with open(model_location, 'wb') as f:
            pkl.dump(bst, f, protocol=4)
        logging.debug("Stored trained model at {}".format(model_location))