예제 #1
0
 def gen_kappa_cv(self, bagging_iter, y_list_valid, cdf_list_valid,
                  num_valid_matrix, p_ens_list_valid_topk,
                  p_ens_list_valid):
     """
     多次bagging 的结果平均值
     :param bagging_iter:第几次bagging,有几次,权重是几
     :param p_ens_list_valid: 多次执行有状态
     :param p_ens_list_valid_topk:
     :return:
     """
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     cutoff = np.zeros((3), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             numValid = num_valid_matrix[run][fold]
             true_label = y_list_valid[run, fold, :numValid]
             cdf = cdf_list_valid[run, fold, :]
             # 每次bagging的结果平均
             p_ens_list_valid[run, fold, :numValid] = (
                 bagging_iter * p_ens_list_valid[run, fold, :numValid] +
                 p_ens_list_valid_topk[run, fold, :numValid]) / (
                     bagging_iter + 1.)
             score, cutoff_tmp = getScore(
                 p_ens_list_valid[run, fold, :numValid], cdf, "valid")
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 score, true_label)
             cutoff += cutoff_tmp
     cutoff /= float(config.n_runs * config.n_folds)
     # 没搞懂?
     cutoff *= (22513 / ((2. / 3) * 10158))
     print("Bag %d, kappa: %.6f (%.6f)" %
           (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv)))
     return kappa_cv, cutoff, p_ens_list_valid
예제 #2
0
 def ensemble_selection_obj(self, param, p1_list, weight1, p2_list,
                            y_list_valid, cdf_list_valid, num_valid_matrix):
     """
     优化param中的weight_current_model参数,使其平均kappa_cv_mean 最大
     :param param:
     :param p1_list: 集成前五个模型(也就是对前五个模型求平均值的结果)
     :param weight1: 1
     :param p2_list: 当前模型预测结果
     :return:
     """
     weight_current_model = param['weight_current_model']
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             num_valid = num_valid_matrix[run][fold]
             p1 = p1_list[run, fold, :num_valid]
             p2 = p2_list[run, fold, :num_valid]
             true_label = y_list_valid[run, fold, :num_valid]
             cdf = cdf_list_valid[run, fold, :]
             p_ens = (weight1 * p1 + weight_current_model * p2) / (
                 weight1 + weight_current_model)
             p_ens_score = getScore(p_ens, cdf)
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 p_ens_score, true_label)
     kappa_cv_mean = np.mean(kappa_cv)
     return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
예제 #3
0
 def ensemble_bagging_models_prediction(self,
                                        best_bagged_model_list,
                                        best_bagged_model_weight,
                                        cdf,
                                        cutoff=None):
     """
     按照bagging、model_list 集成预测结果;根据交叉验证选取的最佳模型,集成All预测结果
     :param best_bagged_model_list:
     :param best_bagged_model_weight:
     :param cdf:
     :param cutoff:
     :return:
     """
     bagging_size = len(best_bagged_model_list)
     # 多次分袋
     for bagging_iter in range(bagging_size):
         # 初始化累计权重
         w_ens = 0
         iter = 0
         # 多个模型集成结果(All预测结果)
         for model, w in zip(best_bagged_model_list[bagging_iter],
                             best_bagged_model_weight[bagging_iter]):
             iter += 1
             pred_file = "%s/All/pred/test.pred.%s.csv" % (
                 self.model_folder, model)
             # 获取当前模型预测值
             this_p_valid = pd.read_csv(pred_file,
                                        dtype=float)["prediction"].values
             this_w = w
             if iter == 1:
                 # 初始化整合预测值是0
                 p_ens_valid = np.zeros((this_p_valid.shape[0]),
                                        dtype=float)
                 id_test = pd.read_csv(pred_file, dtype=float)["id"].values
                 id_test = np.asarray(id_test, dtype=int)
             # 按照归一化权重 线性组合
             p_ens_valid = (w_ens * p_ens_valid +
                            this_w * this_p_valid) / (w_ens + this_w)
             # 累计权重
             w_ens += this_w
         # 多个bagging进行集成,每个bagging的权重都相同
         if bagging_iter == 0:
             p_ens_valid_bag = p_ens_valid
         else:
             # 每次bagging的权重都是1,同等权重
             p_ens_valid_bag = (bagging_iter * p_ens_valid_bag +
                                p_ens_valid) / (bagging_iter + 1.)
     # 根据cdf对排序后的预测结果进行映射成1-4
     if cutoff is None:
         p_ens_score = getScore(p_ens_valid_bag, cdf)
     else:
         # 使用相近取整的方式得出预测结果
         p_ens_score = getTestScore(p_ens_valid_bag, cutoff)
     # 输出集成后的结果
     output = pd.DataFrame({"id": id_test, "prediction": p_ens_score})
     return output
예제 #4
0
 def init_topk_best_model(self, init_top_k, this_sorted_models,
                          pred_list_valid, y_list_valid, cdf_list_valid,
                          num_valid_matrix):
     """
     选择前五个模型 返回整合后的预测值;前五个模型名字;前五个模型的权重(全是1,相当于取平均值)
     读取实例变量:
     pred_list_valid
     num_valid_matrix
     model2idx
     cdf_list_valid
     y_list_valid
     :param init_top_k:
     :param this_sorted_models:
     :return:best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
     """
     best_model_list = []
     best_model_weight = []
     p_ens_list_valid_topk = np.zeros(
         (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
     w_ens, this_w = 0, 1.0
     cnt = 0
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for model, kappa in this_sorted_models[0:init_top_k]:
         print("add the following model to the ensembles ")
         print("model: %s" % model)
         print("kappa: %.6f" % kappa)
         # 指定模型的预测结果
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # 多个模型预测值线性组合
                 p_ens_list_valid_topk[run, fold, :num_valid] = (
                     w_ens * p_ens_list_valid_topk[run, fold, :num_valid] +
                     this_w * this_p_list_valid[run, fold, :num_valid]) / (
                         w_ens + this_w)
                 # 在最后一个model,生成一些指标
                 if cnt == init_top_k - 1:
                     cdf = cdf_list_valid[run, fold, :]
                     true_label = y_list_valid[run, fold, :num_valid]
                     score = getScore(
                         p_ens_list_valid_topk[run, fold, :num_valid], cdf)
                     kappa_cv[run][fold] = quadratic_weighted_kappa(
                         score, true_label)
         best_model_list.append(model)
         best_model_weight.append(this_w)
         w_ens += this_w
         cnt += 1
         print("Init kappa: %.6f (%.6f)" %
               (np.mean(kappa_cv), np.std(kappa_cv)))
     return best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
    def out_put_all(self, feat_folder, feat_name, kappa_cv_mean, kappa_cv_std,
                    pred_raw, pred_rank):
        # write
        output = pd.DataFrame({
            "id": self.all_matrix['id_test'],
            "prediction": pred_raw
        })
        output.to_csv(self.all_matrix['raw_pred_test_path'], index=False)

        # write
        output = pd.DataFrame({
            "id": self.all_matrix['id_test'],
            "prediction": pred_rank
        })
        output.to_csv(self.all_matrix['rank_pred_test_path'], index=False)

        # write score pred--原来代码有错:应该是pred_raw 因为pred_raw是多次装袋后平均预测值,不应该是其中一次装袋的预测值
        pred_score = utils.getScore(pred_raw, self.all_matrix['cdf_test'])
        output = pd.DataFrame({
            "id": self.all_matrix['id_test'],
            "prediction": pred_score
        })
        output.to_csv(self.all_matrix['subm_path'], index=False)
    def gen_bagging(self, param, set_obj, all):
        """
        分袋整合预测结果
        :param set_obj:
        :param all:
        :return:
        """
        for n in range(model_param_conf.bagging_size):
            # 对数据进行自举法抽样;因为ratio=1 且bootstrap_replacement=false 说明没有用到,就使用的是全量数据
            index_base, index_meta = utils.bootstrap_all(
                model_param_conf.bootstrap_replacement, set_obj['numTrain'],
                model_param_conf.bootstrap_ratio)
            set_obj['index_base'] = index_base
            set_obj['dtrain'] = xgb.DMatrix(
                set_obj['X_train'][index_base],
                label=set_obj['labels_train'][index_base],
                weight=set_obj['weight_train'][index_base])
            if all:
                preds_bagging = np.zeros(
                    (set_obj['numTest'], model_param_conf.bagging_size),
                    dtype=float)
                set_obj['dtest'] = xgb.DMatrix(set_obj['X_test'],
                                               label=set_obj['labels_test'])
                # watchlist
                set_obj['watchlist'] = []
                if model_param_conf.verbose_level >= 2:
                    set_obj['watchlist'] = [(set_obj['dtrain'], 'train')]
                    # 调用 每个子类的train_predict方法,多态
                pred = self.train_predict(param, set_obj, all)
                pred_test = pred
                preds_bagging[:, n] = pred_test
            else:
                preds_bagging = np.zeros(
                    (set_obj['numValid'], model_param_conf.bagging_size),
                    dtype=float)
                set_obj['dvalid'] = xgb.DMatrix(set_obj['X_valid'],
                                                label=set_obj['labels_valid'])
                # watchlist
                set_obj['watchlist'] = []
                if model_param_conf.verbose_level >= 2:
                    set_obj['watchlist'] = [(set_obj['dtrain'], 'train'),
                                            (set_obj['dvalid_base'], 'valid')]
                # 调用 每个子类的train_predict方法,多态
                pred = self.train_predict(param, set_obj, all)
                pred_valid = pred
                preds_bagging[:, n] = pred_valid
                # 每次会把当前bagging的结果累计进来 求均值
                pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1)
                # 为什么需要两次argsort?
                pred_rank = pred_raw.argsort().argsort()
                pred_score, cutoff = utils.getScore(pred_rank,
                                                    set_obj['cdf_valid'],
                                                    valid=True)
                kappa_valid = utils.quadratic_weighted_kappa(
                    pred_score, set_obj['Y_valid'])

        if all:
            pred_raw = np.mean(preds_bagging, axis=1)
            pred_rank = pred_raw.argsort().argsort()
            return pred_raw, pred_rank
        else:
            return pred_raw, pred_rank, kappa_valid
예제 #7
0
 def find_best_model(self, this_sorted_models, pred_list_valid,
                     y_list_valid, cdf_list_valid, num_valid_matrix, w_ens,
                     w_min, w_max, best_kappa, hypteropt_max_evals,
                     p_ens_list_valid_topk):
     """
     从模型集合中找出一个最佳模型,最佳系数,与topK集成结果进行线性组合
     寻找最佳模型、权重、kappa值 从this_sorted_models找到一个最佳模型
     :param this_sorted_models:
     :param w_ens:
     :param w_min:
     :param w_max:
     :param hypteropt_max_evals:
     :param p_ens_list_valid_topk:
     :return:
     """
     best_model = None
     best_weight = 0
     for model, kappa in this_sorted_models:
         # 当前模型预测值
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         # hyperopt 找当前模型最优权重
         trials = Trials()
         # 不同模型的权重
         param_space = {
             'weight_current_model':
             hp.uniform('weight_current_model', w_min, w_max)
         }
         # topk权重是1 找另一个最佳权重
         obj = lambda param: self.ensemble_selection_obj(
             param, p_ens_list_valid_topk, 1., this_p_list_valid,
             y_list_valid, cdf_list_valid, num_valid_matrix)
         best_params = fmin(obj,
                            param_space,
                            algo=tpe.suggest,
                            trials=trials,
                            max_evals=hypteropt_max_evals)
         # 返回当前模型权重
         this_w = best_params['weight_current_model']
         # 按比例缩放当前权重 1 this_w --- w_ens this_w * w_ens
         this_w *= w_ens
         # 当前kappa cv
         kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # topk预测
                 p1 = p_ens_list_valid_topk[run, fold, :num_valid]
                 # 当前预测
                 p2 = this_p_list_valid[run, fold, :num_valid]
                 # 真实label
                 true_label = y_list_valid[run, fold, :num_valid]
                 cdf = cdf_list_valid[run, fold, :]
                 # 集成后的结果
                 p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                 score = getScore(p_ens, cdf)
                 # 集成后kappa值
                 kappa_cv[run][fold] = quadratic_weighted_kappa(
                     score, true_label)
         # 集成后平均kappa cv 由于现在
         if np.mean(kappa_cv) > best_kappa:
             best_kappa, best_model, best_weight = np.mean(
                 kappa_cv), model, this_w
     return best_kappa, best_model, best_weight
예제 #8
0
    def init_model_metrics_by_run_fold(self, feat_folder, cdf):
        """
         为每个交叉验证数据按照 run-fold生成一系列指标
         初始化实例变量,供后续方法使用
         kappa_list      :每个模型的平均kappa值
         num_valid_matrix:每个run-fold 的预测结果行数
         y_list_valid    :每个run-fold 的真实label
         cdf_list_valid  :每个run-fold 的cdf
         kappa_cv        :每个run-fold 的kappa cv
         pred_list_valid :每个run-fold 的真实预测值

        :param feat_folder:
        :param cdf:
        :return:
        """
        kappa_list = dict()
        # 模型-run-fold-行 交叉验证-valid数据集预测结果
        pred_list_valid = np.zeros((len(self.model_list), config.n_runs,
                                    config.n_folds, self.max_num_valid),
                                   dtype=float)
        # run-fold-行      交叉验证-valid数据集真实label
        y_list_valid = np.zeros(
            (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
        # run-fold-4类别   交叉验证-valid数据集预测结果cdf
        cdf_list_valid = np.zeros(
            (config.n_runs, config.n_folds, config.num_of_class), dtype=float)
        # run-fold valid   交叉验证-valid数据集预测结果行数
        num_valid_matrix = np.zeros((config.n_runs, config.n_folds), dtype=int)
        print("Load model...")
        for i, model in enumerate(self.model_list):
            print("model: %s" % model)
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    path = "%s/Run%d/Fold%d/pred" % (self.model_folder,
                                                     run + 1, fold + 1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)
                    cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (
                        config.solution_info, run + 1, fold + 1)
                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    # 这些指标只需要执行一次就行了,每个模型都一样
                    if i == 0:
                        # 记录run-fold的行数
                        num_valid_matrix[run][fold] = this_p_valid.shape[0]
                        # 记录run-fold的真实值
                        y_list_valid[run, fold, :num_valid_matrix[run]
                                     [fold]] = this_p_valid["target"].values
                        # load cdf
                        if cdf == None:
                            cdf_list_valid[run,
                                           fold, :] = np.loadtxt(cdf_file,
                                                                 dtype=float)
                        else:
                            cdf_list_valid[run, fold, :] = cdf
                        score = getScore(this_p_valid["prediction"].values,
                                         cdf_list_valid[run, fold, :])
                        kappa_cv[run][fold] = quadratic_weighted_kappa(
                            score,
                            y_list_valid[run,
                                         fold, :num_valid_matrix[run][fold]])
                    # 记录model-run-fold的预测值数组
                    pred_list_valid[
                        self.model2idx[model], run, fold, :this_p_valid.
                        shape[0]] = this_p_valid["prediction"].values
            print("kappa: %.6f" % np.mean(kappa_cv))
            # 算出每个模型的平均kappa_cv
            kappa_list[model] = np.mean(kappa_cv)

        return kappa_list, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix