예제 #1
0
    def cv(self):
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(self.learner.param_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        rmse_cv = np.zeros(self.n_iter)
        for i in range(self.n_iter):
            # data
            X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(
                i)
            # fit
            self.learner.fit(X_train, y_train)
            y_pred = self.learner.predict(X_valid)
            rmse_cv[i] = dist_utils._rmse(y_valid, y_pred)
            # log
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                i + 1, np.round(rmse_cv[i], 6), X_train.shape[0],
                X_train.shape[1]))
            # save
            fname = "%s/Run%d/valid.pred.%s.csv" % (config.OUTPUT_DIR, i + 1,
                                                    self.__str__())
            df = pd.DataFrame({"target": y_valid, "prediction": y_pred})
            df.to_csv(fname, index=False, columns=["target", "prediction"])
            if hasattr(self.learner.learner, "predict_proba"):
                y_proba = self.learner.learner.predict_proba(X_valid)
                fname = "%s/Run%d/valid.proba.%s.csv" % (config.OUTPUT_DIR,
                                                         i + 1, self.__str__())
                columns = ["proba%d" % i for i in range(y_proba.shape[1])]
                df = pd.DataFrame(y_proba, columns=columns)
                df["target"] = y_valid
                df.to_csv(fname, index=False)

        self.rmse_cv_mean = np.mean(rmse_cv)
        self.rmse_cv_std = np.std(rmse_cv)
        end = time.time()
        _sec = end - start
        _min = int(_sec / 60.)
        if self.verbose:
            self.logger.info("RMSE")
            self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
            self.logger.info("      Std: %.6f" % self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins" % _min)
            else:
                self.logger.info("      %d secs" % _sec)
            self.logger.info("-" * 50)
        return self
예제 #2
0
    def cv(self):
        start = time.time()
        if self.verbose:
            self.logger.info("=" * 50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            self._print_param_dict(self.learner.param_dict)
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")

        auc_cv = np.zeros(self.n_iter)
        total_train = self.feature.len_train
        train_pred = np.zeros(total_train)
        i = -1
        for (X_train, y_train, X_valid, y_valid, train_ind,
             valid_ind) in self.feature._get_train_valid_data():
            i += 1
            print(X_train[:10])
            print(y_train[:10])
            # data
            #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i)
            # fit
            self.learner.fit(X_train, y_train)
            y_pred = self.learner.predict(X_valid)
            train_pred[valid_ind] = y_pred
            auc_cv[i] = dist_utils._rmse(y_valid, y_pred)
            # log
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                i + 1, np.round(auc_cv[i], 6), X_train.shape[0],
                X_train.shape[1]))

        # save
        fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__())
        df = pd.DataFrame({"click_id": y_valid, "predicted": y_pred})
        df.to_csv(fname, index=False, columns=["click_id", "predicted"])

        self.rmse_cv_mean = np.mean(auc_cv)
        self.rmse_cv_std = np.std(auc_cv)
        end = time.time()
        _sec = end - start
        _min = int(_sec / 60.)
        if self.verbose:
            self.logger.info("AUC")
            self.logger.info("      Mean: %.6f" % self.rmse_cv_mean)
            self.logger.info("      Std: %.6f" % self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins" % _min)
            else:
                self.logger.info("      %d secs" % _sec)
            self.logger.info("-" * 50)
        return self
예제 #3
0
    def cv(self):
        start = time.time()
        if self.verbose:
            self.logger.info("="*50)
            self.logger.info("Task")
            self.logger.info("      %s" % str(self.__str__()))
            self.logger.info("Param")
            for k,v in sorted(self.learner.param_dict.items()):
                self.logger.info("      %s: %s" % (k,v))
            self.logger.info("Result")
            self.logger.info("      Run      RMSE        Shape")
    
        rmse_cv = np.zeros(self.n_iter)
        for i in range(self.n_iter):
            # data
            X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i)
            # fit
            self.learner.fit(X_train, y_train)
            y_pred = self.learner.predict(X_valid)
            rmse_cv[i] = dist_utils._rmse(y_valid, y_pred)
            # log
            self.logger.info("      {:>3}    {:>8}    {} x {}".format(
                i+1, np.round(rmse_cv[i],6), X_train.shape[0], X_train.shape[1]))
            # save
            fname = "%s/Run%d/valid.pred.%s.csv"%(config.OUTPUT_DIR, i+1, self.__str__())
            df = pd.DataFrame({"target": y_valid, "prediction": y_pred})
            df.to_csv(fname, index=False, columns=["target", "prediction"])
            if hasattr(self.learner.learner, "predict_proba"):
                y_proba = self.learner.learner.predict_proba(X_valid)
                fname = "%s/Run%d/valid.proba.%s.csv"%(config.OUTPUT_DIR, i+1, self.__str__())
                columns = ["proba%d"%i for i in range(y_proba.shape[1])]
                df = pd.DataFrame(y_proba, columns=columns)
                df["target"] = y_valid
                df.to_csv(fname, index=False)

        self.rmse_cv_mean = np.mean(rmse_cv)
        self.rmse_cv_std = np.std(rmse_cv)
        end = time.time()
        _sec = end - start
        _min = int(_sec/60.)
        if self.verbose:
            self.logger.info("RMSE")
            self.logger.info("      Mean: %.6f"%self.rmse_cv_mean)
            self.logger.info("      Std: %.6f"%self.rmse_cv_std)
            self.logger.info("Time")
            if _min > 0:
                self.logger.info("      %d mins"%_min)
            else:
                self.logger.info("      %d secs"%_sec)
            self.logger.info("-"*50)
        return self
 def _ens_obj_generic(self, weight2, p1_list, weight1, p2_list,
                      true_label_list, numBSTMatrix, bst_inst_idx):
     rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
     for run in range(config.N_RUNS):
         for fold in range(config.N_FOLDS):
             numBST = numBSTMatrix[run, fold]
             bidx = bst_inst_idx[run, fold, :numBST].tolist()
             p1 = p1_list[run, fold, bidx]
             p2 = p2_list[run, fold, bidx]
             true_label = true_label_list[run, fold, bidx]
             p_ens = self._merge_pred(weight1, p1, weight2, p2)
             rmse_cv[run, fold] = dist_utils._rmse(p_ens, true_label)
     rmse_mean = np.mean(rmse_cv)
     rmse_std = np.std(rmse_cv)
     return rmse_mean, rmse_std
 def _ens_obj_generic(self, weight2, p1_list, weight1, p2_list, 
                     true_label_list, numBSTMatrix, bst_inst_idx):
     rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
     for run in range(config.N_RUNS):
         for fold in range(config.N_FOLDS):
             numBST = numBSTMatrix[run,fold]
             bidx = bst_inst_idx[run,fold,:numBST].tolist()
             p1 = p1_list[run,fold,bidx]
             p2 = p2_list[run,fold,bidx]
             true_label = true_label_list[run,fold,bidx]
             p_ens = self._merge_pred(weight1, p1, weight2, p2)
             rmse_cv[run,fold] = dist_utils._rmse(p_ens, true_label)
     rmse_mean = np.mean(rmse_cv)
     rmse_std = np.std(rmse_cv)
     return rmse_mean, rmse_std
예제 #6
0
 def _get_centroid_rmse(self, text1, text2):
     centroid1 = self._get_centroid_vector(text1)
     centroid2 = self._get_centroid_vector(text2)
     return dist_utils._rmse(centroid1, centroid2)
    def go(self):

        ## initialization
        pred_list_valid = np.zeros((self.n_models, config.N_RUNS,
                                    config.N_FOLDS, config.VALID_SIZE_MAX),
                                   dtype=float)
        Y_list_valid = np.zeros(
            (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX),
            dtype=float)
        numValidMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        p_ens_list_valid = np.zeros(
            (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX),
            dtype=float)

        bst_inst_idx = np.zeros(
            (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX),
            dtype=float)
        numBSTMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        oob_inst_idx = np.zeros(
            (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX),
            dtype=float)
        numOOBMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)

        self.logger.info("Perform Extreme Ensemble Selection...")
        ## model index
        model_index_dict = dict(zip(self.model_list, range(self.n_models)))
        model_rmse_dict = dict(zip(self.model_list, [0] * self.n_models))
        self.logger.info("=" * 80)
        self.logger.info("Load model...")
        for model in self.model_list:
            self.logger.info("model: %s" % model)
            model_id = model_index_dict[model]
            rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
            ## load model
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    path = "%s/Run%d" % (self.model_folder, run + 1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)

                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    numValidMatrix[run, fold] = this_p_valid.shape[0]
                    numValid = numValidMatrix[run, fold]
                    this_target = this_p_valid["target"].values
                    this_p_valid = this_p_valid["prediction"].values
                    pred_list_valid[model_id, run, fold, :numValid] = np.clip(
                        this_p_valid, 1., 3.)
                    Y_list_valid[run, fold, :numValid] = this_target

                    ##
                    rmse_cv[run, fold] = dist_utils._rmse(
                        pred_list_valid[model_id, run, fold, :numValid],
                        Y_list_valid[run, fold, :numValid])

            self.logger.info("rmse: %.6f (%.6f)" %
                             (np.mean(rmse_cv), np.std(rmse_cv)))
            model_rmse_dict[model] = (np.mean(rmse_cv), np.std(rmse_cv))
        self.logger.info("%d models in total." % self.n_models)

        sorted_models = sorted(model_rmse_dict.items(), key=lambda x: x[1][0])

        # greedy ensemble
        self.logger.info("=" * 80)
        best_bagged_model_list = [[]] * self.bagging_size
        best_bagged_model_weight = [[]] * self.bagging_size
        score_valid_bag_mean = np.nan * np.zeros(
            (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX,
             self.bagging_size),
            dtype=float)
        rmse_cv_mean_mean_lst = [0] * self.bagging_size
        rmse_cv_mean_std_lst = [0] * self.bagging_size
        for bagging_iter in range(self.bagging_size):
            seed_model = self.random_seed + 100 * bagging_iter
            if not self.enable_extreme:
                this_sorted_models = self._pick_random_models(
                    sorted_models, seed_model)
            #### instance level subsampling
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    if self.inst_splitter is None:
                        # GENERAL APPROACH
                        seed_inst = self.random_seed + 1000 * bagging_iter + 100 * run + 10 * fold
                        rng_inst = np.random.RandomState(seed_inst)
                        numValid = numValidMatrix[run, fold]
                        if self.inst_subsample_replacement:
                            sss = StratifiedShuffleSplitReplacement(
                                Y_list_valid[run, fold, :numValid],
                                n_iter=1,
                                test_size=1. - self.inst_subsample,
                                random_state=seed_inst)
                            iidx, oidx = list(sss)[0]
                        else:
                            if self.inst_subsample < 1:
                                # Stratified ShuffleSplit
                                sss = ShuffleSplit(
                                    len(Y_list_valid[run, fold, :numValid]),
                                    n_iter=1,
                                    test_size=1. - self.inst_subsample,
                                    random_state=seed_inst)
                                iidx, oidx = list(sss)[0]
                            elif self.inst_subsample == 1:
                                # set iidx (trianing) the same as oidx (validation)
                                iidx = np.arange(numValid)
                                oidx = np.arange(numValid)
                    else:
                        iidx, oidx = self.inst_splitter[run]
                    numBSTMatrix[run, fold] = len(iidx)
                    bst_inst_idx[run, fold, :numBSTMatrix[run, fold]] = iidx
                    numOOBMatrix[run, fold] = len(oidx)
                    oob_inst_idx[run, fold, :numOOBMatrix[run, fold]] = oidx

            #print this_model_list
            best_model_list = []
            best_model_weight = []
            best_model_rmse = []
            best_rmse = 0
            best_rmse_std = 0
            best_model = None
            p_ens_list_valid_tmp = np.zeros(
                (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX),
                dtype=float)
            #### Technique: Ensemble Initialization
            iter = 0
            w_ens, this_w = 0.0, 1.0
            if self.init_top_k > 0:
                # self.logger.info("** Ensemble Initialization **")
                # init_top_k = min(init_top_k, num_model)
                rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS),
                                   dtype=float)
                for cnt in range(self.init_top_k):
                    iter += 1
                    start = time.time()
                    seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                    if self.enable_extreme:
                        this_sorted_models = self._pick_random_models(
                            sorted_models, seed_model)
                        best_model, (rmse, rmse_std) = this_sorted_models[0]
                    else:
                        best_model, (rmse, rmse_std) = this_sorted_models[cnt]
                    this_p_list_valid = pred_list_valid[
                        model_index_dict[best_model]]
                    for run in range(config.N_RUNS):
                        for fold in range(config.N_FOLDS):
                            numValid = numValidMatrix[run, fold]
                            numBST = numBSTMatrix[run, fold]
                            bidx = bst_inst_idx[run, fold, :numBST].tolist()
                            p_ens_list_valid_tmp[
                                run, fold, :numValid] = self._merge_pred(
                                    w_ens,
                                    p_ens_list_valid_tmp[run, fold, :numValid],
                                    this_w, this_p_list_valid[run,
                                                              fold, :numValid])
                            true_label = Y_list_valid[run, fold, bidx]
                            rmse_cv[run, fold] = dist_utils._rmse(
                                p_ens_list_valid_tmp[run, fold, bidx],
                                true_label)
                    end = time.time()
                    best_weight = this_w
                    best_rmse = np.mean(rmse_cv)
                    best_rmse_std = np.std(rmse_cv)

                    self.logger.info("Iter: %d (%.2fs)" % (iter,
                                                           (end - start)))
                    self.logger.info("     model: %s" % best_model)
                    self.logger.info("     weight: %s" % best_weight)
                    self.logger.info("     rmse: %.6f (%.6f)" %
                                     (best_rmse, best_rmse_std))

                    best_model_list.append(best_model)
                    best_model_weight.append(best_weight)
                    w_ens += best_weight

            #### Technique: Ensemble Selection with Replacement
            while True:
                iter += 1
                seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                if self.enable_extreme:
                    this_sorted_models = self._pick_random_models(
                        sorted_models, seed_model)
                if self.multiprocessing:
                    start = time.time()
                    models_tmp = [
                        model for model, (_, _) in this_sorted_models
                    ]
                    best_trial_rmse_mean_lst, best_trial_rmse_std_lst, model_lst, this_w_lst = \
                        zip(*Parallel(n_jobs=self.multiprocessing_num_cores)(
                            delayed(self._find_optim_weight_scipy)(
                                p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix,
                                bst_inst_idx, w_ens, model_index_dict, m
                                ) for m in models_tmp
                            ))
                    ##
                    ind_best = np.argmin(best_trial_rmse_mean_lst)
                    best_trial_rmse_mean = best_trial_rmse_mean_lst[ind_best]
                    best_trial_rmse_std = best_trial_rmse_std_lst[ind_best]
                    model = model_lst[ind_best]
                    this_w = this_w_lst[ind_best]
                    if best_trial_rmse_mean < best_rmse:
                        best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                        best_model, best_weight = model, this_w
                    end = time.time()
                else:
                    start = time.time()
                    for model, (_, _) in this_sorted_models:
                        best_trial_rmse_mean, best_trial_rmse_std, model, this_w = \
                            self._find_optim_weight_scipy(
                            p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix,
                            bst_inst_idx, w_ens, model_index_dict, model)
                        if best_trial_rmse_mean < best_rmse:
                            best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                            best_model, best_weight = model, this_w
                    end = time.time()
                if best_model is None:
                    break
                if len(best_model_rmse) > 1 and (
                        best_model_rmse[-1] - best_rmse < self.epsilon):
                    break

                ##
                self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start)))
                self.logger.info("     model: %s" % best_model)
                self.logger.info("     weight: %s" % best_weight)
                self.logger.info("     rmse: %.6f (%.6f)" %
                                 (best_rmse, best_rmse_std))

                # valid
                this_p_list_valid = pred_list_valid[
                    model_index_dict[best_model]]
                pred_raw_list = []
                true_label_list = []
                for run in range(config.N_RUNS):
                    for fold in range(config.N_FOLDS):
                        numValid = numValidMatrix[run, fold]
                        numBST = numBSTMatrix[run, fold]
                        bidx = bst_inst_idx[run, fold, :numBST].tolist()
                        p_ens_list_valid_tmp[
                            run, fold, :numValid] = self._merge_pred(
                                w_ens, p_ens_list_valid_tmp[run,
                                                            fold, :numValid],
                                best_weight,
                                this_p_list_valid[run, fold, :numValid])

                        pred_raw_list.append(p_ens_list_valid_tmp[run, fold,
                                                                  bidx])
                        true_label_list.append(Y_list_valid[run, fold, bidx])

                best_model_list.append(best_model)
                best_model_weight.append(best_weight)
                best_model_rmse.append(best_rmse)

                best_model = None
                w_ens += best_weight

            ## compute OOB score
            rmse_cv_mean = np.zeros((config.N_RUNS, config.N_FOLDS),
                                    dtype=float)
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    numValid = numValidMatrix[run, fold]
                    true_label = Y_list_valid[run, fold, :numValid]
                    numOOB = numOOBMatrix[run, fold]
                    oidx = oob_inst_idx[run, fold, :numOOB].tolist()
                    pred_raw = p_ens_list_valid_tmp[run, fold, oidx]
                    ## mean
                    score_valid_bag_mean[run, fold, oidx,
                                         bagging_iter] = pred_raw
                    pred_mean = np_utils._array_mean(
                        score_valid_bag_mean[run,
                                             fold, :numValid, :(bagging_iter +
                                                                1)])
                    non_nan_idx = pred_mean != config.MISSING_VALUE_NUMERIC
                    rmse_cv_mean[run, fold] = dist_utils._rmse(
                        pred_mean[non_nan_idx], true_label[non_nan_idx])
            self.logger.info("-" * 80)
            self.logger.info("Bag: %d" % (bagging_iter + 1))
            self.logger.info("rmse-mean: %.6f (%.6f)" %
                             (np.mean(rmse_cv_mean), np.std(rmse_cv_mean)))
            self.logger.info("-" * 80)

            best_bagged_model_list[bagging_iter] = best_model_list
            best_bagged_model_weight[bagging_iter] = best_model_weight

            ## save the current prediction
            mr = "R" + str(self.model_subsample_replacement).upper()[0]
            ir = "R" + str(self.inst_subsample_replacement).upper()[0]
            ## mean
            best_rmse_mean = np.mean(rmse_cv_mean)
            best_rmse_std = np.std(rmse_cv_mean)
            output = self._ens_predict(
                best_bagged_model_list[:(bagging_iter + 1)],
                best_bagged_model_weight[:(bagging_iter + 1)])
            sub_file = "%s_[MS%.2f_%s]_[IS%.2f_%s]_[Top%d]_[Bag%d]_[Mean%.6f]_[Std%.6f].mean.csv" % (
                self.subm_prefix, self.model_subsample, mr,
                self.inst_subsample, ir, self.init_top_k, bagging_iter + 1,
                best_rmse_mean, best_rmse_std)
            output.to_csv(sub_file, index=False)
            rmse_cv_mean_mean_lst[bagging_iter] = best_rmse_mean
            rmse_cv_mean_std_lst[bagging_iter] = best_rmse_std

            ## plot OOB score
            x = np.arange(1, bagging_iter + 2, 1)
            label = "Mean (Best = %.6f, Bag = %d)" % (
                np.min(rmse_cv_mean_mean_lst[:(bagging_iter + 1)]),
                np.argmin(rmse_cv_mean_mean_lst[:(bagging_iter + 1)]) + 1)
            plt.errorbar(x,
                         rmse_cv_mean_mean_lst[:(bagging_iter + 1)],
                         yerr=rmse_cv_mean_std_lst[:(bagging_iter + 1)],
                         fmt='-o',
                         label=label)
            plt.xlim(1, self.bagging_size)
            plt.title("Extreme Ensemble Selection RMSE")
            plt.xlabel("Bag")
            plt.ylabel("CV/OOB RMSE")
            plt.legend(loc="upper right")
            fig_file = "%s/ensemble_selection_%d.pdf" % (config.FIG_DIR,
                                                         bagging_iter + 1)
            plt.savefig(fig_file)
            plt.clf()
예제 #8
0
 def _get_rmse(self, sent1, sent2):
     vect1 = self._get_vector(sent1)
     vect2 = self._get_vector(sent2)
     return dist_utils._rmse(vect1, vect2)
    def go(self):

        ## initialization
        pred_list_valid = np.zeros((self.n_models, config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        Y_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numValidMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        p_ens_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)

        bst_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numBSTMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)
        oob_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
        numOOBMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int)

        self.logger.info("Perform Extreme Ensemble Selection...")
        ## model index
        model_index_dict = dict(zip(self.model_list, range(self.n_models)))
        model_rmse_dict = dict(zip(self.model_list, [0]*self.n_models))
        self.logger.info("="*80)
        self.logger.info("Load model...")
        for model in self.model_list:
            self.logger.info("model: %s" % model)
            model_id = model_index_dict[model]
            rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
            ## load model
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    path = "%s/Run%d" % (self.model_folder, run+1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)

                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    numValidMatrix[run,fold] = this_p_valid.shape[0]
                    numValid = numValidMatrix[run,fold]
                    this_target = this_p_valid["target"].values
                    this_p_valid = this_p_valid["prediction"].values
                    pred_list_valid[model_id,run,fold,:numValid] = np.clip(this_p_valid, 1., 3.)
                    Y_list_valid[run,fold,:numValid] = this_target

                    ##
                    rmse_cv[run,fold] = dist_utils._rmse(pred_list_valid[model_id,run,fold,:numValid], 
                                                            Y_list_valid[run,fold,:numValid])     

            self.logger.info("rmse: %.6f (%.6f)" % (np.mean(rmse_cv), np.std(rmse_cv)))
            model_rmse_dict[model] = (np.mean(rmse_cv), np.std(rmse_cv))
        self.logger.info("%d models in total." % self.n_models)

        sorted_models = sorted(model_rmse_dict.items(), key=lambda x: x[1][0])
            
        # greedy ensemble
        self.logger.info("="*80)
        best_bagged_model_list = [[]]*self.bagging_size
        best_bagged_model_weight = [[]]*self.bagging_size
        score_valid_bag_mean = np.nan * np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX, self.bagging_size), dtype=float)
        rmse_cv_mean_mean_lst = [0]*self.bagging_size
        rmse_cv_mean_std_lst = [0]*self.bagging_size
        for bagging_iter in range(self.bagging_size):
            seed_model = self.random_seed + 100 * bagging_iter
            if not self.enable_extreme:
                this_sorted_models = self._pick_random_models(sorted_models, seed_model)
            #### instance level subsampling
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    if self.inst_splitter is None:
                        # GENERAL APPROACH
                        seed_inst = self.random_seed + 1000 * bagging_iter + 100 * run + 10 * fold
                        rng_inst = np.random.RandomState(seed_inst)
                        numValid = numValidMatrix[run,fold]
                        if self.inst_subsample_replacement:
                            sss = StratifiedShuffleSplitReplacement(Y_list_valid[run,fold,:numValid], n_iter=1,
                                test_size=1.-self.inst_subsample, random_state=seed_inst)
                            iidx, oidx = list(sss)[0]
                        else:
                            if self.inst_subsample < 1:
                                # Stratified ShuffleSplit
                                sss = ShuffleSplit(len(Y_list_valid[run,fold,:numValid]), n_iter=1,
                                    test_size=1.-self.inst_subsample, random_state=seed_inst)
                                iidx, oidx = list(sss)[0]
                            elif self.inst_subsample == 1:
                                # set iidx (trianing) the same as oidx (validation)
                                iidx = np.arange(numValid)
                                oidx = np.arange(numValid)
                    else:
                        iidx, oidx = self.inst_splitter[run]
                    numBSTMatrix[run,fold] = len(iidx)
                    bst_inst_idx[run,fold,:numBSTMatrix[run,fold]] = iidx
                    numOOBMatrix[run,fold] = len(oidx)
                    oob_inst_idx[run,fold,:numOOBMatrix[run,fold]] = oidx

            #print this_model_list
            best_model_list = []
            best_model_weight = []
            best_model_rmse = []
            best_rmse = 0
            best_rmse_std = 0
            best_model = None
            p_ens_list_valid_tmp = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float)
            #### Technique: Ensemble Initialization
            iter = 0
            w_ens, this_w = 0.0, 1.0
            if self.init_top_k > 0:
                # self.logger.info("** Ensemble Initialization **")
                # init_top_k = min(init_top_k, num_model)
                rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
                for cnt in range(self.init_top_k):
                    iter += 1
                    start = time.time()
                    seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                    if self.enable_extreme:
                        this_sorted_models = self._pick_random_models(sorted_models, seed_model)
                        best_model,(rmse,rmse_std) = this_sorted_models[0]
                    else:
                        best_model,(rmse,rmse_std) = this_sorted_models[cnt]
                    this_p_list_valid = pred_list_valid[model_index_dict[best_model]]
                    for run in range(config.N_RUNS):
                        for fold in range(config.N_FOLDS):
                            numValid = numValidMatrix[run,fold]
                            numBST = numBSTMatrix[run,fold]
                            bidx = bst_inst_idx[run,fold,:numBST].tolist()
                            p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred(
                                w_ens, p_ens_list_valid_tmp[run,fold,:numValid], 
                                this_w, this_p_list_valid[run,fold,:numValid])
                            true_label = Y_list_valid[run,fold,bidx]
                            rmse_cv[run,fold] = dist_utils._rmse(p_ens_list_valid_tmp[run,fold,bidx], true_label)
                    end = time.time()
                    best_weight = this_w
                    best_rmse = np.mean(rmse_cv)
                    best_rmse_std = np.std(rmse_cv)

                    self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start)))
                    self.logger.info("     model: %s" % best_model)
                    self.logger.info("     weight: %s" % best_weight)
                    self.logger.info("     rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std))

                    best_model_list.append(best_model)
                    best_model_weight.append(best_weight)
                    w_ens += best_weight

            #### Technique: Ensemble Selection with Replacement
            while True:
                iter += 1
                seed_model = self.random_seed + 100 * bagging_iter + 10 * iter
                if self.enable_extreme:
                    this_sorted_models = self._pick_random_models(sorted_models, seed_model)
                if self.multiprocessing:
                    start = time.time()
                    models_tmp = [model for model,(_,_) in this_sorted_models]
                    best_trial_rmse_mean_lst, best_trial_rmse_std_lst, model_lst, this_w_lst = \
                        zip(*Parallel(n_jobs=self.multiprocessing_num_cores)(
                            delayed(self._find_optim_weight_scipy)(
                                p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, 
                                bst_inst_idx, w_ens, model_index_dict, m
                                ) for m in models_tmp
                            ))
                    ##
                    ind_best = np.argmin(best_trial_rmse_mean_lst)
                    best_trial_rmse_mean = best_trial_rmse_mean_lst[ind_best]
                    best_trial_rmse_std = best_trial_rmse_std_lst[ind_best]
                    model = model_lst[ind_best]
                    this_w = this_w_lst[ind_best]
                    if best_trial_rmse_mean < best_rmse:
                        best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                        best_model, best_weight = model, this_w
                    end = time.time()
                else:
                    start = time.time()
                    for model,(_,_) in this_sorted_models:
                        best_trial_rmse_mean, best_trial_rmse_std, model, this_w = \
                            self._find_optim_weight_scipy(
                            p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, 
                            bst_inst_idx, w_ens, model_index_dict, model)
                        if best_trial_rmse_mean < best_rmse:
                            best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std
                            best_model, best_weight = model, this_w
                    end = time.time()
                if best_model is None:
                    break
                if len(best_model_rmse) > 1 and (best_model_rmse[-1] - best_rmse < self.epsilon):
                    break

                ##
                self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start)))
                self.logger.info("     model: %s" % best_model)
                self.logger.info("     weight: %s" % best_weight)
                self.logger.info("     rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std))
                
                # valid
                this_p_list_valid = pred_list_valid[model_index_dict[best_model]]
                pred_raw_list = []
                true_label_list = []
                for run in range(config.N_RUNS):
                    for fold in range(config.N_FOLDS):
                        numValid = numValidMatrix[run,fold]
                        numBST = numBSTMatrix[run,fold]
                        bidx = bst_inst_idx[run,fold,:numBST].tolist()
                        p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred(
                            w_ens, p_ens_list_valid_tmp[run,fold,:numValid], 
                            best_weight, this_p_list_valid[run,fold,:numValid])

                        pred_raw_list.append( p_ens_list_valid_tmp[run,fold,bidx] )
                        true_label_list.append( Y_list_valid[run,fold,bidx] )

                best_model_list.append(best_model)
                best_model_weight.append(best_weight)
                best_model_rmse.append(best_rmse)

                best_model = None
                w_ens += best_weight
            
            ## compute OOB score
            rmse_cv_mean = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float)
            for run in range(config.N_RUNS):
                for fold in range(config.N_FOLDS):
                    numValid = numValidMatrix[run,fold]
                    true_label = Y_list_valid[run,fold,:numValid]
                    numOOB = numOOBMatrix[run,fold]
                    oidx = oob_inst_idx[run,fold,:numOOB].tolist()                
                    pred_raw = p_ens_list_valid_tmp[run,fold,oidx]
                    ## mean
                    score_valid_bag_mean[run,fold,oidx,bagging_iter] = pred_raw
                    pred_mean = np_utils._array_mean(score_valid_bag_mean[run,fold,:numValid,:(bagging_iter+1)])
                    non_nan_idx = pred_mean != config.MISSING_VALUE_NUMERIC
                    rmse_cv_mean[run,fold] = dist_utils._rmse(pred_mean[non_nan_idx], true_label[non_nan_idx])
            self.logger.info("-"*80)
            self.logger.info( "Bag: %d"% (bagging_iter+1))
            self.logger.info( "rmse-mean: %.6f (%.6f)" % (np.mean(rmse_cv_mean), np.std(rmse_cv_mean)))
            self.logger.info("-"*80)

            best_bagged_model_list[bagging_iter] = best_model_list
            best_bagged_model_weight[bagging_iter] = best_model_weight

            ## save the current prediction
            mr = "R" + str(self.model_subsample_replacement).upper()[0]
            ir = "R" + str(self.inst_subsample_replacement).upper()[0]
            ## mean
            best_rmse_mean = np.mean(rmse_cv_mean)
            best_rmse_std = np.std(rmse_cv_mean)
            output = self._ens_predict(best_bagged_model_list[:(bagging_iter+1)], 
                best_bagged_model_weight[:(bagging_iter+1)])
            sub_file = "%s_[MS%.2f_%s]_[IS%.2f_%s]_[Top%d]_[Bag%d]_[Mean%.6f]_[Std%.6f].mean.csv" % (
                self.subm_prefix, self.model_subsample, mr, self.inst_subsample, ir, 
                self.init_top_k, bagging_iter+1, best_rmse_mean, best_rmse_std)
            output.to_csv(sub_file, index=False)
            rmse_cv_mean_mean_lst[bagging_iter] = best_rmse_mean
            rmse_cv_mean_std_lst[bagging_iter] = best_rmse_std

            ## plot OOB score
            x = np.arange(1,bagging_iter+2,1)
            label = "Mean (Best = %.6f, Bag = %d)"%(
                    np.min(rmse_cv_mean_mean_lst[:(bagging_iter+1)]), 
                    np.argmin(rmse_cv_mean_mean_lst[:(bagging_iter+1)])+1)
            plt.errorbar(x, rmse_cv_mean_mean_lst[:(bagging_iter+1)], 
                yerr=rmse_cv_mean_std_lst[:(bagging_iter+1)], 
                fmt='-o', label=label)
            plt.xlim(1, self.bagging_size)
            plt.title("Extreme Ensemble Selection RMSE")
            plt.xlabel("Bag")
            plt.ylabel("CV/OOB RMSE")
            plt.legend(loc="upper right")
            fig_file = "%s/ensemble_selection_%d.pdf"%(config.FIG_DIR, bagging_iter+1)
            plt.savefig(fig_file)
            plt.clf()
예제 #10
0
 def _get_centroid_rmse(self, text1, text2):
     centroid1 = self._get_centroid_vector(text1)
     centroid2 = self._get_centroid_vector(text2)
     return dist_utils._rmse(centroid1, centroid2)
예제 #11
0
 def _get_rmse(self, sent1, sent2):
     vect1 = self._get_vector(sent1)
     vect2 = self._get_vector(sent2)
     return dist_utils._rmse(vect1, vect2)