예제 #1
0
 def record_scores(self, X_test, y_test, metrics, results_path):
     models_scores_path = results_path + '/model_scores/'
     preds = self.predict(X_test, results_path)
     f = open(models_scores_path + "metric.txt", "a")
     f.write("XGB Regression\t")
     if metrics['rmse']:
         rms = rmse(preds, y_test)
         f.write("RMSE : " + str(rms) + "\t")
     if metrics['mae']:
         me = mae(preds, y_test)
         f.write("MAE : " + str(me) + "\t")
     if metrics['r_squared']:
         rsq = r2(preds, y_test)
         f.write("R^2 : " + str(rsq) + "\t")
     if metrics['pearson_correlation']:
         pcorr = pearson_correlation(preds, y_test)
         f.write("Pearson Correlation : " + str(pcorr[0]) + "\t")
     f.write("\n")
     f.close()
예제 #2
0
    def record_scores(self, X_test, y_test, metrics, n_runs, results_path):
        models_scores_path = results_path + '/model_scores/'

        best_rmse = 0
        best_mae = 0
        best_r2 = 0
        best_corr = 0
        best_model = None

        workbook = xlsxwriter.Workbook(models_scores_path+'linear_regression_results.xlsx')
        worksheet = workbook.add_worksheet()

        row, column = 0, 0
        worksheet.write(row, column, "Model Name")

        f = open(models_scores_path+"metric.txt", "a")
        for n in range(n_runs):
            model_path = results_path + '/linear_models/linear_model_'+str(n)+'.sav'
            preds = self.predict(X_test, results_path, n)
            f.write("Linear Regression Model " + str(n)+ "\t")
            row = n + 1
            worksheet.write(row, 0, "Linear Regression Model " + str(n)+ "\t")

            column = 0

            if metrics['rmse']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "RMSE")
                rmse_sc = rmse(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_rmse, best_model =  (rmse_sc, model)  if rmse_sc > best_rmse else (best_rmse, best_model)
                f.write("RMSE : " + str(rmse_sc) + "\t")
                worksheet.write(row, column, rmse_sc)
            if metrics['mae']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "MAE")
                me = mae(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_mae, best_model =  (me, model)  if me > best_mae else (best_mae, best_model)
                f.write("MAE : " + str(me) + "\t")
                worksheet.write(row, column, me)
            if metrics['r_squared']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "R^2")
                rsq = r2(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_r2, best_model =  (rsq, model)  if rsq > best_r2 else (best_r2, best_model)
                f.write("R^2 : " + str(rsq) + "\t")
                worksheet.write(row, column, rsq)
            if metrics['pearson_correlation']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Pearson Correlation")
                pcorr, _ = pearson_correlation(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_corr, best_model =  (pcorr, model)  if pcorr > best_corr else (best_corr, best_model)
                f.write("Pearson Correlation : " + str(pcorr) + "\t")
                worksheet.write(row, column, pcorr)
            f.write("\n")
        f.close()

        filename = results_path + '/linear_models/linear_model_best.sav'
        pickle.dump(best_model, open(filename, 'wb'))
        workbook.close()
    def __init__(self, X_train, y_train, X_val, y_val, gbm_params, n_runs,
                 results_path):
        print("Training GBM Model...")
        os.mkdir(results_path + '/gbm_models')
        # Get params
        learning_rate_list = gbm_params['learning_rate']
        n_estimators_list = gbm_params['n_estimators']
        min_samples_split_list = gbm_params['min_samples_split']
        min_samples_leaf_list = gbm_params['min_samples_leaf']
        max_depth_list = gbm_params['max_depth']
        max_features = gbm_params['max_features']
        subsample_list = gbm_params['subsample']
        model_selection_metric = gbm_params['model_selection_metric']

        best_me = float('inf')
        best_rms = float('inf')
        best_rsq = 0
        best_pcorr = 0
        best_model = None
        best_model_params = {}

        for n in range(n_runs):

            best_me_run = float('inf')
            best_rms_run = float('inf')
            best_rsq_run = 0
            best_pcorr_run = 0
            best_model_run = None

            for learning_rate in learning_rate_list:
                for n_estimators in n_estimators_list:
                    for min_samples_split in min_samples_split_list:
                        for min_samples_leaf in min_samples_leaf_list:
                            for max_depth in max_depth_list:
                                for subsample in subsample_list:
                                    gbm_estimator = GradientBoostingRegressor(
                                        learning_rate=learning_rate,
                                        n_estimators=n_estimators,
                                        min_samples_split=min_samples_split,
                                        min_samples_leaf=min_samples_leaf,
                                        max_depth=max_depth,
                                        subsample=subsample,
                                        max_features=max_features)
                                    model_params = {
                                        'learning_rate': learning_rate,
                                        'n_estimators': n_estimators,
                                        'min_samples_split': min_samples_split,
                                        'min_samples_leaf': min_samples_leaf,
                                        'max_depth': max_depth,
                                        'subsample': subsample,
                                        'max_features': max_features
                                    }
                                    gbm_estimator.fit(X_train, y_train)
                                    preds = gbm_estimator.predict(X_val)

                                    if model_selection_metric == "mae":
                                        me = mae(preds, y_val)
                                        best_me_run, best_model_run = (
                                            me, gbm_estimator
                                        ) if me < best_me_run else (
                                            best_me_run, best_model_run)
                                        best_me, best_model, best_model_params = (
                                            me, gbm_estimator, model_params
                                        ) if me < best_me else (
                                            best_me, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "rmse":
                                        rms = rmse(preds, y_val)
                                        best_rms_run, best_model_run = (
                                            rms, gbm_estimator
                                        ) if rms < best_rms_run else (
                                            best_rms_run, best_model_run)
                                        best_rms, best_model, best_model_params = (
                                            rms, gbm_estimator.model_params
                                        ) if rms < best_rms else (
                                            best_rms, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "r_squared":
                                        rsq = r2(preds, y_val)
                                        best_rsq_run, best_model_run = (
                                            rsq, gbm_estimator
                                        ) if rsq > best_rsq_run else (
                                            best_rsq_run, best_model_run)
                                        best_rsq, best_model, best_model_params = (
                                            rsq, gbm_estimator, model_params
                                        ) if rsq > best_rsq else (
                                            best_rsq, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "pearson_correlation":
                                        pcorr, _ = pearson_correlation(
                                            preds, y_val)
                                        best_pcorr_run, best_model_run = (
                                            pcorr, gbm_estimator
                                        ) if pcorr > best_pcorr_run else (
                                            best_pcorr_run, best_model_run)
                                        best_pcorr, best_model, best_model_params = (
                                            pcorr, gbm_estimator, model_params
                                        ) if pcorr > best_pcorr else (
                                            best_pcorr, best_model,
                                            best_model_params)
                                    else:
                                        print(
                                            "Wrong model selection metric entered!"
                                        )

            filename = results_path + '/gbm_models/gbm_model_' + str(
                n) + '.sav'
            pickle.dump(best_model_run, open(filename, 'wb'))

        self.gbm_model = best_model
        filename = results_path + '/gbm_models/gbm_model.sav'
        pickle.dump(self.gbm_model, open(filename, 'wb'))
        f = open(results_path + '/gbm_models/best_scores.txt', 'w')
        f.write(str(best_model_params))
        f.close()
        print("Training GBM Model completed.")
예제 #4
0
    def __init__(self, X_train, y_train, X_val, y_val, xgb_params,
                 results_path):
        print("Training XGBoost Model...")
        # Get params
        learning_rate_list = xgb_params['learning_rate']
        n_estimators_list = xgb_params['n_estimators']
        max_depth_list = xgb_params['max_depth']
        colsample_bytree_list = xgb_params['colsample_bytree']
        gamma_list = xgb_params['gamma']
        alpha_list = xgb_params['alpha']
        lambda_list = xgb_params['lambda']
        subsample_list = xgb_params['subsample']
        model_selection_metric = xgb_params['model_selection_metric']

        best_me = float('inf')
        best_rms = float('inf')
        best_rsq = 0
        best_pcorr = 0
        best_model = None

        for learning_rate in learning_rate_list:
            for n_estimators in n_estimators_list:
                for colsample_bytree in colsample_bytree_list:
                    for gamma in gamma_list:
                        for max_depth in max_depth_list:
                            for subsample in subsample_list:
                                for alpha in alpha_list:
                                    for lamda in lambda_list:
                                        xgb_estimator = xgb.XGBRegressor(
                                            objective='reg:squarederror',
                                            learning_rate=learning_rate,
                                            n_estimators=n_estimators,
                                            colsample_bytree=colsample_bytree,
                                            gamma=gamma,
                                            alpha=alpha,
                                            reg_lambda=lamda,
                                            max_depth=max_depth,
                                            subsample=subsample)
                                        xgb_estimator.fit(X_train, y_train)
                                        preds = xgb_estimator.predict(X_val)

                                        if model_selection_metric == "mae":
                                            me = mae(preds, y_val)
                                            best_me, best_model = (
                                                me, xgb_estimator
                                            ) if me < best_me else (best_me,
                                                                    best_model)
                                        elif model_selection_metric == "rmse":
                                            rms = rmse(preds, y_val)
                                            best_rms, best_model = (
                                                rms, xgb_estimator
                                            ) if rms < best_rms else (
                                                best_rms, best_model)
                                        elif model_selection_metric == "r_squared":
                                            rsq = r2(preds, y_val)
                                            best_rsq, best_model = (
                                                rsq, xgb_estimator
                                            ) if rsq > best_rsq else (
                                                best_rsq, best_model)
                                        elif model_selection_metric == "pearson_correlation":
                                            pcorr = pearson_correlation(
                                                preds, y_val)
                                            best_pcorr, best_model = (
                                                pcorr, xgb_estimator
                                            ) if pcorr > best_pcorr else (
                                                best_pcorr, best_model)
                                        else:
                                            print(
                                                "Wrong model selection metric entered!"
                                            )

        self.xgb_model = best_model
        filename = results_path + '/xgb_model.sav'
        pickle.dump(self.xgb_model, open(filename, 'wb'))
        print("Training XGBoost Model completed.")