def __init__(self, X_train, y_train, X_val, y_val, n_runs, ext_params, results_path):
        print("Training Extra Trees Classification Model...")
        os.mkdir(results_path+'/extra_trees_models')
        # Get params
        n_estimators_list = ext_params['n_estimators']
        min_samples_split_list = ext_params['min_samples_split']
        min_samples_leaf_list = ext_params['min_samples_leaf']
        max_features = ext_params['max_features']
        model_selection_metric = ext_params['model_selection_metric']

        best_f1 = 0
        best_roc = 0
        best_auc = 0
        best_acc = 0
        best_model = None

        for n in range(n_runs):

            best_f1_run = 0
            best_roc_run = 0
            best_auc_run = 0
            best_acc_run = 0
            best_model_run = None

            for n_estimators in n_estimators_list:
                for min_samples_split in min_samples_split_list:
                    for min_samples_leaf in min_samples_leaf_list:
                            ext_estimator = ExtraTreesClassifier(n_estimators=n_estimators,
                                                                        min_samples_split=min_samples_split,
                                                                        min_samples_leaf=min_samples_leaf,
                                                                        max_features=max_features)
                            ext_estimator.fit(X_train, y_train)
                            preds = ext_estimator.predict(X_val)

                            if model_selection_metric == "f1":
                                f = f1(y_val, preds)
                                best_f1_run, best_model_run =  (f, ext_estimator)  if f > best_f1_run else (best_f1_run, best_model_run)
                                best_f1, best_model=  (f, ext_estimator)  if f > best_f1 else (best_f1, best_model)
                            elif model_selection_metric == "roc":
                                r = roc(y_val, preds)
                                best_roc_run, best_model_run = (r, ext_estimator) if r > best_roc_run else (best_roc_run, best_model_run)
                                best_roc, best_model = (r, ext_estimator) if r > best_roc else (best_roc, best_model)
                            elif model_selection_metric == "auc":
                                au = auc(y_val, preds)
                                best_auc_run, best_model_run = (auc, ext_estimator) if auc > best_auc_run else (best_auc_run, best_model_run)
                                best_auc, best_model = (auc, ext_estimator) if auc > best_auc else (best_auc, best_model)
                            elif model_selection_metric == "accuracy":
                                acc = accuracy(preds, y_val)
                                best_acc_run, best_model_run = (acc, ext_estimator) if acc > best_acc_run else (best_acc_run, best_model_run)
                                best_acc, best_model = (acc, ext_estimator) if acc > best_acc else (best_acc, best_model)
                            else:
                                print("Wrong model selection metric entered!")

            filename = results_path + '/extra_trees_models/ext_model_' + str(n) + '.sav'
            pickle.dump(best_model_run, open(filename, 'wb'))

        self.ext_model = best_model
        filename = results_path + '/extra_trees_models/ext_model.sav'
        pickle.dump(self.ext_model, open(filename, 'wb'))
        print("Training Extra Trees Classification Model completed.")
    def record_scores(self, X_test, y_test, n_runs, metrics, results_path):
        models_scores_path = results_path + '/model_scores/'

        workbook = xlsxwriter.Workbook(models_scores_path+'extra_trees_results.xlsx')
        worksheet = workbook.add_worksheet()

        row, column = 0, 0
        worksheet.write(row, column, "Model Name")

        f = open(models_scores_path+"metric.txt", "a")
        for n in range(n_runs):
            preds = self.predict(X_test, results_path, model_name=n)
            row = n + 1
            worksheet.write(row, 0, "Extra Trees Model " + str(n)+ "\t")
            f.write("Extra Trees Classifier "+ str(n) + " \t")
            column = 0
            if metrics['f1']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "F1")
                f1_sc = f1(y_test, preds)
                f.write("F1 score : " + str(f1) + "\t")
                worksheet.write(row, column, f1_sc)
            if metrics['accuracy']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Accuracy")
                acc = accuracy(y_test, preds)
                f.write("Accuracy : " + str(acc) + "\t")
                worksheet.write(row, column, acc)
            if metrics['roc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "ROC")
                roc_curve = roc(y_test, preds)
                f.write("ROC : " + str(roc_curve) + "\t")
                worksheet.write(row, column, roc_curve)
            if metrics['auc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "AUC")
                auroc = auc(y_test, preds)
                f.write("Area under ROC : " + str(auroc) + "\t")
                worksheet.write(row, column, auroc)
            f.write("\n")
        f.close()

        workbook.close()
    def record_scores(self, X_test, y_test, metrics, n_runs,
                      max_display_features, feature_names, results_path):
        models_scores_path = results_path + '/model_scores/'

        best_f1 = 0
        best_acc = 0
        best_roc = 0
        best_auc = 0
        best_bal_acc = 0
        best_model = None

        workbook = xlsxwriter.Workbook(models_scores_path + 'gbm_results.xlsx')
        worksheet = workbook.add_worksheet()

        row, column = 0, 0
        worksheet.write(row, column, "Model Name")

        f = open(models_scores_path + "metric.txt", "a")
        for n in range(n_runs):
            model_path = results_path + '/gbm_models/gbm_model_' + str(
                n) + '.sav'
            preds = self.predict(X_test, results_path, n)
            f.write("GBM Model " + str(n) + "\t")
            row = n + 1
            worksheet.write(row, 0, "GBM Model " + str(n) + "\t")

            column = 0

            if metrics['f1']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "F1")
                f1_sc = f1(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_f1, best_model = (f1_sc, model) if f1_sc > best_f1 else (
                    best_f1, best_model)
                f.write("F1 score : " + str(f1_sc) + "\t")
                worksheet.write(row, column, f1_sc)
            if metrics['accuracy']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Accuracy")
                acc = accuracy(preds, y_test)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_acc, best_model = (acc, model) if acc > best_acc else (
                    best_acc, best_model)
                f.write("Accuracy : " + str(acc) + "\t")
                worksheet.write(row, column, acc)
            if metrics['balanced_accuracy']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Balanced Accuracy")
                bal_acc = balanced_accuracy(preds, y_test)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_bal_acc, best_model = (
                    bal_acc, model) if bal_acc > best_acc else (best_bal_acc,
                                                                best_model)
                f.write("Balanced Accuracy : " + str(bal_acc) + "\t")
                worksheet.write(row, column, bal_acc)
            if metrics['roc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "ROC")
                roc_curve = roc(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_roc, best_model = (
                    roc_curve, model) if roc_curve > best_roc else (best_roc,
                                                                    best_model)
                f.write("ROC : " + str(roc_curve) + "\t")
                worksheet.write(row, column, roc_curve)
            if metrics['auc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "AUC")
                auroc = auc(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_auc, best_model = (
                    auroc, model) if auroc > best_auc else (best_auc,
                                                            best_model)
                f.write("AUC : " + str(auc) + "\t")
                worksheet.write(row, column, auc)
            f.write("\n")
        f.close()
        workbook.close()
    def __init__(self, x_train, y_train, x_val, y_val, gbm_params, n_runs,
                 results_path):
        self.explainer = None
        y_train = np.squeeze(np.array(y_train))
        y_val = np.squeeze(np.array(y_val))
        print("Training GBM Model...")
        if not os.path.exists(results_path + '/gbm_models'):
            os.mkdir(results_path + '/gbm_models')
        # Get params
        learning_rate_list = gbm_params['learning_rate']
        n_estimators_list = gbm_params['n_estimators']
        min_samples_split_list = gbm_params['min_samples_split']
        min_samples_leaf_list = gbm_params['min_samples_leaf']
        max_depth_list = gbm_params['max_depth']
        max_features = gbm_params['max_features']
        subsample_list = gbm_params['subsample']
        model_selection_metric = gbm_params['model_selection_metric']

        best_f1 = 0
        best_roc = 0
        best_auc = 0
        best_acc = 0
        best_bal_acc = 0
        best_model = None
        best_model_params = {}

        for n in range(n_runs):

            best_f1_run = 0
            best_roc_run = 0
            best_auc_run = 0
            best_acc_run = 0
            best_bal_acc_run = 0
            best_model_run = None

            for learning_rate in learning_rate_list:
                for n_estimators in n_estimators_list:
                    for min_samples_split in min_samples_split_list:
                        for min_samples_leaf in min_samples_leaf_list:
                            for max_depth in max_depth_list:
                                for subsample in subsample_list:
                                    gbm_estimator = GradientBoostingClassifier(
                                        learning_rate=learning_rate,
                                        n_estimators=n_estimators,
                                        min_samples_split=min_samples_split,
                                        min_samples_leaf=min_samples_leaf,
                                        max_depth=max_depth,
                                        subsample=subsample,
                                        max_features=max_features)
                                    model_params = {
                                        'learning_rate': learning_rate,
                                        'n_estimators': n_estimators,
                                        'min_samples_split': min_samples_split,
                                        'min_samples_leaf': min_samples_leaf,
                                        'max_depth': max_depth,
                                        'subsample': subsample,
                                        'max_features': max_features
                                    }
                                    gbm_estimator.fit(x_train, y_train)
                                    preds = gbm_estimator.predict(x_val)

                                    if model_selection_metric == "f1":
                                        f = f1(y_val, preds)
                                        best_f1_run, best_model_run = (
                                            f, gbm_estimator
                                        ) if f > best_f1_run else (
                                            best_f1_run, best_model_run)
                                        best_f1, best_model, best_model_params = (
                                            f, gbm_estimator,
                                            model_params) if f > best_f1 else (
                                                best_f1, best_model,
                                                best_model_params)
                                    elif model_selection_metric == "roc":
                                        r = roc(y_val, preds)
                                        best_roc_run, best_model_run = (
                                            r, gbm_estimator
                                        ) if r > best_roc_run else (
                                            best_roc_run, best_model_run)
                                        best_roc, best_model, best_model_params = (
                                            r, gbm_estimator, model_params
                                        ) if r > best_roc else (
                                            best_roc, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "auc":
                                        au = auc(y_val, preds)
                                        best_auc_run, best_model_run = (
                                            au, gbm_estimator
                                        ) if au > best_auc_run else (
                                            best_auc_run, best_model_run)
                                        best_auc, best_model, best_model_params = (
                                            au, gbm_estimator, model_params
                                        ) if au > best_auc else (
                                            best_auc, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "accuracy":
                                        acc = accuracy(preds, y_val)
                                        best_acc_run, best_model_run = (
                                            acc, gbm_estimator
                                        ) if acc > best_acc_run else (
                                            best_acc_run, best_model_run)
                                        best_acc, best_model, best_model_params = (
                                            acc, gbm_estimator, model_params
                                        ) if acc > best_acc else (
                                            best_acc, best_model,
                                            best_model_params)
                                    elif model_selection_metric == "balanced_accuracy":
                                        bal_acc = balanced_accuracy(
                                            preds, y_val)
                                        best_bal_acc_run, best_model_run = (
                                            bal_acc, gbm_estimator
                                        ) if bal_acc > best_bal_acc_run else (
                                            best_bal_acc_run, best_model_run)
                                        best_bal_acc, best_model, best_model_params = (
                                            bal_acc, gbm_estimator,
                                            model_params
                                        ) if bal_acc > best_bal_acc else (
                                            best_bal_acc, best_model,
                                            best_model_params)
                                    else:
                                        print(
                                            "Wrong model selection metric entered!"
                                        )

            filename = results_path + '/gbm_models/gbm_model_' + str(
                n) + '.sav'
            pickle.dump(best_model_run, open(filename, 'wb'))

        self.gbm_model = best_model
        filename = results_path + '/gbm_models/gbm_model.sav'
        pickle.dump(self.gbm_model, open(filename, 'wb'))
        f = open(results_path + '/gbm_models/best_params.txt', 'w')
        f.write(str(best_model_params))
        f.close()
        self.explainer = shap.TreeExplainer(best_model, x_train)
예제 #5
0
    def record_scores(self, X_test, y_test, n_runs, metrics, feature_names,
                      max_display_features, results_path):
        models_scores_path = results_path + '/model_scores/'

        best_f1 = 0
        best_roc = 0
        best_auc = 0
        best_acc = 0
        best_model = None

        workbook = xlsxwriter.Workbook(models_scores_path +
                                       'logistic_regression_results.xlsx')
        worksheet = workbook.add_worksheet()

        row, column = 0, 0
        worksheet.write(row, column, "Model Name")

        f = open(models_scores_path + "metric.txt", "a")
        for n in range(n_runs):
            model_path = results_path + '/logistic_models/logistic_model_' + str(
                n) + '.sav'
            preds = self.predict(X_test, results_path, n)
            f.write("Logistic Regression Model " + str(n) + "\t")
            row = n + 1
            worksheet.write(row, 0,
                            "Logistic Regression Model " + str(n) + "\t")

            column = 0

            if metrics['f1']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "F1")
                f1_sc = f1(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_f1, best_model = (f1_sc, model) if f1_sc > best_f1 else (
                    best_f1, best_model)
                f.write("F1 score : " + str(f1_sc) + "\t")
                worksheet.write(row, column, f1_sc)
            if metrics['accuracy']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Accuracy")
                acc = accuracy(preds, y_test)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_acc, best_model = (acc, model) if acc > best_acc else (
                    best_acc, best_model)
                f.write("Accuracy : " + str(acc) + "\t")
                worksheet.write(row, column, acc)
            if metrics['balanced_accuracy']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "Balanced Accuracy")
                bal_acc = balanced_accuracy(y_preds, y_true)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_bal_acc, best_model = (
                    bal_acc, model) if bal_acc > best_acc else (best_bal_acc,
                                                                best_model)
                f.write("Balanced Accuracy : " + str(bal_acc) + "\t")
                worksheet.write(row, column, bal_acc)
            if metrics['roc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "ROC")
                roc_curve = roc(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_roc, best_model = (
                    roc_curve, model) if roc_curve > best_roc else (best_roc,
                                                                    best_model)
                f.write("ROC : " + str(roc_curve) + "\t")
                worksheet.write(row, column, roc_curve)
            if metrics['auc']:
                column += 1
                if n == 0:
                    worksheet.write(0, column, "AUC")
                auroc = auc(y_test, preds)
                with open(model_path, 'rb') as model_file:
                    model = pickle.load(model_file)
                best_auc, best_model = (
                    auroc, model) if auroc > best_auc else (best_auc,
                                                            best_model)
                f.write("Area under ROC : " + str(auroc) + "\t")
                worksheet.write(row, column, auroc)
            f.write("\n")
        f.close()

        filename = results_path + '/logistic_models/logistic_model.sav'
        pickle.dump(best_model, open(filename, 'wb'))
        self.feature_importances(results_path, feature_names.values)
        shap_values = self.explainer.shap_values(X_test)
        X_test_array = np.array(X_test)
        shap.summary_plot(shap_values,
                          X_test,
                          feature_names=feature_names,
                          show=False,
                          plot_type="bar",
                          max_display=max_display_features)
        plt.savefig(results_path + '/logistic_plots/features.png',
                    bbox_inches='tight')
        plt.close()
        shap.summary_plot(shap_values,
                          X_test,
                          feature_names=feature_names,
                          show=False,
                          max_display=max_display_features)
        plt.savefig(results_path + '/logistic_plots/features_summary.png',
                    bbox_inches='tight')
        plt.close()
        vals = np.abs(shap_values).mean(0)
        feature_importance = pd.DataFrame(
            list(zip(X_test.columns, vals)),
            columns=['col_name', 'feature_importance_vals'])
        feature_importance.sort_values(by=['feature_importance_vals'],
                                       ascending=False,
                                       inplace=True)
        feature_importance.to_csv(index=False,
                                  path_or_buf=results_path +
                                  "/logistic_plots/feature_importances.csv")
        workbook.close()