def _ada_boost_classification_train(table, feature_cols, label_col, max_depth=1, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None): x_train = table[feature_cols] y_train = table[label_col] base_estimator = DecisionTreeClassifier(max_depth=max_depth) classifier = AdaBoostClassifier(base_estimator, n_estimators, learning_rate, algorithm, random_state) classifier.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': classifier.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'algorithm': algorithm, 'random_state': random_state } model = _model_dict('ada_boost_classification_model') get_param = classifier.get_params() model['parameters'] = get_param model['classifier'] = classifier model['params'] = params fig_feature_importance = _plot_feature_importance(feature_cols, classifier) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Classification Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = classifier.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_cols[i], feature_importance[i]] for i in range(len(feature_cols))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def search_bestparam_AdaBoostClassifier(X, y, df_search_best_param): print(f"Search best params for AdaBoostClassifier ...") model = AdaBoostClassifier() print("Supported params", model.get_params()) param_grid = { 'n_estimators': [1, 10, 100, 1000], 'algorithm': ['SAMME', 'SAMME.R'] } search_bestparam(model, param_grid, X, y, df_search_best_param)
class AdaBoost(ClassicalModel): def __init__(self, input_size, output_size, labels, class_weights=None, **kwargs): super().__init__(input_size, output_size, labels, class_weights) self.model = AdaBoostClassifier(**kwargs) self.name = "AdaBoost Classifier: \n" + str(self.model.get_params())
def test_folder_name(self): clf = AdaBoostClassifier(n_estimators=23) clf.base_estimator.max_depth = 42 base = "/hello/world/" category = "testing" params_path = util.params_to_path(clf.get_params()) self.assertEqual( util.folder_name(base, category, clf), os.path.join("/hello/world/AdaBoostClassifier/testing/", params_path))
def adaboostClassifier(X_train, X_test, y_train, y_test): print("adaboost") model2 = AdaBoostClassifier(random_state=1, learning_rate=0.404) model2.fit(X_train, y_train) y_pred = model2.predict(X_test) print(f1_score(y_test, y_pred)) # Look at parameters used by our current forest print('Parameters currently in use:\n') print(model2.get_params())
class _AdaBoostClassifierImpl: def __init__( self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm="SAMME.R", random_state=None, ): estimator_impl = base_estimator if isinstance(estimator_impl, lale.operators.Operator): if isinstance(estimator_impl, lale.operators.IndividualOp): estimator_impl = estimator_impl._impl_instance() wrapped_model = getattr(estimator_impl, "_wrapped_model", None) if wrapped_model is not None: estimator_impl = wrapped_model else: raise ValueError( "If base_estimator is a Lale operator, it needs to be an individual operator. " ) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "algorithm": algorithm, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def randomForest(train_bow_tf_idf, train_labels, bow_test_tf_idf, test_labels): model = AdaBoostClassifier(n_estimators=100) model.fit(train_bow_tf_idf, train_labels) print() print('------- Random Forest -------') # evaluate the model print('Default hyperparameters:') print(model.get_params()) train_pred = model.predict(train_bow_tf_idf) print('Random Forest train accuracy = {}'.format( (train_pred == train_labels).mean())) test_pred = model.predict(bow_test_tf_idf) print('Random Forest test accuracy = {}'.format( (test_pred == test_labels).mean())) return model
class SklearnBDT(SklearnDT): def __init__(self): SklearnDT.__init__(self) self.boosting = 'adaptive' self.config = self.load_config('bdt') self.classifier = AdaBoostClassifier( dtree.DecisionTreeClassifier( criterion=self.config.get('decision tree', 'criterion'), splitter=self.config.get('decision tree', 'splitter'), max_depth=self.config.getint('decision tree', 'max_depth'), min_samples_split=self.config.getint('decision tree', 'min_samples_split'), min_samples_leaf=self.config.getint('decision tree', 'min_samples_leaf'), min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False), algorithm=self.config.get('adaboost', 'algorithm'), n_estimators=int(self.config.getint('adaboost', 'n_estimators')), learning_rate=self.config.getfloat('adaboost', 'learning_rate'), random_state=None) def show(self): dt_options = self.classifier.get_params() print('-' * gl.screenwidth) print('--- Boosted decision tree options: ' + self.boosting) print('-' * gl.screenwidth) for i in dt_options: if i is not 'base_estimator': print('--- {:50s} {:s}'.format(i, str(dt_options[i]))) print('-' * gl.screenwidth) def eval(self, data): return self.classifier.decision_function(data)
def cross_validation(X, y): #fig = plt.figure() #ax = fig.add_subplot(111, projection='3d') assert(len(y) == len(X)) # Split the dataset in two equal parts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) depth = [8, 16, 32, 64] split = [1, 2, 4, 8, 16, 32, 64] best_score = 0 best_train_score = 0 best_param = None for d in depth: for s in split: estimator = DecisionTreeClassifier(max_features='sqrt', max_depth = d, min_samples_split = s) model = AdaBoostClassifier(n_estimators=500, base_estimator = estimator) model = model.fit(X_train, y_train) print "Depth: %d split: %d" % (d, s) print "Model trainning score:" score_train = model.score(X_train, y_train) print score_train #ax.scatter(d, s, score_train, c='b', marker='o') print "Model test score:" score_test = model.score(X_test, y_test) print score_test #ax.scatter(d, s, score_test, c='r', marker='^') if score_test > best_score: best_score = score_test best_train_score = score_train best_param = model.get_params() print "==================" print best_train_score print best_score print best_param return best_param
print "Validation set score: ERF " , clf_etree.score(X_val, y_val) clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) clf_boost.fit(X_train, y_train) print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val) #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) #clf_gboost.fit(X_train, y_train) #print "Validation set score:LR " , clf_gboost.score(X_val, y_val) print "Classifier:" print clf, clf.get_params() print clf_etree, clf_etree.get_params() print clf_boost, clf_boost.get_params() if(fe==1): #L1 norm based feature elimination clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0) clf_fe.fit(X_train, y_train) X_train = X_train[:,clf_fe.coef_.ravel()!=0] print "Xtrain.shape: ", X_train.shape X_val = X_val[:,clf_fe.coef_.ravel()!=0] clf2_l = svm.SVC(kernel='linear', C=reg) clf2_l.fit(X_train, y_train) print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val) clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g) clf2.fit(X_train, y_train) print "Lasso Validation set score filtered coeff: " , clf2.score(X_val, y_val)
class RandomForestAdaRandSearch(object): '''This class is the doing the actual work in the following steps: * define smaller data frames: database, man_add, transform * split the data into training and test set * setup and run a randomized search for best paramaters to define a random forest * create a new random forest with best parameters * predict on this new random forest with test data and cross-validated training data * analyse the predisctions with graphs and stats ''' def __init__(self, metrix, output_dir): self.metrix = metrix self.output_dir = output_dir self.prepare_metrix_data() self.split_data() self.forest_best_params() self.predict() self.analysis() def prepare_metrix_data(self): '''Function to create smaller dataframe. ****** Input: large data frame Output: smaller dataframe ''' print("*" * 80) print("* Preparing input dataframe") print("*" * 80) columns = [ "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF", "diffI", "autobuild_success" ] self.data = self.metrix[columns] logging.info(f"Using dataframe with column labels {columns}") ############################################################################### # # creating training and test set # ############################################################################### def split_data(self): '''Function which splits the input data into training set and test set. ****** Input: a dataframe that contains the features and labels in columns and the samples in rows Output: sets of training and test data with an 80/20 split; X_train, X_test, y_train, y_test ''' print("*" * 80) print("* Splitting data into test and training set with test=20%") print("*" * 80) y = self.metrix["autobuild_success"] X = self.data[[ "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF", "diffI" ]] #stratified split of samples X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test y_test_csv = os.path.join(self.output_dir, "y_test.csv") np.savetxt(y_test_csv, self.y_test, delimiter=",") X_test_csv = os.path.join(self.output_dir, "X_test.csv") np.savetxt(X_test_csv, self.X_test, delimiter=",") X_train_shape = X_train.shape X_test_shape = X_test.shape y_train_shape = y_train.shape y_test_shape = y_test.shape logging.info(f"Shape of test data X_train {X_train_shape}") logging.info(f"Shape of test data X_test {X_test_shape}") logging.info(f"Shape of test data y_train {y_train_shape}") logging.info(f"Shape of test data y_test {y_test_shape}") ############################################################################### # # optional step of over/undersampling if there is a large mis-match between classes # ############################################################################### #the weight distribution for the classes used by "class_weight" weights = {0:0.1, 1:0.9} #print('*' *80) #print('* Applying Over/Undersampling and SMOTE') #print('*' *80) #oversample = RandomOverSampler(sampling_strategy = 'minority') #oversample = RandomOverSampler(sampling_strategy = 0.1) #oversample = SMOTE(sampling_strategy = 0.3, random_state=28) # fit and apply the transform #X_over, y_over = oversample.fit_resample(self.X_newdata_transform_train, self.y_train) #undersample = RandomUnderSampler(sampling_strategy=0.7) #X_over, y_over = undersample.fit_resample(X_over, y_over) #self.X_over = X_over #self.y_over = y_over ############################################################################### # # creating classifier with best parameter from IUCrJ publication # ############################################################################### def forest_best_params(self): '''create a new random forest using the best parameter combination found above''' print("*" * 80) print( "* Building new forest based on best parameter combination and save as pickle" ) print("*" * 80) # a blank decision tree with Ada Boost that can be used for hyperparameter search when # when starting from scratch # clf2 = DecisionTreeClassifier(**self.best_params_base_estimator, # random_state= 0) # self.tree_clf2_new_rand = AdaBoostClassifier(clf2, # **self.best_params_ada, # algorithm ="SAMME.R", # random_state=100) # hyperparameters as were used for the classifier published in IUCrJ; this was first run # in deployment with really bad performance; # the saved model is named: 2019 calibrated_classifier_20190501_1115.pkl clf2 = DecisionTreeClassifier(criterion="entropy", max_depth=3, max_features=2, max_leaf_nodes=17, min_samples_leaf=8, min_samples_split=18, random_state=0, class_weight="balanced") self.tree_clf2_new_rand = AdaBoostClassifier(clf2, learning_rate=0.6355, n_estimators=5694, algorithm="SAMME.R", random_state=5) # hyperparameters for a new classifier; this one was found after adding some user data # from run1 2020 to the training data; this one is now running in the automated data # analysis pipelines; the saved model is named: calibrated_classifier_20200408_1552.pkl # clf2 = DecisionTreeClassifier(criterion="entropy", # max_depth=5, # max_features=2, # max_leaf_nodes=15, # min_samples_leaf=5, # min_samples_split=3, # random_state= 0, # class_weight = "balanced") # self.tree_clf2_new_rand = AdaBoostClassifier( # clf2, # learning_rate=0.6846, # n_estimators=4693, # algorithm ="SAMME.R", # random_state=5) classifier_params = self.tree_clf2_new_rand.get_params() print(classifier_params) self.tree_clf2_new_rand.fit(self.X_train, self.y_train) logging.info( f"Created classifier based on IUCrJ publication and fitted training data.\n" f"Classifier parameters: {classifier_params}") ############################################################################### # # Bootstrapping to find the 95% confidence interval # ############################################################################### # Trying some bootstrap to assess confidence interval for classification print("*" * 80) print( "* Calculating confidence interval for best decisiontree with AdaBoost" ) print("*" * 80) def bootstrap_calc(data_train, data_test, train_labels, test_labels, found_model): # configure bootstrap n_iterations = 1000 n_size = int(len(data_train)) # run bootstrap stats = list() for i in range(n_iterations): # prepare train and test sets train_boot = resample(data_train, n_samples=n_size) test_boot = train_labels # fit model model = found_model model.fit(train_boot, test_boot) # evaluate model predictions = model.predict(data_test) score = accuracy_score(test_labels, predictions) stats.append(score) # plot scores plt.hist(stats) plt.savefig(os.path.join(self.output_dir, "bootstrap_hist_ada.png"), dpi=600) plt.close() # confidence interval alpha = 0.95 p = ((1.0 - alpha) / 2.0) * 100 lower = max(0.0, np.percentile(stats, p)) p = (alpha + ((1.0 - alpha) / 2.0)) * 100 upper = min(1.0, np.percentile(stats, p)) lower_boundary = round((lower * 100), 2) upper_boundary = round((upper * 100), 2) logging.info( f"Calculating 95% confidence interval from bootstrap exercise\n" f"Lower boundary: {lower_boundary}\n" f"Upper boundary: {upper_boundary}") bootstrap_calc(self.X_train, self.X_test, self.y_train, self.y_test, self.tree_clf2_new_rand) ############################################################################### # # get feature importances for best tree and full classifier; # plot feature importances for both # ############################################################################### #print(self.tree_clf2_new_rand.estimators_) #print(self.tree_clf2_new_rand.feature_importances_) attr = [ "anomalousCC", "anomalousslope", "lowreslimit", "f", "diffF", "diffI" ] feature_importances = self.tree_clf2_new_rand.feature_importances_ feature_importances_ls = sorted(zip(feature_importances, attr), reverse=True) #print(feature_importances_transform_ls) feature_importances_tree_mean = np.mean([ tree.feature_importances_ for tree in self.tree_clf2_new_rand.estimators_ ], axis=0) feature_importances_tree_mean_ls = sorted(zip( feature_importances_tree_mean, attr), reverse=True) logging.info( f"Feature importances, for best tree in classifier: {feature_importances_ls}\n" f"Plotting bar plot of feature importances for best tree in classifier\n" f"Feature importances, mean over all trees: {feature_importances_tree_mean_ls}\n" f"Plotting bar plot of feature importances with mean and std for classifier" ) def feature_importances_best_estimator(feature_list, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") feature_list.sort(key=lambda x: x[1], reverse=True) feature = list(zip(*feature_list))[1] score = list(zip(*feature_list))[0] x_pos = np.arange(len(feature)) plt.bar(x_pos, score, align="center") plt.xticks(x_pos, feature, rotation=90, fontsize=18) plt.title( "Histogram of Feature Importances for best tree in best classifier" ) plt.xlabel("Features") plt.tight_layout() plt.savefig(os.path.join( directory, "feature_importances_besttree_bestclassifier_bar_plot_" + datestring + ".png"), dpi=600) plt.close() feature_importances_best_estimator(feature_importances_ls, self.output_dir) def feature_importances_pandas(clf, X_train, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") feature_list = [] for tree in clf.estimators_: feature_importances_ls = tree.feature_importances_ feature_list.append(feature_importances_ls) df = pd.DataFrame(feature_list, columns=X_train.columns) df_mean = df[X_train.columns].mean(axis=0) df_std = df[X_train.columns].std(axis=0) df_mean.plot(kind="bar", color="b", yerr=[df_std], align="center", figsize=(20, 10), rot=90, fontsize=18) plt.title( "Histogram of Feature Importances over all trees in best classifier with std" ) plt.xlabel('Features') plt.tight_layout() plt.savefig(os.path.join( directory, "feature_importances_mean_std_bestclassifier_bar_plot_" + datestring + ".png"), dpi=600) plt.close() feature_importances_pandas(self.tree_clf2_new_rand, self.X_train, self.output_dir) #feature_importances_pandas(self.tree_clf_rand_ada_new_transform, self.X_over, 'newdata_minusEP', self.newdata_minusEP) ############################################################################### # # save best classifier as pickle file for future use # ############################################################################### def write_pickle(forest, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") joblib.dump( forest, os.path.join(directory, "best_classifier_rand_ada_" + datestring + ".pkl")) write_pickle(self.tree_clf2_new_rand, self.output_dir) logging.info(f"Saving best classifier.") print("*" * 80) print("* Getting basic stats for new forest") print("*" * 80) ############################################################################### # # get basic stats for 3-fold cross-validation on the training data # ############################################################################### def basic_stats(forest, data_train, labels_train, directory): #distribution --> accuracy accuracy_each_cv = cross_val_score(forest, data_train, labels_train, cv=3, scoring="accuracy") accuracy_mean_cv = round( cross_val_score(forest, data_train, labels_train, cv=3, scoring="accuracy").mean(), 4) ## calculate cross_val_scoring with different scoring functions for CV train set train_roc_auc = round( cross_val_score(forest, data_train, labels_train, cv=3, scoring="roc_auc").mean(), 4) train_recall = round( cross_val_score(forest, data_train, labels_train, cv=3, scoring="recall").mean(), 4) train_precision = round( cross_val_score(forest, data_train, labels_train, cv=3, scoring="precision").mean(), 4) train_f1 = round( cross_val_score(forest, data_train, labels_train, cv=3, scoring="f1").mean(), 4) logging.info( f"Get various cross_val_scores to evaluate clf performance for best parameters\n" f"Training accuracy for individual folds in 3-fold CV: {accuracy_each_cv}\n" f"Mean training accuracy over all folds in 3-fold CV: {accuracy_mean_cv}\n" f"Mean training recall for 3-fold CV: {train_recall}\n" f"Mean training precision for 3-fold CV: {train_precision}\n" f"Mean training ROC_AUC for 3-fold CV: {train_roc_auc}\n" f"Mean training F1 score for 3-fold CV: {train_f1}") basic_stats(self.tree_clf2_new_rand, self.X_train, self.y_train, self.output_dir) ############################################################################### # # predicting with test set # ############################################################################### def predict(self): '''do predictions using the best classifier and the test set and doing some initial analysis on the output''' print("*" * 80) print("* Predict using new forest and test set") print("*" * 80) #try out how well the classifier works to predict from the test set self.y_pred = self.tree_clf2_new_rand.predict(self.X_test) self.y_pred_proba = self.tree_clf2_new_rand.predict_proba(self.X_test) self.y_pred_proba_ones = self.y_pred_proba[:, 1] #test data to be class 1 self.y_pred_proba_zeros = self.y_pred_proba[:, 0] #test data to be class 0 y_pred_csv = os.path.join(self.output_dir, "y_pred.csv") y_pred_proba_csv = os.path.join(self.output_dir, "y_pred_proba.csv") np.savetxt(y_pred_csv, self.y_pred, delimiter=",") np.savetxt(y_pred_proba_csv, self.y_pred_proba, delimiter=",") # with open(y_pred_csv, "w", newline="") as pred_csv: # pred_out = csv.writer(pred_csv) # pred_out.writerows(self.y_pred) logging.info( f"Storing predictions for test set to y_pred.\n" f"Storing probabilities for predictions for the test set to y_pred_proba" ) print("*" * 80) print("* Calculate prediction stats") print("*" * 80) def prediction_stats(y_test, y_pred, directory): # calculate accuracy y_accuracy = accuracy_score(y_test, y_pred) # examine the class distribution of the testing set (using a Pandas Series method) class_dist = self.y_test.value_counts() class_zero = class_dist[0] class_one = class_dist[1] self.biggest_class = 0 if class_zero > class_one: self.biggest_class = class_zero else: self.biggest_class = class_one # calculate the percentage of ones # because y_test only contains ones and zeros, # we can simply calculate the mean = percentage of ones ones = round(y_test.mean(), 4) # calculate the percentage of zeros zeros = round(1 - y_test.mean(), 4) # calculate null accuracy in a single line of code # only for binary classification problems coded as 0/1 null_acc = round(max(y_test.mean(), 1 - y_test.mean()), 4) logging.info( f"Accuracy score or agreement between y_test and y_pred: {y_accuracy}\n" f"Class distribution for y_test: {class_dist}\n" f"Percent 1s in y_test: {ones}\n" f"Percent 0s in y_test: {zeros}\n" f"Null accuracy in y_test: {null_acc}") prediction_stats(self.y_test, self.y_pred, self.output_dir) ############################################################################### # # detailed analysis and stats # ############################################################################### def analysis(self): '''detailed analysis of the output: * create a confusion matrix * split the data into TP, TN, FP, FN for test and train_CV * determine accuracy score * determine classification error * determine sensitivity * determine specificity * determine false-positive rate * determine precision * determine F1 score calculate prediction probabilities and draw plots * histogram for probability to be class 1 * precision-recall curve * look for adjustments in classification thresholds * ROC curve * determine ROC_AUC * try different scoring functions for comparison''' print("*" * 80) print("* Detailed analysis and plotting") print("*" * 80) ############################################################################### # # calculate and draw confusion matrix for test set predictions # ############################################################################### # IMPORTANT: first argument is true values, second argument is predicted values # this produces a 2x2 numpy array (matrix) conf_mat_test = confusion_matrix(self.y_test, self.y_pred) logging.info(f"confusion matrix using test set: {conf_mat_test}") def draw_conf_mat(matrix, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") labels = ["0", "1"] ax = plt.subplot() sns.heatmap(matrix, annot=True, ax=ax, annot_kws={"size": 18}, vmin=0, vmax=self.biggest_class) plt.title("Confusion matrix of the classifier") ax.set_xticklabels(labels, fontdict={"fontsize": 18}) ax.set_yticklabels(labels, fontdict={"fontsize": 18}) plt.xlabel("Predicted", fontsize=20) plt.ylabel("True", fontsize=20) plt.tight_layout() plt.savefig(os.path.join( directory, "confusion_matrix_for_test_set_predictions" + datestring + ".png"), dpi=600) plt.close() draw_conf_mat(conf_mat_test, self.output_dir) ############################################################################### # # calculate stats for the test set using classification outcomes # ############################################################################### TP = conf_mat_test[1, 1] TN = conf_mat_test[0, 0] FP = conf_mat_test[0, 1] FN = conf_mat_test[1, 0] logging.info(f"False-positives in predicting the test set: {FP}") logging.info(f"False-negatives in predicting the test set: {FN}") #calculate accuracy acc_score_man_test = round((TP + TN) / float(TP + TN + FP + FN), 4) acc_score_sklearn_test = round( accuracy_score(self.y_test, self.y_pred), 4) #classification error class_err_man_test = round((FP + FN) / float(TP + TN + FP + FN), 4) class_err_sklearn_test = round( 1 - accuracy_score(self.y_test, self.y_pred), 4) #sensitivity/recall/true positive rate; correctly placed positive cases sensitivity_man_test = round(TP / float(FN + TP), 4) sensitivity_sklearn_test = round( recall_score(self.y_test, self.y_pred), 4) #specificity specificity_man_test = round(TN / (TN + FP), 4) #false positive rate false_positive_rate_man_test = round(FP / float(TN + FP), 4) #precision/confidence of placement precision_man_test = round(TP / float(TP + FP), 4) precision_sklearn_test = round( precision_score(self.y_test, self.y_pred), 4) #F1 score; uses precision and recall f1_score_sklearn_test = round(f1_score(self.y_test, self.y_pred), 4) logging.info( f"Detailed stats for the test set\n" f"Accuracy score:\n" f"accuracy score manual test: {acc_score_man_test}\n" f"accuracy score sklearn test: {acc_score_sklearn_test}\n" f"Classification error:\n" f"classification error manual test: {class_err_man_test}\n" f"classification error sklearn test: {class_err_sklearn_test}\n" f"Sensitivity/Recall/True positives:\n" f"sensitivity manual test: {sensitivity_man_test}\n" f"sensitivity sklearn test: {sensitivity_sklearn_test}\n" f"Specificity:\n" f"specificity manual test: {specificity_man_test}\n" f"False positive rate or 1-specificity:\n" f"false positive rate manual test: {false_positive_rate_man_test}\n" f"Precision or confidence of classification:\n" f"precision manual: {precision_man_test}\n" f"precision sklearn: {precision_sklearn_test}\n" f"F1 score:\n" f"F1 score sklearn test: {f1_score_sklearn_test}") data_dict = { "group": "prediction", "ACC (%)": (acc_score_man_test * 100), "Class Error (%)": (class_err_man_test * 100), "Sensitivity (%)": (sensitivity_man_test * 100), "Specificity (%)": (specificity_man_test * 100), "FPR (%)": (false_positive_rate_man_test * 100), "Precision (%)": (precision_man_test * 100), "F1 score (%)": (f1_score_sklearn_test * 100) } df = pd.DataFrame(data=data_dict, index=[0]) def plot_radar_chart(df, directory): datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') # ------- PART 1: Create background # number of variable categories = list(df)[1:] print(categories) N = len(categories) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * pi for n in range(N)] angles += angles[:1] # Initialise the spider plot #fig = plt.figure(figsize=(9, 9)) fig = plt.figure(figsize=(7, 6)) ax = fig.add_subplot(111, polar=True) # If you want the first axis to be on top: ax.set_theta_offset(pi / 2) ax.set_theta_direction(-1) # Draw one axe per variable + add labels labels yet ax.set_xticks(angles[:-1]) ax.set_xticklabels(categories, fontsize=20, wrap=True) #plt.xticks(angles[:-1], categories) # Draw ylabels ax.set_rlabel_position(15) ax.set_yticks([20, 40, 60, 80, 100]) ax.set_yticklabels(["20", "40", "60", "80", "100%"], fontsize=20, wrap=True) ax.set_ylim(0, 100) # ------- PART 2: Add plots #values = df.loc[0].values.flatten().tolist() values = df.loc[0].drop('group').values.flatten().tolist() print(values) values += values[:1] ax.plot(angles, values, linewidth=2, linestyle="solid", label="Test set") ax.fill(angles, values, "b", alpha=0.1) plt.savefig(os.path.join( directory, "radar_chart_for_test_set_" + datestring + ".png"), dpi=600) plt.close() plot_radar_chart(df, self.output_dir) ############################################################################### # # plot histogram of test set probabilities # ############################################################################### #plot histograms of probabilities def plot_hist_pred_proba(y_pred_proba, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") plt.hist(y_pred_proba[1], bins=20, color="b", label="class 1") plt.hist(y_pred_proba[0], bins=20, color="g", label="class 0") plt.xlim(0, 1) plt.title( "Histogram of predicted probabilities for class 1 in the test set" ) plt.xlabel("Predicted probability of EP_success") plt.ylabel("Frequency") plt.legend(loc="best") plt.tight_layout() plt.savefig(os.path.join(directory, "hist_pred_proba_" + datestring + ".png"), dpi=600) plt.close() plot_hist_pred_proba(self.y_pred_proba, self.output_dir) ############################################################################### # # plot precision-recall curve for class 1 samples in test set # ############################################################################### #plot Precision Recall Threshold curve for test set class 1 precisions, recalls, thresholds = precision_recall_curve( self.y_test, self.y_pred_proba_ones) def plot_precision_recall_vs_threshold(precisions, recalls, thresholds, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") plt.plot(thresholds, precisions[:-1], "b--", label="Precision") plt.plot(thresholds, recalls[:-1], "g--", label="Recall") plt.title("Precsion-Recall plot for classifier, test set, class 1") plt.xlabel("Threshold") plt.legend(loc="upper left") plt.ylim([0, 1]) plt.tight_layout() plt.savefig(os.path.join( directory, "Precision_Recall_class1_" + datestring + ".png"), dpi=600) plt.close() plot_precision_recall_vs_threshold(precisions, recalls, thresholds, self.output_dir) ############################################################################### # # plot ROC curve, calculate AUC and explore thresholds for class 1 samples in test set # ############################################################################### #IMPORTANT: first argument is true values, second argument is predicted probabilities #we pass y_test and y_pred_prob #we do not use y_pred, because it will give incorrect results without generating an error #roc_curve returns 3 objects fpr, tpr, thresholds #fpr: false positive rate #tpr: true positive rate fpr_1, tpr_1, thresholds_1 = roc_curve(self.y_test, self.y_pred_proba_ones) AUC_test_class1 = round( roc_auc_score(self.y_test, self.y_pred_proba_ones), 4) logging.info(f"AUC score for class 1 in test set: {AUC_test_class1}") #plot ROC curves manual approach def plot_roc_curve(fpr, tpr, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") plt.plot(fpr, tpr, linewidth=2) plt.plot([0, 1], [0, 1], "k--") plt.axis([0, 1, 0, 1]) plt.title("ROC curve for classifier, test set, class 1") plt.xlabel("False Positive Rate (1 - Specificity)") plt.ylabel("True Positive Rate (Sensitivity)") plt.grid(True) plt.text(0.7, 0.1, r"AUC = {AUC_test_class1}") plt.tight_layout() plt.savefig(os.path.join(directory, "ROC_curve_class1_" + datestring + ".png"), dpi=600) plt.close() plot_roc_curve(fpr_1, tpr_1, self.output_dir) #plot ROC curves using scikit_plot method def plot_roc_curve_skplot(y_test, y_proba, directory): datestring = datetime.strftime(datetime.now(), "%Y%m%d_%H%M") skplt.metrics.plot_roc(y_test, y_proba, title="ROC curve") plt.tight_layout() plt.savefig(os.path.join( directory, "ROC_curve_skplt_class1_" + datestring + ".png"), dpi=600) plt.close() plot_roc_curve_skplot(self.y_test, self.y_pred_proba, self.output_dir) # define a function that accepts a threshold and prints sensitivity and specificity def evaluate_threshold(tpr, fpr, thresholds, threshold): sensitivity = round(tpr[thresholds > threshold][-1], 4) specificity = round(1 - fpr[thresholds > threshold][-1], 4) logging.info( f"Sensitivity for class 1 at threshold {threshold}: {sensitivity}\n" f"Specificity for class 1 at threshold {threshold}: {specificity}" ) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.7) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.6) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.5) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.4) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.3) evaluate_threshold(tpr_1, fpr_1, thresholds_1, 0.2) # Try to copy log file if it was created in training.log try: shutil.copy("training.log", self.output_dir) except FileExistsError: logging.warning("Could not find training.log to copy") except Exception: logging.warning("Could not copy training.log to output directory")
class adaBoost: __all__=['run','plotFeatureRanking','plotScores'] def __init__(self, foundVariables, trainingData, trainingClasses, trainingWeights, testingData, testingClasses, adaName, bkg_name): """Build a forest and compute the feature importances. Keyword args: foundVariables -- The list of the names of found variabes, can get using Sample_x.returnFoundVariables() trainingData -- The training data trainingClasses -- The training data classes testingData -- the testing data testingClasses -- the testing data classes adaName -- the name of the object (eg. sig+bkg_name) """ self.ada = AdaBoostClassifier(DecisionTreeClassifier(compute_importances=True,max_depth=4,min_samples_split=2,min_samples_leaf=100),n_estimators=400, learning_rate=0.5, algorithm="SAMME",compute_importances=True) #class sklearn.tree.DecisionTreeClassifier(criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_density=0.10000000000000001, max_features=None, compute_importances=False, random_state=None) self.foundVariables = foundVariables self.trainingData = trainingData self.trainingClasses = trainingClasses self.testingData = testingData self.testingClasses = testingClasses self.trainingWeights = trainingWeights self.name = adaName self.bkg_name = bkg_name self.elapsed = 0.0 def returnName(self): return self.name def run(self): """Run the fitting and testing.""" #start the fitting and time it start = clock() print 'starting training on AdaBoostClassifier' self.ada.fit(self.trainingData, self.trainingClasses, self.trainingWeights) self.elapsed = clock()-start print 'time taken for training: ' + str(self.elapsed) #set up the arrays for testing/ eval #xtA_C = copy.deepcopy(self.testingData) #pred = self.ada.predict(xtA_C) #import createHists #createHists.drawSigBkgDistrib(xtA_C, pred, self.foundVariables) # draw the signal and background distributions together # list the importances of each variable in the bdt, get the score on the test data self.importancesada = self.ada.feature_importances_ print 'importances' print self.importancesada self.score= self.ada.score(self.testingData,self.testingClasses) self.params = self.ada.get_params() self.std_mat = np.std([tree.feature_importances_ for tree in self.ada.estimators_], axis=0) self.indicesada = np.argsort(self.importancesada)[::-1] self.variableNamesSorted = [] for i in self.indicesada: self.variableNamesSorted.append(self.foundVariables[i]) # Print the feature ranking print "Feature ranking:" for f in xrange(12): print "%d. feature %d (%f)" % (f + 1, self.indicesada[f], self.importancesada[self.indicesada[f]]) + " " +self.variableNamesSorted[f] self.twoclass_output = self.ada.decision_function(self.testingData) self.twoclass_output_train = self.ada.decision_function(self.trainingData) self.class_proba = self.ada.predict_proba(self.testingData)[:, -1] def plotFeatureRanking(self): # We need this to run in batch because it complains about not being able to open display from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl #plot the feature ranking pl.figure() pl.title("Feature importances Ada") pl.bar(xrange(len(self.variableNamesSorted)), self.importancesada[self.indicesada], color="r", yerr=self.std_mat[self.indicesada], align="center") pl.xticks(xrange(12), self.variableNamesSorted)#indicesada) pl.xlim([-1, 12]) pl.show() def plotScores(self, returnROC = False, rocInput = []): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl from sklearn.metrics import roc_curve, auc plot_colors = "rb" plot_step = 1000.0 class_names = "AB" # Plot the training points pl.subplot(131) for i, n, c in zip(xrange(2), class_names, plot_colors): idx = np.where(self.trainingClasses == i) pl.scatter(self.trainingData[idx, 0], self.trainingData[idx, 1], c=c, cmap=pl.cm.Paired, label="Class %s" % n) pl.axis("tight") pl.legend(loc='upper right') pl.xlabel("Decision Boundary") # Plot the class probabilities for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.class_proba[self.testingClasses == i], bins=50, range=(0, 1), facecolor=c, label='Class %s' % n) pl.legend(loc='upper center') pl.ylabel('Samples') pl.xlabel('Class Probability') # Plot the two-class decision scores/ bdt scores pl.subplot(133) for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.twoclass_output[self.testingClasses == i], bins=50, range=(-1, 1), facecolor=c, label='Class %s' % n, normed=True) pl.legend(loc='upper right') pl.ylabel('Samples') pl.xlabel('Two-class Decision Scores') pl.subplots_adjust(wspace=0.25) mean_tpr = 0.0 mean_fpr = pl.linspace(0, 1, 100) pl.subplot(132) beginIdx = 0 endIdx = len(self.testingData)#/2 fpr_arr = [] tpr_arr = [] roc_auc_arr = [] rej_arr = [] for i in range(1): probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx]) #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i]) # Compute ROC curve and area the curve fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1]) #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i) #mean_tpr += interp(mean_fpr, fpr, tpr) #mean_tpr[0] = 0.0 roc_auc = auc(tpr,rej)#auc(fpr, tpr) fpr_arr.append(fpr) tpr_arr.append(tpr) roc_auc_arr.append(roc_auc) rej_arr.append(rej) pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc_arr[i]), color=plot_colors[i]) beginIdx = endIdx endIdx = len(self.testingData) if len(rocInput)>0: pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC fold %d (area = %0.2f)' % (2, rocInput[2][0]), color=plot_colors[1]) if returnROC: return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr] pl.show() def plotBDTScores(self): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl plot_colors = "rb" plot_step = 1000.0 alpha_h = [1.0, 0.7] class_names = ['Background', 'Signal'] for i, n, c in zip(xrange(2), class_names, plot_colors): pl.hist(self.twoclass_output[self.testingClasses == i], bins=50, range=(-1, 1), facecolor=c, alpha=alpha_h[i], label='Class %s' % n, normed=True) pl.legend(loc='upper right') pl.ylabel('Samples') pl.xlabel('BDT Scores') pl.savefig('BDTScores'+self.name+'.png') def plotROC(self, returnROC = False, rocInput = []): from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas import matplotlib.pyplot as plt import pylab as pl from sklearn.metrics import roc_curve, auc beginIdx = 0 endIdx = len(self.testingData)#/2 plot_colors = "rb" plot_step = 1000.0 class_names = "AB" fpr_arr = [] tpr_arr = [] roc_auc_arr = [] rej_arr = [] names = [] pl.xlabel("Signal Efficiency") pl.ylabel("Background Rejection") pl.title("ROC curves") for i in range(1): probas_ = self.ada.predict_proba(self.testingData[beginIdx:endIdx]) #probas_ = self.ada.predict_proba(self.testingData[self.testingClasses == i]) # Compute ROC curve and area the curve fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[beginIdx:endIdx], probas_[:,1]) #fpr, tpr, thresholds, rej = sc.roc_curve_rej(self.testingClasses[self.testingClasses == i], probas_[:,1],i) #mean_tpr += interp(mean_fpr, fpr, tpr) #mean_tpr[0] = 0.0 roc_auc = auc(tpr,rej)#auc(fpr, tpr) fpr_arr.append(fpr) tpr_arr.append(tpr) roc_auc_arr.append(roc_auc) rej_arr.append(rej) names.append(self.name) beginIdx = endIdx endIdx = len(self.testingData) if len(rocInput)>0: label_bkg = rocInput[4][0] if '_A' in rocInput[4][0]: label_bkg = 'even event number' pl.plot(rocInput[1][0], rocInput[3][0], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, rocInput[2][0]), color=plot_colors[1]) if not returnROC: label_bkg = self.name if '_B' in self.name: label_bkg = 'odd event number' pl.plot(tpr_arr[i], rej_arr[i], lw=1, label='ROC %s (area = %0.2f)' % (label_bkg, roc_auc_arr[i]), color=plot_colors[i]) pl.legend(loc='lower left') pl.savefig("roc_combined_"+self.name+".png") if returnROC: return [fpr_arr, tpr_arr, roc_auc_arr, rej_arr, names] pl.show() def plotDecisionBoundaries(self): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn.preprocessing import StandardScaler #from sklearn.cross_validation import train_test_split # just plot the dataset first cm = pl.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) #self.trainingData = StandardScaler().fit_transform(self.trainingData) #self.testingData = StandardScaler().fit_transform(self.testingData) #X_train = StandardScaler().fit_transform(self.twoclass_output_train) h = 0.1 h2 = 0.01 #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4) # get most important variable indices idx1 = self.foundVariables.index(self.variableNamesSorted[0]) idx2 = self.foundVariables.index(self.variableNamesSorted[1]) x_min, x_max = self.trainingData[np.argmin(self.trainingData[:, idx1])][idx1] - .1, self.trainingData[np.argmax(self.trainingData[:, idx1])][idx1] + .1 y_min, y_max = self.trainingData[np.argmin(self.trainingData[:, idx2])][idx2]- .01, self.trainingData[np.argmax(self.trainingData[:,idx2])][idx2] + .01 x_min2, x_max2 = self.testingData[np.argmin(self.testingData[:, idx1])][idx1] - .1, self.testingData[np.argmax(self.testingData[:, idx1])][idx1] + .1 y_min2, y_max2 = self.testingData[np.argmin(self.testingData[:, idx2])][idx2] - .01, self.testingData[np.argmax(self.testingData[:, idx2])][idx2] + .01 xmin = min(x_min,x_min2) xmax = max(x_max,x_max2) ymin = min(y_min, y_min2) ymax = max(y_max,y_max2) xx, yy = np.meshgrid(np.arange(xmin, xmax, float((xmax-xmin)/25.0)), np.arange(ymin, ymax, float((ymax-ymin)/25.0))) # get mean values for other variables means = np.mean(self.testingData, axis=0) means = np.tile(means, (xx.shape[1]*xx.shape[0],1)) for j in xrange(xx.shape[0]): for k in xrange(xx.shape[1]): means[(j+1)*(k+1)-1][idx1] = xx[0][j] means[(j+1)*(k+1)-1][idx2] = yy[k][0] #print 'shape X: ' #print X.shape print 'shape xx: ' print xx.shape print 'shape yy: ' print yy.shape #rav = np.c_[xx.ravel(), yy.ravel()] print 'shape means: ' print means.shape # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. #if hasattr(clf, "decision_function"): # Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) #else: Z = self.ada.predict_proba(means)[:, 1] print 'Z shape:' print Z.shape # Put the result into a color plot Z = Z.reshape(xx.shape) figure = pl.figure() ax = pl.axes() ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot also the training points #for i, n in zip(xrange(2), class_names): # idx = np.where(self.trainingClasses == i) ax.scatter(self.trainingData[:, idx1], self.trainingData[:, idx2], c=self.trainingClasses[:], cmap=cm_bright) #for i, n in zip(xrange(2), class_names): # idx = np.where(self.testingClasses == i) ax.scatter(self.testingData[:, idx1], self.testingData[:, idx2], c=self.testingClasses[:], cmap=cm_bright, alpha=0.6) #ax.scatter(X_train[:, 0], X_training[:, 1], c=self.trainingClasses, cmap=cm_bright) # and testing points #ax.scatter(X[:, 0], X[:, 1], c=self.testingClasses, cmap=cm_bright, # alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title("adaBoost") ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % self.score).lstrip('0'), size=15, horizontalalignment='right') pl.savefig("adaBoostDecisionBoundaries"+self.name+".png") pl.show()
class AdaBoost(Classifier): r"""Implementation of AdaBoost classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: Y. Freund, R. Schapire, “A Decision-Theoretic Generalization of on-Line Learning and an Application to Boosting”, 1995. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'AdaBoost' def __init__(self, **kwargs): r"""Initialize AdaBoost instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self._params = dict(n_estimators=ParameterDefinition( MinMax(min=10, max=111), np.uint), algorithm=ParameterDefinition(['SAMME', 'SAMME.R'])) self.__ada_boost = AdaBoostClassifier() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__ada_boost.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit AdaBoost. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. """ self.__ada_boost.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__ada_boost.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__ada_boost.get_params()))
else: print( 'Low precision and recall, DecisionTree is not a good classifier with set parameters' ) print( '################### Try AdaBoostClassifier ###################################' ) # AdaBoostClassifier - 2 ab_clf_1 = AdaBoostClassifier() pipe = Pipeline([('feature_selection', SelectKBest(k=k)), ('classification', ab_clf_1)]) # Check the parameters that can be set for AdaBoostClassifier, and create a param_grid estimated = ab_clf_1.get_params().keys() print('param_keys########################', estimated) param_grid = {'classification__n_estimators': [10, 50, 100]} scorer = make_scorer(f1_score) ab_clf_1 = GridSearchCV(pipe, param_grid=param_grid, scoring=scorer) ab_clf_1.fit(X_training_features, y_train_poi) scores = sklearn.cross_validation.cross_val_score(ab_clf_1, features, labels) print(scores) print('AdaBoostClassifier mean score:', scores.mean()) clf_best = ab_clf_1.best_estimator_ y_poi_predicted = clf_best.predict(X_test_features)
index=["Predicted No", "Predicted Yes"], columns=["Actual No", "Actual Yes"]) plt.figure() sns.heatmap(df_cm, cmap="bwr", annot=True) print(classification_report(y_test, prediction_RF, target_names=["no", "yes"])) #AUC probs_RF = rf_random.predict_proba(x_test) preds_RF = probs_RF[:, 1] fprrfc, tprrfc, thresholdrfc = metrics.roc_curve(y_test, preds_RF) roc_aucrfc = metrics.auc(fprrfc, tprrfc) #%% 6.1.2. ADABOOST from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier ADA = AdaBoostClassifier() #explore the hyperparameters pprint(ADA.get_params()) #learning rate shrinks the contribution of each tree by learning_rate. learning_rate = [ round(float(x), 2) for x in np.linspace(start=0.2, stop=2, num=10) ] #algorithm =================================================================== # If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. # base_estimator must support calculation of class probabilities. # If ‘SAMME’ then use the SAMME discrete boosting algorithm. # The SAMME.R algorithm typically converges faster than SAMME, # achieving a lower test error with fewer boosting iterations. # ============================================================================= algorithm = ["SAMME", "SAMME.R"] n_estimators = [500] #The base estimator from which the boosted ensemble is built base_estimator = [
print(LGB.score(X_val, y_val)) from joblib import dump, load dump(LGB, 'LGB.joblib') dump(LBM_1, 'LGB.joblib') from sklearn.ensemble import AdaBoostClassifier from pprint import pprint ada = AdaBoostClassifier() ada.fit(X_train, y_train) print(ada.score(X_test, y_test)) print('Parameters currently in use:\n') pprint(ada.get_params()) # XGBoost # XGBoost and Adaboost are bad in multiclassification. The reason can be c=found in the lightgbm literature review. from xgboost import XGBRegressor from sklearn.metrics import accuracy_score xgb = XGBRegressor() xgb.fit(X_train, y_train) xgb.score(X_test, y_test) from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasRegressor from sklearn.model_selection import cross_val_score
test_error = [] train_error = [] for train_index, test_index in skf: print("for iteration {}".format(i)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] clf = clf.fit(X_train,y_train) y_pred = clf.predict(X_test) test_error.append(accuracy_score(y_pred,y_test)) y_pred = clf.predict(X_train) train_error.append(accuracy_score(y_pred,y_train)) print(clf.get_params()) print('Time to fit the dataset of alpha = {} is {}'.format(i,time.clock()-start)) # y_pred = clf.predict(X) # train_error = mean_absolute_error(y_pred,Y) # y_pred = clf.predict(X_test) test_error = sum(test_error)/len(test_error) train_error = sum(train_error)/len(train_error) f.write('{},{},{}\n'.format(i,train_error,test_error)) print('{},{},{}\n'.format(i,train_error,test_error)) f.flush()
class AdaBoost(Model): # X represents the features, Y represents the labels X = None Y = None prediction = None model = None def __init__(self): pass def __init__(self, X=None, Y=None, label_headers=None, n_estimators=100, type='regressor', cfg=False): if X is not None: self.X = X if Y is not None: self.Y = Y self.mapping_dict = None self.label_headers = label_headers self.type = type self.cfg = cfg if type == 'regressor': self.model = AdaBoostRegressor(n_estimators=n_estimators) else: self.model = AdaBoostClassifier(n_estimators=n_estimators) def fit(self, X=None, Y=None): if X is not None: self.X = X if Y is not None: self.Y = Y if self.type == 'classifier': self.map_str_to_number(Y) print('AdaBoost Train started............') self.model.fit(self.X, self.Y) print('AdaBoost Train completed..........') return self.model def predict(self, test_features): print('Prediction started............') self.predictions = self.model.predict(test_features) print('Prediction completed..........') return self.predictions def save(self, filename='adaboost_model.pkl'): if self.cfg: f = open('adaboost_configs.txt', 'w') f.write(json.dumps(self.model.get_params())) f.close() pickle.dump(self.model, open(filename, 'wb')) def featureImportance(self): # Get numerical feature importances # importances = list(self.model.feature_importances_) # List of tuples with variable and importance # feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_headers, importances)] # Sort the feature importances by most important first # feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True) # Print out the feature and importances # [print('Variable: {!s:20} Importance: {}'.format(*pair)) for pair in feature_importances]; return self.model.feature_importances_ def getAccuracy(self, test_labels, predictions, origin=0, hitmissr=0.8): if self.type == 'classifier': correct = 0 df = pd.DataFrame(data=predictions.flatten()) for i in range(len(df)): if (df.values[i] == test_labels.values[i]): correct = correct + 1 else: correct = 0 df = pd.DataFrame(data=predictions.flatten()) for i in range(len(df)): if 1 - abs(df.values[i] - test_labels.values[i]) / abs( df.values[i]) >= hitmissr: correct = correct + 1 return float(correct) / len(df) def getConfusionMatrix(self, test_labels, predictions, label_headers): if self.type == 'classifier': df = pd.DataFrame(data=predictions.flatten()) index = 0 for label_header in label_headers: classes = test_labels[label_header].unique() title = 'Normalized confusion matrix for AdaBoost (' + label_header + ')' self.plot_confusion_matrix(test_labels.ix[:, index], df.ix[:, index], classes=classes, normalize=True, title=title) index = index + 1 else: return 'No Confusion Matrix for Regression' def getRSquare(self, test_labels, predictions, mode='single'): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': if mode == 'multiple': errors = r2_score(test_labels, df, multioutput='variance_weighted') else: errors = r2_score(test_labels, df) return errors else: return 'No RSquare for Classification' def getMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = mean_squared_error(test_labels, df) return errors else: return 'No MSE for Classification' def getMAPE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = np.mean(np.abs( (test_labels - df.values) / test_labels)) * 100 return errors.values[0] else: return 'No MAPE for Classification' def getRMSE(self, test_labels, predictions): df = pd.DataFrame(data=predictions.flatten()) if self.type == 'regressor': errors = sqrt(mean_squared_error(test_labels, df)) return errors else: return 'No RMSE for Classification'
# n_estimators=50, # learning_rate=1.5, # algorithm="SAMME") # bdt_discrete.fit(X_train, Y1_train) print('finished fit') print(confusion_table(Y1_test, bdt_real.predict(X_test))) prob = bdt_real.predict_proba(X_test) pred = Y1_test.tolist() table = pd.DataFrame(prob) table = pd.concat([table, pd.DataFrame(pred)], 1) table.columns = [-1, 0, 1, 'true'] table.sort(1, ascending=False)[:1000] bdt_real.get_params() # draw trees in every round_count = 0 for trees in bdt_real.estimators_: dot_data = tree.export_graphviz(trees, out_file=None, filled=True, rounded=True, special_characters=True, feature_names=X_train.columns) graph = pydotplus.graph_from_dot_data(dot_data) graph.write_pdf(data_output_dir + "tree_round_" + str(round_count) + ".pdf") round_count += 1 real_test_errors = [] # discrete_test_errors = [] # for real_test_predict, discrete_train_predict in zip( # bdt_real.staged_predict(X_test), bdt_discrete.staged_predict(X_test)):
class _AdaBoostClassifierImpl: def __init__( self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm="SAMME.R", random_state=None, ): if base_estimator is None: estimator_impl = None else: estimator_impl = _FitSpecProxy(base_estimator) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "algorithm": algorithm, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = _FitSpecProxy( feature_transformer >> self._hyperparams["base_estimator"]) self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def predict_log_proba(self, X): return self._wrapped_model.predict_log_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def main(): """magic happens here""" # preprocess, then train, test, and split chess_num_datatrain, chess_num_datatest, chess_num_targettrain, chess_num_targettest = tts_chess_numeric( ) iris_num_datatrain, iris_num_datatest, iris_num_targettrain, iris_num_targettest = tts_iris_numeric( ) letter_num_datatrain, letter_num_datatest, letter_num_targettrain, letter_num_targettest = tts_letter_numeric( ) # For each dataset ## Try at least 3 different "regular" learning algorithms and note the results. ### DS1 - chess print("") ##### method 1 - MLP ** clf_chess_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(40, 30), random_state=1) clf_chess_num_MLP.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_MLP.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - Neural Network") ##### method 2 - Decision Tree clf_chess_num_DT = DecisionTreeClassifier(random_state=0) clf_chess_num_DT.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_DT.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - Decision Tree") ##### method 3 - KNN clf_chess_num_KNN = KNeighborsClassifier(n_neighbors=7) clf_chess_num_KNN.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_KNN.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "Chess - KNN") ### DS2 - iris print("") ##### method 1 - MLP clf_iris_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 7), random_state=1) clf_iris_num_MLP.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_MLP.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - Neural Network") # clf_iris_num_MLP_gs = MLPClassifier() # iris_param_grid = [ # { # 'activation' : ['identity', 'logistic', 'tanh', 'relu'], # 'solver' : ['lbfgs', 'sgd', 'adam'], # 'hidden_layer_sizes': [ # (9,1),(9,2),(9,3),(9,4),(9,5),(9,6),(9,7),(9,8),(9,10),(9,11),(9,12), # (10,1),(10,2),(10,3),(10,4),(10,5),(10,6),(10,7),(10,8),(10,10),(10,11),(10,12), # (11,1),(11,2),(11,3),(11,4),(11,5),(11,6),(11,7),(11,8),(11,10),(11,11),(11,12) # ] # } # ] # grid_clf = GridSearchCV(clf_iris_num_MLP_gs, iris_param_grid, cv=3, # scoring='accuracy') # grid_clf.fit(iris_num_datatrain, iris_num_targettrain) # print("the best parameters out of those chosen are: ") # print(grid_clf.best_params_) ##### method 2 - Decision Tree clf_iris_num_DT = DecisionTreeClassifier() clf_iris_num_DT.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_DT.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - Decision Tree") ##### method 3 - KNN clf_iris_num_KNN = KNeighborsClassifier(n_neighbors=3) clf_iris_num_KNN.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_KNN.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "Iris - KNN") ### DS3 print("") ##### method 1 - MLP clf_letter_num_MLP = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(40, 30), random_state=1) clf_letter_num_MLP.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_MLP.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - Neural Network") ##### method 2 - Decision Tree clf_letter_num_DT = DecisionTreeClassifier() clf_letter_num_DT.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_DT.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - Decision Tree") ##### method 3 - KNN clf_letter_num_KNN = KNeighborsClassifier(n_neighbors=3) clf_letter_num_KNN.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_KNN.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "Letter - KNN") print("") ## Use Bagging and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20) clf_chess_num_Bagging.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_Bagging.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "BAGGING - Chess") ### DS2 - Iris clf_iris_num_Bagging = BaggingClassifier(bootstrap=True) clf_iris_num_Bagging.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_Bagging.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "BAGGING - Iris") ### DS3 - Letter clf_letter_num_Bagging = BaggingClassifier(bootstrap=True, n_estimators=20) clf_letter_num_Bagging.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_Bagging.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "BAGGING - Letter") print("") ## Use AdaBoost and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_AdaBoost = AdaBoostClassifier() clf_chess_num_AdaBoost.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_AdaBoost.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "ADABOOST - Chess") params = clf_chess_num_AdaBoost.get_params() print(params) ### DS2 - Iris clf_iris_num_AdaBoost = AdaBoostClassifier(learning_rate=0.3) clf_iris_num_AdaBoost.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_AdaBoost.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "ADABOOST - Iris") params = clf_iris_num_AdaBoost.get_params() print(params) ### DS3 - Letter clf_letter_num_AdaBoost = AdaBoostClassifier(n_estimators=200) clf_letter_num_AdaBoost.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_AdaBoost.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "ADABOOST - Letter") params = clf_letter_num_AdaBoost.get_params() print(params) print("") ## Use a random forest and note the results. (Play around with a few different options) ### DS1 - Chess clf_chess_num_RandomForest = RandomForestClassifier(criterion='entropy', bootstrap=False, n_estimators=30) clf_chess_num_RandomForest.fit(chess_num_datatrain, chess_num_targettrain) predictions = clf_chess_num_RandomForest.predict(chess_num_datatest) display_similarity(predictions, chess_num_targettest, "RANDOM FOREST - Chess") ### DS2 - Iris clf_iris_num_RandomForest = RandomForestClassifier() clf_iris_num_RandomForest.fit(iris_num_datatrain, iris_num_targettrain) predictions = clf_iris_num_RandomForest.predict(iris_num_datatest) display_similarity(predictions, iris_num_targettest, "RANDOM FOREST - Iris") ### DS3 - Letter clf_letter_num_RandomForest = RandomForestClassifier(bootstrap=False) clf_letter_num_RandomForest.fit(letter_num_datatrain, letter_num_targettrain) predictions = clf_letter_num_RandomForest.predict(letter_num_datatest) display_similarity(predictions, letter_num_targettest, "RANDOM FOREST - Letter")
score_grid[clf] = [accuracy, precision, recall, f1, f2] ### print out AdaBoost parameters tuning result ############################################################### print "\nAdaBoost Tuning:" print """Note: if 'min_samples_split' and 'max_featues' are None, base_estimator = None. Else, base_estimator = DecisionTreeClassifier(min_samples_split = min_samples_split, max_features = max_features)""" print "\n" print "{:^17}{:^16}{:^16}{:^16}{:^11}{:^11}{:^11}{:^11}{:^11}"\ .format("", "n_estimator","min_samples_split","max_features","accuracy", "precision", "recall", "f1", "f2") scoring_methods = ["accuracy", "precision", "recall", "f1", "f2"] for sm in scoring_methods: clf = find_best(sm, score_grid) ne = clf.get_params().get("n_estimators") msp = clf.get_params().get('base_estimator__min_samples_split') mf = clf.get_params().get('base_estimator__max_features') accuracy, precision, recall, f1, f2 = score_grid[clf] print "{:^17}{:^16}{:^16}{:^16}{:^11.3f}{:^11.3f}{:^11.3f}{:^11.3f}{:^11.3f}"\ .format("best_"+sm, ne, msp, mf, accuracy, precision, recall, f1, f2) ### try PCA, the best estimator above(based on f1 score) and sf3 to do classification ############################# clf_f1 = find_best("f1", score_grid) sf_in_use = sfs_dict["sf3"] print "\nPCA Analysis:\n" print "clf:", clf_f1, "\n" print "{:^15}{:^12}{:^12}{:^12}{:^12}{:^12}".format("n_components", "accuracy", "precision", "recall", "f1", "f2")
# Accuracy of Extratrees classifier on test set: 0.8295 #****************************************************************************** #****************************************************************************** # *** Applying Machine Learning Technique #6 *** from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier Adab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=5) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(Adab.get_params()) Adab.fit(X_train, y_train) score_ABC = Adab.score(X_test, y_test) print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ABC)) # Accuracy of Extratrees classifier on test set: 0.8224 #****************************************************************************** #****************************************************************************** # *** Applying Machine Learning Technique #7 *** from sklearn.ensemble import RandomForestClassifier Rando = RandomForestClassifier(n_estimators=5)
class Algorithm(object): # Initialize the chosen algorithm with parameters set by user def __init__( self, algorithm='decision tree', parameters={} ): # algorithm: String, Parameters: Dict {paramName:paramValue} self.algorithm = algorithm if self.algorithm == 'decision tree': # needed parameter: max_depth if parameters == {}: self.classifier = DecisionTreeClassifier(max_depth=2) else: self.classifier = DecisionTreeClassifier( max_depth=parameters['max_depth']) elif self.algorithm == 'support vector machine': # needed parameters: penalty, tol, C, class_weight # needed parameter: max_depth if parameters == {}: self.classifier = LinearSVC(dual=False, penalty='l1', tol=0.00001, C=1.0, class_weight=None) else: self.classifier = LinearSVC( dual=False, penalty=parameters['penalty'], tol=parameters['tol'], C=parameters['C'], class_weight=parameters['class_weight']) elif self.algorithm == 'random forest': # needed parameters: n_estimators, max_features, min_samples_leaf, max_depth # needed parameter: max_depth if parameters == {}: self.classifier = RandomForestClassifier(n_estimators=20, max_features='auto', min_samples_leaf=3, max_depth=3) else: self.classifier = RandomForestClassifier( n_estimators=parameters['n_estimators'], max_features=parameters['max_features'], min_samples_leaf=parameters['min_samples_leaf'], max_depth=parameters['max_depth']) elif self.algorithm == 'adaboost': # needed parameters: base_estimator, n_estimators, learning_rate, algorithm. print 'adaboost start' if parameters == {}: self.classifier = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=60, learning_rate=1, algorithm='SAMME') else: print 'cls1 ' + parameters['base_estimator'] if parameters['base_estimator'] == 'decision tree': baseEstimator = DecisionTreeClassifier(max_depth=2) elif parameters['base_estimator'] == 'support vector machine': baseEstimator = LinearSVC(dual=False, penalty='l1', tol=0.00001, C=1.0, class_weight=None) elif parameters['base_estimator'] == 'random forest': baseEstimator = RandomForestClassifier(n_estimators=20, max_features='auto', min_samples_leaf=3, max_depth=3) else: print 'cls2 ' + parameters['base_estimator'] self.classifier = AdaBoostClassifier( base_estimator=baseEstimator, n_estimators=parameters['n_estimators'], learning_rate=parameters['learning_rate'], algorithm=parameters['algorithm']) def getAlgorithm(self): return self.algorithm def getParameters(self): return self.classifier.get_params() def fit(self, X, Y): self.classifier.fit(X, Y) def predict(self, X): return self.classifier.predict(X) def aggregation(self, Y, maxGap): loc1_1 = 0 for i in range(len(Y) - maxGap): if Y[i] > 0.5: loc1_2 = loc1_1 loc1_1 = i if 1.5 < loc1_1 - loc1_2 < maxGap: for iter in range(loc1_2 + 1, loc1_1): Y[iter] = 1