def reportBestResult(): C = [16, 32, 64, 128, 256, 512] bestAUC = 0 bestParam = 0 for c in C: auc_history = load_np_array("results/ada_auc_" + str(c) + ".bin") mean_auc = auc_history.mean() if (mean_auc > bestAUC): bestAUC = mean_auc bestParam = c print bestParam confusion_matrix_history = load_np_array("results/rfc_folds_confusion_" + str(c) + ".bin") print confusion_matrix_history mean_cm = np.mean(confusion_matrix_history, axis=2) std_cm = np.std(confusion_matrix_history, axis=2) # for i in range(0,2): # for j in range(0,2): # mean_cm[i][j] = confusion_matrix_history[i][j].mean() print mean_cm print std_cm compute_performance_metrics(mean_cm) acc_mean = [] recall_mean = [] for c in C: accuracy_history = load_np_array("results/ada_accuracy_" + str(c) + ".bin") recall_history = load_np_array("results/ada_recall_" + str(c) + ".bin") acc_mean.append(accuracy_history.mean()) recall_mean.append(recall_history.mean())
# Train classifier. print "\nTraining classifier param %d" % c for i, (train, test) in enumerate(cvs): sm = OverSampler(verbose = False, ratio = 2.5) train_oversampled_x, train_oversampled_train_y = sm.fit_transform(homesite.train_x[train], homesite.train_y[train]) probas_ = clf.fit(train_oversampled_x, train_oversampled_train_y).predict_proba(homesite.train_x[test]) fpr, tpr, thresholds = roc_curve(homesite.train_y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = compute_auc(homesite.train_y[test], probas_[:, 1]) fold_cm = confusion_matrix(homesite.train_y[test], np.round(probas_)[:, 1]) confusion_matrix_history = np.dstack((confusion_matrix_history, fold_cm)) accuracy, precision, recall = compute_performance_metrics(fold_cm) mean_acc += accuracy mean_recall += recall mean_precision += precision accuracy_history.append(accuracy) precision_history.append(precision) recall_history.append(recall) auc_history.append(roc_auc) save_np_array("../../results/random_forests/rf_accuracy_" + str(c) + ".bin", np.array(accuracy_history)) save_np_array("../../results/random_forests/rf_precision_" + str(c) + ".bin", np.array(precision_history)) save_np_array("../../results/random_forests/rf_recall_" + str(c) + ".bin", np.array(recall_history)) save_np_array("../../results/random_forests/rf_auc_" + str(c) + ".bin", np.array(auc_history)) save_np_array("../../results/random_forests/rf_confusion_matrix_" + str(c) + ".bin", np.array(confusion_matrix_history)) plt.plot(fpr, tpr, lw = 1, label = 'ROC fold %d (area = %0.2f)' % (i, roc_auc))
''' oversampled_path = "resources/oversampled_normalized_data_ratio_2.bin" homesite = Data() homesite.load_sliptted_data(oversampled_path) del homesite.test_x # Deleted to save memory. print homesite.train_x.shape # Creating classifier. # clf = DecisionTreeClassifier() clf = RandomForestClassifier(max_features=100) # clf = AdaBoostClassifier(n_estimators = 10) # clf = svm.SVC(gamma = 0.00005) # clf = RandomForestClassifier() # clf = MultiplePLS(n_classifiers = 10, n_samples = 5000, n_positive_samples = 2500, threshold = 0.9, acc = 0.999) # clf = svm.LinearSVC() # Train classifier. print "Training classifier." clf.fit(homesite.train_x, homesite.train_y) # Test classifier. print 'Testing classifier.' predicted_labels = clf.predict_proba(homesite.validation_x)[:, 1] # Show final results. results = confusion_matrix(homesite.validation_y, np.round(predicted_labels)) accuracy, precision, recall = compute_performance_metrics(results) auc = compute_auc(homesite.validation_y, predicted_labels)