def logistic_fit_and_analysis_no_folds(inputs, targets, lambda_val, threshold): N = inputs.shape[0] training_filter, validation_filter = train_and_test_filter( N, test_fraction=0.20) training_data, training_targets, validation_data, validation_targets = train_and_test_partition( inputs, targets, training_filter, validation_filter) log_reg_weights = robust_logistic_regression_fit(training_data, training_targets, lambda_val, threshold=threshold) predicted = logistic_regression_predict(validation_data, log_reg_weights) #ACCURACY predict_probs = logistic_regression_prediction_probs( validation_data, log_reg_weights) #print(validation_data.shape, validation_targets.shape, predict_probs.shape) accuracy = 0.0 for i in range(len(validation_targets)): accuracy += (predicted[i] == validation_targets[i] ).astype(int) / len(validation_targets) if (predict_probs[i] == 1): predict_probs[ i] -= 0.0001 #THIS IS SO THAT IN CROSS_ENTROPY_ERROR THERE IS NEVER A LOG OF 0 entropy = cross_entropy_error(validation_targets, predict_probs) return accuracy, entropy
def Finding_most_optimum_k(inputs, targets, num_knn, cv_folds, num_folds): # create array of zeros to store the mean validation accuracy for k at location k-1 mean_validation_accuracy = np.zeros(num_knn - 1) mean_train_accuracy = np.zeros(num_knn - 1) for f in range(num_folds): fold_filters = cv_folds[f] training_filter = fold_filters[0] validation_filter = fold_filters[1] training_data, training_targets, validation_data, validation_targets = train_and_test_partition( inputs, targets, training_filter, validation_filter) neighbors = np.arange(1, num_knn) train_accuracy = np.empty(len(neighbors)) test_accuracy = np.empty(len(neighbors)) # Loop over different values of k for i, k in enumerate(neighbors): # Setup a k-NN Classifier with k neighbors: knn knn = KNeighborsClassifier(n_neighbors=k) # Fit the classifier to the training data knn.fit(training_data, training_targets) # Compute accuracy on the training set train_accuracy[i] = knn.score(training_data, training_targets) # Compute accuracy on the testing set test_accuracy[i] = knn.score(validation_data, validation_targets) mean_validation_accuracy[i] += test_accuracy[i] / num_folds mean_train_accuracy[i] += train_accuracy[i] / num_folds optimumKNN = test_accuracy.tolist().index(np.max(test_accuracy)) + 1 return optimumKNN, neighbors, mean_validation_accuracy, mean_train_accuracy
def logistic_fit_and_analysis(inputs, targets, cv_folds, num_folds, lambda_val, threshold): accuracy_array = np.zeros(num_folds) entropy_array = np.zeros(num_folds) for f in range(num_folds): fold_filters = cv_folds[f] training_filter = fold_filters[0] validation_filter = fold_filters[1] training_data, training_targets, validation_data, validation_targets = train_and_test_partition( inputs, targets, training_filter, validation_filter) log_reg_weights = robust_logistic_regression_fit(training_data, training_targets, lambda_val, threshold=threshold) predicted = logistic_regression_predict(validation_data, log_reg_weights) #ACCURACY predict_probs = logistic_regression_prediction_probs( validation_data, log_reg_weights) #print(validation_data.shape, validation_targets.shape, predict_probs.shape) for i in range(len(validation_targets)): accuracy_array[f] += (predicted[i] == validation_targets[i] ).astype(int) / len(validation_targets) if (predict_probs[i] == 1): predict_probs[ i] -= 0.0001 #THIS IS SO THAT IN CROSS_ENTROPY_ERROR THERE IS NEVER A LOG OF 0 entropy_array[f] = cross_entropy_error(validation_targets, predict_probs) accuracy = accuracy_array.mean() entropy = entropy_array.mean() return accuracy, entropy
def cross_validation_decision_boundary_fishers(inputs, targets, cv_folds, num_folds, decision_boundaries, num_decision_boundaries, robust=0): train_accuracy_array = np.zeros(num_decision_boundaries) test_accuracy_array = np.zeros(num_decision_boundaries) for f in range(num_folds): fold_filters = cv_folds[f] training_filter = fold_filters[0] validation_filter = fold_filters[1] training_data, training_targets, validation_data, validation_targets = train_and_test_partition( inputs, targets, training_filter, validation_filter) if robust == 0: fisher_weights = fisher_linear_discriminant_projection( training_data, training_targets) elif robust == 1: fisher_weights = robust_fisher_linear_discriminant_projection( training_data, training_targets, 1e-6) projected_inputs_train = project_data(training_data, fisher_weights) projected_inputs_test = project_data(validation_data, fisher_weights) new_ordering_train = np.argsort(projected_inputs_train) new_ordering_test = np.argsort(projected_inputs_test) projected_inputs_train = projected_inputs_train[new_ordering_train] projected_inputs_test = projected_inputs_test[new_ordering_test] training_targets = np.copy(training_targets[new_ordering_train]) validation_targets = np.copy(validation_targets[new_ordering_test]) predicted_train = np.empty(len(projected_inputs_train)) predicted_test = np.empty(len(projected_inputs_test)) for j in range(len(decision_boundaries)): train_accuracy_temp = 0.0 test_accuracy_temp = 0.0 for i in range(len(training_targets)): predicted_train[i] = (projected_inputs_train[i] > decision_boundaries[j]).astype(int) train_accuracy_temp += ( predicted_train[i] == training_targets[i]).astype(int) / len(training_targets) for t in range(len(validation_targets)): predicted_test[t] = (projected_inputs_test[t] > decision_boundaries[j]).astype(int) test_accuracy_temp += (predicted_test[t] == validation_targets[t] ).astype(int) / len(validation_targets) test_accuracy_array[j] += test_accuracy_temp / num_folds train_accuracy_array[j] += train_accuracy_temp / num_folds return train_accuracy_array, test_accuracy_array, decision_boundaries[ test_accuracy_array.tolist().index(np.max(test_accuracy_array))]
def main(dataset): inputs, targets, label = process_data(dataset) N = inputs.shape[0] # Total number of datasets num_knn = 30 # number of nearest neighbours + 1 num_folds = 5 # number of folds #Partitioning train and test data train_filter, test_filter = train_and_test_filter(N, test_fraction=0.20) train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition( inputs, targets, train_filter, test_filter) cv_folds = create_cv_folds(N, num_folds) #Cross validations, obtaining best parameter for KNN optimumKNN, neighbors, mean_validation_accuracy, train_accuracy = Finding_most_optimum_k( inputs, targets, num_knn, cv_folds, num_folds) print( "The best mean validation score = {} with number of nearest neighbours = {}" .format(mean_validation_accuracy[optimumKNN], optimumKNN)) #Model fitting with the optimum number of neighbors knn, fitting_accuracy, prediction_accuracy = fitting_best_k( optimumKNN, train_inputs, train_targets, test_inputs, test_targets) #Preparation for ROC curve fpr, tpr, AUC = ROC_values_and_AUC(test_inputs, test_targets, 100, knn) #QUADRATIC EXPANSION quadratic_inputs = quadratic_feature_mapping(inputs) quadratic_train_inputs = quadratic_feature_mapping(train_inputs) quadratic_test_inputs = quadratic_feature_mapping(test_inputs) quadratic_optimumKNN, neighbors, quadratic_mean_validation_accuracy, quadratic_train_accuracy = Finding_most_optimum_k( quadratic_inputs, targets, num_knn, cv_folds, num_folds) print( "The best mean validation score = {} with number of nearest neighbours = {}" .format(quadratic_mean_validation_accuracy[quadratic_optimumKNN], quadratic_optimumKNN)) #Model fitting with the optimum number of neighbors quadratic_knn, quadratic_fitting_accuracy, quadratic_prediction_accuracy = fitting_best_k( quadratic_optimumKNN, quadratic_train_inputs, train_targets, quadratic_test_inputs, test_targets) #Preparation for ROC curve quad_fpr, quad_tpr, quad_AUC = ROC_values_and_AUC(quadratic_test_inputs, test_targets, 100, quadratic_knn) #Plotting testing accuracy and training accuracy against no. of neighbors to see if the model overfit fig = plt.figure() fig.suptitle("Accuracy for different K") ax1 = fig.add_subplot(1, 1, 1) ax1.plot(neighbors, mean_validation_accuracy, label='Testing Accuracy') ax1.plot(neighbors, train_accuracy, label='Training Accuracy') ax1.set_xlabel('Number of Neighbors') ax1.set_ylabel('Accuracy') ax1.plot(neighbors, quadratic_mean_validation_accuracy, label='Quadratic Testing Accuracy') ax1.plot(neighbors, quadratic_train_accuracy, label='Quadratic Training Accuracy') ax1.legend() #Plotting ROC curve fig2 = plt.figure() ax2 = fig2.add_subplot(1, 1, 1) ax2.plot(fpr, tpr, '-', color="b", label='Normal AUC = %0.4f' % AUC) ax2.plot(quad_fpr, quad_tpr, '-', color="r", label='Quadratic AUC = %0.4f' % quad_AUC) ax2.legend(loc='lower right') ax2.plot([0, 1], [0, 1], linestyle='--') ax2.set_xlabel("False Positive Rate") ax2.set_ylabel("True Positive Rate") ax2.set_aspect('equal', 'box') ax2.set_xlim([-0.01, 1.01]) ax2.set_ylim([-0.01, 1.01]) ax2.set_xticks([0, 0.5, 1]) ax2.set_yticks([0, 0.5, 1]) plt.tight_layout() #Constructing confusion matrix y_pred = knn.predict(test_inputs) y_actual = test_targets confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() confusion_matrix = pd.DataFrame(confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) fig3, ax3 = plt.subplots(figsize=(5, 5)) fig3.suptitle("Normal Confusion Matrix") sns.heatmap(confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax3, cmap="Blues") ax3.set_xlabel("Predicted class") ax3.set_ylabel("Actual class") #Calculating performance of the model confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() TN = confusion_matrix[0, 0] FN = confusion_matrix[1, 0] TP = confusion_matrix[1, 1] FP = confusion_matrix[0, 1] Precision = TP / (TP + FP) Sensitivity = TP / (TP + FN) Specificity = TN / (TN + FP) F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity)) print('Precision = ', Precision) print('Sensitivity = ', Sensitivity) print('Specificity = ', Specificity) print('F1 Score = ', F1Score) y_pred_quadratic = quadratic_knn.predict(quadratic_test_inputs) quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) fig5, ax5 = plt.subplots(figsize=(5, 5)) fig5.suptitle("Quadratic Confusion Matrix") sns.heatmap(quadratic_confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax5, cmap="Blues") plt.xlabel("Predicted class") plt.ylabel("Actual class") print("made it this far") #Calculating performance of the model quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() print("made the confusion matrix") TN_quadratic = quadratic_confusion_matrix[0, 0] FN_quadratic = quadratic_confusion_matrix[1, 0] TP_quadratic = quadratic_confusion_matrix[1, 1] FP_quadratic = quadratic_confusion_matrix[0, 1] Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic) Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic) Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic) F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) / (Precision_quadratic + Sensitivity_quadratic)) print('Precision = ', Precision_quadratic) print('Sensitivity = ', Sensitivity_quadratic) print('Specificity = ', Specificity_quadratic) print('F1 Score = ', F1Score_quadratic) # Creating a figure with all the numbers fig4 = plt.figure() ax4 = fig4.add_subplot(2, 2, 1) ax4.text(0, 1.0, 'Results', fontsize=12, fontweight='bold') ax4.text( 0, 0.7, 'Accuracy with no basis functions = {} with best k = {}'.format( round(mean_validation_accuracy[optimumKNN], 4), optimumKNN)) ax4.text( 0, 0.6, 'Accuracy with with quadratic basis functions = {} with best k = {}'. format( round(quadratic_mean_validation_accuracy[quadratic_optimumKNN], 4), quadratic_optimumKNN)) ax4.text( 0, 0.4, 'Area Under Curves with no basis functions = {}'.format(round(AUC, 4))) ax4.text( 0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format( round(quad_AUC, 4))) ax4.text(0, 0.1, 'Precision = {}'.format(round(Precision, 4))) ax4.text(0, 0, 'Sensitivity = {}'.format(round(Sensitivity, 4))) ax4.text(0, -0.1, 'Specificity = {}'.format(round(Specificity, 4))) ax4.text(0, -0.2, 'F1 Score = {}'.format(round(F1Score, 4))) ax4.text( 0, -0.3, 'Quadratic Precision = {}'.format(round(Precision_quadratic, 4))) ax4.text( 0, -0.4, 'Quadratic Sensitivity = {}'.format(round(Sensitivity_quadratic, 4))) ax4.text( 0, -0.5, 'Quadratic Specificity = {}'.format(round(Specificity_quadratic, 4))) ax4.text(0, -0.6, 'Quadratic F1 Score = {}'.format(round(F1Score_quadratic, 4))) ax4.axis('off') #Showing all plots plt.show()
def main(dataset): num_folds = 5 # number of folds inputs, targets, label = process_data(dataset) N = inputs.shape[0] # total number of datasets if ('titanic' in dataset.lower()): name = 'Titanic' num_decision_boundaries_normal = 50 decision_boundaries_normal = np.linspace( -1, 1, num_decision_boundaries_normal) else: name = 'Abalone' num_decision_boundaries_normal = 30 decision_boundaries_normal = np.linspace( 0.1, 0.3, num_decision_boundaries_normal) #Partitioning train and test data train_filter, test_filter = train_and_test_filter(N, test_fraction=0.3) train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition( inputs, targets, train_filter, test_filter) cv_folds = create_cv_folds(N, num_folds) #Cross validation train_accuracy_array, test_accuracy_array, decision_boundary = cross_validation_decision_boundary_fishers( inputs, targets, cv_folds, num_folds, decision_boundaries_normal, num_decision_boundaries_normal, 0) weights = fisher_linear_discriminant_projection(train_inputs, train_targets) predicted = predict(test_inputs, weights, decision_boundary) #Cross Validation for quadratic pol2_inputs = quadratic_feature_mapping(inputs) pol2_train_inputs = quadratic_feature_mapping(train_inputs) pol2_test_inputs = quadratic_feature_mapping(test_inputs) num_decision_boundaries_robust = 30 decision_boundaries_robust = np.linspace(-0.1, 0.1, num_decision_boundaries_robust) quadratic_train_accuracy_array, quadratic_test_accuracy_array, quadratic_decision_boundary = cross_validation_decision_boundary_fishers( pol2_inputs, targets, cv_folds, num_folds, decision_boundaries_robust, num_decision_boundaries_robust, 1) quadratic_weights = robust_fisher_linear_discriminant_projection( pol2_train_inputs, train_targets, 1e-6) quadratic_predicted = predict(pol2_test_inputs, quadratic_weights, quadratic_decision_boundary) #Preparation for AUC plot false_positive_rates, true_positive_rates, AUC = ROC_values_and_AUC( train_inputs, train_targets, test_inputs, test_targets, 0) pol2_false_positive_rates, pol2_true_positive_rates, pol2_AUC = ROC_values_and_AUC( pol2_train_inputs, train_targets, pol2_test_inputs, test_targets, 1) #Plotting testing accuracy and training accuracy on changing decision boundaries #Normal data fig1 = plt.figure() ax1 = fig1.add_subplot(1, 1, 1) ax1.set_title('Fisher, changing decision boundaries, {}'.format(name)) ax1.plot(-decision_boundaries_normal, test_accuracy_array, label='Testing Accuracy') ax1.plot(-decision_boundaries_normal, train_accuracy_array, label='Training Accuracy') ax1.legend() ax1.set_xlabel('Decision Boundary') ax1.set_ylabel('Accuracy') #Transformed data (Quadratic) fig2 = plt.figure() ax2 = fig2.add_subplot(1, 1, 1) ax2.set_title( 'Fisher, changing decision boundaries, Quadratic, {}'.format(name)) ax2.plot(decision_boundaries_robust, quadratic_test_accuracy_array, label='Testing Accuracy') ax2.plot(decision_boundaries_robust, quadratic_train_accuracy_array, label='Training Accuracy') ax2.legend() ax2.set_xlabel('Decision Boundary') ax2.set_ylabel('Accuracy') #Plotting ROC curve fig3 = plt.figure(figsize=(6, 6)) ax3 = fig3.add_subplot(1, 1, 1) ax3.plot(false_positive_rates, true_positive_rates, '-', color="b", label='AUC normal = %0.2f' % AUC) ax3.plot(pol2_false_positive_rates, pol2_true_positive_rates, '-', color="r", label='AUC quadratic = %0.2f' % pol2_AUC) ax3.legend(loc='lower right') ax3.set_xlabel("False Positive Rate") ax3.set_ylabel("True Positive Rate") ax3.set_aspect('equal', 'box') ax3.plot([0, 1], [0, 1], linestyle='--') ax3.set_xlim([-0.01, 1.01]) ax3.set_ylim([-0.01, 1.01]) ax3.set_xticks([0, 0.5, 1]) ax3.set_yticks([0, 0.5, 1]) plt.tight_layout() print("The AUC with no basis function = ", AUC) print("The AUC with quadratic expansion = ", pol2_AUC) weights = fisher_linear_discriminant_projection(train_inputs, train_targets) predicted = predict(test_inputs, weights, decision_boundary) y_pred = predicted y_actual = test_targets try: confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() confusion_matrix = pd.DataFrame(confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) except: print( 'Sorry, Pandas is acting weird (trust me), please run the program again.' ) exit(0) fig3, ax3 = plt.subplots(figsize=(5, 5)) fig3.suptitle("Normal Confusion Matrix") sns.heatmap(confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax3, cmap="Blues") plt.xlabel("Predicted class") plt.ylabel("Actual class") #Calculating performance of the model confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() TN = confusion_matrix[0, 0] FN = confusion_matrix[1, 0] TP = confusion_matrix[1, 1] FP = confusion_matrix[0, 1] Precision = TP / (TP + FP) Sensitivity = TP / (TP + FN) Specificity = TN / (TN + FP) F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity)) print('Precision = ', Precision) print('Sensitivity = ', Sensitivity) print('Specificity = ', Specificity) print('F1 Score = ', F1Score) quadratic_weights = robust_fisher_linear_discriminant_projection( pol2_train_inputs, train_targets, 1e-6) quadratic_predicted = predict(pol2_test_inputs, quadratic_weights, quadratic_decision_boundary) y_pred_quadratic = quadratic_predicted quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) fig5, ax5 = plt.subplots(figsize=(5, 5)) fig5.suptitle("Quadratic Confusion Matrix") sns.heatmap(quadratic_confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax5, cmap="Blues") plt.xlabel("Predicted class") plt.ylabel("Actual class") print("made it this far") #Calculating performance of thee model quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() print("made the confusion matrix") TN_quadratic = quadratic_confusion_matrix[0, 0] FN_quadratic = quadratic_confusion_matrix[1, 0] TP_quadratic = quadratic_confusion_matrix[1, 1] FP_quadratic = quadratic_confusion_matrix[0, 1] Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic) Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic) Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic) F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) / (Precision_quadratic + Sensitivity_quadratic)) print('Precision = ', Precision_quadratic) print('Sensitivity = ', Sensitivity_quadratic) print('Specificity = ', Specificity_quadratic) print('F1 Score = ', F1Score_quadratic) #Creating a figure with the results fig4 = plt.figure() ax4 = fig4.add_subplot(2, 2, 1) ax4.text(0, 0.7, 'Results', fontsize=12, fontweight='bold') ax4.text( 0, 0.4, 'Area Under Curves with no basis functions = {}'.format(round(AUC, 4))) ax4.text( 0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format( round(pol2_AUC, 4))) ax4.text(0, 0.1, 'Precision = {}'.format(round(Precision, 4))) ax4.text(0, 0, 'Sensitivity = {}'.format(round(Sensitivity, 4))) ax4.text(0, -0.1, 'Specificity = {}'.format(round(Specificity, 4))) ax4.text(0, -0.2, 'F1 Score = {}'.format(round(F1Score, 4))) ax4.text( 0, -0.3, 'Quadratic Precision = {}'.format(round(Precision_quadratic, 4))) ax4.text( 0, -0.4, 'Quadratic Sensitivity = {}'.format(round(Sensitivity_quadratic, 4))) ax4.text( 0, -0.5, 'Quadratic Specificity = {}'.format(round(Specificity_quadratic, 4))) ax4.text(0, -0.6, 'Quadratic F1 Score = {}'.format(round(F1Score_quadratic, 4))) ax4.axis('off') #Showing all plots plt.show()
def main(ifname, input_cols=None, target_col=None, classes=None): """ Imports the titanic data-set and generates exploratory plots parameters ---------- ifname -- filename/path of data file. input_cols -- list of column names for the input data target_col -- column name of the target data classes -- list of the classes to plot """ inputs, targets, field_names, classes = import_for_classification( ifname, input_cols=input_cols, target_col=target_col, classes=classes) N = inputs.shape[0] test_fraction = 0.2 train_filter, test_filter = train_and_test_filter(N, test_fraction) train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition( inputs, targets, train_filter, test_filter) # without basis functions print("WITHOUT BASIS FUNCTIONS") fig, ax = fit_and_plot_roc_logistic(train_inputs, train_targets, test_inputs, test_targets, fig_ax=None, colour='r', type='training') fit_and_plot_roc_generative(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig, ax), colour='b', type='training') fit_and_plot_roc_fisher(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig, ax), colour='y', type='training') ax.legend([ "Logistic regression", "Shared covariance model", "Fisher's linear discriminant" ]) fig.savefig('train_no_bf_roc') fig1, ax1 = fit_and_plot_roc_logistic(train_inputs, train_targets, test_inputs, test_targets, fig_ax=None, colour='r', type='testing') fit_and_plot_roc_generative(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig1, ax1), colour='b', type='testing') fit_and_plot_roc_fisher(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig1, ax1), colour='y', type='testing') ax1.legend([ "Logistic regression", "Shared covariance model", "Fisher's linear discriminant" ]) fig1.savefig('test_no_bf_roc') # with quadratic basis function print("WITH QUADRATIC BASIS FUNCTIONS") train_designmtx = quadratic_feature_mapping(train_inputs) test_designmtx = quadratic_feature_mapping(test_inputs) # train_designmtx = np.delete(train_designmtx, np.where(~train_designmtx.any(axis=0))[0], axis=1) # test_designmtx = np.delete(test_designmtx, np.where(~test_designmtx.any(axis=0))[0], axis=1) fig2, ax2 = fit_and_plot_roc_generative(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=None, colour='b', type='training') fit_and_plot_roc_fisher(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=(fig2, ax2), colour='y', type='training') ax2.legend(["Shared covariance model", "Fisher's linear discriminant"]) fig2.savefig('train_quadratic_roc') fig3, ax3 = fit_and_plot_roc_generative(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=None, colour='b', type='testing') fit_and_plot_roc_fisher(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=(fig3, ax3), colour='y', type='testing') ax3.legend(["Shared covariance model", "Fisher's linear discriminant"]) fig3.savefig('test_quadratic_roc') plt.show()
def main(dataset): inputs, targets, label = process_data(dataset) quadratic_inputs = quadratic_feature_mapping(inputs) N = inputs.shape[0] # Total number of datasets num_folds = 5 # number of folds #Partitioning train and test data cv_folds = create_cv_folds(N, num_folds) #Cross validations, obtaining best parameter for KNN if ("titanic" in dataset.lower()): print( "No cross validation is done on the titanic data because its run time exceeds 45 minutes" ) print("Please wait...") normal_test_accuracy, normal_entropy = logistic_fit_and_analysis_no_folds( inputs, targets, 1e-5, 1e-3) quadratic_test_accuracy, quadratic_entropy = logistic_fit_and_analysis_no_folds( quadratic_inputs, targets, 1e-5, 1e-3) else: print("Cross validation is running") print("Please wait...") normal_test_accuracy, normal_entropy = logistic_fit_and_analysis( inputs, targets, cv_folds, num_folds, 1e-6, 1e-6) quadratic_test_accuracy, quadratic_entropy = logistic_fit_and_analysis( quadratic_inputs, targets, cv_folds, num_folds, 1e-6, 1e-6) print( "The accuracy = {} and cross entropy error = {} for no basis functions" .format(normal_test_accuracy, normal_entropy)) print( "The accuracy = {} and cross entropy error = {} for a quadratic basis functions" .format(quadratic_test_accuracy, quadratic_entropy)) train_filter, test_filter = train_and_test_filter(N, test_fraction=0.20) train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition( inputs, targets, train_filter, test_filter) quadratic_train_inputs = quadratic_feature_mapping(train_inputs) quadratic_test_inputs = quadratic_feature_mapping(test_inputs) weights = robust_logistic_regression_fit(train_inputs, train_targets, 1e-5, 1e-3) quadratic_weights = robust_logistic_regression_fit(quadratic_train_inputs, train_targets, 1e-5, 1e-3) num_points = 500 print("No Basis Function ROC") fpr, tpr, AUC = ROC_values_and_AUC(test_inputs, test_targets, weights, num_points) print("Quadratic Expansion Basis Function ROC") quad_fpr, quad_tpr, quad_AUC = ROC_values_and_AUC(quadratic_test_inputs, test_targets, quadratic_weights, num_points) #Plotting ROC curve fig2 = plt.figure() ax2 = fig2.add_subplot(1, 1, 1) ax2.plot(fpr, tpr, '-', color="b", label='Normal AUC = %0.4f' % AUC) ax2.plot(quad_fpr, quad_tpr, '-', color="r", label='Quadratic AUC = %0.4f' % quad_AUC) ax2.legend(loc='lower right') ax2.plot([0, 1], [0, 1], linestyle='--') ax2.set_xlabel("False Positive Rate") ax2.set_ylabel("True Positive Rate") ax2.set_aspect('equal', 'box') ax2.set_xlim([-0.01, 1.01]) ax2.set_ylim([-0.01, 1.01]) ax2.set_xticks([0, 0.5, 1]) ax2.set_yticks([0, 0.5, 1]) plt.tight_layout() y_pred = logistic_regression_predict(test_inputs, weights) y_actual = test_targets confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() confusion_matrix = pd.DataFrame(confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) fig3, ax3 = plt.subplots(figsize=(5, 5)) fig3.suptitle("Normal Confusion Matrix") sns.heatmap(confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax3, cmap="Blues") plt.xlabel("Predicted class") plt.ylabel("Actual class") #Calculating performance of the model confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix() TN = confusion_matrix[0, 0] FN = confusion_matrix[1, 0] TP = confusion_matrix[1, 1] FP = confusion_matrix[0, 1] Precision = TP / (TP + FP) Sensitivity = TP / (TP + FN) Specificity = TN / (TN + FP) F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity)) print('Precision = ', Precision) print('Sensitivity = ', Sensitivity) print('Specificity = ', Specificity) print('F1 Score = ', F1Score) y_pred_quadratic = logistic_regression_predict(quadratic_test_inputs, quadratic_weights) quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix, index=['Negative', 'Positive'], columns=['Negative', 'Positive']) fig5, ax5 = plt.subplots(figsize=(5, 5)) fig5.suptitle("Quadratic Confusion Matrix") sns.heatmap(quadratic_confusion_matrix, annot=True, linewidths=0.3, linecolor="White", cbar=False, fmt=".0f", ax=ax5, cmap="Blues") plt.xlabel("Predicted class") plt.ylabel("Actual class") print("made it this far") #Calculating performance of the model quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic, y_actual).T.as_matrix() print("made the confusion matrix") TN_quadratic = quadratic_confusion_matrix[0, 0] FN_quadratic = quadratic_confusion_matrix[1, 0] TP_quadratic = quadratic_confusion_matrix[1, 1] FP_quadratic = quadratic_confusion_matrix[0, 1] Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic) Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic) Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic) F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) / (Precision_quadratic + Sensitivity_quadratic)) print('Precision = ', Precision_quadratic) print('Sensitivity = ', Sensitivity_quadratic) print('Specificity = ', Specificity_quadratic) print('F1 Score = ', F1Score_quadratic) #Creating a figure with the results fig4 = plt.figure() ax4 = fig4.add_subplot(2, 2, 1) ax4.text(0, 1.2, 'Results', fontsize=12, fontweight='bold') if ("titanic" in dataset.lower()): ax4.text( 0, 1.0, 'Accuracy with no basis functions = {}'.format( round(normal_test_accuracy, 4))) ax4.text( 0, 0.9, 'Accuracy with with quadratic basis functions = {}'.format( round(quadratic_test_accuracy, 4))) else: ax4.text( 0, 1.0, 'Mean accuracy with no basis functions = {}'.format( round(normal_test_accuracy, 4))) ax4.text( 0, 0.9, 'Mean accuracy with with quadratic basis functions = {}'.format( round(quadratic_test_accuracy, 4))) ax4.text( 0, 0.7, 'Cross Entropy Error with no basis functions = {}'.format( round(normal_entropy, 4))) ax4.text( 0, 0.6, 'Cross Entropy Error with quadratic basis functions = {}'.format( round(quadratic_entropy, 4))) ax4.text( 0, 0.4, 'Area Under Curves with no basis functions = {}'.format(round(AUC, 4))) ax4.text( 0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format( round(quad_AUC, 4))) ax4.text(0, 0.1, 'Precision = {}'.format(round(Precision, 4))) ax4.text(0, 0, 'Sensitivity = {}'.format(round(Sensitivity, 4))) ax4.text(0, -0.1, 'Specificity = {}'.format(round(Specificity, 4))) ax4.text(0, -0.2, 'F1 Score = {}'.format(round(F1Score, 4))) ax4.text( 0, -0.3, 'Quadratic Precision = {}'.format(round(Precision_quadratic, 4))) ax4.text( 0, -0.4, 'Quadratic Sensitivity = {}'.format(round(Sensitivity_quadratic, 4))) ax4.text( 0, -0.5, 'Quadratic Specificity = {}'.format(round(Specificity_quadratic, 4))) ax4.text(0, -0.6, 'Quadratic F1 Score = {}'.format(round(F1Score_quadratic, 4))) ax4.axis('off') #Showing all plots plt.show()
def main(ifname, input_cols=None, target_col=None, classes=None): """ Imports the titanic data-set and generates exploratory plots parameters ---------- ifname -- filename/path of data file. input_cols -- list of column names for the input data target_col -- column name of the target data classes -- list of the classes to plot """ inputs, targets, field_names, classes = import_for_classification( ifname, input_cols=input_cols, target_col=target_col, classes=classes) N = inputs.shape[0] test_fraction = 0.2 train_filter, test_filter = train_and_test_filter(N, test_fraction) train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition( inputs, targets, train_filter, test_filter) # # without basis functions fig0, ax0 = fit_and_plot_accuracy_logistic(train_inputs, train_targets, test_inputs, test_targets, fig_ax=None, colour='r', type='training') fit_and_plot_accuracy_logistic(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig0, ax0), colour='b', type='testing') ax0.legend(["Training", "Testing"]) fig0.savefig('logistic_no_bf_accuracy.png') fig1, ax1 = fit_and_plot_accuracy_generative(train_inputs, train_targets, test_inputs, test_targets, fig_ax=None, colour='r', type='training') fit_and_plot_accuracy_generative(train_inputs, train_targets, test_inputs, test_targets, fig_ax=(fig1, ax1), colour='b', type='testing') ax1.legend(["Training", "Testing"]) fig1.savefig('generative_no_bf_accuracy.png') # with quadratic basis function train_designmtx = quadratic_feature_mapping(train_inputs) test_designmtx = quadratic_feature_mapping(test_inputs) # train_designmtx = np.delete(train_designmtx, np.where(~train_designmtx.any(axis=0))[0], axis=1) # test_designmtx = np.delete(test_designmtx, np.where(~test_designmtx.any(axis=0))[0], axis=1) print("WITH QUADRATIC BASIS FUNCTIONS") fig2, ax2 = fit_and_plot_accuracy_generative(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=None, colour='r', type='training') fit_and_plot_accuracy_generative(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=(fig2, ax2), colour='b', type='testing') ax2.legend(["Training", "Testing"]) fig2.savefig('generative_quadratic_accuracy.png') # # fig, ax0, ax1= fit_and_plot_accuracy_logistic(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=None, colour='r', type='training') # fit_and_plot_accuracy_logistic(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=(fig, ax0, ax1), colour='b', type='testing') plt.show()