示例#1
0
def logistic_fit_and_analysis_no_folds(inputs, targets, lambda_val, threshold):
    N = inputs.shape[0]
    training_filter, validation_filter = train_and_test_filter(
        N, test_fraction=0.20)
    training_data, training_targets, validation_data, validation_targets = train_and_test_partition(
        inputs, targets, training_filter, validation_filter)
    log_reg_weights = robust_logistic_regression_fit(training_data,
                                                     training_targets,
                                                     lambda_val,
                                                     threshold=threshold)
    predicted = logistic_regression_predict(validation_data, log_reg_weights)
    #ACCURACY
    predict_probs = logistic_regression_prediction_probs(
        validation_data, log_reg_weights)
    #print(validation_data.shape, validation_targets.shape, predict_probs.shape)

    accuracy = 0.0
    for i in range(len(validation_targets)):
        accuracy += (predicted[i] == validation_targets[i]
                     ).astype(int) / len(validation_targets)
        if (predict_probs[i] == 1):
            predict_probs[
                i] -= 0.0001  #THIS IS SO THAT IN CROSS_ENTROPY_ERROR THERE IS NEVER A LOG OF 0
    entropy = cross_entropy_error(validation_targets, predict_probs)
    return accuracy, entropy
def Finding_most_optimum_k(inputs, targets, num_knn, cv_folds, num_folds):

    # create array of zeros to store the mean validation accuracy for k at location k-1
    mean_validation_accuracy = np.zeros(num_knn - 1)
    mean_train_accuracy = np.zeros(num_knn - 1)
    for f in range(num_folds):
        fold_filters = cv_folds[f]
        training_filter = fold_filters[0]
        validation_filter = fold_filters[1]

        training_data, training_targets, validation_data, validation_targets = train_and_test_partition(
            inputs, targets, training_filter, validation_filter)
        neighbors = np.arange(1, num_knn)
        train_accuracy = np.empty(len(neighbors))
        test_accuracy = np.empty(len(neighbors))

        # Loop over different values of k
        for i, k in enumerate(neighbors):
            # Setup a k-NN Classifier with k neighbors: knn
            knn = KNeighborsClassifier(n_neighbors=k)
            # Fit the classifier to the training data
            knn.fit(training_data, training_targets)

            # Compute accuracy on the training set
            train_accuracy[i] = knn.score(training_data, training_targets)
            # Compute accuracy on the testing set
            test_accuracy[i] = knn.score(validation_data, validation_targets)
            mean_validation_accuracy[i] += test_accuracy[i] / num_folds
            mean_train_accuracy[i] += train_accuracy[i] / num_folds

    optimumKNN = test_accuracy.tolist().index(np.max(test_accuracy)) + 1
    return optimumKNN, neighbors, mean_validation_accuracy, mean_train_accuracy
示例#3
0
def logistic_fit_and_analysis(inputs, targets, cv_folds, num_folds, lambda_val,
                              threshold):
    accuracy_array = np.zeros(num_folds)
    entropy_array = np.zeros(num_folds)
    for f in range(num_folds):
        fold_filters = cv_folds[f]
        training_filter = fold_filters[0]
        validation_filter = fold_filters[1]
        training_data, training_targets, validation_data, validation_targets = train_and_test_partition(
            inputs, targets, training_filter, validation_filter)
        log_reg_weights = robust_logistic_regression_fit(training_data,
                                                         training_targets,
                                                         lambda_val,
                                                         threshold=threshold)
        predicted = logistic_regression_predict(validation_data,
                                                log_reg_weights)
        #ACCURACY
        predict_probs = logistic_regression_prediction_probs(
            validation_data, log_reg_weights)
        #print(validation_data.shape, validation_targets.shape, predict_probs.shape)
        for i in range(len(validation_targets)):
            accuracy_array[f] += (predicted[i] == validation_targets[i]
                                  ).astype(int) / len(validation_targets)
            if (predict_probs[i] == 1):
                predict_probs[
                    i] -= 0.0001  #THIS IS SO THAT IN CROSS_ENTROPY_ERROR THERE IS NEVER A LOG OF 0
        entropy_array[f] = cross_entropy_error(validation_targets,
                                               predict_probs)
    accuracy = accuracy_array.mean()
    entropy = entropy_array.mean()
    return accuracy, entropy
示例#4
0
def cross_validation_decision_boundary_fishers(inputs,
                                               targets,
                                               cv_folds,
                                               num_folds,
                                               decision_boundaries,
                                               num_decision_boundaries,
                                               robust=0):

    train_accuracy_array = np.zeros(num_decision_boundaries)
    test_accuracy_array = np.zeros(num_decision_boundaries)

    for f in range(num_folds):
        fold_filters = cv_folds[f]
        training_filter = fold_filters[0]
        validation_filter = fold_filters[1]
        training_data, training_targets, validation_data, validation_targets = train_and_test_partition(
            inputs, targets, training_filter, validation_filter)
        if robust == 0:
            fisher_weights = fisher_linear_discriminant_projection(
                training_data, training_targets)
        elif robust == 1:
            fisher_weights = robust_fisher_linear_discriminant_projection(
                training_data, training_targets, 1e-6)

        projected_inputs_train = project_data(training_data, fisher_weights)
        projected_inputs_test = project_data(validation_data, fisher_weights)
        new_ordering_train = np.argsort(projected_inputs_train)
        new_ordering_test = np.argsort(projected_inputs_test)
        projected_inputs_train = projected_inputs_train[new_ordering_train]
        projected_inputs_test = projected_inputs_test[new_ordering_test]

        training_targets = np.copy(training_targets[new_ordering_train])
        validation_targets = np.copy(validation_targets[new_ordering_test])
        predicted_train = np.empty(len(projected_inputs_train))
        predicted_test = np.empty(len(projected_inputs_test))

        for j in range(len(decision_boundaries)):
            train_accuracy_temp = 0.0
            test_accuracy_temp = 0.0
            for i in range(len(training_targets)):
                predicted_train[i] = (projected_inputs_train[i] >
                                      decision_boundaries[j]).astype(int)
                train_accuracy_temp += (
                    predicted_train[i]
                    == training_targets[i]).astype(int) / len(training_targets)

            for t in range(len(validation_targets)):
                predicted_test[t] = (projected_inputs_test[t] >
                                     decision_boundaries[j]).astype(int)
                test_accuracy_temp += (predicted_test[t]
                                       == validation_targets[t]
                                       ).astype(int) / len(validation_targets)
            test_accuracy_array[j] += test_accuracy_temp / num_folds
            train_accuracy_array[j] += train_accuracy_temp / num_folds

    return train_accuracy_array, test_accuracy_array, decision_boundaries[
        test_accuracy_array.tolist().index(np.max(test_accuracy_array))]
def main(dataset):

    inputs, targets, label = process_data(dataset)
    N = inputs.shape[0]  # Total number of datasets
    num_knn = 30  # number of nearest neighbours + 1
    num_folds = 5  # number of folds

    #Partitioning train and test data

    train_filter, test_filter = train_and_test_filter(N, test_fraction=0.20)
    train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition(
        inputs, targets, train_filter, test_filter)
    cv_folds = create_cv_folds(N, num_folds)

    #Cross validations, obtaining best parameter for KNN

    optimumKNN, neighbors, mean_validation_accuracy, train_accuracy = Finding_most_optimum_k(
        inputs, targets, num_knn, cv_folds, num_folds)
    print(
        "The best mean validation score = {} with number of nearest neighbours = {}"
        .format(mean_validation_accuracy[optimumKNN], optimumKNN))

    #Model fitting with the optimum number of neighbors

    knn, fitting_accuracy, prediction_accuracy = fitting_best_k(
        optimumKNN, train_inputs, train_targets, test_inputs, test_targets)

    #Preparation for ROC curve

    fpr, tpr, AUC = ROC_values_and_AUC(test_inputs, test_targets, 100, knn)

    #QUADRATIC EXPANSION
    quadratic_inputs = quadratic_feature_mapping(inputs)
    quadratic_train_inputs = quadratic_feature_mapping(train_inputs)
    quadratic_test_inputs = quadratic_feature_mapping(test_inputs)
    quadratic_optimumKNN, neighbors, quadratic_mean_validation_accuracy, quadratic_train_accuracy = Finding_most_optimum_k(
        quadratic_inputs, targets, num_knn, cv_folds, num_folds)
    print(
        "The best mean validation score = {} with number of nearest neighbours = {}"
        .format(quadratic_mean_validation_accuracy[quadratic_optimumKNN],
                quadratic_optimumKNN))

    #Model fitting with the optimum number of neighbors

    quadratic_knn, quadratic_fitting_accuracy, quadratic_prediction_accuracy = fitting_best_k(
        quadratic_optimumKNN, quadratic_train_inputs, train_targets,
        quadratic_test_inputs, test_targets)

    #Preparation for ROC curve

    quad_fpr, quad_tpr, quad_AUC = ROC_values_and_AUC(quadratic_test_inputs,
                                                      test_targets, 100,
                                                      quadratic_knn)

    #Plotting testing accuracy and training accuracy against no. of neighbors to see if the model overfit

    fig = plt.figure()
    fig.suptitle("Accuracy for different K")
    ax1 = fig.add_subplot(1, 1, 1)
    ax1.plot(neighbors, mean_validation_accuracy, label='Testing Accuracy')
    ax1.plot(neighbors, train_accuracy, label='Training Accuracy')
    ax1.set_xlabel('Number of Neighbors')
    ax1.set_ylabel('Accuracy')
    ax1.plot(neighbors,
             quadratic_mean_validation_accuracy,
             label='Quadratic Testing Accuracy')
    ax1.plot(neighbors,
             quadratic_train_accuracy,
             label='Quadratic Training Accuracy')
    ax1.legend()

    #Plotting ROC curve

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(1, 1, 1)
    ax2.plot(fpr, tpr, '-', color="b", label='Normal AUC = %0.4f' % AUC)
    ax2.plot(quad_fpr,
             quad_tpr,
             '-',
             color="r",
             label='Quadratic AUC = %0.4f' % quad_AUC)
    ax2.legend(loc='lower right')
    ax2.plot([0, 1], [0, 1], linestyle='--')
    ax2.set_xlabel("False Positive Rate")
    ax2.set_ylabel("True Positive Rate")
    ax2.set_aspect('equal', 'box')
    ax2.set_xlim([-0.01, 1.01])
    ax2.set_ylim([-0.01, 1.01])
    ax2.set_xticks([0, 0.5, 1])
    ax2.set_yticks([0, 0.5, 1])
    plt.tight_layout()

    #Constructing confusion matrix
    y_pred = knn.predict(test_inputs)
    y_actual = test_targets
    confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()
    confusion_matrix = pd.DataFrame(confusion_matrix,
                                    index=['Negative', 'Positive'],
                                    columns=['Negative', 'Positive'])
    fig3, ax3 = plt.subplots(figsize=(5, 5))
    fig3.suptitle("Normal Confusion Matrix")
    sns.heatmap(confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax3,
                cmap="Blues")
    ax3.set_xlabel("Predicted class")
    ax3.set_ylabel("Actual class")

    #Calculating performance of the model
    confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()
    TN = confusion_matrix[0, 0]
    FN = confusion_matrix[1, 0]
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]

    Precision = TP / (TP + FP)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity))
    print('Precision   = ', Precision)
    print('Sensitivity = ', Sensitivity)
    print('Specificity = ', Specificity)
    print('F1 Score    = ', F1Score)

    y_pred_quadratic = quadratic_knn.predict(quadratic_test_inputs)
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()

    quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix,
                                              index=['Negative', 'Positive'],
                                              columns=['Negative', 'Positive'])

    fig5, ax5 = plt.subplots(figsize=(5, 5))
    fig5.suptitle("Quadratic Confusion Matrix")
    sns.heatmap(quadratic_confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax5,
                cmap="Blues")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")

    print("made it this far")

    #Calculating performance of the model
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()
    print("made the confusion matrix")
    TN_quadratic = quadratic_confusion_matrix[0, 0]
    FN_quadratic = quadratic_confusion_matrix[1, 0]
    TP_quadratic = quadratic_confusion_matrix[1, 1]
    FP_quadratic = quadratic_confusion_matrix[0, 1]

    Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic)
    Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic)
    Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic)
    F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) /
                             (Precision_quadratic + Sensitivity_quadratic))
    print('Precision   = ', Precision_quadratic)
    print('Sensitivity = ', Sensitivity_quadratic)
    print('Specificity = ', Specificity_quadratic)
    print('F1 Score    = ', F1Score_quadratic)

    # Creating a figure with all the numbers
    fig4 = plt.figure()
    ax4 = fig4.add_subplot(2, 2, 1)
    ax4.text(0, 1.0, 'Results', fontsize=12, fontweight='bold')
    ax4.text(
        0, 0.7,
        'Accuracy with no basis functions = {} with best k = {}'.format(
            round(mean_validation_accuracy[optimumKNN], 4), optimumKNN))
    ax4.text(
        0, 0.6,
        'Accuracy with with quadratic basis functions = {} with best k = {}'.
        format(
            round(quadratic_mean_validation_accuracy[quadratic_optimumKNN], 4),
            quadratic_optimumKNN))
    ax4.text(
        0, 0.4,
        'Area Under Curves with no basis functions = {}'.format(round(AUC, 4)))
    ax4.text(
        0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format(
            round(quad_AUC, 4)))
    ax4.text(0, 0.1, 'Precision   = {}'.format(round(Precision, 4)))
    ax4.text(0, 0, 'Sensitivity   = {}'.format(round(Sensitivity, 4)))
    ax4.text(0, -0.1, 'Specificity   = {}'.format(round(Specificity, 4)))
    ax4.text(0, -0.2, 'F1 Score   = {}'.format(round(F1Score, 4)))
    ax4.text(
        0, -0.3,
        'Quadratic Precision   = {}'.format(round(Precision_quadratic, 4)))
    ax4.text(
        0, -0.4,
        'Quadratic Sensitivity   = {}'.format(round(Sensitivity_quadratic, 4)))
    ax4.text(
        0, -0.5,
        'Quadratic Specificity   = {}'.format(round(Specificity_quadratic, 4)))
    ax4.text(0, -0.6,
             'Quadratic F1 Score   = {}'.format(round(F1Score_quadratic, 4)))
    ax4.axis('off')

    #Showing all plots
    plt.show()
示例#6
0
def main(dataset):
    num_folds = 5  # number of folds
    inputs, targets, label = process_data(dataset)
    N = inputs.shape[0]  # total number of datasets
    if ('titanic' in dataset.lower()):
        name = 'Titanic'
        num_decision_boundaries_normal = 50
        decision_boundaries_normal = np.linspace(
            -1, 1, num_decision_boundaries_normal)
    else:
        name = 'Abalone'
        num_decision_boundaries_normal = 30
        decision_boundaries_normal = np.linspace(
            0.1, 0.3, num_decision_boundaries_normal)

    #Partitioning train and test data

    train_filter, test_filter = train_and_test_filter(N, test_fraction=0.3)
    train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition(
        inputs, targets, train_filter, test_filter)
    cv_folds = create_cv_folds(N, num_folds)

    #Cross validation
    train_accuracy_array, test_accuracy_array, decision_boundary = cross_validation_decision_boundary_fishers(
        inputs, targets, cv_folds, num_folds, decision_boundaries_normal,
        num_decision_boundaries_normal, 0)
    weights = fisher_linear_discriminant_projection(train_inputs,
                                                    train_targets)
    predicted = predict(test_inputs, weights, decision_boundary)

    #Cross Validation for quadratic
    pol2_inputs = quadratic_feature_mapping(inputs)
    pol2_train_inputs = quadratic_feature_mapping(train_inputs)
    pol2_test_inputs = quadratic_feature_mapping(test_inputs)
    num_decision_boundaries_robust = 30
    decision_boundaries_robust = np.linspace(-0.1, 0.1,
                                             num_decision_boundaries_robust)

    quadratic_train_accuracy_array, quadratic_test_accuracy_array, quadratic_decision_boundary = cross_validation_decision_boundary_fishers(
        pol2_inputs, targets, cv_folds, num_folds, decision_boundaries_robust,
        num_decision_boundaries_robust, 1)
    quadratic_weights = robust_fisher_linear_discriminant_projection(
        pol2_train_inputs, train_targets, 1e-6)
    quadratic_predicted = predict(pol2_test_inputs, quadratic_weights,
                                  quadratic_decision_boundary)

    #Preparation for AUC plot
    false_positive_rates, true_positive_rates, AUC = ROC_values_and_AUC(
        train_inputs, train_targets, test_inputs, test_targets, 0)

    pol2_false_positive_rates, pol2_true_positive_rates, pol2_AUC = ROC_values_and_AUC(
        pol2_train_inputs, train_targets, pol2_test_inputs, test_targets, 1)

    #Plotting testing accuracy and training accuracy on changing decision boundaries
    #Normal data
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(1, 1, 1)
    ax1.set_title('Fisher, changing decision boundaries, {}'.format(name))
    ax1.plot(-decision_boundaries_normal,
             test_accuracy_array,
             label='Testing Accuracy')
    ax1.plot(-decision_boundaries_normal,
             train_accuracy_array,
             label='Training Accuracy')
    ax1.legend()
    ax1.set_xlabel('Decision Boundary')
    ax1.set_ylabel('Accuracy')

    #Transformed data (Quadratic)
    fig2 = plt.figure()
    ax2 = fig2.add_subplot(1, 1, 1)
    ax2.set_title(
        'Fisher, changing decision boundaries, Quadratic, {}'.format(name))
    ax2.plot(decision_boundaries_robust,
             quadratic_test_accuracy_array,
             label='Testing Accuracy')
    ax2.plot(decision_boundaries_robust,
             quadratic_train_accuracy_array,
             label='Training Accuracy')
    ax2.legend()
    ax2.set_xlabel('Decision Boundary')
    ax2.set_ylabel('Accuracy')

    #Plotting ROC curve
    fig3 = plt.figure(figsize=(6, 6))
    ax3 = fig3.add_subplot(1, 1, 1)
    ax3.plot(false_positive_rates,
             true_positive_rates,
             '-',
             color="b",
             label='AUC normal = %0.2f' % AUC)
    ax3.plot(pol2_false_positive_rates,
             pol2_true_positive_rates,
             '-',
             color="r",
             label='AUC quadratic = %0.2f' % pol2_AUC)
    ax3.legend(loc='lower right')
    ax3.set_xlabel("False Positive Rate")
    ax3.set_ylabel("True Positive Rate")
    ax3.set_aspect('equal', 'box')
    ax3.plot([0, 1], [0, 1], linestyle='--')
    ax3.set_xlim([-0.01, 1.01])
    ax3.set_ylim([-0.01, 1.01])
    ax3.set_xticks([0, 0.5, 1])
    ax3.set_yticks([0, 0.5, 1])
    plt.tight_layout()
    print("The AUC with no basis function = ", AUC)
    print("The AUC with quadratic expansion = ", pol2_AUC)

    weights = fisher_linear_discriminant_projection(train_inputs,
                                                    train_targets)
    predicted = predict(test_inputs, weights, decision_boundary)
    y_pred = predicted
    y_actual = test_targets
    try:
        confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()
        confusion_matrix = pd.DataFrame(confusion_matrix,
                                        index=['Negative', 'Positive'],
                                        columns=['Negative', 'Positive'])
    except:
        print(
            'Sorry, Pandas is acting weird (trust me), please run the program again.'
        )
        exit(0)

    fig3, ax3 = plt.subplots(figsize=(5, 5))
    fig3.suptitle("Normal Confusion Matrix")
    sns.heatmap(confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax3,
                cmap="Blues")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")

    #Calculating performance of the model
    confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()
    TN = confusion_matrix[0, 0]
    FN = confusion_matrix[1, 0]
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]

    Precision = TP / (TP + FP)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity))
    print('Precision   = ', Precision)
    print('Sensitivity = ', Sensitivity)
    print('Specificity = ', Specificity)
    print('F1 Score    = ', F1Score)

    quadratic_weights = robust_fisher_linear_discriminant_projection(
        pol2_train_inputs, train_targets, 1e-6)
    quadratic_predicted = predict(pol2_test_inputs, quadratic_weights,
                                  quadratic_decision_boundary)
    y_pred_quadratic = quadratic_predicted
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()

    quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix,
                                              index=['Negative', 'Positive'],
                                              columns=['Negative', 'Positive'])

    fig5, ax5 = plt.subplots(figsize=(5, 5))
    fig5.suptitle("Quadratic Confusion Matrix")
    sns.heatmap(quadratic_confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax5,
                cmap="Blues")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")

    print("made it this far")

    #Calculating performance of thee model
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()
    print("made the confusion matrix")
    TN_quadratic = quadratic_confusion_matrix[0, 0]
    FN_quadratic = quadratic_confusion_matrix[1, 0]
    TP_quadratic = quadratic_confusion_matrix[1, 1]
    FP_quadratic = quadratic_confusion_matrix[0, 1]

    Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic)
    Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic)
    Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic)
    F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) /
                             (Precision_quadratic + Sensitivity_quadratic))
    print('Precision   = ', Precision_quadratic)
    print('Sensitivity = ', Sensitivity_quadratic)
    print('Specificity = ', Specificity_quadratic)
    print('F1 Score    = ', F1Score_quadratic)

    #Creating a figure with the results
    fig4 = plt.figure()
    ax4 = fig4.add_subplot(2, 2, 1)
    ax4.text(0, 0.7, 'Results', fontsize=12, fontweight='bold')
    ax4.text(
        0, 0.4,
        'Area Under Curves with no basis functions = {}'.format(round(AUC, 4)))
    ax4.text(
        0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format(
            round(pol2_AUC, 4)))
    ax4.text(0, 0.1, 'Precision   = {}'.format(round(Precision, 4)))
    ax4.text(0, 0, 'Sensitivity   = {}'.format(round(Sensitivity, 4)))
    ax4.text(0, -0.1, 'Specificity   = {}'.format(round(Specificity, 4)))
    ax4.text(0, -0.2, 'F1 Score   = {}'.format(round(F1Score, 4)))
    ax4.text(
        0, -0.3,
        'Quadratic Precision   = {}'.format(round(Precision_quadratic, 4)))
    ax4.text(
        0, -0.4,
        'Quadratic Sensitivity   = {}'.format(round(Sensitivity_quadratic, 4)))
    ax4.text(
        0, -0.5,
        'Quadratic Specificity   = {}'.format(round(Specificity_quadratic, 4)))
    ax4.text(0, -0.6,
             'Quadratic F1 Score   = {}'.format(round(F1Score_quadratic, 4)))
    ax4.axis('off')

    #Showing all plots
    plt.show()
示例#7
0
def main(ifname, input_cols=None, target_col=None, classes=None):
    """
    Imports the titanic data-set and generates exploratory plots

    parameters
    ----------
    ifname -- filename/path of data file.
    input_cols -- list of column names for the input data
    target_col -- column name of the target data
    classes -- list of the classes to plot

    """
    inputs, targets, field_names, classes = import_for_classification(
        ifname, input_cols=input_cols, target_col=target_col, classes=classes)

    N = inputs.shape[0]
    test_fraction = 0.2
    train_filter, test_filter = train_and_test_filter(N, test_fraction)
    train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition(
        inputs, targets, train_filter, test_filter)
    # without basis functions
    print("WITHOUT BASIS FUNCTIONS")
    fig, ax = fit_and_plot_roc_logistic(train_inputs,
                                        train_targets,
                                        test_inputs,
                                        test_targets,
                                        fig_ax=None,
                                        colour='r',
                                        type='training')
    fit_and_plot_roc_generative(train_inputs,
                                train_targets,
                                test_inputs,
                                test_targets,
                                fig_ax=(fig, ax),
                                colour='b',
                                type='training')
    fit_and_plot_roc_fisher(train_inputs,
                            train_targets,
                            test_inputs,
                            test_targets,
                            fig_ax=(fig, ax),
                            colour='y',
                            type='training')
    ax.legend([
        "Logistic regression", "Shared covariance model",
        "Fisher's linear discriminant"
    ])
    fig.savefig('train_no_bf_roc')

    fig1, ax1 = fit_and_plot_roc_logistic(train_inputs,
                                          train_targets,
                                          test_inputs,
                                          test_targets,
                                          fig_ax=None,
                                          colour='r',
                                          type='testing')
    fit_and_plot_roc_generative(train_inputs,
                                train_targets,
                                test_inputs,
                                test_targets,
                                fig_ax=(fig1, ax1),
                                colour='b',
                                type='testing')
    fit_and_plot_roc_fisher(train_inputs,
                            train_targets,
                            test_inputs,
                            test_targets,
                            fig_ax=(fig1, ax1),
                            colour='y',
                            type='testing')
    ax1.legend([
        "Logistic regression", "Shared covariance model",
        "Fisher's linear discriminant"
    ])
    fig1.savefig('test_no_bf_roc')

    # with quadratic basis function
    print("WITH QUADRATIC BASIS FUNCTIONS")
    train_designmtx = quadratic_feature_mapping(train_inputs)
    test_designmtx = quadratic_feature_mapping(test_inputs)
    # train_designmtx = np.delete(train_designmtx, np.where(~train_designmtx.any(axis=0))[0], axis=1)
    # test_designmtx = np.delete(test_designmtx, np.where(~test_designmtx.any(axis=0))[0], axis=1)

    fig2, ax2 = fit_and_plot_roc_generative(train_designmtx,
                                            train_targets,
                                            test_designmtx,
                                            test_targets,
                                            fig_ax=None,
                                            colour='b',
                                            type='training')
    fit_and_plot_roc_fisher(train_designmtx,
                            train_targets,
                            test_designmtx,
                            test_targets,
                            fig_ax=(fig2, ax2),
                            colour='y',
                            type='training')
    ax2.legend(["Shared covariance model", "Fisher's linear discriminant"])
    fig2.savefig('train_quadratic_roc')

    fig3, ax3 = fit_and_plot_roc_generative(train_designmtx,
                                            train_targets,
                                            test_designmtx,
                                            test_targets,
                                            fig_ax=None,
                                            colour='b',
                                            type='testing')
    fit_and_plot_roc_fisher(train_designmtx,
                            train_targets,
                            test_designmtx,
                            test_targets,
                            fig_ax=(fig3, ax3),
                            colour='y',
                            type='testing')
    ax3.legend(["Shared covariance model", "Fisher's linear discriminant"])
    fig3.savefig('test_quadratic_roc')

    plt.show()
示例#8
0
def main(dataset):
    inputs, targets, label = process_data(dataset)
    quadratic_inputs = quadratic_feature_mapping(inputs)
    N = inputs.shape[0]  # Total number of datasets
    num_folds = 5  # number of folds
    #Partitioning train and test data

    cv_folds = create_cv_folds(N, num_folds)

    #Cross validations, obtaining best parameter for KNN
    if ("titanic" in dataset.lower()):
        print(
            "No cross validation is done on the titanic data because its run time exceeds 45 minutes"
        )
        print("Please wait...")
        normal_test_accuracy, normal_entropy = logistic_fit_and_analysis_no_folds(
            inputs, targets, 1e-5, 1e-3)
        quadratic_test_accuracy, quadratic_entropy = logistic_fit_and_analysis_no_folds(
            quadratic_inputs, targets, 1e-5, 1e-3)
    else:
        print("Cross validation is running")
        print("Please wait...")
        normal_test_accuracy, normal_entropy = logistic_fit_and_analysis(
            inputs, targets, cv_folds, num_folds, 1e-6, 1e-6)
        quadratic_test_accuracy, quadratic_entropy = logistic_fit_and_analysis(
            quadratic_inputs, targets, cv_folds, num_folds, 1e-6, 1e-6)
    print(
        "The accuracy = {} and cross entropy error = {} for no basis functions"
        .format(normal_test_accuracy, normal_entropy))
    print(
        "The accuracy = {} and cross entropy error = {} for a quadratic basis functions"
        .format(quadratic_test_accuracy, quadratic_entropy))

    train_filter, test_filter = train_and_test_filter(N, test_fraction=0.20)
    train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition(
        inputs, targets, train_filter, test_filter)

    quadratic_train_inputs = quadratic_feature_mapping(train_inputs)
    quadratic_test_inputs = quadratic_feature_mapping(test_inputs)

    weights = robust_logistic_regression_fit(train_inputs, train_targets, 1e-5,
                                             1e-3)
    quadratic_weights = robust_logistic_regression_fit(quadratic_train_inputs,
                                                       train_targets, 1e-5,
                                                       1e-3)
    num_points = 500
    print("No Basis Function ROC")
    fpr, tpr, AUC = ROC_values_and_AUC(test_inputs, test_targets, weights,
                                       num_points)
    print("Quadratic Expansion Basis Function ROC")
    quad_fpr, quad_tpr, quad_AUC = ROC_values_and_AUC(quadratic_test_inputs,
                                                      test_targets,
                                                      quadratic_weights,
                                                      num_points)

    #Plotting ROC curve

    fig2 = plt.figure()
    ax2 = fig2.add_subplot(1, 1, 1)
    ax2.plot(fpr, tpr, '-', color="b", label='Normal AUC = %0.4f' % AUC)
    ax2.plot(quad_fpr,
             quad_tpr,
             '-',
             color="r",
             label='Quadratic AUC = %0.4f' % quad_AUC)
    ax2.legend(loc='lower right')
    ax2.plot([0, 1], [0, 1], linestyle='--')
    ax2.set_xlabel("False Positive Rate")
    ax2.set_ylabel("True Positive Rate")
    ax2.set_aspect('equal', 'box')
    ax2.set_xlim([-0.01, 1.01])
    ax2.set_ylim([-0.01, 1.01])
    ax2.set_xticks([0, 0.5, 1])
    ax2.set_yticks([0, 0.5, 1])
    plt.tight_layout()

    y_pred = logistic_regression_predict(test_inputs, weights)
    y_actual = test_targets
    confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()

    confusion_matrix = pd.DataFrame(confusion_matrix,
                                    index=['Negative', 'Positive'],
                                    columns=['Negative', 'Positive'])

    fig3, ax3 = plt.subplots(figsize=(5, 5))
    fig3.suptitle("Normal Confusion Matrix")
    sns.heatmap(confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax3,
                cmap="Blues")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")

    #Calculating performance of the model
    confusion_matrix = pd.crosstab(y_pred, y_actual).T.as_matrix()
    TN = confusion_matrix[0, 0]
    FN = confusion_matrix[1, 0]
    TP = confusion_matrix[1, 1]
    FP = confusion_matrix[0, 1]

    Precision = TP / (TP + FP)
    Sensitivity = TP / (TP + FN)
    Specificity = TN / (TN + FP)
    F1Score = 2 * ((Precision * Sensitivity) / (Precision + Sensitivity))
    print('Precision   = ', Precision)
    print('Sensitivity = ', Sensitivity)
    print('Specificity = ', Specificity)
    print('F1 Score    = ', F1Score)

    y_pred_quadratic = logistic_regression_predict(quadratic_test_inputs,
                                                   quadratic_weights)
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()

    quadratic_confusion_matrix = pd.DataFrame(quadratic_confusion_matrix,
                                              index=['Negative', 'Positive'],
                                              columns=['Negative', 'Positive'])

    fig5, ax5 = plt.subplots(figsize=(5, 5))
    fig5.suptitle("Quadratic Confusion Matrix")
    sns.heatmap(quadratic_confusion_matrix,
                annot=True,
                linewidths=0.3,
                linecolor="White",
                cbar=False,
                fmt=".0f",
                ax=ax5,
                cmap="Blues")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")

    print("made it this far")

    #Calculating performance of the model
    quadratic_confusion_matrix = pd.crosstab(y_pred_quadratic,
                                             y_actual).T.as_matrix()
    print("made the confusion matrix")
    TN_quadratic = quadratic_confusion_matrix[0, 0]
    FN_quadratic = quadratic_confusion_matrix[1, 0]
    TP_quadratic = quadratic_confusion_matrix[1, 1]
    FP_quadratic = quadratic_confusion_matrix[0, 1]

    Precision_quadratic = TP_quadratic / (TP_quadratic + FP_quadratic)
    Sensitivity_quadratic = TP_quadratic / (TP_quadratic + FN_quadratic)
    Specificity_quadratic = TN_quadratic / (TN_quadratic + FP_quadratic)
    F1Score_quadratic = 2 * ((Precision_quadratic * Sensitivity_quadratic) /
                             (Precision_quadratic + Sensitivity_quadratic))
    print('Precision   = ', Precision_quadratic)
    print('Sensitivity = ', Sensitivity_quadratic)
    print('Specificity = ', Specificity_quadratic)
    print('F1 Score    = ', F1Score_quadratic)

    #Creating a figure with the results
    fig4 = plt.figure()
    ax4 = fig4.add_subplot(2, 2, 1)
    ax4.text(0, 1.2, 'Results', fontsize=12, fontweight='bold')
    if ("titanic" in dataset.lower()):
        ax4.text(
            0, 1.0, 'Accuracy with no basis functions = {}'.format(
                round(normal_test_accuracy, 4)))
        ax4.text(
            0, 0.9, 'Accuracy with with quadratic basis functions = {}'.format(
                round(quadratic_test_accuracy, 4)))
    else:
        ax4.text(
            0, 1.0, 'Mean accuracy with no basis functions = {}'.format(
                round(normal_test_accuracy, 4)))
        ax4.text(
            0, 0.9,
            'Mean accuracy with with quadratic basis functions = {}'.format(
                round(quadratic_test_accuracy, 4)))
    ax4.text(
        0, 0.7, 'Cross Entropy Error with no basis functions = {}'.format(
            round(normal_entropy, 4)))
    ax4.text(
        0, 0.6,
        'Cross Entropy Error with quadratic basis functions = {}'.format(
            round(quadratic_entropy, 4)))
    ax4.text(
        0, 0.4,
        'Area Under Curves with no basis functions = {}'.format(round(AUC, 4)))
    ax4.text(
        0, 0.3, 'Area Under Curves with quadratic basis functions = {}'.format(
            round(quad_AUC, 4)))
    ax4.text(0, 0.1, 'Precision   = {}'.format(round(Precision, 4)))
    ax4.text(0, 0, 'Sensitivity   = {}'.format(round(Sensitivity, 4)))
    ax4.text(0, -0.1, 'Specificity   = {}'.format(round(Specificity, 4)))
    ax4.text(0, -0.2, 'F1 Score   = {}'.format(round(F1Score, 4)))
    ax4.text(
        0, -0.3,
        'Quadratic Precision   = {}'.format(round(Precision_quadratic, 4)))
    ax4.text(
        0, -0.4,
        'Quadratic Sensitivity   = {}'.format(round(Sensitivity_quadratic, 4)))
    ax4.text(
        0, -0.5,
        'Quadratic Specificity   = {}'.format(round(Specificity_quadratic, 4)))
    ax4.text(0, -0.6,
             'Quadratic F1 Score   = {}'.format(round(F1Score_quadratic, 4)))
    ax4.axis('off')

    #Showing all plots
    plt.show()
示例#9
0
def main(ifname, input_cols=None, target_col=None, classes=None):
    """
    Imports the titanic data-set and generates exploratory plots

    parameters
    ----------
    ifname -- filename/path of data file.
    input_cols -- list of column names for the input data
    target_col -- column name of the target data
    classes -- list of the classes to plot

    """
    inputs, targets, field_names, classes = import_for_classification(
        ifname, input_cols=input_cols, target_col=target_col, classes=classes)

    N = inputs.shape[0]
    test_fraction = 0.2
    train_filter, test_filter = train_and_test_filter(N, test_fraction)
    train_inputs, train_targets, test_inputs, test_targets = train_and_test_partition(
        inputs, targets, train_filter, test_filter)
    # # without basis functions
    fig0, ax0 = fit_and_plot_accuracy_logistic(train_inputs,
                                               train_targets,
                                               test_inputs,
                                               test_targets,
                                               fig_ax=None,
                                               colour='r',
                                               type='training')
    fit_and_plot_accuracy_logistic(train_inputs,
                                   train_targets,
                                   test_inputs,
                                   test_targets,
                                   fig_ax=(fig0, ax0),
                                   colour='b',
                                   type='testing')
    ax0.legend(["Training", "Testing"])
    fig0.savefig('logistic_no_bf_accuracy.png')

    fig1, ax1 = fit_and_plot_accuracy_generative(train_inputs,
                                                 train_targets,
                                                 test_inputs,
                                                 test_targets,
                                                 fig_ax=None,
                                                 colour='r',
                                                 type='training')
    fit_and_plot_accuracy_generative(train_inputs,
                                     train_targets,
                                     test_inputs,
                                     test_targets,
                                     fig_ax=(fig1, ax1),
                                     colour='b',
                                     type='testing')
    ax1.legend(["Training", "Testing"])
    fig1.savefig('generative_no_bf_accuracy.png')

    # with quadratic basis function
    train_designmtx = quadratic_feature_mapping(train_inputs)
    test_designmtx = quadratic_feature_mapping(test_inputs)
    # train_designmtx = np.delete(train_designmtx, np.where(~train_designmtx.any(axis=0))[0], axis=1)
    # test_designmtx = np.delete(test_designmtx, np.where(~test_designmtx.any(axis=0))[0], axis=1)

    print("WITH QUADRATIC BASIS FUNCTIONS")
    fig2, ax2 = fit_and_plot_accuracy_generative(train_designmtx,
                                                 train_targets,
                                                 test_designmtx,
                                                 test_targets,
                                                 fig_ax=None,
                                                 colour='r',
                                                 type='training')
    fit_and_plot_accuracy_generative(train_designmtx,
                                     train_targets,
                                     test_designmtx,
                                     test_targets,
                                     fig_ax=(fig2, ax2),
                                     colour='b',
                                     type='testing')
    ax2.legend(["Training", "Testing"])
    fig2.savefig('generative_quadratic_accuracy.png')

    # # fig, ax0, ax1= fit_and_plot_accuracy_logistic(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=None, colour='r', type='training')
    # fit_and_plot_accuracy_logistic(train_designmtx, train_targets, test_designmtx, test_targets, fig_ax=(fig, ax0, ax1), colour='b', type='testing')

    plt.show()