def compare_naive_bayes(training, testing, pca_training, pca_testing):
    util.print_line_break()
    print "Comparing Naive Bayes accuracy with and without PCA"

    print "Without PCA:"
    print NaiveBayes(training).classify_all(testing).compute_accuracy()

    print "With PCA"
    print NaiveBayes(pca_training).classify_all(pca_testing).compute_accuracy()
def compare_knn(training, testing, pca_training, pca_testing):
    util.print_line_break()
    print "Comparing KNN accuracy with and without PCA"

    print "Without PCA:"
    print Knn(training, k=3).classify_all(testing).compute_accuracy()

    print "With PCA"
    print Knn(pca_training, k=3).classify_all(pca_testing).compute_accuracy()
示例#3
0
def pca_find_important_features(dataset):
    # The weight matrix that is used to transform the original data 
    # to the reduced data can be used to see which features are most 
    # important.  The features with the largest magnitude weight have 
    # the largest impact on the reduced data.
    
    util.print_line_break()
    print "First principal component impacts (absolute value of weight):"
    print pca(dataset, 2).get_first_component_impacts()
def decision_tree_accuracy_tests(training, testing):
    util.print_line_break()
    print "Decision tree accuracy test:"
    
    # bin grades of 0-3 as low, 4-6 as mid, 7-9 as high
    training.bin("*", [4, 7], bin_names=["low", "mid", "high"])
    testing.bin("*", [4, 7], bin_names=["low", "mid", "high"])
    
    accuracy = DecisionTree(training).classify_all(testing).compute_accuracy()    
    print "%2.5f %%" % (100 * accuracy)
def knn_accuracy_tests(training, testing):
    util.print_line_break()
    print "KNN accuracy test:"
    print "k\tAccuracy"
    print "-\t--------"
    
    # Test the accuracy for k values of 3, 4, 5, 6, 7, 8, 9
    for k in range(3, 10):
        accuracy = Knn(training, k=k).classify_all(testing).compute_accuracy()
        print "%d\t%2.5f %%" % (k, 100 * accuracy)
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    cluster_3_groups(data.copy())
    cluster_pass_fail(data.copy())
    cluster_success_struggle(data.copy())
    
    util.print_line_break()
    
    print "Now with PCA:"
    cluster_3_groups_with_pca(data.copy())
    cluster_pass_fail_with_pca(data.copy())
    cluster_success_struggle_with_pca(data.copy())
def main():
    # The original data set.
    data = util.load_data()
    
    # Fill in missing values with the average for that course.
    data.fill_missing_with_feature_means()
    
    # Count successful and probation students as one group (s)
    # Comment this out to try and distinguish all 3 groups (s, p, f)
    data.combine_labels(["s", "p"], "s")
    
    util.print_line_break()
    print "Without PCA: %.5f" % get_knn_accuracy(data)
    
    util.print_line_break()
    print "With PCA:"
    print "\t".join(["PCs", "Accuracy"])
    for num_components in range(1, data.num_features()):
        accuracy = get_knn_accuracy(pca(data, num_components))
        print "%d\t%.5f" % (num_components, accuracy)
def naive_bayes_accuracy_tests(training, testing):
    util.print_line_break()
    print "Naive Bayes accuracy test:"
    
    accuracy = NaiveBayes(training).classify_all(testing).compute_accuracy()
    print "%2.5f %%" % (100 * accuracy)