def spam():
    train, target = load_spambase()

    normalize_columns = [55, 56]
    normalize(train, normalize_columns)
    train = append_new_column(train, 1.0, 0)

    # 10 fold cross validation
    train_size = len(train)
    k = 10
    test_index_generator = cross_validation.k_fold_cross_validation(train_size, k)
    fold = 0
    train_accuracy = 0
    test_accuracy = 0
    train_mse = 0
    test_mse = 0

    for start, end in test_index_generator:
        train_left = train[range(0, start)]
        train_right = train[range(end, train_size)]
        k_fold_train = np.vstack((train_left, train_right))
        test = train[range(start, end)]

        target_left = target[range(0, start)]
        target_right = target[range(end, train_size)]
        train_target = np.append(target_left, target_right)
        test_target = target[range(start, end)]

        cf = LinearRegression()
        cf = cf.fit(k_fold_train, train_target)

        print '=============Train Data Result============'
        predict_train = cf.predict(k_fold_train)
        cm = confusion_matrix(train_target, predict_train)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
        train_accuracy += acc
        print "mse: ", mse(predict_train, train_target), " rmse: ", rmse(predict_train, train_target), " mae: ", mae(
            predict_train,
            train_target)
        train_mse += mse(predict_train, train_target)

        print '=============Test Data Result============'
        predict_test = cf.predict(test)
        cm = confusion_matrix(test_target, predict_test)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
        test_accuracy += acc
        fold += 1
        print "mse: ", mse(predict_test, test_target), " rmse: ", rmse(predict_test, test_target), " mae: ", mae(
            predict_test,
            test_target)
        test_mse += mse(predict_test, test_target)

    print "Average train acc: %f, average test acc: %f" % (train_accuracy / fold, test_accuracy / fold)
    print "Average train mse: %f, average test mse: %f" % (train_mse / fold, test_mse / fold)
def gaussian_naive_bayes1():
    train, target = load_spambase()
    train, test, train_target, test_target = train_test_shuffle_split(train, target, len(train) / 10)
    cf = GaussianNaiveBayes()
    predicts = cf.fit(train, train_target).predict_class(test)
    cm = confusion_matrix(test_target, predicts)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
示例#3
0
def spam():
    train, target = load_spambase()

    normalize_columns = [55, 56]
    normalize(train, normalize_columns)
    train = append_new_column(train, 1.0, 0)

    # 10 fold cross validation
    train_size = len(train)
    k = 10
    test_index_generator = cross_validation.k_fold_cross_validation(
        train_size, k)
    fold = 0
    train_accuracy = 0
    test_accuracy = 0
    train_mse = 0
    test_mse = 0

    for start, end in test_index_generator:
        train_left = train[range(0, start)]
        train_right = train[range(end, train_size)]
        k_fold_train = np.vstack((train_left, train_right))
        test = train[range(start, end)]

        target_left = target[range(0, start)]
        target_right = target[range(end, train_size)]
        train_target = np.append(target_left, target_right)
        test_target = target[range(start, end)]

        cf = LinearRegression()
        cf = cf.fit(k_fold_train, train_target)

        print '=============Train Data Result============'
        predict_train = cf.predict(k_fold_train)
        cm = confusion_matrix(train_target, predict_train)

        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        train_accuracy += acc
        print "mse: ", mse(predict_train, train_target), " rmse: ", rmse(
            predict_train,
            train_target), " mae: ", mae(predict_train, train_target)
        train_mse += mse(predict_train, train_target)

        print '=============Test Data Result============'
        predict_test = cf.predict(test)
        cm = confusion_matrix(test_target, predict_test)

        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        test_accuracy += acc
        fold += 1
        print "mse: ", mse(predict_test, test_target), " rmse: ", rmse(
            predict_test, test_target), " mae: ", mae(predict_test,
                                                      test_target)
        test_mse += mse(predict_test, test_target)

    print "Average train mse: %f, average test mse: %f" % (
        1.0 * train_mse / fold, 1.0 * test_mse / fold)
def decision_tree_all_data():
    train, target = load_spambase()
    cf = tree.DecisionTree()
    cf = cf.fit(train, target, 5)
    print_tree(cf.root)
    predicts = cf.predict(train)
    cm = confusion_matrix(target,predicts)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
def spambase(T=100):
    train, target = load_spambase()
    target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target))

    train, test, train_target, test_target = train_test_shuffle_split(train, target, len(train) / 10)
    boost = AdaBoost()
    start = timeit.default_timer()
    boost.boost(train, train_target, test, test_target, T)
    stop = timeit.default_timer()
    print "Total Run Time: %s secs" % (stop - start)
def spambase(T=100):
    train, target = load_spambase()
    target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target))

    train, test, train_target, test_target = train_test_shuffle_split(
        train, target,
        len(train) / 10)
    boost = AdaBoost()
    start = timeit.default_timer()
    boost.boost(train, train_target, test, test_target, T)
    stop = timeit.default_timer()
    print "Total Run Time: %s secs" % (stop - start)
示例#7
0
def decision_tree_all_data():
    train, target = load_spambase()
    cf = tree.DecisionTree()
    cf = cf.fit(train, target, 5)
    print_tree(cf.root)
    predicts = cf.predict(train)
    cm = confusion_matrix(target, predicts)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
        cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                              tpr)
def gaussian_naive_bayes1():
    train, target = load_spambase()
    train, test, train_target, test_target = train_test_shuffle_split(
        train, target,
        len(train) / 10)
    cf = GaussianNaiveBayes()
    predicts = cf.fit(train, train_target).predict_class(test)
    cm = confusion_matrix(test_target, predicts)
    print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
        cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
    er, acc, fpr, tpr = confusion_matrix_analysis(cm)
    print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                              tpr)
示例#9
0
def spam(step, loop, converge):
    train, target = load_spambase()

    train, test, train_target, test_target = cross_validation.train_test_shuffle_split(train, target, len(train) / 10)
    scaler = normalize(train)
    train = append_new_column(train, 1.0, 0)
    scaler.scale_test(test)
    test = append_new_column(test, 1.0, 0)

    print '\n============== Logistic Regression - Stochastic Gradient Descending==============='
    spam_logistic(train, test, train_target, test_target, step, loop, converge)

    print '\n============== Linear Regression - Stochastic Gradient Descending ==============='
    spam_linear(train, test, train_target, test_target, step, loop, converge)

    print '\n============== Linear Regression - Normal Equation==============='
    spam_normal_equation(train, test, train_target, test_target)

    print '\n============== Decision Tree ===================================='
    spam_decision_tree(train, test, train_target, test_target)
def naive_bayes(c, on_train=False):
    train, target = load_spambase()
    train, target = shuffle(train, target)

    k = 10
    train_size = len(train)
    test_index_generator = k_fold_cross_validation(train_size, k)

    fold = 1

    overall_acc = 0
    overall_error = 0
    overall_auc = 0

    overall_train_acc = 0
    overall_train_error = 0

    for start, end in test_index_generator:
        k_fold_train = np.vstack((train[range(0, start)], train[range(end, train_size)]))
        test = train[range(start, end)]

        train_target = np.append(target[range(0, start)], target[range(end, train_size)])
        test_target = target[range(start, end)]

        cf = get_classifier(c)
        raw_predicts = cf.fit(k_fold_train, train_target).predict(test)
        predicts = cf.predict_class(raw_predicts)

        if on_train:
            raw_train_predicts = cf.predict(k_fold_train)
            train_predicts = cf.predict_class(raw_train_predicts)
            print '=============Fold %s Train==================' % fold
            cm = confusion_matrix(train_target, train_predicts)
            print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
            er, acc, fpr, tpr = confusion_matrix_analysis(cm)
            print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
            overall_train_acc += acc
            overall_train_error += er


        print '=============Fold %s Test==================' % fold
        cm = confusion_matrix(test_target, predicts)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)

        roc = ROC(test_target, raw_predicts, 0, "NaN")
        recs = roc.create_roc_records()
        roc.plot_roc_data(recs)
        auc = roc.auc()
        print 'AUC: %s' % auc

        if fold == 1:
            roc_data.append(roc.points)

        overall_acc += acc
        overall_error += er
        overall_auc += auc
        fold += 1

    print '--------------- Result-------------------'
    if on_train:
        print 'Overall Accuracy: %s, Overall Error: %s' % (overall_train_acc/k, overall_train_error/k)
    print 'Overall Accuracy: %s, Overall Error: %s, Overall AUC: %s\n' % (overall_acc/k, overall_error/k, overall_auc/k)
def naive_bayes(c, on_train=False):
    train, target = load_spambase()
    train, target = shuffle(train, target)

    k = 10
    train_size = len(train)
    test_index_generator = k_fold_cross_validation(train_size, k)

    fold = 1

    overall_acc = 0
    overall_error = 0
    overall_auc = 0

    overall_train_acc = 0
    overall_train_error = 0

    for start, end in test_index_generator:
        k_fold_train = np.vstack(
            (train[range(0, start)], train[range(end, train_size)]))
        test = train[range(start, end)]

        train_target = np.append(target[range(0, start)],
                                 target[range(end, train_size)])
        test_target = target[range(start, end)]

        cf = get_classifier(c)
        raw_predicts = cf.fit(k_fold_train, train_target).predict(test)
        predicts = cf.predict_class(raw_predicts)

        if on_train:
            raw_train_predicts = cf.predict(k_fold_train)
            train_predicts = cf.predict_class(raw_train_predicts)
            print '=============Fold %s Train==================' % fold
            cm = confusion_matrix(train_target, train_predicts)
            print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
                cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
            er, acc, fpr, tpr = confusion_matrix_analysis(cm)
            print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc,
                                                                      fpr, tpr)
            overall_train_acc += acc
            overall_train_error += er

        print '=============Fold %s Test==================' % fold
        cm = confusion_matrix(test_target, predicts)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
            cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                                  tpr)

        roc = ROC(test_target, raw_predicts, 0, "NaN")
        recs = roc.create_roc_records()
        roc.plot_roc_data(recs)
        auc = roc.auc()
        print 'AUC: %s' % auc

        if fold == 1:
            roc_data.append(roc.points)

        overall_acc += acc
        overall_error += er
        overall_auc += auc
        fold += 1

    print '--------------- Result-------------------'
    if on_train:
        print 'Overall Accuracy: %s, Overall Error: %s' % (
            overall_train_acc / k, overall_train_error / k)
    print 'Overall Accuracy: %s, Overall Error: %s, Overall AUC: %s\n' % (
        overall_acc / k, overall_error / k, overall_auc / k)
示例#12
0
def decision_tree():
    train, target = load_spambase()

    # 10 fold cross validation
    train_size = len(train)
    k = 10
    test_index_generator = cross_validation.k_fold_cross_validation(
        train_size, k)
    fold = 0
    train_accuracy = 0
    test_accuracy = 0
    train_mse = 0
    test_mse = 0

    for start, end in test_index_generator:
        train_left = train[range(0, start)]
        train_right = train[range(end, train_size)]
        k_fold_train = np.vstack((train_left, train_right))
        test = train[range(start, end)]

        target_left = target[range(0, start)]
        target_right = target[range(end, train_size)]
        train_target = np.append(target_left, target_right)
        test_target = target[range(start, end)]

        cf = tree.DecisionTree()
        cf = cf.fit(k_fold_train, train_target, 5)
        print "=========Tree=============="
        print_tree(cf.root)

        print '=============Train Data Result============'
        predict_train = cf.predict(k_fold_train)
        cm = confusion_matrix(train_target, predict_train)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
            cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                                  tpr)
        train_accuracy += acc
        print "mse: ", mse(predict_train, train_target), " rmse: ", rmse(
            predict_train,
            train_target), " mae: ", mae(predict_train, train_target)
        train_mse += mse(predict_train, train_target)

        print '=============Test Data Result============'
        predict_test = cf.predict(test)
        cm = confusion_matrix(test_target, predict_test)
        print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (
            cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1])
        er, acc, fpr, tpr = confusion_matrix_analysis(cm)
        print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr,
                                                                  tpr)
        test_accuracy += acc
        print "mse: ", mse(predict_test, test_target), " rmse: ", rmse(
            predict_test, test_target), " mae: ", mae(predict_test,
                                                      test_target)
        test_mse += mse(predict_test, test_target)

        fold += 1

    print "Average train acc: %f, average test acc: %f" % (
        train_accuracy / fold, test_accuracy / fold)
    print "Average train mse: %f, average test mse: %f" % (train_mse / fold,
                                                           test_mse / fold)