def spam(): train, target = load_spambase() normalize_columns = [55, 56] normalize(train, normalize_columns) train = append_new_column(train, 1.0, 0) # 10 fold cross validation train_size = len(train) k = 10 test_index_generator = cross_validation.k_fold_cross_validation(train_size, k) fold = 0 train_accuracy = 0 test_accuracy = 0 train_mse = 0 test_mse = 0 for start, end in test_index_generator: train_left = train[range(0, start)] train_right = train[range(end, train_size)] k_fold_train = np.vstack((train_left, train_right)) test = train[range(start, end)] target_left = target[range(0, start)] target_right = target[range(end, train_size)] train_target = np.append(target_left, target_right) test_target = target[range(start, end)] cf = LinearRegression() cf = cf.fit(k_fold_train, train_target) print '=============Train Data Result============' predict_train = cf.predict(k_fold_train) cm = confusion_matrix(train_target, predict_train) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) train_accuracy += acc print "mse: ", mse(predict_train, train_target), " rmse: ", rmse(predict_train, train_target), " mae: ", mae( predict_train, train_target) train_mse += mse(predict_train, train_target) print '=============Test Data Result============' predict_test = cf.predict(test) cm = confusion_matrix(test_target, predict_test) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) test_accuracy += acc fold += 1 print "mse: ", mse(predict_test, test_target), " rmse: ", rmse(predict_test, test_target), " mae: ", mae( predict_test, test_target) test_mse += mse(predict_test, test_target) print "Average train acc: %f, average test acc: %f" % (train_accuracy / fold, test_accuracy / fold) print "Average train mse: %f, average test mse: %f" % (train_mse / fold, test_mse / fold)
def gaussian_naive_bayes1(): train, target = load_spambase() train, test, train_target, test_target = train_test_shuffle_split(train, target, len(train) / 10) cf = GaussianNaiveBayes() predicts = cf.fit(train, train_target).predict_class(test) cm = confusion_matrix(test_target, predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
def spam(): train, target = load_spambase() normalize_columns = [55, 56] normalize(train, normalize_columns) train = append_new_column(train, 1.0, 0) # 10 fold cross validation train_size = len(train) k = 10 test_index_generator = cross_validation.k_fold_cross_validation( train_size, k) fold = 0 train_accuracy = 0 test_accuracy = 0 train_mse = 0 test_mse = 0 for start, end in test_index_generator: train_left = train[range(0, start)] train_right = train[range(end, train_size)] k_fold_train = np.vstack((train_left, train_right)) test = train[range(start, end)] target_left = target[range(0, start)] target_right = target[range(end, train_size)] train_target = np.append(target_left, target_right) test_target = target[range(start, end)] cf = LinearRegression() cf = cf.fit(k_fold_train, train_target) print '=============Train Data Result============' predict_train = cf.predict(k_fold_train) cm = confusion_matrix(train_target, predict_train) er, acc, fpr, tpr = confusion_matrix_analysis(cm) train_accuracy += acc print "mse: ", mse(predict_train, train_target), " rmse: ", rmse( predict_train, train_target), " mae: ", mae(predict_train, train_target) train_mse += mse(predict_train, train_target) print '=============Test Data Result============' predict_test = cf.predict(test) cm = confusion_matrix(test_target, predict_test) er, acc, fpr, tpr = confusion_matrix_analysis(cm) test_accuracy += acc fold += 1 print "mse: ", mse(predict_test, test_target), " rmse: ", rmse( predict_test, test_target), " mae: ", mae(predict_test, test_target) test_mse += mse(predict_test, test_target) print "Average train mse: %f, average test mse: %f" % ( 1.0 * train_mse / fold, 1.0 * test_mse / fold)
def decision_tree_all_data(): train, target = load_spambase() cf = tree.DecisionTree() cf = cf.fit(train, target, 5) print_tree(cf.root) predicts = cf.predict(train) cm = confusion_matrix(target,predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
def spambase(T=100): train, target = load_spambase() target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target)) train, test, train_target, test_target = train_test_shuffle_split(train, target, len(train) / 10) boost = AdaBoost() start = timeit.default_timer() boost.boost(train, train_target, test, test_target, T) stop = timeit.default_timer() print "Total Run Time: %s secs" % (stop - start)
def spambase(T=100): train, target = load_spambase() target = np.array(map(lambda v: -1.0 if v == 0 else 1.0, target)) train, test, train_target, test_target = train_test_shuffle_split( train, target, len(train) / 10) boost = AdaBoost() start = timeit.default_timer() boost.boost(train, train_target, test, test_target, T) stop = timeit.default_timer() print "Total Run Time: %s secs" % (stop - start)
def decision_tree_all_data(): train, target = load_spambase() cf = tree.DecisionTree() cf = cf.fit(train, target, 5) print_tree(cf.root) predicts = cf.predict(train) cm = confusion_matrix(target, predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
def gaussian_naive_bayes1(): train, target = load_spambase() train, test, train_target, test_target = train_test_shuffle_split( train, target, len(train) / 10) cf = GaussianNaiveBayes() predicts = cf.fit(train, train_target).predict_class(test) cm = confusion_matrix(test_target, predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr)
def spam(step, loop, converge): train, target = load_spambase() train, test, train_target, test_target = cross_validation.train_test_shuffle_split(train, target, len(train) / 10) scaler = normalize(train) train = append_new_column(train, 1.0, 0) scaler.scale_test(test) test = append_new_column(test, 1.0, 0) print '\n============== Logistic Regression - Stochastic Gradient Descending===============' spam_logistic(train, test, train_target, test_target, step, loop, converge) print '\n============== Linear Regression - Stochastic Gradient Descending ===============' spam_linear(train, test, train_target, test_target, step, loop, converge) print '\n============== Linear Regression - Normal Equation===============' spam_normal_equation(train, test, train_target, test_target) print '\n============== Decision Tree ====================================' spam_decision_tree(train, test, train_target, test_target)
def naive_bayes(c, on_train=False): train, target = load_spambase() train, target = shuffle(train, target) k = 10 train_size = len(train) test_index_generator = k_fold_cross_validation(train_size, k) fold = 1 overall_acc = 0 overall_error = 0 overall_auc = 0 overall_train_acc = 0 overall_train_error = 0 for start, end in test_index_generator: k_fold_train = np.vstack((train[range(0, start)], train[range(end, train_size)])) test = train[range(start, end)] train_target = np.append(target[range(0, start)], target[range(end, train_size)]) test_target = target[range(start, end)] cf = get_classifier(c) raw_predicts = cf.fit(k_fold_train, train_target).predict(test) predicts = cf.predict_class(raw_predicts) if on_train: raw_train_predicts = cf.predict(k_fold_train) train_predicts = cf.predict_class(raw_train_predicts) print '=============Fold %s Train==================' % fold cm = confusion_matrix(train_target, train_predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) overall_train_acc += acc overall_train_error += er print '=============Fold %s Test==================' % fold cm = confusion_matrix(test_target, predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % (cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) roc = ROC(test_target, raw_predicts, 0, "NaN") recs = roc.create_roc_records() roc.plot_roc_data(recs) auc = roc.auc() print 'AUC: %s' % auc if fold == 1: roc_data.append(roc.points) overall_acc += acc overall_error += er overall_auc += auc fold += 1 print '--------------- Result-------------------' if on_train: print 'Overall Accuracy: %s, Overall Error: %s' % (overall_train_acc/k, overall_train_error/k) print 'Overall Accuracy: %s, Overall Error: %s, Overall AUC: %s\n' % (overall_acc/k, overall_error/k, overall_auc/k)
def naive_bayes(c, on_train=False): train, target = load_spambase() train, target = shuffle(train, target) k = 10 train_size = len(train) test_index_generator = k_fold_cross_validation(train_size, k) fold = 1 overall_acc = 0 overall_error = 0 overall_auc = 0 overall_train_acc = 0 overall_train_error = 0 for start, end in test_index_generator: k_fold_train = np.vstack( (train[range(0, start)], train[range(end, train_size)])) test = train[range(start, end)] train_target = np.append(target[range(0, start)], target[range(end, train_size)]) test_target = target[range(start, end)] cf = get_classifier(c) raw_predicts = cf.fit(k_fold_train, train_target).predict(test) predicts = cf.predict_class(raw_predicts) if on_train: raw_train_predicts = cf.predict(k_fold_train) train_predicts = cf.predict_class(raw_train_predicts) print '=============Fold %s Train==================' % fold cm = confusion_matrix(train_target, train_predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) overall_train_acc += acc overall_train_error += er print '=============Fold %s Test==================' % fold cm = confusion_matrix(test_target, predicts) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) roc = ROC(test_target, raw_predicts, 0, "NaN") recs = roc.create_roc_records() roc.plot_roc_data(recs) auc = roc.auc() print 'AUC: %s' % auc if fold == 1: roc_data.append(roc.points) overall_acc += acc overall_error += er overall_auc += auc fold += 1 print '--------------- Result-------------------' if on_train: print 'Overall Accuracy: %s, Overall Error: %s' % ( overall_train_acc / k, overall_train_error / k) print 'Overall Accuracy: %s, Overall Error: %s, Overall AUC: %s\n' % ( overall_acc / k, overall_error / k, overall_auc / k)
def decision_tree(): train, target = load_spambase() # 10 fold cross validation train_size = len(train) k = 10 test_index_generator = cross_validation.k_fold_cross_validation( train_size, k) fold = 0 train_accuracy = 0 test_accuracy = 0 train_mse = 0 test_mse = 0 for start, end in test_index_generator: train_left = train[range(0, start)] train_right = train[range(end, train_size)] k_fold_train = np.vstack((train_left, train_right)) test = train[range(start, end)] target_left = target[range(0, start)] target_right = target[range(end, train_size)] train_target = np.append(target_left, target_right) test_target = target[range(start, end)] cf = tree.DecisionTree() cf = cf.fit(k_fold_train, train_target, 5) print "=========Tree==============" print_tree(cf.root) print '=============Train Data Result============' predict_train = cf.predict(k_fold_train) cm = confusion_matrix(train_target, predict_train) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) train_accuracy += acc print "mse: ", mse(predict_train, train_target), " rmse: ", rmse( predict_train, train_target), " mae: ", mae(predict_train, train_target) train_mse += mse(predict_train, train_target) print '=============Test Data Result============' predict_test = cf.predict(test) cm = confusion_matrix(test_target, predict_test) print "confusion matrix: TN: %s, FP: %s, FN: %s, TP: %s" % ( cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]) er, acc, fpr, tpr = confusion_matrix_analysis(cm) print 'Error rate: %f, accuracy: %f, FPR: %f, TPR: %f' % (er, acc, fpr, tpr) test_accuracy += acc print "mse: ", mse(predict_test, test_target), " rmse: ", rmse( predict_test, test_target), " mae: ", mae(predict_test, test_target) test_mse += mse(predict_test, test_target) fold += 1 print "Average train acc: %f, average test acc: %f" % ( train_accuracy / fold, test_accuracy / fold) print "Average train mse: %f, average test mse: %f" % (train_mse / fold, test_mse / fold)