def q1(): """ feature analysis with Adaboost """ #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase()) spamData = utils.load_and_normalize_polluted_spam_data() k = 10 all_folds = hw3u.partition_folds(spamData, k) col_errs = [] kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) # We're not actually cross-validating anything -- we just want feature weights #X = np.concatenate([X, X_test], axis=0) #y = np.concatenate([y, y_test], axis=0) #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random')) adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best')) #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal) adaboost.fit(X, y) margin_fractions = get_margin_fractions(adaboost, X[0]) #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y) #print col_errs ranked = rank(margin_fractions) print_ranks(ranked) pred = adaboost.predict(X_test) print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def runDigits(n, skclf, myclf): mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) print 'scikit predict' sk_pred = skclf.predict(X_test) print sk_pred print y_test print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def runDigitsDensity(n,_i, j): metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute #skclf = KernelDensity(metric=ma) myclf = hw7u.MyKNN(metric=metric[j], density=True) mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' #skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) #print 'scikit predict' #sk_pred = skclf.predict(X_test) #print sk_pred print y_test print y_pred #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc) print 'My Accuracy: {}'.format(myacc)
def tests_radius(): i = 0 j = 0 k = 10 X, y = testData() #print X X = np.concatenate([X, y.reshape((len(y), 1))], axis=1) X = [list(x.ravel()) for x in X] radius = [3, 5, 7] radius = [1e-1,1e-2,1e-3] # for radius metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = speedy.Kernel(ktype=metric[j]).compute #ma = hw7u.Kernel(ktype=metric[j]).compute print 'spam radius is {}'.format(radius[i]) clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1) skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5) all_folds = hw3u.partition_folds(X, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start scikit' knnsci = hw7u.KNN(classifier=skclf) print 'start MyKNN' knn = hw7u.KNN(classifier=clf) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'start my pred' y_pred = knn.predict(X_test, X, y) print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def q1(): spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 all_folds = hw3.partition_folds(spamData, k) tprs = [] fprs = [] for i in [0]: #range(len(all_folds)): kf_data, kf_test = dl.get_train_and_test(all_folds, i) y, X = hw4.split_truth_from_data(kf_data) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, i) predicted = adaboost.predict(X) print(roc_auc_score(y, predicted)) for i in range(len(adaboost.snapshots)): round_number = i + 1 ab = adaboost.snapshots[i] yt_pred = ab.predict(X_test) round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) adaboost.adaboost_error_test[round_number] = round_err print predicted[:20] print y[:20] name = 'q1' directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks' path = os.path.join(directory, name + 'hw4errors.pdf') tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf') print path plt.Errors([adaboost.local_errors]).plot_all_errors(path) plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath) roc = plt.ROC() #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values()) get_tpr_fpr(adaboost, roc, X_test, y_test, 30) roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
def svm_q1(data, classifier=svm.SVC()): k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'length train: {} length test {}'.format(len(X), len(X_test)) clf = classifier clf.fit(X, y) y_pred = clf.predict(X_test) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def q6(): """ Bagging - sample with replacement """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) y, X = hw4.split_truth_from_data(spamData) bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3)) bagged.fit(X, y) kf_fold = hw4.partition_folds(spamData, .4) test_y, test_X = hw4.split_truth_from_data(kf_fold[0]) test_pred = bagged.predict(test_X) test_y = bagged._check_y(test_y) test_pred = bagged._check_y(test_pred) test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y) print 'Final testing error: {}'.format(test_error)
def q7(): h_test, h_train = utils.load_and_normalize_housing_set() housingData_test = hw3.pandas_to_data(h_test) housingData_train = hw3.pandas_to_data(h_train) y, X = hw4.split_truth_from_data(housingData_train) y_test, X_test = hw4.split_truth_from_data(housingData_test) #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1) gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1)) gb.fit(X, y) gb.print_stats() yhat = gb.predict(X_test) print y_test[:10] print yhat[:10] print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
def q3(): """Run your code from PB1 on Spambase dataset to perform Active Learning. Specifically: - start with a training set of about 5% of the data (selected randomly) - iterate M episodes: train the Adaboost for T rounds; from the datapoints not in the training set, select the 2% ones that are closest to the separation surface (boosting score F(x) closest to ) and add these to the training set (with labels). Repeat until the size of the training set reaches 50% of the data. How is the performance improving with the training set increase? Compare the performance of the Adaboost algorithm on the c% randomly selected training set with c% actively-built training set for several values of c : 5, 10, 15, 20, 30, 50. """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) percent = .05 all_folds = hw4.partition_folds_q4(spamData, percent) kf_train = all_folds[0] kf_test = all_folds[1] left_over = all_folds[2] while len(kf_train) < len(spamData)/2: y, X = hw4.split_truth_from_data(kf_train) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx') yt_pred = adaboost.predict(X_test) order = adaboost.rank(X_test) yt_pred = adaboost._check_y(yt_pred) y_test = adaboost._check_y(y_test) round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) print 'Error {}'.format(round_err) shift_number = int(len(order) * .02) # number of items to switch into training set mask = [] for i in xrange(shift_number): mask.append(order[i]) kf_train.append(kf_test[order[i]]) new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask] for i in xrange(len(mask)): new_test.append(left_over[i]) left_over = left_over[len(mask):] kf_test = new_test[:] print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)
def fit(self, X, y): X = np.asarray(X) y = np.asarray(y) self.mean = np.mean(y) #y = np.asarray([self.mean]*len(y)) #hypothesis = self.learner().fit(X, y) #self.hypotheses.append(hypothesis) for round in xrange(self.max_rounds): residual = [(yn - yl) for yn, yl in zip(y, self.predict(X))] hypothesis = self.learner().fit(X, residual) self.hypotheses.append(hypothesis) self.local_error.append(hw4.compute_mse(residual, hypothesis.predict(X))) pred_round = self.predict(X) self.train_score = hw4.compute_mse(pred_round, y) self.training_error.append(self.train_score)
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def q2(): """Boosting on UCI datasets""" crx = dl.data_q3_crx() #crx = dl.data_q3_vote() num_points = len(crx) for i in xrange(5, 85, 5): percent = float(i)/100 all_folds = hw4.partition_folds(crx, percent) kf_train = all_folds[0] kf_test = all_folds[1] y, X = hw4.split_truth_from_data(kf_train) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx') yt_pred = adaboost.predict(X_test) yt_pred = adaboost._check_y(yt_pred) y_test = adaboost._check_y(y_test) round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) last_round = adaboost.local_errors.keys()[-1] #print 'Error at {}%: Train: {} Test: {}'.format(percent, adaboost.adaboost_error[last_round], round_err) print 'Error at {}%: Test: {}'.format(percent, round_err)
def relief(n): max_iters = 1 j = 0 i = 1 n_neighbors = [1, 3, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) loops = 0 weights = np.zeros(len(X[0])) loops += 1 n_features = len(X[0]) n_samples = len(X) for j in range(n_features): #feature for i in range(n_samples): # data closest_same = None closest_opp = None for z_i in range(n_samples): if z_i == i: continue diff = (X[z_i][j] - X[i][j]) ** 2 if y[z_i] == y[i]: # same if closest_same is None or diff < closest_same: closest_same = diff else: # opp if closest_opp is None or diff < closest_opp: closest_opp = diff weights[j] += (-closest_same + closest_opp) if i % 1000 == 0: print 'feature {} of {}, sample {} of {}'.format(j, n_features, i, n_samples) print weights return sorted(zip(weights, range(len(weights))), reverse=True)[:n][1]
def tests_density(): i = 0 j = 2 k = 10 X, y = testData() print X X = np.concatenate([X, y.reshape((len(y), 1))], axis=1) X = [list(x.ravel()) for x in X] radius = [3, 5, 7] metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]).compute params = {'bandwidth': np.logspace(-1, 1, 20)} grid = GridSearchCV(KernelDensity(), params) grid.fit(X) clf = hw7u.MyKNN(metric=metric[j], density=True) bw = grid.best_estimator_.bandwidth print("best bandwidth: {0}".format(bw)) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ skclf = KernelDensity(bandwidth=bw, kernel='gaussian') skclf.fit(X[:-10], y[:-10]) print skclf.score_samples(X[-10:]) return all_folds = hw3u.partition_folds(X, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start scikit' knnsci = hw7u.KNN(classifier=skclf) print 'start MyKNN' knn = hw7u.KNN(classifier=clf) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'start my pred' y_pred = knn.predict(X_test, X, y) print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def runSpamDensity(_i, j, features='all'): metric = ['gaussian', 'poly2', 'cosine_similarity', 'gaussian_density'] data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print(len(X)) print(len(X_test)) myclassifier = hw7u.MyKNN(metric=metric[j], density=True) print 'start MyKNN' myclassifier.fit(X, y) #print 'start scikit' #knnsci = skclassifier.fit(X, y) print 'start my pred' y_pred = myclassifier.predict(X_test) print(y_pred) #print 'start sk pred' #y_sci = knnsci.score(X_test) #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred))) print '2b: My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred)))