def q2_plots(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 num_points = 50 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): roc = ROC.ROC() print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in [0]: alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) for ti in range(num_points + 2): theta = ti * 1./(num_points + 1) predict = nb_model.predict(data_rows, theta) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy roc.add_tp_tn(predict, truth_rows, theta) #print_plot_output(ki, accuracy, theta) roc.plot_ROC('/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks/roc_{}.pdf'.format(model_type)) roc.print_info()
def TreeTest(): spamDat = spamData() k = 10 all_folds = hw3.partition_folds(spamDat, k) num_in_fold = [] err_in_fold = [] for i in range(len(all_folds) - 1): spam = all_folds[i] num_in_fold.append(len(spam)) truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) print 'Prediction...\n' predict = tree.predict(f_data) print predict print truth error = 1. - hw3.get_accuracy(predict, truth) err_in_fold.append(error) print 'Tree error is: {}'.format(error) spam = all_folds[k -1] truth, f_data = decTree.split_truth_from_data(spam) tree = decTree.TreeOptimal(max_depth=2) #tree = decTree.TreeRandom() tree.fit(f_data, truth) predict = tree.predict(f_data) error = 1. - hw3.get_accuracy(predict, truth) sum_training_err = 0 for i in range(len(num_in_fold)): sum_training_err += err_in_fold[i] #sum_training_err += float(err_in_fold)/num_in_fold average_training_error = float(sum_training_err)/len(num_in_fold) print 'Average training error: {}\nAverage testing error: {}'.format(average_training_error, error)
def testPdToDict(): df = hw3.load_and_normalize_spambase() cols = df.columns[0:3] sub = utils.train_subset(df, cols, 5) print sub print hw3.pandas_to_data(sub)
def model_bin_train(self, data_row, truth, num_bins=2): #TODO add epsilon model = {} cutoffsc = [[] for _ in range(len(data_row[0]))] dmat = np.matrix(data_row) drange = dmat.max() - dmat.min() bin_size = float(drange) / num_bins data_col = hw3.transpose_array(data_row) for j in range(len(data_col)): #cutoffsc.append([min(data_col)[0] + bin_size * i for i in range(num_bins)]) mu = np.asarray(data_col[j]).mean() low_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] < mu]).mean() high_mu = np.asarray([data_col[j][i] for i in range(len(data_col[j])) if data_col[j][i] > mu]).mean() if num_bins == 4: cutoffsc[j] = [min(data_col)[0], low_mu, mu, high_mu] else: cutoffsc[j] = [min(data_col)[0], (low_mu - min(data_col)[0])/2, mu, (high_mu-mu)/2, high_mu, (max(data_col)[0]-high_mu)/2] cutoffs = [dmat.min() + bin_size * i for i in range(num_bins)] #epsilon = float(alpha * 1) / len(covar_matrix) for label in [0,1]: # transpose to go by column sub_data = hw3.transpose_array(hw3.get_sub_at_value(data_row, truth, label)) model[label] = hw3.bins_per_column(sub_data, cutoffs) model[label] = hw3.bins_per_column_by_col(sub_data, cutoffsc) # probability of bin given label self.y_prob = float(sum(truth))/len(truth) self.cutoffs = cutoffsc return model
def q1(): """GDA """ """Run the Gaussian Discriminant Analysis on the spambase data. Use the k-folds from the previous problem (1 for testing, k-1 for training, for each fold) Since you have 57 real value features, each of the 2gaussians (for + class and for - class) will have a mean vector with 57 components, and a they will have either a common (shared) covariance matrix size 57x57. This covariance is estimated from all training data (both classes) or two separate covariance 57x57 matrices (estimated separately for each class) (you can use a Matlab or Python of Java built in function to estimated covariance matrices, but the estimator is easy to code up). Looking at the training and testing performance, does it appear that the gaussian assumption (normal distributed data) holds for this particular dataset? """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) # returns an array of arrays - this is by row k = 10 train_acc_sum = 0 k_folds = hw3.partition_folds(spamData, k) gdas = [] for ki in range(k - 1): subset = [] gda = hw3.GDA() X, truth = hw3.separate_X_and_y(k_folds[ki]) covariance_matrix = hw3.get_covar(X) gda.p_y = float(sum(truth)) / len(truth) gda.train(X, covariance_matrix, truth) predictions = gda.predict(X) #print predictions accuracy = mystats.get_error(predictions, truth, True) #gdas.append(gda) print_output(ki, accuracy) #print gda.prob gdas.append(gda)
def q1(): spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 all_folds = hw3.partition_folds(spamData, k) tprs = [] fprs = [] for i in [0]: #range(len(all_folds)): kf_data, kf_test = dl.get_train_and_test(all_folds, i) y, X = hw4.split_truth_from_data(kf_data) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, i) predicted = adaboost.predict(X) print(roc_auc_score(y, predicted)) for i in range(len(adaboost.snapshots)): round_number = i + 1 ab = adaboost.snapshots[i] yt_pred = ab.predict(X_test) round_err = float(np.sum([1 if yt==yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) adaboost.adaboost_error_test[round_number] = round_err print predicted[:20] print y[:20] name = 'q1' directory = '/Users/Admin/Dropbox/ML/MachineLearning_CS6140/CS6140_A_MacLeay/Homeworks' path = os.path.join(directory, name + 'hw4errors.pdf') tterrpath = os.path.join(directory, name + 'hw4_errors_test_train.pdf') print path plt.Errors([adaboost.local_errors]).plot_all_errors(path) plt.Errors([adaboost.adaboost_error, adaboost.adaboost_error_test]).plot_all_errors(tterrpath) roc = plt.ROC() #roc.add_tpr_fpr_arrays(adaboost.tpr.values(), adaboost.fpr.values()) get_tpr_fpr(adaboost, roc, X_test, y_test, 30) roc.plot_ROC(os.path.join(directory, name + 'hw4_roc.pdf'))
def testTransposeArray(): dfup = hw3.load_and_normalize_spambase() cols = dfup.columns[0:3] sub = utils.train_subset(dfup, cols, 5) up = hw3.pandas_to_data(sub) print up trans = hw3.transpose_array(up) print trans
def model_gaussian_rand_var_train(self, data, truth): mus = {} std_dev = {} for label in [0,1]: sub_data = hw3.get_sub_at_value(data, truth, label) mus[label] = hw3.get_mus(sub_data) std_dev[label] = hw3.get_std_dev(sub_data) self.y_prob = float(sum(truth))/len(truth) return [mus, std_dev, float(sum(truth))/len(truth)]
def q6(): """ Bagging - sample with replacement """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) y, X = hw4.split_truth_from_data(spamData) bagged = bag.Bagging(max_rounds=100, sample_size=1000, learner=lambda: DecisionTreeClassifier(max_depth=3)) bagged.fit(X, y) kf_fold = hw4.partition_folds(spamData, .4) test_y, test_X = hw4.split_truth_from_data(kf_fold[0]) test_pred = bagged.predict(test_X) test_y = bagged._check_y(test_y) test_pred = bagged._check_y(test_pred) test_error = float(sum([0 if py == ty else 1 for py, ty in zip(test_pred, test_y)]))/len(test_y) print 'Final testing error: {}'.format(test_error)
def q2(): models = ['Bernoulli', 'Gaussian', '4-bins', '9-bins'] spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) k = 10 k_folds = hw3.partition_folds(spamData, k) for model_type in range(4): print '\nModel: {}'.format(models[model_type]) train_acc_sum = 0 nb_models = [] for ki in range(k - 1): alpha = .001 if model_type==0 else 0 nb_model = nb.NaiveBayes(model_type, alpha=alpha) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[ki]) nb_model.train(data_rows, truth_rows) predict = nb_model.predict(data_rows) print predict accuracy = hw3.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) nb_combined = nb.NaiveBayes(model_type, alpha=.001) if model_type < 2: nb_combined.aggregate_model(nb_models) else: nb_combined.aggregate_model3(nb_models) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(k_folds[k - 1]) test_predict = nb_combined.predict(data_rows) test_accuracy = hw3.get_accuracy(test_predict, truth_rows) print_test_output(test_accuracy, float(train_acc_sum)/(k-1)) #print len(k_folds[0]) truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(spamData)
def q7(): h_test, h_train = utils.load_and_normalize_housing_set() housingData_test = hw3.pandas_to_data(h_test) housingData_train = hw3.pandas_to_data(h_train) y, X = hw4.split_truth_from_data(housingData_train) y_test, X_test = hw4.split_truth_from_data(housingData_test) #gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=1, max_depth=1) gb = gradb.GradientBoostRegressor(learning_rate=.1, n_estimators=100, max_depth=1, learner=lambda: DecisionTreeRegressor(max_depth=1)) gb.fit(X, y) gb.print_stats() yhat = gb.predict(X_test) print y_test[:10] print yhat[:10] print 'MSE: {}'.format(hw4.compute_mse(y_test, yhat))
def q3(): # Got points off b/c I have 89 accuracy instead of 92 """ Logistic Regression """ data = utils.load_and_normalize_polluted_spam_data() k = 10 k_folds = hw3u.partition_folds(data, k) train_acc = [] test_acc = [] hw2_train_acc = [] hw2_test_acc = [] for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) y, X = utils.split_truth_from_data(grouped_fold) y_truth, X_test = utils.split_truth_from_data(k_folds[ki]) clf = lm.LogisticRegression() #penalty="l1") ridge_clf = hw5u.Ridge() #clf = lm.Lasso(alpha=.5) #clf = lm.RidgeClassifier(alpha=.1) clf.fit(X, y) ridge_clf.fit(X, y) y_train = [1 if p >= .5 else 0 for p in clf.predict(X)] y_test = [1 if p >= .5 else 0 for p in clf.predict(X_test)] yhat_ridge_train = [1 if p >= .5 else 0 for p in ridge_clf.predict(X)] yhat_ridge_test = [1 if p >= .5 else 0 for p in ridge_clf.predict(X_test)] train_acc.append(accuracy_score(y, y_train)) test_acc.append(accuracy_score(y_truth, y_test)) hw2_train_acc.append(accuracy_score(y, yhat_ridge_train)) hw2_test_acc.append(accuracy_score(y_truth, yhat_ridge_test)) print 'Fold {} train acc: {} test acc: {} HW2 ridge train: {} HW2 ridge test: {}'.format(ki+1, train_acc[-1], test_acc[-1], hw2_train_acc[-1], hw2_test_acc[-1]) print 'Average acc - Train: {} Test: {} HW2 ridge: {}'.format(np.mean(train_acc), np.mean(test_acc), np.mean(hw2_train_acc), np.mean(hw2_test_acc))
def SpamClassifier(features, skclassifier, myclassifier): data = utils.pandas_to_data(utils.load_and_normalize_spam_data()) k = 10 if features != 'all': # Only use the features passed in the features array new = [] t = utils.transpose_array(data) for i in xrange(len(t)): if i in features: new.append(t[i]) data = utils.transpose_array(t) all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start MyKNN' knn = hw7u.KNN(classifier=myclassifier) print 'start scikit' knnsci = hw7u.KNN(classifier=skclassifier) print 'start my pred' y_pred = knn.predict(X_test, X, y) print 'My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_pred))) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(fix_y(y_test), fix_y(y_sci)), accuracy_score(fix_y(y_test), fix_y(y_pred)))
def runDigits(n, skclf, myclf): mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) print 'scikit predict' sk_pred = skclf.predict(X_test) print sk_pred print y_test print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc)
def runDigitsDensity(n,_i, j): metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = hw7u.Kernel(ktype=metric[j]+'_sci').compute #skclf = KernelDensity(metric=ma) myclf = hw7u.MyKNN(metric=metric[j], density=True) mnsize = n df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y, dtype=np.float), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test, dtype=np.float) print 'my fit' clf = OneVsRestClassifier(myclf).fit(X, y) print 'scikit fit' #skclf = skclf.fit(X, y) print 'my predict' y_pred = clf.predict(X_test) myacc = accuracy_score(y_test, y_pred) print '({})'.format(myacc) #print 'scikit predict' #sk_pred = skclf.predict(X_test) #print sk_pred print y_test print y_pred #print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(y_test, sk_pred), myacc) print 'My Accuracy: {}'.format(myacc)
def tests_radius(): i = 0 j = 0 k = 10 X, y = testData() #print X X = np.concatenate([X, y.reshape((len(y), 1))], axis=1) X = [list(x.ravel()) for x in X] radius = [3, 5, 7] radius = [1e-1,1e-2,1e-3] # for radius metric = ['minkowski', 'cosine', 'gaussian', 'poly2'] ma = speedy.Kernel(ktype=metric[j]).compute #ma = hw7u.Kernel(ktype=metric[j]).compute print 'spam radius is {}'.format(radius[i]) clf = hw7u.MyKNN(radius=radius[i], metric=metric[j], outlier_label=-1) skclf = RadiusNeighborsClassifier(radius=radius[i], algorithm='brute', metric="euclidean", p=2, outlier_label=.5) all_folds = hw3u.partition_folds(X, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'start scikit' knnsci = hw7u.KNN(classifier=skclf) print 'start MyKNN' knn = hw7u.KNN(classifier=clf) print 'start sk pred' y_sci = knnsci.predict(X_test, X, y) print 'start my pred' y_pred = knn.predict(X_test, X, y) print y_pred print 'SciKit Accuracy: {} My Accuracy: {}'.format(accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_sci)), accuracy_score(hw7.fix_y(y_test), hw7.fix_y(y_pred)))
def q1(): """ feature analysis with Adaboost """ #spamData = hw3u.pandas_to_data(hw3u.load_and_normalize_spambase()) spamData = utils.load_and_normalize_polluted_spam_data() k = 10 all_folds = hw3u.partition_folds(spamData, k) col_errs = [] kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) # We're not actually cross-validating anything -- we just want feature weights #X = np.concatenate([X, X_test], axis=0) #y = np.concatenate([y, y_test], axis=0) #adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='random')) adaboost = adac.AdaboostOptimal(max_rounds=100, do_fast=False, learner=lambda: DecisionTreeClassifier(max_depth=1, splitter='best')) #adaboost = adac.AdaboostOptimal(max_rounds=10, do_fast=False, learner=hw4u.TreeOptimal) adaboost.fit(X, y) margin_fractions = get_margin_fractions(adaboost, X[0]) #margin_fractions_v = hw5u.get_margin_fractions_validate(adaboost, X, y) #print col_errs ranked = rank(margin_fractions) print_ranks(ranked) pred = adaboost.predict(X_test) print 'Accuracy: {}'.format(accuracy_score(adaboost._check_y_not_zero(y_test), adaboost._check_y_not_zero(pred)))
def test_NaiveBayes(): bayes = nb.NaiveBayes(2) arr = get_nb_data() print arr truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(arr) bayes.train(data_rows, truth_rows) print bayes.model return bayes
def q3(): """Run your code from PB1 on Spambase dataset to perform Active Learning. Specifically: - start with a training set of about 5% of the data (selected randomly) - iterate M episodes: train the Adaboost for T rounds; from the datapoints not in the training set, select the 2% ones that are closest to the separation surface (boosting score F(x) closest to ) and add these to the training set (with labels). Repeat until the size of the training set reaches 50% of the data. How is the performance improving with the training set increase? Compare the performance of the Adaboost algorithm on the c% randomly selected training set with c% actively-built training set for several values of c : 5, 10, 15, 20, 30, 50. """ spamData = hw3.pandas_to_data(hw3.load_and_normalize_spambase()) percent = .05 all_folds = hw4.partition_folds_q4(spamData, percent) kf_train = all_folds[0] kf_test = all_folds[1] left_over = all_folds[2] while len(kf_train) < len(spamData)/2: y, X = hw4.split_truth_from_data(kf_train) y_test, X_test = hw4.split_truth_from_data(kf_test) adaboost = run_adaboost(X, y, X_test, y_test, 'q2_crx') yt_pred = adaboost.predict(X_test) order = adaboost.rank(X_test) yt_pred = adaboost._check_y(yt_pred) y_test = adaboost._check_y(y_test) round_err = float(np.sum([1 if yt!=yp else 0 for yt, yp in zip(yt_pred, y_test)]))/len(y_test) print 'Error {}'.format(round_err) shift_number = int(len(order) * .02) # number of items to switch into training set mask = [] for i in xrange(shift_number): mask.append(order[i]) kf_train.append(kf_test[order[i]]) new_test = [kf_test[i] for i in range(len(kf_test)) if i not in mask] for i in xrange(len(mask)): new_test.append(left_over[i]) left_over = left_over[len(mask):] kf_test = new_test[:] print 'test len {} train len {} leftover len {} shifting {}'.format(len(kf_test), len(kf_train), len(left_over), shift_number)
def test_NaiveBayes_predict(): bayes = nb.NaiveBayes(2) arr = get_nb_data() test = get_nb_test_data(5) print arr truth_rows, data_rows, data_mus, y_mu = hw3.get_data_and_mus(arr) bayes.train(data_rows, truth_rows) print data_mus print bayes.model print bayes.predict(test)
def svm_q1(data, classifier=svm.SVC()): k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train) y_test, X_test = hw4u.split_truth_from_data(kf_test) print 'length train: {} length test {}'.format(len(X), len(X_test)) clf = classifier clf.fit(X, y) y_pred = clf.predict(X_test) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y), fix_y(clf.predict(X))), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def model_average_predict(self, data_row, theta=.5): """ For each row calculate the probability that y is 1 and the probability that y is 0 P(Y|X) = ( P(X|Y) * P(Y) ) / ( P(X) ) P(X) = prob_over (probability that x is above average for column) P(X|Y) = prob_over_given_c (probability that x is above average when y = c for column) P(Y) = prob_y ( probability of y ) """ mus = hw3.get_mus(data_row) data_cols = hw3.transpose_array(data_row) prob_over_given_1 = self.model[0] prob_over_given_0 = self.model[1] prob_over = self.model[2] prob_y1 = self.model[3] predict = [] for r in range(len(data_row)): row = data_row[r] prob_1 = 1 prob_0 = 1 for c in range(len(row)): mu = mus[c] if row[c] > mu: prob_x1 = prob_over_given_1[c] prob_x0 = prob_over_given_0[c] prob_xover = prob_over[c] else: prob_x1 = 1 - prob_over_given_1[c] prob_x0 = 1 - prob_over_given_0[c] prob_xover = 1 - prob_over[c] prob_1 = prob_1 * prob_x1 #* prob_y1 #/ prob_xover #P(X|Y) * P(Y) prob_0 = prob_0 * prob_x0 #* (1-prob_y1) #/ prob_xover #prob_1 = prob_1 + np.log(prob_x1) + np.log(prob_y1) #prob_0 = prob_0 + np.log(prob_x0) + np.log(1-prob_y1) prob_1 = prob_1 * prob_y1 prob_0 = prob_0 * (1 - prob_y1) prob_norm = float(prob_1)/(prob_0 + prob_1) if prob_norm > theta: predict.append(1) else: predict.append(0) return predict
def GaussianNB(X, num_features=None): model_type = 1 train_acc_sum = 0 test_acc_sum = 0 k = 10 nb_models = [] if num_features is not None: y, X = utils.split_truth_from_data(X) q4_slct = SelectKBest(k=num_features).fit(X, y) X = q4_slct.transform(X) X = utils.add_row(X, y) k_folds = hw3u.partition_folds(X, k) for ki in range(k): grouped_fold = hw5u.group_fold(k_folds, ki) alpha = .001 if model_type==0 else 0 mask_cols = check_cols(grouped_fold) #nb_model = nb.NaiveBayes(model_type, alpha=alpha, ignore_cols=mask_cols) nb_model = BernoulliNB() print 'len of kfolds {}'.format(len(grouped_fold)) #truth_rows, data_rows, data_mus, y_mu = hw3u.get_data_and_mus(grouped_fold) truth_rows, data_rows = utils.split_truth_from_data(grouped_fold) print 'len of data {}'.format(len(data_rows)) #nb_model.train(data_rows, truth_rows) nb_model.fit(data_rows, truth_rows) predict = nb_model.predict(data_rows) #print predict accuracy = hw3u.get_accuracy(predict, truth_rows) train_acc_sum += accuracy print_output(ki, accuracy) nb_models.append(nb_model) truth_rows, data_rows = utils.split_truth_from_data(k_folds[ki]) test_predict = nb_model.predict(data_rows) test_accuracy = hw3u.get_accuracy(test_predict, truth_rows) test_acc_sum += test_accuracy print_output(ki, test_accuracy, 'test') print_test_output(float(train_acc_sum)/k, float(test_acc_sum)/k)
def model_average_train(self, data_row, truth): """ return [prob_over_given_1, prob_over_given_0, prob_y1] prob_over_give_x = col1[mu, var, proabality], colx[mu, var, prob] ... """ mus = hw3.get_mus(data_row) is_not_spam = hw3.get_sub_at_value(data_row, truth, 0) is_spam = hw3.get_sub_at_value(data_row, truth, 1) prob_over = get_prob_over(data_row, mus) prob_over_given_1 = get_prob_over(is_spam, mus) prob_over_given_0 = get_prob_over(is_not_spam, mus) l0 = len(prob_over_given_0) l1 = len(prob_over_given_1) if l1 != l0: addx = abs(l1-l0) fake_row = [0 for _ in range(addx)] if l1 > l0: prob_over_given_0 = fake_row else: prob_over_given_1 = fake_row prob_y1 = float(sum(truth))/len(truth) self.y_prob = prob_y1 return [prob_over_given_1, prob_over_given_0, prob_over, prob_y1]
def initialize(self, data, k=2): # start with k = 2 and std_dev = 1 self.k = k self.labels = [ki for ki in range(self.k)] models = [EMModel() for _ in range(self.k)] mucheat = mu_cheat(hw3.transpose_array(data), k) for ki in range(self.k): #models[ki].random_mus(data) models[ki].mu = mucheat[ki] self.labels = self.assign_labels(data, models) #self.labels = self.assign_labels2(data, model) self.prevent_empty(data) for ki in range(self.k): sub_data = hw3.get_sub_at_value(data, self.labels, ki) #models[ki].sigma = hw3.get_covar(sub_data) models[ki].sigma = hw3.get_covar(data) #models[ki].weight = float(len(sub_data)) / len(data) models[ki].weight = .5 models[ki].likelihood = self.expectation(data, models[ki]) # multivarate_normal self.models = models
def model_gaussian_rand_var_predict(self, data, theta=.5): """ model = [[mus_by_col], [std_dev_by_col], prob_y]""" std_devs = self.model[1] mus = self.model[0] y_prob = self.model[2] probabilities = {} for label in [0, 1]: if len(std_devs[label]) == 0: #print self.model #print 'Standard Deviations is empty!!!' probabilities[label] = [0] * len(data) continue prob_of_y = y_prob if label==1 else (1-y_prob) probabilities[label] = hw3.univariate_normal(data, std_devs[label], mus[label], prob_of_y, .15, ignore_cols=self.ignore_cols) return self.nb_predict(probabilities, theta)
def get_prob_over(data_by_row, mus): """ Return array of arrays column[i] = [probability_above] """ probability_above_mu = [] size = len(data_by_row) by_col = hw3.transpose_array(data_by_row) for col in range(len(by_col)): total_over = 0 column = by_col[col] mu_col = mus[col] var_col = utils.variance(by_col[col], size) for row in range(len(column)): if column[row] > mu_col: total_over += 1 probability_above_mu.append(float(total_over)/size) return probability_above_mu
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def run(self, data, weights): k_folds = hw3.partition_folds(data, self.number_k_folds) for k in xrange(self.number_k_folds - 1): err_matrix = [] fold = k_folds[k] truth, f_data = split_truth_from_data(fold) model = self.fit(f_data) predicted = self.predict(model, f_data) # {-1, 1} err_matrix = self.compute_error_matrix(truth, predicted) self.training_errors.append(self.get_error(err_matrix)) self.training_errors_weighted.append(sum(self.weight_errors(err_matrix, weights))) fold = k_folds[self.number_k_folds - 1] truth, f_data = utils.split_truth_from_data(fold) predicted = self.predict(model, f_data) # Error matrix for round computed from test data self.err_matrix = self.compute_error_matrix(truth, predicted) self.testing_error = self.get_error(self.err_matrix) self.testing_errors_weighted = self.weight_errors(self.err_matrix, weights) self.set_weight_distribution_and_total() # Dt(x) and epsilon self.set_alpha()
def model_bin_predict(self, data_row, alpha=2.00001, theta=.5): """ probality[0] = [xlabel_0_prob, xlabel_1_prob, ..., xlabel_n_prob] probability of y == 0 given xlabel probality[1] = [xlabel_0_prob, xlabel_1_prob, ..., xlabel_n_prob] probability of y == 1 given xlabel """ probability = [[] for _ in [0, 1]] # hold probability per row for r in range(len(data_row)): prob = [1 for _ in [0, 1]] #[1 for _ in range(len(self.cutoffs))] row = data_row[r] for c in range(len(row)): xbin = hw3.classify_x(row[c], self.cutoffs[c]) for label in [0, 1]: # model[0] = [col1: prob_bin1, prob_bin2 ...], [col2:...] #for modbin in self.model[label] prob[label] = prob[label] * (self.model[label][c][xbin] + float(alpha) / len(data_row)) for label in [0, 1]: prob_y = self.y_prob if label == 1 else 1 - self.y_prob probability[label].append(prob[label] * prob_y) return self.nb_predict(probability, theta=theta)