def pick_best_combo(k_arr,d_arr,s_arr): hyperparams = [] for k in k_arr: for depth in d_arr: for split in s_arr: if split > k: continue [train_pass, train_fail] = format.getTopicDistributions(k, True) [test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices testForest = RandomForest(100, split, depth) testForest.buildForest(train_pass, train_fail) trainerror = testForest.classError(train_pass, train_fail) testerror = testForest.classError(test_pass, test_fail) print "trainerror", trainerror print "testerror", testerror #print str(error) #number between 0 and 1 hyperparams.append(HyperParams(k, depth, split, testerror)) hyperparams.sort() return hyperparams
def RF_error(k,hyperparams): errors = numpy.zeros(10) """ #Partition train_features/ directory bills into training and testing set instead of using the actual testing set in test_features/ directory [passed, failed] = format.getTopicDistributions(k, True) [train_pass, train_fail, test_pass, test_fail] = partition(passed, failed) """ [train_pass, train_fail] = format.getTopicDistributions(k, True) [test_pass, test_fail] = format.getTopicDistributions(k, False) index = 0 for num in range(1,11): forest = RandomForest(10*num, hyperparams[0], hyperparams[1]) forest.buildForest(train_pass, train_fail) errors[num-1] = forest.classError(test_pass, test_fail) print 'NumTrees: ' + str(10*num) + '; Our Random Forest Error: ' + str(errors[num-1]) index = index+1 return errors
def pick_best_combo(k_arr, d_arr, s_arr): hyperparams = [] for k in k_arr: for depth in d_arr: for split in s_arr: if split > k: continue [train_pass, train_fail] = format.getTopicDistributions(k, True) [test_pass, test_fail] = format.getTopicDistributions( k, False) #call David's function to get the four matrices testForest = RandomForest(100, split, depth) testForest.buildForest(train_pass, train_fail) trainerror = testForest.classError(train_pass, train_fail) testerror = testForest.classError(test_pass, test_fail) print "trainerror", trainerror print "testerror", testerror #print str(error) #number between 0 and 1 hyperparams.append(HyperParams(k, depth, split, testerror)) hyperparams.sort() return hyperparams
def sci_kit_error(k_arr, hyperparams): errors = numpy.zeros(len(k_arr)) # error values using sci-kit learn's random forest errors_finetuned = numpy.zeros( len(k_arr) ) # error values using our own implementation of random forest, with optimal hyperparameters std = numpy.zeros(len(k_arr)) std_finetuned = numpy.zeros(len(k_arr)) errors_LR = numpy.zeros(len(k_arr)) std_LR = numpy.zeros(len(k_arr)) index = 0 for k in k_arr: [train_pass, train_fail] = format.getTopicDistributions(k, True) [test_pass, test_fail] = format.getTopicDistributions(k, False) # [passed, failed] = format.getTopicDistributions(k, True) # [train_pass, train_fail, test_pass, test_fail] = partition(passed, failed) # [test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices N = 2 error_vals = numpy.zeros(shape=(1, N)) error_vals_finetuned = numpy.zeros(shape=(1, N)) error_vals_LR = numpy.zeros(shape=(1, N)) num_p = len(train_pass) num_f = len(train_fail) train_x = numpy.concatenate((train_pass, train_fail), axis=0) train_y = numpy.concatenate((numpy.ones(shape=(num_p, 1)), numpy.zeros(shape=(num_f, 1))), axis=0) trueVals = numpy.concatenate( (numpy.ones(shape=(1, len(test_pass))), numpy.zeros(shape=(1, len(test_fail)))), axis=1 ) for j in range(0, N): # Run sci-kit learn's decision tree clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x, train_y) predictions = clf.predict(numpy.concatenate((test_pass, test_fail), axis=0)) error_vals[0, j] = sum(abs(predictions - trueVals)[0]) / len(predictions) errors[index] = numpy.mean(error_vals[0]) std[index] = numpy.std(error_vals[0]) print "SKLearn's Random Forest error: " + str(errors[index]) + "; Standard deviation of each run: " + str( std[index] ) for j in range(0, N): # Run sci-kit learn's logistic regression LR_Model = LogisticRegression() LR_Model = LR_Model.fit(train_x, train_y) predictions = LR_Model.predict(numpy.concatenate((test_pass, test_fail), axis=0)) error_vals_LR[0, j] = sum(abs(predictions - trueVals)[0]) / len(predictions) errors_LR[index] = numpy.mean(error_vals_LR[0]) std_LR[index] = numpy.std(error_vals_LR[0]) print "Logistic Regression error: " + str(errors_LR[index]) + "; Standard deviation of each run: " + str( std_LR[index] ) ######### for j in range(0, N): forest = RandomForest(100, hyperparams[index][0], hyperparams[index][1]) forest.buildForest(train_pass, train_fail) ooberror = forest.OOBestimate(num_p, num_f) error_vals_finetuned[0, j] = forest.classError(test_pass, test_fail) errors_finetuned[index] = numpy.mean(error_vals_finetuned[0]) std_finetuned[index] = numpy.std(error_vals_finetuned[0]) print "Our Random Forest error: " + str(errors_finetuned[index]) + "; Standard deviation of each run: " + str( std_finetuned[index] ) print "Our OOB error: " + str(ooberror) ######### index = index + 1 # error = rf(train_pass,train_fail,test_pass,test_fail,depth,split) # print str(error) # print str(error) #number between 0 and 1 # hyperparams.append(HyperParams(k, depth, split, error)) # hyperparams.sort() return [errors, std, errors_LR, std_LR, errors_finetuned, std_finetuned]
def sci_kit_error(k_arr, hyperparams): errors = numpy.zeros( len(k_arr)) #error values using sci-kit learn's random forest errors_finetuned = numpy.zeros( len(k_arr) ) #error values using our own implementation of random forest, with optimal hyperparameters std = numpy.zeros(len(k_arr)) std_finetuned = numpy.zeros(len(k_arr)) errors_LR = numpy.zeros(len(k_arr)) std_LR = numpy.zeros(len(k_arr)) index = 0 for k in k_arr: [train_pass, train_fail] = format.getTopicDistributions(k, True) [test_pass, test_fail] = format.getTopicDistributions(k, False) #[passed, failed] = format.getTopicDistributions(k, True) #[train_pass, train_fail, test_pass, test_fail] = partition(passed, failed) #[test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices N = 2 error_vals = numpy.zeros(shape=(1, N)) error_vals_finetuned = numpy.zeros(shape=(1, N)) error_vals_LR = numpy.zeros(shape=(1, N)) num_p = len(train_pass) num_f = len(train_fail) train_x = numpy.concatenate((train_pass, train_fail), axis=0) train_y = numpy.concatenate( (numpy.ones(shape=(num_p, 1)), numpy.zeros(shape=(num_f, 1))), axis=0) trueVals = numpy.concatenate((\ numpy.ones(shape=(1,len(test_pass))),\ numpy.zeros(shape=(1,len(test_fail)))),axis=1) for j in range(0, N): #Run sci-kit learn's decision tree clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x, train_y) predictions = clf.predict( numpy.concatenate((test_pass, test_fail), axis=0)) error_vals[0, j] = sum( abs(predictions - trueVals)[0]) / len(predictions) errors[index] = numpy.mean(error_vals[0]) std[index] = numpy.std(error_vals[0]) print 'SKLearn\'s Random Forest error: ' + str( errors[index]) + '; Standard deviation of each run: ' + str( std[index]) for j in range(0, N): #Run sci-kit learn's logistic regression LR_Model = LogisticRegression() LR_Model = LR_Model.fit(train_x, train_y) predictions = LR_Model.predict( numpy.concatenate((test_pass, test_fail), axis=0)) error_vals_LR[0, j] = sum( abs(predictions - trueVals)[0]) / len(predictions) errors_LR[index] = numpy.mean(error_vals_LR[0]) std_LR[index] = numpy.std(error_vals_LR[0]) print 'Logistic Regression error: ' + str( errors_LR[index]) + '; Standard deviation of each run: ' + str( std_LR[index]) ######### for j in range(0, N): forest = RandomForest(100, hyperparams[index][0], hyperparams[index][1]) forest.buildForest(train_pass, train_fail) ooberror = forest.OOBestimate(num_p, num_f) error_vals_finetuned[0, j] = forest.classError(test_pass, test_fail) errors_finetuned[index] = numpy.mean(error_vals_finetuned[0]) std_finetuned[index] = numpy.std(error_vals_finetuned[0]) print 'Our Random Forest error: ' + str( errors_finetuned[index] ) + '; Standard deviation of each run: ' + str(std_finetuned[index]) print 'Our OOB error: ' + str(ooberror) ######### index = index + 1 #error = rf(train_pass,train_fail,test_pass,test_fail,depth,split) #print str(error) #print str(error) #number between 0 and 1 #hyperparams.append(HyperParams(k, depth, split, error)) #hyperparams.sort() return [errors, std, errors_LR, std_LR, errors_finetuned, std_finetuned]
def pick_best_combo(k_arr, d_arr, s_arr): hyperparams = [] for k in k_arr: [train_pass, train_fail] = format.getTopicDistributions(k, True) for depth in d_arr: for split in s_arr: if split > k: continue train_pass = numpy.random.permutation(train_pass) train_fail = numpy.random.permutation(train_fail) N = 2 mP = len(train_pass) mF = len(train_fail) error_vals = numpy.zeros(shape=(1, N)) for j in range(0, N): train_pass_cv = numpy.concatenate( (train_pass[0 : math.floor(mP / N * j), :], train_pass[math.floor(mP / N * (j + 1)) : mP, :]), axis=0, ) train_fail_cv = numpy.concatenate( (train_fail[0 : math.floor(mF / N * j), :], train_fail[math.floor(mF / N * (j + 1)) : mF, :]), axis=0, ) valid_pass_cv = train_pass[math.floor(mP / N * j) : math.floor(mP / N * (j + 1)), :] valid_fail_cv = train_fail[math.floor(mF / N * j) : math.floor(mF / N * (j + 1)), :] forest = RandomForest(100, split, depth) forest.buildForest(train_pass_cv, train_fail_cv) error_vals[0, j] = forest.classError(valid_pass_cv, valid_fail_cv) ####sci kit learn, used as comparison for debugging """num_p = len(train_pass_cv) num_f = len(train_fail_cv) train_x = numpy.concatenate((train_pass_cv,train_fail_cv),axis=0) train_y = numpy.concatenate((numpy.ones(shape=(num_p,1)), numpy.zeros(shape=(num_f,1)) ), axis=0) clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x,train_y) predictions = clf.predict(numpy.concatenate((valid_pass_cv,valid_fail_cv),axis=0)) trueVals = numpy.concatenate((\ numpy.ones(shape=(1,len(valid_pass_cv))),\ numpy.zeros(shape=(1,len(valid_fail_cv)))),axis=1) print 'Error HERE: ' + str(sum(abs(predictions-trueVals)[0])/len(predictions))""" #### print "(k, depth, split): " + str(k) + "," + str(depth) + "," + str(split) + "; iteration: " + str( j ) + ", error:" + str(error_vals[0, j]) hyperparams.append(HyperParams(k, depth, split, numpy.mean(error_vals), numpy.std(error_vals))) return hyperparams
""" pca_dimred.py Performs PCA dimensionality reduction on feature vectors, and writes compressed versions of feature vectors to ./k{5,8,10,15,20}_pca_passed.txt for each k """ import numpy as np from sklearn.decomposition import PCA import format_lda_to_python as format for k in [5, 8, 10, 15, 20]: [passed, failed] = format.getTopicDistributions(k, True) pca = PCA(n_components=3) total = np.concatenate((passed, failed), axis=0) pca.fit(total) f = open('./k' + str(k) + '_pca_passed.txt', 'w') pass_dim_red = pca.fit_transform(passed, y=None) for item in pass_dim_red: f.write(','.join(map(str, item)) + '\n') f.close() f = open('./k' + str(k) + '_pca_failed.txt', 'w') fail_dim_red = pca.fit_transform(failed, y=None) for item in fail_dim_red: f.write(','.join(map(str, item)) + '\n') f.close()
""" pca_dimred.py Performs PCA dimensionality reduction on feature vectors, and writes compressed versions of feature vectors to ./k{5,8,10,15,20}_pca_passed.txt for each k """ import numpy as np from sklearn.decomposition import PCA import format_lda_to_python as format for k in [5,8,10,15,20]: [passed, failed] = format.getTopicDistributions(k, True) pca = PCA(n_components=3) total = np.concatenate((passed,failed),axis=0) pca.fit(total) f = open('./k'+str(k)+'_pca_passed.txt','w') pass_dim_red = pca.fit_transform(passed, y=None) for item in pass_dim_red: f.write(','.join(map(str, item)) + '\n') f.close() f = open('./k'+str(k)+'_pca_failed.txt','w') fail_dim_red = pca.fit_transform(failed, y=None) for item in fail_dim_red: f.write(','.join(map(str, item)) + '\n') f.close()
def pick_best_combo(k_arr, d_arr, s_arr): hyperparams = [] for k in k_arr: [train_pass, train_fail] = format.getTopicDistributions(k, True) for depth in d_arr: for split in s_arr: if split > k: continue train_pass = numpy.random.permutation(train_pass) train_fail = numpy.random.permutation(train_fail) N = 2 mP = len(train_pass) mF = len(train_fail) error_vals = numpy.zeros(shape=(1, N)) for j in range(0, N): train_pass_cv = numpy.concatenate((train_pass[0:math.floor(mP / N * j), :], \ train_pass[math.floor(mP / N * (j + 1)):mP, :]),axis=0) train_fail_cv = numpy.concatenate((train_fail[0:math.floor(mF / N * j), :], \ train_fail[math.floor(mF / N * (j + 1)):mF, :]),axis=0) valid_pass_cv = train_pass[ math.floor(mP / N * j):math.floor(mP / N * (j + 1)), :] valid_fail_cv = train_fail[ math.floor(mF / N * j):math.floor(mF / N * (j + 1)), :] forest = RandomForest(100, split, depth) forest.buildForest(train_pass_cv, train_fail_cv) error_vals[0, j] = forest.classError(valid_pass_cv, valid_fail_cv) ####sci kit learn, used as comparison for debugging """num_p = len(train_pass_cv) num_f = len(train_fail_cv) train_x = numpy.concatenate((train_pass_cv,train_fail_cv),axis=0) train_y = numpy.concatenate((numpy.ones(shape=(num_p,1)), numpy.zeros(shape=(num_f,1)) ), axis=0) clf = tree.DecisionTreeClassifier() clf = clf.fit(train_x,train_y) predictions = clf.predict(numpy.concatenate((valid_pass_cv,valid_fail_cv),axis=0)) trueVals = numpy.concatenate((\ numpy.ones(shape=(1,len(valid_pass_cv))),\ numpy.zeros(shape=(1,len(valid_fail_cv)))),axis=1) print 'Error HERE: ' + str(sum(abs(predictions-trueVals)[0])/len(predictions))""" #### print '(k, depth, split): ' + str(k) + ',' + str( depth) + ',' + str(split) + '; iteration: ' + str( j) + ', error:' + str(error_vals[0, j]) hyperparams.append( HyperParams(k, depth, split, numpy.mean(error_vals), numpy.std(error_vals))) return hyperparams