예제 #1
0
def pick_best_combo(k_arr,d_arr,s_arr):

	hyperparams = []
	for k in k_arr:
		for depth in d_arr:
			for split in s_arr:
				if split > k:
					continue
				[train_pass, train_fail] = format.getTopicDistributions(k, True)
				[test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices
				
				testForest = RandomForest(100, split, depth)
				testForest.buildForest(train_pass, train_fail)
				trainerror = testForest.classError(train_pass, train_fail)
				testerror = testForest.classError(test_pass, test_fail)

				print "trainerror", trainerror
				print "testerror", testerror
				#print str(error) #number between 0 and 1
				hyperparams.append(HyperParams(k, depth, split, testerror))
	hyperparams.sort()
	return hyperparams
예제 #2
0
def RF_error(k,hyperparams):

	errors = numpy.zeros(10)
	
	"""
	#Partition train_features/ directory bills into training and testing set instead of using the actual testing set in test_features/ directory
	[passed, failed] = format.getTopicDistributions(k, True)
	[train_pass, train_fail, test_pass, test_fail] = partition(passed, failed)
	"""

	[train_pass, train_fail] = format.getTopicDistributions(k, True)
	[test_pass, test_fail] = format.getTopicDistributions(k, False)

	index = 0
	for num in range(1,11):
		
		forest = RandomForest(10*num, hyperparams[0], hyperparams[1])
		forest.buildForest(train_pass, train_fail)
		errors[num-1] = forest.classError(test_pass, test_fail)
		
		print 'NumTrees: ' + str(10*num) + '; Our Random Forest Error: ' + str(errors[num-1])
		index = index+1
	return errors
예제 #3
0
def pick_best_combo(k_arr, d_arr, s_arr):

    hyperparams = []
    for k in k_arr:
        for depth in d_arr:
            for split in s_arr:
                if split > k:
                    continue
                [train_pass,
                 train_fail] = format.getTopicDistributions(k, True)
                [test_pass, test_fail] = format.getTopicDistributions(
                    k, False)  #call David's function to get the four matrices

                testForest = RandomForest(100, split, depth)
                testForest.buildForest(train_pass, train_fail)
                trainerror = testForest.classError(train_pass, train_fail)
                testerror = testForest.classError(test_pass, test_fail)

                print "trainerror", trainerror
                print "testerror", testerror
                #print str(error) #number between 0 and 1
                hyperparams.append(HyperParams(k, depth, split, testerror))
    hyperparams.sort()
    return hyperparams
예제 #4
0
def sci_kit_error(k_arr, hyperparams):

    errors = numpy.zeros(len(k_arr))  # error values using sci-kit learn's random forest
    errors_finetuned = numpy.zeros(
        len(k_arr)
    )  # error values using our own implementation of random forest, with optimal hyperparameters
    std = numpy.zeros(len(k_arr))
    std_finetuned = numpy.zeros(len(k_arr))
    errors_LR = numpy.zeros(len(k_arr))
    std_LR = numpy.zeros(len(k_arr))

    index = 0
    for k in k_arr:
        [train_pass, train_fail] = format.getTopicDistributions(k, True)
        [test_pass, test_fail] = format.getTopicDistributions(k, False)

        # [passed, failed] = format.getTopicDistributions(k, True)
        # [train_pass, train_fail, test_pass, test_fail] = partition(passed, failed)

        # [test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices
        N = 2
        error_vals = numpy.zeros(shape=(1, N))
        error_vals_finetuned = numpy.zeros(shape=(1, N))
        error_vals_LR = numpy.zeros(shape=(1, N))

        num_p = len(train_pass)
        num_f = len(train_fail)

        train_x = numpy.concatenate((train_pass, train_fail), axis=0)
        train_y = numpy.concatenate((numpy.ones(shape=(num_p, 1)), numpy.zeros(shape=(num_f, 1))), axis=0)
        trueVals = numpy.concatenate(
            (numpy.ones(shape=(1, len(test_pass))), numpy.zeros(shape=(1, len(test_fail)))), axis=1
        )

        for j in range(0, N):
            # Run sci-kit learn's decision tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_x, train_y)

            predictions = clf.predict(numpy.concatenate((test_pass, test_fail), axis=0))

            error_vals[0, j] = sum(abs(predictions - trueVals)[0]) / len(predictions)

        errors[index] = numpy.mean(error_vals[0])
        std[index] = numpy.std(error_vals[0])
        print "SKLearn's Random Forest error: " + str(errors[index]) + "; Standard deviation of each run: " + str(
            std[index]
        )

        for j in range(0, N):
            # Run sci-kit learn's logistic regression
            LR_Model = LogisticRegression()
            LR_Model = LR_Model.fit(train_x, train_y)

            predictions = LR_Model.predict(numpy.concatenate((test_pass, test_fail), axis=0))
            error_vals_LR[0, j] = sum(abs(predictions - trueVals)[0]) / len(predictions)

        errors_LR[index] = numpy.mean(error_vals_LR[0])
        std_LR[index] = numpy.std(error_vals_LR[0])
        print "Logistic Regression error: " + str(errors_LR[index]) + "; Standard deviation of each run: " + str(
            std_LR[index]
        )

        #########
        for j in range(0, N):
            forest = RandomForest(100, hyperparams[index][0], hyperparams[index][1])
            forest.buildForest(train_pass, train_fail)
            ooberror = forest.OOBestimate(num_p, num_f)
            error_vals_finetuned[0, j] = forest.classError(test_pass, test_fail)
        errors_finetuned[index] = numpy.mean(error_vals_finetuned[0])
        std_finetuned[index] = numpy.std(error_vals_finetuned[0])
        print "Our Random Forest error: " + str(errors_finetuned[index]) + "; Standard deviation of each run: " + str(
            std_finetuned[index]
        )
        print "Our OOB error: " + str(ooberror)
        #########
        index = index + 1
        # error = rf(train_pass,train_fail,test_pass,test_fail,depth,split)
        # print str(error)
        # print str(error) #number between 0 and 1
        # hyperparams.append(HyperParams(k, depth, split, error))
        # hyperparams.sort()
    return [errors, std, errors_LR, std_LR, errors_finetuned, std_finetuned]
예제 #5
0
def sci_kit_error(k_arr, hyperparams):

    errors = numpy.zeros(
        len(k_arr))  #error values using sci-kit learn's random forest
    errors_finetuned = numpy.zeros(
        len(k_arr)
    )  #error values using our own implementation of random forest, with optimal hyperparameters
    std = numpy.zeros(len(k_arr))
    std_finetuned = numpy.zeros(len(k_arr))
    errors_LR = numpy.zeros(len(k_arr))
    std_LR = numpy.zeros(len(k_arr))

    index = 0
    for k in k_arr:
        [train_pass, train_fail] = format.getTopicDistributions(k, True)
        [test_pass, test_fail] = format.getTopicDistributions(k, False)

        #[passed, failed] = format.getTopicDistributions(k, True)
        #[train_pass, train_fail, test_pass, test_fail] = partition(passed, failed)

        #[test_pass, test_fail] = format.getTopicDistributions(k, False) #call David's function to get the four matrices
        N = 2
        error_vals = numpy.zeros(shape=(1, N))
        error_vals_finetuned = numpy.zeros(shape=(1, N))
        error_vals_LR = numpy.zeros(shape=(1, N))

        num_p = len(train_pass)
        num_f = len(train_fail)

        train_x = numpy.concatenate((train_pass, train_fail), axis=0)
        train_y = numpy.concatenate(
            (numpy.ones(shape=(num_p, 1)), numpy.zeros(shape=(num_f, 1))),
            axis=0)
        trueVals = numpy.concatenate((\
          numpy.ones(shape=(1,len(test_pass))),\
          numpy.zeros(shape=(1,len(test_fail)))),axis=1)

        for j in range(0, N):
            #Run sci-kit learn's decision tree
            clf = tree.DecisionTreeClassifier()
            clf = clf.fit(train_x, train_y)

            predictions = clf.predict(
                numpy.concatenate((test_pass, test_fail), axis=0))

            error_vals[0, j] = sum(
                abs(predictions - trueVals)[0]) / len(predictions)

        errors[index] = numpy.mean(error_vals[0])
        std[index] = numpy.std(error_vals[0])
        print 'SKLearn\'s Random Forest error: ' + str(
            errors[index]) + '; Standard deviation of each run: ' + str(
                std[index])

        for j in range(0, N):
            #Run sci-kit learn's logistic regression
            LR_Model = LogisticRegression()
            LR_Model = LR_Model.fit(train_x, train_y)

            predictions = LR_Model.predict(
                numpy.concatenate((test_pass, test_fail), axis=0))
            error_vals_LR[0, j] = sum(
                abs(predictions - trueVals)[0]) / len(predictions)

        errors_LR[index] = numpy.mean(error_vals_LR[0])
        std_LR[index] = numpy.std(error_vals_LR[0])
        print 'Logistic Regression error: ' + str(
            errors_LR[index]) + '; Standard deviation of each run: ' + str(
                std_LR[index])

        #########
        for j in range(0, N):
            forest = RandomForest(100, hyperparams[index][0],
                                  hyperparams[index][1])
            forest.buildForest(train_pass, train_fail)
            ooberror = forest.OOBestimate(num_p, num_f)
            error_vals_finetuned[0,
                                 j] = forest.classError(test_pass, test_fail)
        errors_finetuned[index] = numpy.mean(error_vals_finetuned[0])
        std_finetuned[index] = numpy.std(error_vals_finetuned[0])
        print 'Our Random Forest error: ' + str(
            errors_finetuned[index]
        ) + '; Standard deviation of each run: ' + str(std_finetuned[index])
        print 'Our OOB error: ' + str(ooberror)
        #########
        index = index + 1
        #error = rf(train_pass,train_fail,test_pass,test_fail,depth,split)
        #print str(error)
        #print str(error) #number between 0 and 1
        #hyperparams.append(HyperParams(k, depth, split, error))
    #hyperparams.sort()
    return [errors, std, errors_LR, std_LR, errors_finetuned, std_finetuned]
예제 #6
0
def pick_best_combo(k_arr, d_arr, s_arr):

    hyperparams = []
    for k in k_arr:
        [train_pass, train_fail] = format.getTopicDistributions(k, True)
        for depth in d_arr:
            for split in s_arr:
                if split > k:
                    continue

                train_pass = numpy.random.permutation(train_pass)
                train_fail = numpy.random.permutation(train_fail)

                N = 2
                mP = len(train_pass)
                mF = len(train_fail)

                error_vals = numpy.zeros(shape=(1, N))

                for j in range(0, N):

                    train_pass_cv = numpy.concatenate(
                        (train_pass[0 : math.floor(mP / N * j), :], train_pass[math.floor(mP / N * (j + 1)) : mP, :]),
                        axis=0,
                    )

                    train_fail_cv = numpy.concatenate(
                        (train_fail[0 : math.floor(mF / N * j), :], train_fail[math.floor(mF / N * (j + 1)) : mF, :]),
                        axis=0,
                    )

                    valid_pass_cv = train_pass[math.floor(mP / N * j) : math.floor(mP / N * (j + 1)), :]

                    valid_fail_cv = train_fail[math.floor(mF / N * j) : math.floor(mF / N * (j + 1)), :]

                    forest = RandomForest(100, split, depth)
                    forest.buildForest(train_pass_cv, train_fail_cv)
                    error_vals[0, j] = forest.classError(valid_pass_cv, valid_fail_cv)

                    ####sci kit learn, used as comparison for debugging
                    """num_p = len(train_pass_cv)
					num_f = len(train_fail_cv)

					train_x = numpy.concatenate((train_pass_cv,train_fail_cv),axis=0)
					train_y = numpy.concatenate((numpy.ones(shape=(num_p,1)), numpy.zeros(shape=(num_f,1)) ), axis=0)

					clf = tree.DecisionTreeClassifier()
					clf = clf.fit(train_x,train_y)

					predictions = clf.predict(numpy.concatenate((valid_pass_cv,valid_fail_cv),axis=0))
					trueVals = numpy.concatenate((\
					numpy.ones(shape=(1,len(valid_pass_cv))),\
					numpy.zeros(shape=(1,len(valid_fail_cv)))),axis=1)
				
					print 'Error HERE: ' + str(sum(abs(predictions-trueVals)[0])/len(predictions))"""
                    ####

                    print "(k, depth, split): " + str(k) + "," + str(depth) + "," + str(split) + "; iteration: " + str(
                        j
                    ) + ", error:" + str(error_vals[0, j])

                hyperparams.append(HyperParams(k, depth, split, numpy.mean(error_vals), numpy.std(error_vals)))
    return hyperparams
예제 #7
0
"""
pca_dimred.py

Performs PCA dimensionality reduction on feature vectors, and writes compressed versions of feature vectors
to ./k{5,8,10,15,20}_pca_passed.txt for each k
"""

import numpy as np
from sklearn.decomposition import PCA
import format_lda_to_python as format

for k in [5, 8, 10, 15, 20]:
    [passed, failed] = format.getTopicDistributions(k, True)
    pca = PCA(n_components=3)
    total = np.concatenate((passed, failed), axis=0)
    pca.fit(total)
    f = open('./k' + str(k) + '_pca_passed.txt', 'w')
    pass_dim_red = pca.fit_transform(passed, y=None)
    for item in pass_dim_red:
        f.write(','.join(map(str, item)) + '\n')
    f.close()
    f = open('./k' + str(k) + '_pca_failed.txt', 'w')
    fail_dim_red = pca.fit_transform(failed, y=None)
    for item in fail_dim_red:
        f.write(','.join(map(str, item)) + '\n')
    f.close()
예제 #8
0
"""
pca_dimred.py

Performs PCA dimensionality reduction on feature vectors, and writes compressed versions of feature vectors
to ./k{5,8,10,15,20}_pca_passed.txt for each k
"""

import numpy as np
from sklearn.decomposition import PCA
import format_lda_to_python as format

for k in [5,8,10,15,20]:
	[passed, failed] = format.getTopicDistributions(k, True)
	pca = PCA(n_components=3)
	total = np.concatenate((passed,failed),axis=0)
	pca.fit(total)
	f = open('./k'+str(k)+'_pca_passed.txt','w')
	pass_dim_red = pca.fit_transform(passed, y=None)
	for item in pass_dim_red:
		f.write(','.join(map(str, item)) + '\n')
	f.close()
	f = open('./k'+str(k)+'_pca_failed.txt','w')
	fail_dim_red = pca.fit_transform(failed, y=None)
	for item in fail_dim_red:
		f.write(','.join(map(str, item)) + '\n')
	f.close()
예제 #9
0
def pick_best_combo(k_arr, d_arr, s_arr):

    hyperparams = []
    for k in k_arr:
        [train_pass, train_fail] = format.getTopicDistributions(k, True)
        for depth in d_arr:
            for split in s_arr:
                if split > k:
                    continue

                train_pass = numpy.random.permutation(train_pass)
                train_fail = numpy.random.permutation(train_fail)

                N = 2
                mP = len(train_pass)
                mF = len(train_fail)

                error_vals = numpy.zeros(shape=(1, N))

                for j in range(0, N):

                    train_pass_cv = numpy.concatenate((train_pass[0:math.floor(mP / N * j), :], \
                    train_pass[math.floor(mP / N * (j + 1)):mP, :]),axis=0)

                    train_fail_cv = numpy.concatenate((train_fail[0:math.floor(mF / N * j), :], \
                    train_fail[math.floor(mF / N * (j + 1)):mF, :]),axis=0)

                    valid_pass_cv = train_pass[
                        math.floor(mP / N * j):math.floor(mP / N * (j + 1)), :]

                    valid_fail_cv = train_fail[
                        math.floor(mF / N * j):math.floor(mF / N * (j + 1)), :]

                    forest = RandomForest(100, split, depth)
                    forest.buildForest(train_pass_cv, train_fail_cv)
                    error_vals[0,
                               j] = forest.classError(valid_pass_cv,
                                                      valid_fail_cv)

                    ####sci kit learn, used as comparison for debugging
                    """num_p = len(train_pass_cv)
					num_f = len(train_fail_cv)

					train_x = numpy.concatenate((train_pass_cv,train_fail_cv),axis=0)
					train_y = numpy.concatenate((numpy.ones(shape=(num_p,1)), numpy.zeros(shape=(num_f,1)) ), axis=0)

					clf = tree.DecisionTreeClassifier()
					clf = clf.fit(train_x,train_y)

					predictions = clf.predict(numpy.concatenate((valid_pass_cv,valid_fail_cv),axis=0))
					trueVals = numpy.concatenate((\
					numpy.ones(shape=(1,len(valid_pass_cv))),\
					numpy.zeros(shape=(1,len(valid_fail_cv)))),axis=1)
				
					print 'Error HERE: ' + str(sum(abs(predictions-trueVals)[0])/len(predictions))"""
                    ####

                    print '(k, depth, split): ' + str(k) + ',' + str(
                        depth) + ',' + str(split) + '; iteration: ' + str(
                            j) + ', error:' + str(error_vals[0, j])

                hyperparams.append(
                    HyperParams(k, depth, split, numpy.mean(error_vals),
                                numpy.std(error_vals)))
    return hyperparams