def readDigitData(trainingSize=100, testSize=100): rootdata = "digitdata/" # loading digits data rawTrainingData = samples.loadDataFile( rootdata + "trainingimages", trainingSize, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT ) trainingLabels = samples.loadLabelsFile(rootdata + "traininglabels", trainingSize) rawValidationData = samples.loadDataFile( rootdata + "validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT ) validationLabels = samples.loadLabelsFile(rootdata + "validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", testSize, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", testSize) try: print "Extracting features..." featureFunction = dataClassifier.basicFeatureExtractorDigit trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) except: display("An exception was raised while extracting basic features: \n %s" % getExceptionTraceBack()) return ( trainingData, trainingLabels, validationData, validationLabels, rawTrainingData, rawValidationData, testData, testLabels, rawTestData, )
def get_neuron_test_data(): test_data = samples.loadDataFile("digitdata/testimages", 1000, 28,28) test_labels = np.array(samples.loadLabelsFile("digitdata/testlabels", 1000)) test_labels = test_labels == 3 featurized_test_data = np.array(map(dcu.simple_image_featurization, test_data)) return test_data, featurized_test_data, test_labels
def get_neuron_training_data(): training_data = samples.loadDataFile("digitdata/trainingimages", num_train_examples, 28, 28) training_labels = np.array(samples.loadLabelsFile("digitdata/traininglabels", num_train_examples)) training_labels = training_labels == 3 featurized_training_data = np.array(map(dcu.simple_image_featurization, training_data)) return training_data, featurized_training_data, training_labels
def runClassifier(args, options): classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if(options.data=="faces"): print "loading face data set" rawTrainingData = samples.loadDataFile("facedata/facedatatrain",FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels") rawValidationData = samples.loadDataFile("facedata/facedatavalidation",FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatavalidationlabels") rawTestData = samples.loadDataFile("facedata/facedatatest", FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels") rawTrainingData,trainingLabels=randomSample(rawTrainingData,trainingLabels,numTraining) rawTestData,testLabels=randomSample(rawTestData,testLabels,numTest) else: print "loading digit data set" rawTrainingData = samples.loadDataFile("digitdata/trainingimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels") rawValidationData = samples.loadDataFile("digitdata/validationimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels") rawTestData = samples.loadDataFile("digitdata/testimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels") rawTrainingData, trainingLabels = randomSample(rawTrainingData, trainingLabels, numTraining) rawTestData, testLabels = randomSample(rawTestData, testLabels, numTest) print "Extracting features..." if (options.classifier == "linear_svm"): if (options.data == "faces"): featureFunction = HogFeatureFaceImg else: featureFunction=HogFeatureImgDigit trainingData = map(featureFunction, rawTrainingData) trainingData=np.array(trainingData).transpose() validationData=map(featureFunction, rawValidationData) validationData = np.array(validationData).transpose() testData=map(featureFunction, rawTestData) testData = np.array(testData).transpose() else: if (options.data == "faces"): featureFunction = enhancedFeatureExtractorFace else: featureFunction = enhancedFeatureExtractorDigit trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) print "Training..." start = timeit.default_timer() classifier.train(trainingData, trainingLabels, validationData, validationLabels) stop = timeit.default_timer() print stop - start, " s" print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options.odds) & (options.classifier != "mostFrequent")): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
def run_digits_n_times(digit, iterations, sample_percentage, trainingpath, labelspath): all_final_weights = [] all_final_accuracies = [] images = [] labels = [] ''' Load either digit training data or face training data ''' n_images == 5000 images = samples.loadDataFile(trainingpath, n_images, 28, 28) labels = samples.loadLabelsFile(labelspath, n_images) ''' Wrap datum in Node object, store Node objects in a list, list index = datum/node's label ''' ''' Compute weights / run perceptron n times, where n = iterations parameter ''' for a in range(iterations): # Functions_list = [pf.avg_horizontal_line_length, pf.variance_horizontal_line_length] images_sample, labels_sample, visited = sample_digits( digit, sample_percentage, images, labels) ''' !!!! SUBJECT TO CHANGE !!! ''' # Featureslist = compute_features(functions_list, images_sample, typeflag) featureslist = compute_features2(images_sample) # Weights = initialize_weights(len(functions_list), 0) weights = initialize_weights(28 * 28, 0) ''' Run Perceptron ''' start = time.time() final_weights = compute_weights(weights, featureslist, labels_sample) elapsed = time.time() - start ''' Validate weights ''' # Why was the first parameter 1???????? should be digit?? re run digits with digit instead of 1 accuracy = validate_weights(digit, final_weights) ### print( str(digit) + ': ' + str(elapsed) + ' ~~ ' + 'sample percent: ' + str(sample_percentage) + ' ~~ ' + str(accuracy) + '%') basepath = './TrainingDigitsResults120/TrainingDigitsResults' + str( digit) + '/' + str(sample_percentage) + '_percent.txt' with open(basepath, 'w') as file: for weight in range(len(weights)): if weight == len(weights) - 1: file.write(str(weights[weight]) + '\n') else: file.write(str(weights[weight]) + ' ') file.write( str(round(elapsed, 2)) + 's' + ' ' + str(round(float(accuracy) / float(100), 2))) ''' Record computed weights and accuracy for this training iteration ''' all_final_weights.append(final_weights) all_final_accuracies.append(accuracy) ''' Record mean accuracy for all iterations ''' '''
def validate_digits120(): start = time.time() weights_vectors = [] base_path = './TrainingDigitsResults120/TrainingDigitsResults' for i in range(0, 10): load_path = base_path + str(i) + '/100_percent.txt' with open(load_path, 'r') as file: lines = file.readlines() weightsstrings = lines[0].split() weights = [float(weight) for weight in weightsstrings] print(weights) weights_vectors.append(weights) file.close() images = samples.loadDataFile('digitdata/validationimages', 1000, 28, 28) labels = samples.loadLabelsFile('digitdata/validationlabels', 1000) featureslist = compute_features2(images) ''' For each image, apply all weight vectors to image Keep track of each sum in a sums list for the current image, Choose the sum that: The highest value The sum's index in the sums dictionary is the same as the weight vector's index The weight vector's index is the same as the digit's designated/chosen weights This index will be what digit you are guessing the image to be Compare this index/guessed number with the labels[image] value, if they are the same, append True to results if false, append False to results ''' results = [] for image in range(len(images)): sums = [] for weightsvector in range(len(weights_vectors)): sum = float(0) for weight in range(len(weights_vectors[weightsvector])): sum += (weights_vectors[weightsvector][weight] * featureslist[image][weight]) # Add the sum to the sums dictionary sums.append(sum) max = float('-inf') index = -1 for z in range(10): if sums[z] > max: max = sums[z] index = z # print(index, end=" --> ") # print(labels[image]) # Keep track of all guesses vs. labels in a tuples list (guess, label) results.append((index, labels[image])) correctcount = float(0) for t in results: if t[0] == t[1]: correctcount += float(1) print('Digits accuracy: ' + str(round((float(correctcount) * 100 / float(len(labels))), 1)) + '%')
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] # Load data dataset = options.dataset numTraining = options.training numTest = options.test if dataset == 'd1': rawTrainingData = samples.loadDataFile("data/D1/training_data", numTraining) trainingLabels = samples.loadLabelsFile("data/D1/training_labels", numTraining) rawTestData = samples.loadDataFile("data/D1/test_data", numTest) testLabels = samples.loadLabelsFile("data/D1/test_labels", numTest) else: rawTrainingData = samples.loadDataFile("data/D2/training_data", numTraining) trainingLabels = samples.loadLabelsFile("data/D2/training_labels", numTraining) rawTestData = samples.loadDataFile("data/D2/test_data", numTest) testLabels = samples.loadLabelsFile("data/D2/test_labels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, testData, testLabels, options.validate) guesses = classifier.classify(trainingData) correct = [ guesses[i] == trainingLabels[i] for i in range(len(trainingLabels)) ].count(True) if (options.classifier == "1vr"): f = open("perceptron1vr_train.csv", "a") f.write( str(len(trainingData)) + "," + str(100 * correct / (1.0 * (len(trainingData)))) + '\n') f.close() print "Testing..." guesses = classifier.classify(testData) # for i in range(len(testLabels)): # if guesses[i] != testLabels[i]: # print trainingData[i], guesses[i], testLabels[i] correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) if (options.classifier == "1vr"): f = open("perceptron1vr_test.csv", "a") f.write( str(len(trainingData)) + "," + str(100 * correct / (1.0 * (len(testData)))) + '\n') f.close()
import minicontest import samples import sys import util import pickle from dataClassifier import DIGIT_DATUM_HEIGHT, DIGIT_DATUM_WIDTH, contestFeatureExtractorDigit TEST_SIZE = 1000 MINICONTEST_PATH = "minicontest_output.pickle" if __name__ == '__main__': print "Loading training data" rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000) rawValidationData = samples.loadDataFile("digitdata/validationimages", 100, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", 100) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) featureFunction = contestFeatureExtractorDigit legalLabels = range(10) classifier = minicontest.contestClassifier(legalLabels) print "Extracting features..."
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print("Extracting features...") #creates the features for the training trainingData = map(featureFunction, rawTrainingData) #creates the features for validation validationData = map(featureFunction, rawValidationData) #features for the testdata testData = map(featureFunction, rawTestData) # Conduct training and testing copydata = copy(trainingData) copylabel = copy(trainingLabels) begin = time.time() for percentageOfData in range(10): start = time.time() trainingData = copy(copydata) trainingLabels = copy(copylabel) del trainingData[numTraining / 10 * (percentageOfData + 1):numTraining] del trainingLabels[numTraining / 10 * (percentageOfData + 1):numTraining] print("Training: %d " % (numTraining / 10 * (percentageOfData + 1))) # basically trains the data classifier.train(trainingData, trainingLabels, validationData, validationLabels) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) elapsed_time_fl = (time.time() - start) print("%d secs" % (elapsed_time_fl)) totaltime = (time.time() - begin) print("Total time: %d" % (totaltime)) #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print(string3) printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
import numpy as np def writeLabeledData(prefix, labeled_data): datums, labels = zip(*labeled_data) with open(prefix + "images", 'w') as f: for datum in datums: f.write(str(datum) + "\n") f.close() with open(prefix + "labels", 'w') as f: for label in labels: f.write(str(label) + "\n") f.close() rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000) rawValidationData = samples.loadDataFile("digitdata/validationimages", 1000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", 1000) rawTestData = samples.loadDataFile("digitdata/testimages", 1000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", 1000) all_data = rawTrainingData + rawValidationData + rawTestData all_labels = trainingLabels + validationLabels + testLabels labeled_data = zip(all_data, all_labels) perm = np.random.permutation(len(labeled_data)) permuted_data = []
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test rawTrainingData = samples.loadDataFile("data/digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("data/digitdata/traininglabels", numTraining) completeRawTrainingData = samples.loadDataFile( "data/digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) completeTrainingLabels = samples.loadLabelsFile( "data/digitdata/traininglabels", 5000) rawValidationData = samples.loadDataFile("data/digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "data/digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("data/digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("data/digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) completeTrainingData = map(featureFunction, completeRawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) print "Testing training data..." guesses = classifier.classify(completeTrainingData) correct = [ guesses[i] == completeTrainingLabels[i] for i in range(len(completeTrainingLabels)) ].count(True) print str(correct), ("correct out of " + str(len(completeTrainingLabels)) + " (%.1f%%).") % (100.0 * correct / len(completeTrainingLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) if ((options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/trainingimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/traininglabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print("Extracting features...") """ old trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) """ trainingData = [] validationData = [] testData = [] for d in rawTrainingData: trainingData.append(featureFunction(d)) for d in rawValidationData: validationData.append(featureFunction(d)) for d in rawTestData: testData.append(featureFunction(d)) # Conduct training and testing print("Training...") trainTimeStart = time.time() classifier.train(trainingData, trainingLabels, validationData, validationLabels) print("Training completed in %s seconds." % (time.time() - trainTimeStart)) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print(string3) printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data # numTraining = options.training # numTest = options.test if options.specialMode: numberOfTestPoints = 150 if options.data == "faces" else 1000 numberOfValidationPoints = 301 if options.data == "faces" else 1000 totalTrainData = 451 if options.data == "faces" else 5000 numValidation = numberOfValidationPoints numTest = numberOfTestPoints numTraining = totalTrainData # Load Test Data Set if options.data == "faces": rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numValidation, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numValidation) rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTraining) else: rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) rawValidationData = samples.loadDataFile( "digitdata/validationimages", numValidation, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "digitdata/validationlabels", numValidation) rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) print("Extracting features...") testData = map(featureFunction, rawTestData) validationData = map(featureFunction, rawValidationData) trainingData = map(featureFunction, rawTrainingData) for percent in range(1, 11): acc = [] aTime = [] for runCount in range(0, 4): # Extract features print("======================================") print("(" + str(runCount) + ")", "Building random", (percent * 10), " percent of Training Data...") numSubTraining = int((percent / 10.0) * totalTrainData) indexes = random.sample(range(0, totalTrainData), numSubTraining) subTrainingData = [] subTrainingLabels = [] for indx in indexes: subTrainingData.append(trainingData[indx]) subTrainingLabels.append(trainingLabels[indx]) validationData = [] validationLabels = [] # Conduct training and testing start = time.time() print("(" + str(runCount) + ")", "Training", numSubTraining, "points ...") if options.classifier == 'nearestNeighbors': classifier.train(trainingData, trainingLabels, testData, testLabels, options.k_number_of_neighbors) else: classifier.train(subTrainingData, subTrainingLabels, validationData, validationLabels) end = time.time() elapsed = end - start print("Elapsed Time:", elapsed) aTime.append(elapsed) print("(" + str(runCount) + ")", "Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print( str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) acc.append(100.0 * correct / len(testLabels)) mean = 0 avgT = 0 for q in range(0, len(acc)): mean += acc[q] avgT += aTime[q] mean = mean / len(acc) avgT = avgT / len(aTime) print("---------------") print("Average training time for", numSubTraining, "data points: ", avgT) print("Average accuracy of", (percent * 10), "percent data training: ", str(mean)) sd = 0 for a in acc: tmp = a - mean sd += (tmp * tmp) sd = sd / (len(acc) - 1) sd = math.sqrt(sd) print("Standard Derivation in accuracy:", sd) sys.exit(1) else: numTraining = options.training numTest = options.test if options.data == "faces": rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile( "digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print("Extracting features...") trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) if options.k_number_of_neighbors > 0: k = options.k_number_of_neighbors # Conduct training and testing print("Training...") if options.classifier == 'nearestNeighbors': classifier.train(trainingData, trainingLabels, testData, testLabels, options.k_number_of_neighbors) else: classifier.train(trainingData, trainingLabels, validationData, validationLabels) if options.classifier != 'nearestNeighbors': print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) if options.analysis: analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if options.odds & (options.classifier == "naiveBayes" or (options.classifier == "nb")): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if options.classifier == "naiveBayes" or options.classifier == "nb": string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print(string3) printImage(features_odds) if options.weights & (options.classifier == "perceptron"): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def runClassifier(args, options): #print 'args: ', args #print 'options', options featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features #print "Extracting features..." #print '#######type of rawTrainingData is', rawTrainingData.__class__ # list of Datum #print '#######type of rawTrainingData[0] is', rawTrainingData[0].__class__ # Datum trainingData = map(featureFunction, rawTrainingData) #print '#######type of trainingData is', trainingData.__class__ # list of Counter #print '#######type of trainingData[0] is', trainingData[0].__class__ # Counter validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds) if((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ==="%l) printImage(features_weights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test thisWidth = 0 thisHeight = 0 num = 0 sample = options.sample reps = options.repetitions acc = [] stdDev = [] for a in range(sample): acc.append(0) counter = 0 for r in range(reps): print("______Repetition " + str(r + 1) + " out of " + str(reps) + "______") counter += 1 accTemp = [] for i in range(sample): print("__________" + str(i + 1) + " try with " + str(100 * (i + 1) / sample) + "% percent of random training data__________") arr = [j for j in range(numTraining)] random.shuffle(arr) if (options.data == "faces"): rawTrainingData1 = samples.loadDataFile( "facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels1 = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile( "facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile( "facedata/facedatatestlabels", numTest) thisWidth = FACE_DATUM_WIDTH thisHeight = FACE_DATUM_HEIGHT num = 2 else: rawTrainingData1 = samples.loadDataFile( "digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels1 = samples.loadLabelsFile( "digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile( "digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) thisWidth = DIGIT_DATUM_WIDTH thisHeight = DIGIT_DATUM_HEIGHT num = 10 rawTrainingData = [] trainingLabels = [] for n in arr[:(numTraining * (1 + i) / sample)]: rawTrainingData.append(rawTrainingData1[n]) trainingLabels.append(trainingLabels1[n]) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels, thisWidth, thisHeight, num) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % ( 100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) percentCorrect = (100.0 * correct / len(testLabels)) print str(correct), ("correct out of " + str(len(testLabels)) + " (" + str(percentCorrect) + "%).") accTemp.append(percentCorrect) #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) for i in range(len(acc)): acc[i] += accTemp[i] stdDev.append(accTemp) accAvg = [] stdDev2 = [[] for k in range(sample)] for i in range(sample): for j in range(reps): stdDev2[i].append(stdDev[j][i]) for i in range(len(acc)): accAvg.append(acc[i] / reps) currSTD = np.std(stdDev2[i]) print("Accuracy with " + str(100 * (i + 1) / sample) + "% of training data: " + str(accAvg[i]) + "%, Standard Dev: " + str(currSTD)) print("Repetitions: " + str(reps)) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print string3 printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def runClassifier(args, options): start = timeit.default_timer() featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] percent = args['percent'] / 100.0 # Load data numTraining = options.training numTest = options.test if (options.data == "faces"): face_test_size = 150 rawTrainingData = samples.loadDataFile("facedata/facedatatrain", 450, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", 450) zipped = list(zip(rawTrainingData, trainingLabels)); amount = int(450 * percent) randomed = sample(zipped, amount) unzipped = zip(*randomed) rawTrainingData = unzipped[0] trainingLabels = unzipped[1] rawValidationData = samples.loadDataFile("facedata/facedatatrain", face_test_size, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", face_test_size) rawTestData = samples.loadDataFile("facedata/facedatatest", face_test_size, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", face_test_size) else: digit_test_size = 1000 rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000) zipped = list(zip(rawTrainingData,trainingLabels)); amount = int(5000 * percent) randomed = sample(zipped, amount) unzipped = zip(*randomed) rawTrainingData = unzipped[0] trainingLabels = unzipped[1] print("Len of sampled data + ", len(rawTrainingData), len(trainingLabels)) rawValidationData = samples.loadDataFile("digitdata/validationimages", digit_test_size, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", digit_test_size) rawTestData = samples.loadDataFile("digitdata/testimages", digit_test_size, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", digit_test_size) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) """print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % ( 100.0 * correct / len(validationLabels))""" print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ===" % l) printImage(features_weights) stop = timeit.default_timer() print('Classifier Run Time (in seconds): ', stop - start)
import samples import util import numpy as np import os from samples import Datum from samples import readlines from dataClassifier import DIGIT_DATUM_HEIGHT, DIGIT_DATUM_WIDTH, contestFeatureExtractorDigit from samples import IntegerConversionFunction featureFunction = contestFeatureExtractorDigit rawTrainingData = samples.loadDataFile("digitdata/testimages", 1, 28, 28) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 1) fin = readlines("digitdata/testimages") fin.reverse() a = ['+', ' ', '#'] print(IntegerConversionFunction(a)) data = [] items = [] for j in range(28): data.append(list(fin.pop())) for i in range(28): print(data[i]) items.append(Datum(data, 28, 28)) print(items[0].getPixels())
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test percent = options.percentage if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) if options.k_number_of_neighbors > 0: k = options.k_number_of_neighbors # Conduct training and testing print "Training..." if (options.classifier == 'nearestNeighbors'): classifier.train(trainingData, trainingLabels, testData, testLabels, options.k_number_of_neighbors) else: randTrainingData, randTrainingLabels = randomData( trainingData, trainingLabels, percent) start = time.clock() classifier.train(randTrainingData, randTrainingLabels, validationData, validationLabels) runTime = time.clock() - start print "training set runtime:\t" + str(runTime) if (options.classifier != 'nearestNeighbors'): print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) # analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print string3 printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
import math import samples #Naive Bayes - Face Data if __name__ == '__main__': print "Training Phase" #stores training data and appropriate labels for faces n = 450 items = samples.loadDataFile("facedata/facedatatrain", n, 60, 70) labels = samples.loadLabelsFile("facedata/facedatatrainlabels", n) all_feature_vectors = [] #stores all quadrants of all sample images for k in range(n): #break up face data into 100 6x7 pixel quadrants for feature extraction feature_quadrants = [] #will be a list of lists temp_array = [] i_start = 0 i_end = 6 j_start = 0 j_end = 7 while i_end <= 60 and j_end <= 70: #parse through image and store pixels in a temporary array for i in range(i_start, i_end): for j in range(j_start, j_end): temp_array.append(items[k].getPixel(i, j)) #add temp_array to feature_quadrant array and reassign temp_array feature_quadrants.append(temp_array) temp_array = []
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "pacman"): agentToClone = args.get('agentToClone', None) trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get( agentToClone, (None, None, None)) trainingData = trainingData or args.get( 'trainingData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0] validationData = validationData or args.get( 'validationData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1] testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[ 'ContestAgent'][2] rawTrainingData, trainingLabels = samples.loadPacmanData( trainingData, numTraining) rawValidationData, validationLabels = samples.loadPacmanData( validationData, numTest) rawTestData, testLabels = samples.loadPacmanData(testData, numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels, options.validate) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) if (options.classifier == "perceptron"): f = open("perceptron_valid.csv", "a") f.write( str(len(trainingData)) + "," + str(100 * correct / (1.0 * (len(validationData)))) + '\n') f.close() print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) if (options.classifier == "perceptron"): f = open("perceptron_test.csv", "a") f.write( str(len(trainingData)) + "," + str(100 * correct / (1.0 * (len(testData)))) + '\n') f.close()
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training # Extract features print "Extracting features..." if options.data == "faces": rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatavalidation", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatavalidationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) if options.classifier == "GDA": if options.data == "faces": dimension = 13 principleComponents = getPrincipleComponents( map( featureFunction, samples.loadDataFile("facedata/facedatatrain", 451, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)), dimension) trainingData = np.dot( basicFeatureDataToNumpyArray( map(featureFunction, rawTrainingData)), principleComponents) validationData = np.dot( basicFeatureDataToNumpyArray( map(featureFunction, rawValidationData)), principleComponents) testData = np.dot( basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents) else: dimension = 13 principleComponents = getPrincipleComponents( map( featureFunction, samples.loadDataFile("digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)), dimension) trainingData = np.dot( basicFeatureDataToNumpyArray( map(featureFunction, rawTrainingData)), principleComponents) validationData = np.dot( basicFeatureDataToNumpyArray( map(featureFunction, rawValidationData)), principleComponents) testData = np.dot( basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents) else: trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) print "len guess: %d valid: %d" % (len(guesses), len(validationLabels)) print guesses[0], guesses[10] correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier != "mostFrequent")): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print string3 printImage(features_odds)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Start training..." start = time.time() classifier.train(trainingData, trainingLabels, validationData, validationLabels) end = time.time() - start print "Traning time: " + str(end) print "Start validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print "Validation result: ", str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Start testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print "Testing result: ", str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options.odds) & (options.classifier == NB) ): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == NB): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds) if((options.weights) & (options.classifier == PT)): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ==="%l) printImage(features_weights)
"""This file is in Beta and is not the real autograder.""" import data_classification_utils as dcu import samples import numpy as np training_data = samples.loadDataFile("digitdata/trainingimages", 1, 28, 28) features = dcu.simple_image_featurization(training_data[0]) expected = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) if not np.array_equal(features, expected): print("Error, featurization is incorrect. You reported: ") print(features)
def runClassifier(): """ Harness code for running different classifiers on the face or digit data. This is the main function for classification, and is designed to be invoked from the command line (outside the Python interpreter). Usage: > python dataClassifier.py OR > python dataClassifier.py <data> <classifierName> OR > python dataClassifier.py <data> <classifierName> <featureFunction> OR > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> OR > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> <odds class1 class2> For example: > python dataClassifier.py digits naivebayes basic 1000 would run the naive Bayes classifier on 1000 training examples using the basicFeatureExtractor function, and then test the classifier on the test data. """ print "Doing classification" print "--------------------" # Assign default values for arguments if they are not provided. if (len(sys.argv) == 1): print "No data specified; using digits." sys.argv.append("digits") if (len(sys.argv) == 2): print "No classifier specified; using default." sys.argv.append("mostfrequent") if (len(sys.argv) == 3): print "No feature extraction function specified; using default." sys.argv.append("basic") if (len(sys.argv) == 4): print "No training set size specified; using default." sys.argv.append("100") if (len(sys.argv) == 5): print "Not doing odds ratio computation." sys.argv.append("noodds") # Set up variables according to the command line input. print "data:\t\t" + sys.argv[1] print "classifier:\t\t" + sys.argv[2] print "feature extractor:\t" + sys.argv[3] print "training set size:\t" + sys.argv[4] if ((sys.argv[1] == "digits") & (sys.argv[3] == "basic")): featureFunction = basicFeatureExtractorDigit elif ((sys.argv[1] == "faces") & (sys.argv[3] == "basic")): featureFunction = basicFeatureExtractorFace elif ((sys.argv[1] == "digits") & (sys.argv[3] == "enhanced")): featureFunction = enhancedFeatureExtractorDigit elif ((sys.argv[1] == "faces") & (sys.argv[3] == "enhanced")): featureFunction = enhancedFeatureExtractorFace else: print "Unknown feature function:", sys.argv[2] return if (sys.argv[1] == "digits"): # if digits detect legalLabels = range(10) else: # if face detect legalLabels = range(2) if (sys.argv[2] == "mostfrequent"): classifier = mostFrequent.MostFrequentClassifier(legalLabels) elif (sys.argv[2] == "naivebayes"): classifier = naiveBayes.NaiveBayesClassifier(legalLabels) elif (sys.argv[2] == "perceptron"): classifier = perceptron.PerceptronClassifier(legalLabels) else: print "Unknown classifier:", sys.argv[2] return # Load data numTraining = int(sys.argv[4]) if (sys.argv[1] == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) util.pause() analysis(classifier, guesses, testLabels, rawTestData) # do odds ratio computation if specified at command line if ((sys.argv[5] == "odds") & (len(sys.argv) == 8)): features_class1, features_class2, features_odds = classifier.findHighOddsFeatures( int(sys.argv[6]), int(sys.argv[7])) if (sys.argv[1] == "faces"): printImage(features_class1, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) printImage(features_class2, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) printImage(features_odds, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) else: printImage(features_class1, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) printImage(features_class2, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) printImage(features_odds, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options['train'] if(options['data']=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) util.pause() analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options['odds']) & (options['classifier'] != "mostfrequent")): class1, class2 = options['class1'], options['class2'] features_class1,features_class2,features_odds = classifier.findHighOddsFeatures(class1,class2) if(options['classifier'] == "naivebayes"): string1 = "=== Features with max P(F_i = 1 | class = %d) ===" % class1 string2 = "=== Features with max P(F_i = 1 | class = %d) ===" % class2 string3 = "=== Features with highest odd ratio of class %d over class %d ===" % (class1, class2) else: string1 = "=== Features with largest weight for class %d ===" % class1 string2 = "=== Features with largest weight for class %d ===" % class2 string3 = "=== Features with for which weight(class %d)-weight(class %d) is biggest ===" % (class1, class2) print string1 printImage(features_class1) print string2 printImage(features_class2) print string3 printImage(features_odds)
def get_digit_acc2(): images = samples.loadDataFile('digitdata/trainingimages', 5000, 28, 28) labels = samples.loadLabelsFile('digitdata/traininglabels', 5000) accuracies = [] times = [] def convertlabelslist(digit, labelslist): booleanlist = [False for i in range(len(labelslist))] for i in range(len(labelslist)): if labelslist[i] == digit: booleanlist[i] = True else: booleanlist[i] = False return booleanlist # For each percentage for i in range(10): sample_percentage = float((i + 1) / 10.0) sample_size = int(math.floor(sample_percentage * float(len(labels)))) joinedlists = list(zip(images, labels)) images_sample = [] labels_sample = [] # Shuffle until sample has all 9 digits while True: joinedlists = list(zip(images, labels)) random.shuffle(joinedlists) images_sample, labels_sample = zip(*joinedlists) images_sample = images_sample[:sample_size] labels_sample = labels_sample[:sample_size] #print(labels_sample) if 0 in labels_sample and 1 in labels_sample and 2 in labels_sample\ and 3 in labels_sample and 4 in labels_sample and 5 in labels_sample\ and 6 in labels_sample and 7 in labels_sample and 8 in labels_sample\ and 9 in labels_sample: break # Have 10 different labels list for each digit # Convert labelslist into true/false instead of numbers labelslist0 = convertlabelslist(0, labels_sample) labelslist1 = convertlabelslist(1, labels_sample) labelslist2 = convertlabelslist(2, labels_sample) labelslist3 = convertlabelslist(3, labels_sample) labelslist4 = convertlabelslist(4, labels_sample) labelslist5 = convertlabelslist(5, labels_sample) labelslist6 = convertlabelslist(6, labels_sample) labelslist7 = convertlabelslist(7, labels_sample) labelslist8 = convertlabelslist(8, labels_sample) labelslist9 = convertlabelslist(9, labels_sample) all_labels = [ labelslist0, labelslist1, labelslist2, labelslist3, labelslist4, labelslist5, labelslist6, labelslist7, labelslist8, labelslist9 ] start = time.time() # Compute weight vectors for all digits 0-9 all_weight_vectors = [] for j in range(10): featureslist = perceptron.compute_features2(images_sample) weights = perceptron.initialize_weights(28 * 28, 0) computed_weights = perceptron.compute_weights( weights, featureslist, all_labels[j]) all_weight_vectors.append((computed_weights)) elapsed = round(time.time() - start, 2) acc = demo_digits(all_weight_vectors) print(str((i + 1) * 10) + ' ' + str(elapsed) + ' ' + str(acc))
def runTask(task): print("Grading task " + str(task)) if task == 2 or task == 5: print( "This is a manually graded task, write your answers in the pdf file" ) elif task == 1: print("The solution cannot be exposed to you now. :)") elif task == 3: print("Ungraded task") else: if task == 6: numTraining = 800 numTest = 200 num_classes = 4 else: numTraining = dataClassifier.TRAIN_SET_SIZE numTest = dataClassifier.TEST_SET_SIZE num_classes = 10 if task == 6: rawTrainingData = samples.loadDataFile("data/D2/training_data", numTraining) trainingLabels = samples.loadLabelsFile("data/D2/training_labels", numTraining) rawTestData = samples.loadDataFile("data/D2/test_data", numTest) testLabels = samples.loadLabelsFile("data/D2/test_labels", numTest) featureFunction = dataClassifier.enhancedFeatureExtractorDigit else: rawTrainingData = samples.loadDataFile("data/D1/training_data", numTraining) trainingLabels = samples.loadLabelsFile("data/D1/training_labels", numTraining) rawTestData = samples.loadDataFile("data/D1/test_data", numTest) testLabels = samples.loadLabelsFile("data/D1/test_labels", numTest) featureFunction = dataClassifier.basicFeatureExtractorDigit legalLabels = range(num_classes) classifier = perceptron1vr.Perceptron1vrClassifier(legalLabels, 3) # Extract features print("Extracting features...") trainingData = map(featureFunction, rawTrainingData) testData = map(featureFunction, rawTestData) # Conduct training and testing print("Training...") classifier.train(trainingData, trainingLabels, testData, testLabels, False) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) acc = 100 * correct / (1.0 * (len(testLabels))) if task == 4: marks = 0 if (acc > 70): marks = 3 elif (acc > 60): marks = 2 elif (acc > 50): marks = 1 print("Received Marks : " + str(marks) + "/3") elif task == 6: marks = 0 print(acc) if len(testData[0]) <= 5: if (acc > 85): marks = 3 elif (acc > 65): marks = 2 elif (acc > 45): marks = 1 else: print("More than permissible features used") print("Received Marks : " + str(marks) + "/3") print("--------------------------------------------------------")
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data trainingFactor = options.training print "training factor {}".format(trainingFactor) if options.data == "faces": TEST_SET_SIZE = 150 numTraining = int(451 * trainingFactor) print "using {} datapoints out of {} ({}%) for faces".format( numTraining, 451, 100 * (numTraining / float(451))) rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: TEST_SET_SIZE = 1000 numTraining = int(5000 * trainingFactor) print "using {} datapoints out of {} ({}%) for digits".format( numTraining, TEST_SET_SIZE, 100 * (numTraining / float(5000))) rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print("Extracting features...") trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing automatic = options.run if automatic: outcomes = {} for iterator in range(5): print("Training...") if options.data == "faces": rawTrainingData = samples.loadDataFile( "facedata/facedatatrain", 451, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", 451) indices = [] for x in range(numTraining): indices.append(random.randint(0, 450)) randomTrainingData = [] randomTrainingLabels = [] for index in indices: randomTrainingData.append(rawTrainingData[index]) randomTrainingLabels.append(trainingLabels[index]) else: rawTrainingData = samples.loadDataFile( "digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile( "digitdata/traininglabels", 5000) indices = [] for x in range(numTraining): indices.append(random.randint(0, 4999)) randomTrainingData = [] randomTrainingLabels = [] for index in indices: randomTrainingData.append(rawTrainingData[index]) randomTrainingLabels.append(trainingLabels[index]) trainingData = map(featureFunction, randomTrainingData) start = time.time() classifier.train(trainingData, randomTrainingLabels, validationData, validationLabels) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) interval = time.time() - start print "Training and testing time: " + str(interval) outcomes[str(iterator)] = [ "Training and testing time: {}".format(interval), "accuracy of training: {}%".format( (100.0 * correct / len(testLabels))) ] print "outcomes: {}".format(outcomes) else: print("Training...") start = time.time() classifier.train(trainingData, trainingLabels, validationData, validationLabels) interval = time.time() - start print "Training time: " + str(interval) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [ guesses[i] == testLabels[i] for i in range(len(testLabels)) ].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if options.odds & (options.classifier != "mostFrequent"): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if options.classifier == "naiveBayes" or options.classifier == "nb": string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print(string3) printImage(features_odds)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "pacman"): agentToClone = args.get('agentToClone', None) trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get( agentToClone, (None, None, None)) trainingData = trainingData or args.get( 'trainingData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0] validationData = validationData or args.get( 'validationData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1] testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[ 'ContestAgent'][2] rawTrainingData, trainingLabels = samples.loadPacmanData( trainingData, numTraining) rawValidationData, validationLabels = samples.loadPacmanData( validationData, numTest) rawTestData, testLabels = samples.loadPacmanData(testData, numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) print string3 printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def demo_digits(n_images, datapath, labelspath, flag): start = time.time() weights_vectors = [] base_path = './TrainDigitsResults/TrainingDigitsResults' for i in range(0, 10): load_path = base_path + str(i) + '/100_percent_digit_train.txt' with open(load_path, 'r') as file: weights = choose_best_weights(file) weights_vectors.append(weights) file.close() images = samples.loadDataFile(datapath, n_images, 28, 28) labels = samples.loadLabelsFile(labelspath, n_images) featureslist = compute_features2(images) ''' For each image, apply all weight vectors to image Keep track of each sum in a sums list for the current image, Choose the sum that: The highest value The sum's index in the sums dictionary is the same as the weight vector's index The weight vector's index is the same as the digit's designated/chosen weights This index will be what digit you are guessing the image to be Compare this index/guessed number with the labels[image] value, if they are the same, append True to results if false, append False to results ''' results = [] for image in range(len(images)): sums = [] for weightsvector in range(len(weights_vectors)): sum = float(0) for weight in range(len(weights_vectors[weightsvector])): sum += (weights_vectors[weightsvector][weight] * featureslist[image][weight]) # Add the sum to the sums dictionary sums.append(sum) max = float('-inf') index = -1 for z in range(10): if sums[z] > max: max = sums[z] index = z #print(index, end=" --> ") #print(labels[image]) # Keep track of all guesses/index vs. labels in a tuples list (guess, label) results.append((index, labels[image])) correctcount = float(0) for t in results: if t[0] == t[1]: correctcount += float(1) if flag: rand = random.randint(0, len(results)) print('Guessed: ' + str(results[rand][0]) + ', Actual: ' + str(results[rand][1]) + ' (line ' + str(rand) + ' of digit testlabel)') else: print(results) print('Digits accuracy: ' + str(round((float(correctcount) * 100 / float(len(labels))), 1)) + '%') print('Time elapsed: ' + str(round(time.time() - start, 2)) + 's')
def run_digits_n_times(): all_final_stddevs = [] all_final_accs = [] colors = ['r', 'b', 'g', 'orange', 'k', 'c', 'm', 'y', 'grey', 'pink'] fig = plt.figure() ax1 = fig.add_subplot(111) ax1.set_title('Face Training Runtimes') images = samples.loadDataFile('digitdata/trainingimages', 5000, 28, 28) labels = samples.loadLabelsFile('digitdata/traininglabels', 5000) # For each percentage for v in range(1, 11): sample_percentage = v * 10 acc_list = [] # For all 5 iterations for j in range(0, 5): weights_vectors = [] # For each digit 0-9 for y in range(0, 10): digit = y images_sample, labels_sample, visited = perceptron.sample_digits( digit, sample_percentage, images, labels) # Featureslist = compute_features(functions_list, images_sample, typeflag) featureslist = perceptron.compute_features2(images_sample) # Weights = initialize_weights(len(functions_list), 0) weights = perceptron.initialize_weights(28 * 28, 0) ''' Run Perceptron / Learn weights ''' start = time.time() final_weights = compute_weights(weights, featureslist, labels_sample) elapsed = time.time() - start weights_vectors.append(final_weights) # Test all weights (weight vectors 0-9), add acc to acc list acc = demo_digits(weights_vectors) acc_list.append(acc) # For percentage sample size v, get the mean accuracy and standard deviation accuracy mean = statistics.mean(acc_list) stddev = statistics.stdev(acc_list) print('Mean accuracy for sample percent-' + str(v) + ' ~ ' + str(mean)) print('STD Dev for sample percent-' + str(v) + ' ~ ' + str(stddev)) all_final_stddevs.append(stddev) all_final_accs.append(mean) print(stddev) ax1.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], all_final_accs, color='blue', marker='.', linestyle='--') #ax1.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], all_final_stddevs, color='red',marker='.', linestyle='--') plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [ '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%' ]) plt.xlabel('Training Data Sample Size') plt.ylabel('Standard Deviation of Accuracies') # plt.legend(loc='lower right', title='Face Train Runtime'); plt.show()
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test trainingTime = [] if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile( "facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) f = open("training_time", "a") f.write("Classifier: {}\n".format(options.classifier)) f.write("Data: {}\n".format(options.data)) f.write("{} training\n".format(options.training)) f.write("{} testing\n".format(options.test)) # Extract features print("Extracting features...") trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and Testing print("Training...") start_stamp = time.time() classifier.train(trainingData, trainingLabels, validationData, validationLabels) end_stamp = time.time() trainingTime.append(str(end_stamp - start_stamp)) print("Validating...") guesses = classifier.classify(validationData) correct = [ guesses[i] == validationLabels[i] for i in range(len(validationLabels)) ].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print("Testing...") guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # print str(guesses) ############################################################################################################################### accuracy = str(100.0 * correct / len(testLabels)) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % ( label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % ( label1, label2) #print (string3) printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights) print("training time", trainingTime) f.write("{}\n".format(trainingTime)) f.write("{}\n".format(accuracy)) f.write("\n") f.close()
def runClassifier(args, options): classifier = args['classifier'] # Load data if options.data == "pacman": agentToClone = args.get('agentToClone', None) trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get( agentToClone, (None, None, None)) trainingData = trainingData or args.get( 'trainingData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0] validationData = validationData or args.get( 'validationData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1] testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[ 'ContestAgent'][2] trainingData, trainingLabels = samples.loadPacmanData( trainingData, options.training) validationData, validationLabels = samples.loadPacmanData( validationData, options.validation) testData, testLabels = samples.loadPacmanData(testData, None) elif options.data == "digits": if options.training is None: options.training = 2000 if options.validation is None: options.validation = 1000 numTest = 1000 trainingData = samples.loadDataFile("digitdata/trainingimages", options.training, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", options.training) validationData = samples.loadDataFile("digitdata/validationimages", options.validation, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", options.validation) testData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) else: raise ValueError('unrecognized dataset %r' % options.data) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
def runClassifier(args, options): #print 'args: ', args #print 'options', options featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features #print "Extracting features..." #print '#######type of rawTrainingData is', rawTrainingData.__class__ # list of Datum #print '#######type of rawTrainingData[0] is', rawTrainingData[0].__class__ # Datum trainingData = map(featureFunction, rawTrainingData) #print '#######type of trainingData is', trainingData.__class__ # list of Counter #print '#######type of trainingData[0] is', trainingData[0].__class__ # Counter validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) print 'length of guesses is %d' % len(guesses) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds) if((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ==="%l) printImage(features_weights)
def runClassifier(): global TK_ROOT, SP_CANVAS, LOG_X, LOG_Y # Set up variables according to the command line inputs featureFunction = basicFeatureExtractorDigit legalLabels = range(10) # number of labels # Select classifier classifier = perceptron.PerceptronClassifier(legalLabels) # Load data numTraining = 1 loadImage() rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT, 'train', SP_CANVAS) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawTestData = samples.loadDataFile("digitdata/testingimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT, 'test', SP_CANVAS) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) # Extract features print rawTestData trainingData = map(basicFeatureExtractorDigit, rawTrainingData) print "cp3" testData = map(basicFeatureExtractorDigit, rawTestData) # Conduct auto training SP_CANVAS.create_text(LOG_X, LOG_Y, text="Auto Training...", anchor=NW, font=tkFont.Font(size=-14)) LOG_Y += 15 classifier.train(trainingData, trainingLabels, SP_CANVAS) # Auto Testing # print "Validating..." # guesses = classifier.classify(validationData) # correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) # print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) # User Input Testing SP_CANVAS.create_text(LOG_X, LOG_Y, text="Recognizing...", anchor=NW, font=tkFont.Font(size=-14)) LOG_Y += 15 guesses = classifier.classify(testData, SP_CANVAS, "usr") # Completion Notify SP_CANVAS.create_text(LOG_X, LOG_Y + 30, text="Completed...", anchor=NW, font=tkFont.Font(size=-14)) LOG_Y += 15
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load the data for testing, training and validation if(options.random): numberOfTestPoints = 150 if options.data=="faces" else 1000 numberOfValidationPoints = 301 if options.data=="faces" else 1000 totalTrainData = 451 if options.data=="faces" else 5000 numValidation = numberOfValidationPoints numTest = numberOfTestPoints numTraining = totalTrainData if(options.data=="faces"): rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numValidation,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numValidation) rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) else: rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) rawValidationData = samples.loadDataFile("digitdata/validationimages", numValidation,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numValidation) rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) print ("Extracting features...") trainingData = [] validationData = [] testData = [] for datum1 in rawTestData: k = featureFunction(datum1) testData.append(k) for datum1 in rawValidationData: k = featureFunction(datum1) validationData.append(k) for datum1 in rawTrainingData: k = featureFunction(datum1) trainingData.append(k) for percent in range(1,11): accuracy = [] times = [] print("\n") for runCount in range(0,5): # Extract features print("======================================\n") print ("("+str(runCount+1)+")" + " Extracting random " + str((percent * 10)) + "% of the training data...") numSubTraining = int((percent / 10.0) * totalTrainData) indexes = random.sample(range(0, totalTrainData), numSubTraining) subTrainingData = [] subTrainingLabels = [] for indx in indexes: subTrainingData.append(trainingData[indx]) subTrainingLabels.append(trainingLabels[indx]) # Conduct training and testing start = time.time() print ("("+str(runCount + 1)+")", "Training on", numSubTraining, "data points...") classifier.train(subTrainingData, subTrainingLabels, validationData, validationLabels) end = time.time() elapsed = end - start print ("("+str(runCount + 1)+")" + " Training completed in %0.4f second(s)" % elapsed) times.append(elapsed) # Validation print ("("+str(runCount+1)+")", "Validating...") guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print ("("+str(runCount + 1)+") " + str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) # Testing print ("("+str(runCount+1)+")", "Testing...") guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print ("("+str(runCount + 1)+") " + str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) + "\n") accuracy.append(100.0 * correct / len(testLabels)) averageAccuracy = 0 avg_time = 0 for q in range(0, len(accuracy)): averageAccuracy += accuracy[q] avg_time += times[q] averageAccuracy = averageAccuracy/len(accuracy) avg_time = avg_time/len(times) print("=================\n") print ("Average training time for", numSubTraining, "data points: %0.4f" % avg_time) print ("Average accuracy of " + str(percent * 10) + ("% data training: "), str(averageAccuracy)) std_dev = 0 for a in accuracy: temp = a - averageAccuracy std_dev += (temp*temp) std_dev = std_dev / (len(accuracy) - 1) std_dev = math.sqrt(std_dev) print ("Standard deviation of accuracy: %0.4f" % std_dev) print sys.exit(1) else: numTraining = options.training numTest = options.test classifier.extra = True if(options.data=="faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print ("Extracting features...") trainingData = [] validationData = [] testData = [] for datum1 in rawTestData: k = featureFunction(datum1) testData.append(k) # trainingData = map(featureFunction, rawValidationData) for datum1 in rawValidationData: k = featureFunction(datum1) validationData.append(k) for datum1 in rawTrainingData: k = featureFunction(datum1) trainingData.append(k) # Conduct training and testing print ("Training...") classifier.train(trainingData, trainingLabels, validationData, validationLabels) print ("Validating...") guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print (str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print ("Testing...") guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print (str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # HighOddsFeatures if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ): label1, label2 = options.label1, options.label2 featOdds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): feats = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: feats = "=== Features for which weight (label %d) - weight (label %d) is largest ===" % (label1, label2) print (feats) printImage(featOdds) if((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: featWeights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ===" % l) printImage(featWeights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if (options.data == "faces"): rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest) rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print("Extracting features...") # randomly choose 10% percentage of the dataset from the training set percentage = 0.1 rand_sample = [i for i in sorted(random.sample(range(numTraining), int(numTraining * percentage)))] sample_rawTrainingData = [rawTrainingData[i] for i in rand_sample] sample_trainingLabels = [trainingLabels[i] for i in rand_sample] trainingData = list(map(featureFunction, sample_rawTrainingData)) validationData = list(map(featureFunction, rawValidationData)) testData = list(map(featureFunction, rawTestData)) # Conduct training and testing print("Training...") start = timeit.default_timer() classifier.train(trainingData, sample_trainingLabels, validationData, validationLabels) stop = timeit.default_timer() print("training time is: ", stop - start) print("Validating...") guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print(str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))) print("Testing...") guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1, label2) if (options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print(string3) printImage(features_odds) if ((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print("=== Features with high weight for label %d ===" % l) printImage(features_weights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training numTest = options.test if(options.data=="pacman"): agentToClone = args.get('agentToClone', None) trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get(agentToClone, (None, None, None)) trainingData = trainingData or args.get('trainingData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0] validationData = validationData or args.get('validationData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1] testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][2] rawTrainingData, trainingLabels = samples.loadPacmanData(trainingData, numTraining) rawValidationData, validationLabels = samples.loadPacmanData(validationData, numTest) rawTestData, testLabels = samples.loadPacmanData(testData, numTest) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest) rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest) # Extract features print "Extracting features..." trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage) # do odds ratio computation if specified at command line if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ): label1, label2 = options.label1, options.label2 features_odds = classifier.findHighOddsFeatures(label1,label2) if(options.classifier == "naiveBayes" or options.classifier == "nb"): string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2) else: string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2) print string3 printImage(features_odds) if((options.weights) & (options.classifier == "perceptron")): for l in classifier.legalLabels: features_weights = classifier.findHighWeightFeatures(l) print ("=== Features with high weight for label %d ==="%l) printImage(features_weights)
def runClassifier(args, options): featureFunction = args['featureFunction'] classifier = args['classifier'] printImage = args['printImage'] # Load data numTraining = options.training # Extract features print "Extracting features..." if options.data=="faces": rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining) rawValidationData = samples.loadDataFile("facedata/facedatavalidation", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("facedata/facedatavalidationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE) else: rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining) rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE) rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT) testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE) if options.classifier == "GDA" or options.classifier == "LR": import os.path if os.path.isfile(options.data + '_' + str(numTraining) + '_pca.np'): f = open(options.data + '_' + str(numTraining) + '_pca.np', 'rb') principleComponents, trainingData, validationData, testData = cPickle.load(f) f.close() else: if options.data == "faces": dimension = 13 principleComponents = getPrincipleComponents(map(featureFunction, samples.loadDataFile("facedata/facedatatrain",451,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)), dimension) trainingData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData)), principleComponents) validationData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData)), principleComponents) testData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents) else: dimension = 13 principleComponents = getPrincipleComponents(map(featureFunction, samples.loadDataFile("digitdata/trainingimages",5000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)), dimension) trainingData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData)), principleComponents) validationData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData)), principleComponents) testData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents) f = open(options.data + '_' + str(numTraining) + '_pca.np', 'wb') cPickle.dump((principleComponents, trainingData, validationData, testData), f) f.close() elif options.classifier == "GPC": trainingData = basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData)) validationData = basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData)) testData = basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)) else: trainingData = map(featureFunction, rawTrainingData) validationData = map(featureFunction, rawValidationData) testData = map(featureFunction, rawTestData) # Conduct training and testing print "Training..." classifier.train(trainingData, trainingLabels, validationData, validationLabels) print "Validating..." guesses = classifier.classify(validationData) correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True) print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)) print "Testing..." guesses = classifier.classify(testData) correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True) print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)