def readDigitData(trainingSize=100, testSize=100):
    rootdata = "digitdata/"
    # loading digits data
    rawTrainingData = samples.loadDataFile(
        rootdata + "trainingimages", trainingSize, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT
    )
    trainingLabels = samples.loadLabelsFile(rootdata + "traininglabels", trainingSize)
    rawValidationData = samples.loadDataFile(
        rootdata + "validationimages", TEST_SET_SIZE, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT
    )
    validationLabels = samples.loadLabelsFile(rootdata + "validationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("digitdata/testimages", testSize, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", testSize)
    try:
        print "Extracting features..."
        featureFunction = dataClassifier.basicFeatureExtractorDigit
        trainingData = map(featureFunction, rawTrainingData)
        validationData = map(featureFunction, rawValidationData)
        testData = map(featureFunction, rawTestData)
    except:
        display("An exception was raised while extracting basic features: \n %s" % getExceptionTraceBack())
    return (
        trainingData,
        trainingLabels,
        validationData,
        validationLabels,
        rawTrainingData,
        rawValidationData,
        testData,
        testLabels,
        rawTestData,
    )
示例#2
0
def get_neuron_test_data():
	test_data = samples.loadDataFile("digitdata/testimages", 1000, 28,28)
	test_labels = np.array(samples.loadLabelsFile("digitdata/testlabels", 1000))
	test_labels = test_labels == 3	

	featurized_test_data = np.array(map(dcu.simple_image_featurization, test_data))
	return test_data, featurized_test_data, test_labels
示例#3
0
def get_neuron_training_data():
	training_data = samples.loadDataFile("digitdata/trainingimages", num_train_examples, 28, 28)
	training_labels = np.array(samples.loadLabelsFile("digitdata/traininglabels", num_train_examples))
	training_labels = training_labels == 3

	featurized_training_data = np.array(map(dcu.simple_image_featurization, training_data))
	return training_data, featurized_training_data, training_labels
def runClassifier(args, options):
  classifier = args['classifier']
  printImage = args['printImage']
  # Load data  
  numTraining = options.training
  numTest = options.test
  if(options.data=="faces"):
    print "loading face data set"
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain",FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels")
    rawValidationData = samples.loadDataFile("facedata/facedatavalidation",FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatavalidationlabels")
    rawTestData = samples.loadDataFile("facedata/facedatatest", FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels")
    rawTrainingData,trainingLabels=randomSample(rawTrainingData,trainingLabels,numTraining)
    rawTestData,testLabels=randomSample(rawTestData,testLabels,numTest)
  else:
    print "loading digit data set"
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels")
    rawValidationData = samples.loadDataFile("digitdata/validationimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels")
    rawTestData = samples.loadDataFile("digitdata/testimages",DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels")
    rawTrainingData, trainingLabels = randomSample(rawTrainingData, trainingLabels, numTraining)
    rawTestData, testLabels = randomSample(rawTestData, testLabels, numTest)
  print "Extracting features..."
  if (options.classifier == "linear_svm"):
        if (options.data == "faces"):
            featureFunction = HogFeatureFaceImg
        else:
            featureFunction=HogFeatureImgDigit
        trainingData = map(featureFunction, rawTrainingData)
        trainingData=np.array(trainingData).transpose()
        validationData=map(featureFunction, rawValidationData)
        validationData = np.array(validationData).transpose()
        testData=map(featureFunction, rawTestData)
        testData = np.array(testData).transpose()
  else:
      if (options.data == "faces"):
          featureFunction = enhancedFeatureExtractorFace
      else:
          featureFunction = enhancedFeatureExtractorDigit
      trainingData = map(featureFunction, rawTrainingData)
      validationData = map(featureFunction, rawValidationData)
      testData = map(featureFunction, rawTestData)
  print "Training..."
  start = timeit.default_timer()
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  stop = timeit.default_timer()
  print  stop - start, " s"
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
示例#5
0
def runClassifier(args, options):

  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training

  if(options.data=="faces"):
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE)
    
  
  # Extract features
  print "Extracting features..."
  trainingData = map(featureFunction, rawTrainingData)
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  
  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
  
  # do odds ratio computation if specified at command line
  if((options.odds) & (options.classifier != "mostFrequent")):
    label1, label2 = options.label1, options.label2
    features_odds = classifier.findHighOddsFeatures(label1,label2)
    if(options.classifier == "naiveBayes" or options.classifier == "nb"):
      string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
    else:
      string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)    
      
    print string3
    printImage(features_odds)
示例#6
0
def runClassifier(args, options):

  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training

  if(options.data=="faces"):
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE)
    
  
  # Extract features
  print "Extracting features..."
  trainingData = map(featureFunction, rawTrainingData)
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  
  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
示例#7
0
def run_digits_n_times(digit, iterations, sample_percentage, trainingpath,
                       labelspath):

    all_final_weights = []
    all_final_accuracies = []
    images = []
    labels = []
    '''
            Load either digit training data or face training data
    '''
    n_images == 5000
    images = samples.loadDataFile(trainingpath, n_images, 28, 28)
    labels = samples.loadLabelsFile(labelspath, n_images)
    '''
            Wrap datum in Node object, store Node objects in a list, list index = datum/node's label
    '''
    '''
    
            Compute weights / run perceptron n times, where n = iterations parameter
    
    '''
    for a in range(iterations):

        # Functions_list = [pf.avg_horizontal_line_length, pf.variance_horizontal_line_length]
        images_sample, labels_sample, visited = sample_digits(
            digit, sample_percentage, images, labels)
        '''
                !!!! SUBJECT TO CHANGE !!!
        '''
        # Featureslist = compute_features(functions_list, images_sample, typeflag)
        featureslist = compute_features2(images_sample)
        # Weights = initialize_weights(len(functions_list), 0)
        weights = initialize_weights(28 * 28, 0)
        '''
                Run Perceptron
        '''
        start = time.time()
        final_weights = compute_weights(weights, featureslist, labels_sample)
        elapsed = time.time() - start
        '''
                Validate weights
        '''
        # Why was the first parameter 1???????? should be digit?? re run digits with digit instead of 1
        accuracy = validate_weights(digit, final_weights)

        ###
        print(
            str(digit) + ': ' + str(elapsed) + ' ~~ ' + 'sample percent: ' +
            str(sample_percentage) + ' ~~ ' + str(accuracy) + '%')
        basepath = './TrainingDigitsResults120/TrainingDigitsResults' + str(
            digit) + '/' + str(sample_percentage) + '_percent.txt'
        with open(basepath, 'w') as file:
            for weight in range(len(weights)):
                if weight == len(weights) - 1:
                    file.write(str(weights[weight]) + '\n')
                else:
                    file.write(str(weights[weight]) + ' ')
            file.write(
                str(round(elapsed, 2)) + 's' + ' ' +
                str(round(float(accuracy) / float(100), 2)))
        '''
                Record computed weights and accuracy for this training iteration
        '''
        all_final_weights.append(final_weights)
        all_final_accuracies.append(accuracy)
    '''
            Record mean accuracy for all iterations
    '''
    '''
示例#8
0
def validate_digits120():
    start = time.time()

    weights_vectors = []
    base_path = './TrainingDigitsResults120/TrainingDigitsResults'

    for i in range(0, 10):
        load_path = base_path + str(i) + '/100_percent.txt'
        with open(load_path, 'r') as file:
            lines = file.readlines()
            weightsstrings = lines[0].split()
            weights = [float(weight) for weight in weightsstrings]
            print(weights)
            weights_vectors.append(weights)
            file.close()

    images = samples.loadDataFile('digitdata/validationimages', 1000, 28, 28)
    labels = samples.loadLabelsFile('digitdata/validationlabels', 1000)
    featureslist = compute_features2(images)
    '''
            For each image, apply all weight vectors to image
            Keep track of each sum in a sums list for the current image,
             Choose the sum that:
                The highest value

            The sum's index in the sums dictionary is the same as the weight vector's index
            The weight vector's index is the same as the digit's designated/chosen weights
            This index will be what digit you are guessing the image to be

            Compare this index/guessed number with the labels[image] value, if they are the same, append True to results
                if false, append False to results
    '''

    results = []
    for image in range(len(images)):
        sums = []

        for weightsvector in range(len(weights_vectors)):
            sum = float(0)
            for weight in range(len(weights_vectors[weightsvector])):
                sum += (weights_vectors[weightsvector][weight] *
                        featureslist[image][weight])
            # Add the sum to the sums dictionary
            sums.append(sum)

        max = float('-inf')
        index = -1
        for z in range(10):
            if sums[z] > max:
                max = sums[z]
                index = z

        # print(index, end=" --> ")
        # print(labels[image])

        # Keep track of all guesses vs. labels in a tuples list (guess, label)
        results.append((index, labels[image]))

    correctcount = float(0)
    for t in results:
        if t[0] == t[1]:
            correctcount += float(1)

    print('Digits accuracy: ' +
          str(round((float(correctcount) * 100 / float(len(labels))), 1)) +
          '%')
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']

    # Load data
    dataset = options.dataset
    numTraining = options.training
    numTest = options.test

    if dataset == 'd1':
        rawTrainingData = samples.loadDataFile("data/D1/training_data",
                                               numTraining)
        trainingLabels = samples.loadLabelsFile("data/D1/training_labels",
                                                numTraining)
        rawTestData = samples.loadDataFile("data/D1/test_data", numTest)
        testLabels = samples.loadLabelsFile("data/D1/test_labels", numTest)

    else:
        rawTrainingData = samples.loadDataFile("data/D2/training_data",
                                               numTraining)
        trainingLabels = samples.loadLabelsFile("data/D2/training_labels",
                                                numTraining)
        rawTestData = samples.loadDataFile("data/D2/test_data", numTest)
        testLabels = samples.loadLabelsFile("data/D2/test_labels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, testData, testLabels,
                     options.validate)

    guesses = classifier.classify(trainingData)
    correct = [
        guesses[i] == trainingLabels[i] for i in range(len(trainingLabels))
    ].count(True)

    if (options.classifier == "1vr"):
        f = open("perceptron1vr_train.csv", "a")
        f.write(
            str(len(trainingData)) + "," +
            str(100 * correct / (1.0 * (len(trainingData)))) + '\n')
        f.close()

    print "Testing..."
    guesses = classifier.classify(testData)
    # for i in range(len(testLabels)):
    #     if guesses[i] != testLabels[i]:
    #         print trainingData[i], guesses[i], testLabels[i]
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))

    if (options.classifier == "1vr"):
        f = open("perceptron1vr_test.csv", "a")
        f.write(
            str(len(trainingData)) + "," + str(100 * correct /
                                               (1.0 * (len(testData)))) + '\n')
        f.close()
示例#10
0
import minicontest
import samples
import sys
import util
import pickle
from dataClassifier import DIGIT_DATUM_HEIGHT, DIGIT_DATUM_WIDTH, contestFeatureExtractorDigit

TEST_SIZE = 1000

MINICONTEST_PATH = "minicontest_output.pickle"

if __name__ == '__main__':
    print "Loading training data"
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", 100,
                                             DIGIT_DATUM_WIDTH,
                                             DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                              100)
    rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SIZE,
                                       DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)

    featureFunction = contestFeatureExtractorDigit
    legalLabels = range(10)
    classifier = minicontest.contestClassifier(legalLabels)

    print "Extracting features..."
示例#11
0
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 numTest, FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print("Extracting features...")
    #creates the features for the training
    trainingData = map(featureFunction, rawTrainingData)
    #creates the features for validation
    validationData = map(featureFunction, rawValidationData)
    #features for the testdata
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    copydata = copy(trainingData)
    copylabel = copy(trainingLabels)
    begin = time.time()
    for percentageOfData in range(10):
        start = time.time()
        trainingData = copy(copydata)
        trainingLabels = copy(copylabel)
        del trainingData[numTraining / 10 * (percentageOfData + 1):numTraining]
        del trainingLabels[numTraining / 10 *
                           (percentageOfData + 1):numTraining]
        print("Training: %d " % (numTraining / 10 * (percentageOfData + 1)))
        # basically trains the data
        classifier.train(trainingData, trainingLabels, validationData,
                         validationLabels)
        print("Validating...")
        guesses = classifier.classify(validationData)
        correct = [
            guesses[i] == validationLabels[i]
            for i in range(len(validationLabels))
        ].count(True)
        print(str(correct),
              ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") %
              (100.0 * correct / len(validationLabels)))
        print("Testing...")
        guesses = classifier.classify(testData)
        correct = [
            guesses[i] == testLabels[i] for i in range(len(testLabels))
        ].count(True)
        print(str(correct),
              ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
              (100.0 * correct / len(testLabels)))
        elapsed_time_fl = (time.time() - start)
        print("%d secs" % (elapsed_time_fl))
    totaltime = (time.time() - begin)
    print("Total time: %d" % (totaltime))

    #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print(string3)
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
示例#12
0
import numpy as np

def writeLabeledData(prefix, labeled_data):
    datums, labels = zip(*labeled_data)

    with open(prefix + "images", 'w') as f:
        for datum in datums:
            f.write(str(datum) + "\n")
        f.close()

    with open(prefix + "labels", 'w') as f:
        for label in labels:
            f.write(str(label) + "\n")
        f.close()

rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000)
rawValidationData = samples.loadDataFile("digitdata/validationimages", 1000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
validationLabels = samples.loadLabelsFile("digitdata/validationlabels", 1000)
rawTestData = samples.loadDataFile("digitdata/testimages", 1000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
testLabels = samples.loadLabelsFile("digitdata/testlabels", 1000)


all_data = rawTrainingData + rawValidationData + rawTestData
all_labels = trainingLabels + validationLabels + testLabels

labeled_data = zip(all_data, all_labels)

perm = np.random.permutation(len(labeled_data))

permuted_data = []
示例#13
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    rawTrainingData = samples.loadDataFile("data/digitdata/trainingimages",
                                           numTraining, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("data/digitdata/traininglabels",
                                            numTraining)
    completeRawTrainingData = samples.loadDataFile(
        "data/digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH,
        DIGIT_DATUM_HEIGHT)
    completeTrainingLabels = samples.loadLabelsFile(
        "data/digitdata/traininglabels", 5000)
    rawValidationData = samples.loadDataFile("data/digitdata/validationimages",
                                             numTest, DIGIT_DATUM_WIDTH,
                                             DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile(
        "data/digitdata/validationlabels", numTest)
    rawTestData = samples.loadDataFile("data/digitdata/testimages", numTest,
                                       DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("data/digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    completeTrainingData = map(featureFunction, completeRawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    print "Testing training data..."
    guesses = classifier.classify(completeTrainingData)
    correct = [
        guesses[i] == completeTrainingLabels[i]
        for i in range(len(completeTrainingLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(completeTrainingLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(completeTrainingLabels))

    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)

    if ((options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 numTest, FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/trainingimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print("Extracting features...")
    """ old
  trainingData = map(featureFunction, rawTrainingData)
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  """
    trainingData = []
    validationData = []
    testData = []
    for d in rawTrainingData:
        trainingData.append(featureFunction(d))
    for d in rawValidationData:
        validationData.append(featureFunction(d))
    for d in rawTestData:
        testData.append(featureFunction(d))

    # Conduct training and testing
    print("Training...")
    trainTimeStart = time.time()
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print("Training completed in %s seconds." % (time.time() - trainTimeStart))
    print("Validating...")
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print(str(correct),
          ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") %
          (100.0 * correct / len(validationLabels)))
    print("Testing...")
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print(str(correct),
          ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
          (100.0 * correct / len(testLabels)))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print(string3)
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
示例#15
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data

    # numTraining = options.training
    # numTest = options.test

    if options.specialMode:
        numberOfTestPoints = 150 if options.data == "faces" else 1000
        numberOfValidationPoints = 301 if options.data == "faces" else 1000
        totalTrainData = 451 if options.data == "faces" else 5000
        numValidation = numberOfValidationPoints
        numTest = numberOfTestPoints
        numTraining = totalTrainData

        # Load Test Data Set
        if options.data == "faces":
            rawTestData = samples.loadDataFile("facedata/facedatatest",
                                               numTest, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                                numTest)
            rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                     numValidation,
                                                     FACE_DATUM_WIDTH,
                                                     FACE_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile(
                "facedata/facedatatrainlabels", numValidation)
            rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                                   numTraining,
                                                   FACE_DATUM_WIDTH,
                                                   FACE_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile(
                "facedata/facedatatrainlabels", numTraining)
        else:
            rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                               DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                                numTest)
            rawValidationData = samples.loadDataFile(
                "digitdata/validationimages", numValidation, DIGIT_DATUM_WIDTH,
                DIGIT_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile(
                "digitdata/validationlabels", numValidation)
            rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                                   numTraining,
                                                   DIGIT_DATUM_WIDTH,
                                                   DIGIT_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                    numTraining)

        print("Extracting features...")
        testData = map(featureFunction, rawTestData)
        validationData = map(featureFunction, rawValidationData)
        trainingData = map(featureFunction, rawTrainingData)

        for percent in range(1, 11):
            acc = []
            aTime = []
            for runCount in range(0, 4):
                # Extract features
                print("======================================")
                print("(" + str(runCount) + ")", "Building random",
                      (percent * 10), " percent of Training Data...")
                numSubTraining = int((percent / 10.0) * totalTrainData)
                indexes = random.sample(range(0, totalTrainData),
                                        numSubTraining)
                subTrainingData = []
                subTrainingLabels = []

                for indx in indexes:
                    subTrainingData.append(trainingData[indx])
                    subTrainingLabels.append(trainingLabels[indx])

                validationData = []
                validationLabels = []

                # Conduct training and testing
                start = time.time()
                print("(" + str(runCount) + ")", "Training", numSubTraining,
                      "points ...")
                if options.classifier == 'nearestNeighbors':
                    classifier.train(trainingData, trainingLabels, testData,
                                     testLabels, options.k_number_of_neighbors)
                else:
                    classifier.train(subTrainingData, subTrainingLabels,
                                     validationData, validationLabels)
                end = time.time()
                elapsed = end - start
                print("Elapsed Time:", elapsed)
                aTime.append(elapsed)

                print("(" + str(runCount) + ")", "Testing...")
                guesses = classifier.classify(testData)
                correct = [
                    guesses[i] == testLabels[i] for i in range(len(testLabels))
                ].count(True)
                print(
                    str(correct),
                    ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
                    (100.0 * correct / len(testLabels)))
                acc.append(100.0 * correct / len(testLabels))

            mean = 0
            avgT = 0
            for q in range(0, len(acc)):
                mean += acc[q]
                avgT += aTime[q]
            mean = mean / len(acc)
            avgT = avgT / len(aTime)
            print("---------------")
            print("Average training time for", numSubTraining, "data points: ",
                  avgT)
            print("Average accuracy of", (percent * 10),
                  "percent data training: ", str(mean))
            sd = 0
            for a in acc:
                tmp = a - mean
                sd += (tmp * tmp)
            sd = sd / (len(acc) - 1)
            sd = math.sqrt(sd)
            print("Standard Derivation in accuracy:", sd)

        sys.exit(1)

    else:
        numTraining = options.training
        numTest = options.test

        if options.data == "faces":
            rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                                   numTraining,
                                                   FACE_DATUM_WIDTH,
                                                   FACE_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile(
                "facedata/facedatatrainlabels", numTraining)
            rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                     numTest, FACE_DATUM_WIDTH,
                                                     FACE_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile(
                "facedata/facedatatrainlabels", numTest)
            rawTestData = samples.loadDataFile("facedata/facedatatest",
                                               numTest, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                                numTest)
        else:
            rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                                   numTraining,
                                                   DIGIT_DATUM_WIDTH,
                                                   DIGIT_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                    numTraining)
            rawValidationData = samples.loadDataFile(
                "digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH,
                DIGIT_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile(
                "digitdata/validationlabels", numTest)
            rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                               DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                                numTest)

        # Extract features
        print("Extracting features...")
        trainingData = map(featureFunction, rawTrainingData)
        validationData = map(featureFunction, rawValidationData)
        testData = map(featureFunction, rawTestData)
        if options.k_number_of_neighbors > 0:
            k = options.k_number_of_neighbors

        # Conduct training and testing
        print("Training...")
        if options.classifier == 'nearestNeighbors':
            classifier.train(trainingData, trainingLabels, testData,
                             testLabels, options.k_number_of_neighbors)
        else:
            classifier.train(trainingData, trainingLabels, validationData,
                             validationLabels)

        if options.classifier != 'nearestNeighbors':
            print("Validating...")
            guesses = classifier.classify(validationData)
            correct = [
                guesses[i] == validationLabels[i]
                for i in range(len(validationLabels))
            ].count(True)
            print(str(correct),
                  ("correct out of " + str(len(validationLabels)) +
                   " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
            print("Testing...")
            guesses = classifier.classify(testData)
            correct = [
                guesses[i] == testLabels[i] for i in range(len(testLabels))
            ].count(True)
            print(str(correct),
                  ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
                  (100.0 * correct / len(testLabels)))
            analysis(classifier, guesses, testLabels, testData, rawTestData,
                     printImage)

        if options.analysis:
            analysis(classifier, guesses, testLabels, testData, rawTestData,
                     printImage)

        # do odds ratio computation if specified at command line
        if options.odds & (options.classifier == "naiveBayes" or
                           (options.classifier == "nb")):
            label1, label2 = options.label1, options.label2
            features_odds = classifier.findHighOddsFeatures(label1, label2)
            if options.classifier == "naiveBayes" or options.classifier == "nb":
                string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                    label1, label2)
            else:
                string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                    label1, label2)

            print(string3)
            printImage(features_odds)

        if options.weights & (options.classifier == "perceptron"):
            for l in classifier.legalLabels:
                features_weights = classifier.findHighWeightFeatures(l)
                print("=== Features with high weight for label %d ===" % l)
                printImage(features_weights)
示例#16
0
def runClassifier(args, options):
  #print 'args: ', args
  #print 'options', options
  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training
  numTest = options.test

  if(options.data=="faces"):
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest)
    rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
    rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)
    
  
  # Extract features
  #print "Extracting features..."
  #print '#######type of rawTrainingData is', rawTrainingData.__class__ # list of Datum
  #print '#######type of rawTrainingData[0] is', rawTrainingData[0].__class__ # Datum
  trainingData = map(featureFunction, rawTrainingData)
  #print '#######type of trainingData is', trainingData.__class__ # list of Counter
  #print '#######type of trainingData[0] is', trainingData[0].__class__ # Counter
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  
  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
  
  # do odds ratio computation if specified at command line
  if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
    label1, label2 = options.label1, options.label2
    features_odds = classifier.findHighOddsFeatures(label1,label2)
    if(options.classifier == "naiveBayes" or options.classifier == "nb"):
      string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
    else:
      string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)    
      
    print string3
    printImage(features_odds)

  if((options.weights) & (options.classifier == "perceptron")):
    for l in classifier.legalLabels:
      features_weights = classifier.findHighWeightFeatures(l)
      print ("=== Features with high weight for label %d ==="%l)
      printImage(features_weights)
示例#17
0
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test
    thisWidth = 0
    thisHeight = 0
    num = 0
    sample = options.sample
    reps = options.repetitions
    acc = []
    stdDev = []
    for a in range(sample):
        acc.append(0)
    counter = 0
    for r in range(reps):
        print("______Repetition " + str(r + 1) + " out of " + str(reps) +
              "______")
        counter += 1
        accTemp = []
        for i in range(sample):
            print("__________" + str(i + 1) + " try with " +
                  str(100 * (i + 1) / sample) +
                  "% percent of random training data__________")
            arr = [j for j in range(numTraining)]
            random.shuffle(arr)
            if (options.data == "faces"):
                rawTrainingData1 = samples.loadDataFile(
                    "facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH,
                    FACE_DATUM_HEIGHT)
                trainingLabels1 = samples.loadLabelsFile(
                    "facedata/facedatatrainlabels", numTraining)
                rawValidationData = samples.loadDataFile(
                    "facedata/facedatatrain", numTest, FACE_DATUM_WIDTH,
                    FACE_DATUM_HEIGHT)
                validationLabels = samples.loadLabelsFile(
                    "facedata/facedatatrainlabels", numTest)
                rawTestData = samples.loadDataFile("facedata/facedatatest",
                                                   numTest, FACE_DATUM_WIDTH,
                                                   FACE_DATUM_HEIGHT)
                testLabels = samples.loadLabelsFile(
                    "facedata/facedatatestlabels", numTest)
                thisWidth = FACE_DATUM_WIDTH
                thisHeight = FACE_DATUM_HEIGHT
                num = 2
            else:
                rawTrainingData1 = samples.loadDataFile(
                    "digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH,
                    DIGIT_DATUM_HEIGHT)
                trainingLabels1 = samples.loadLabelsFile(
                    "digitdata/traininglabels", numTraining)
                rawValidationData = samples.loadDataFile(
                    "digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH,
                    DIGIT_DATUM_HEIGHT)
                validationLabels = samples.loadLabelsFile(
                    "digitdata/validationlabels", numTest)
                rawTestData = samples.loadDataFile("digitdata/testimages",
                                                   numTest, DIGIT_DATUM_WIDTH,
                                                   DIGIT_DATUM_HEIGHT)
                testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                                    numTest)
                thisWidth = DIGIT_DATUM_WIDTH
                thisHeight = DIGIT_DATUM_HEIGHT
                num = 10
            rawTrainingData = []
            trainingLabels = []

            for n in arr[:(numTraining * (1 + i) / sample)]:
                rawTrainingData.append(rawTrainingData1[n])
                trainingLabels.append(trainingLabels1[n])

            # Extract features
            print "Extracting features..."
            trainingData = map(featureFunction, rawTrainingData)
            validationData = map(featureFunction, rawValidationData)
            testData = map(featureFunction, rawTestData)

            # Conduct training and testing
            print "Training..."
            classifier.train(trainingData, trainingLabels, validationData,
                             validationLabels, thisWidth, thisHeight, num)
            print "Validating..."
            guesses = classifier.classify(validationData)
            correct = [
                guesses[i] == validationLabels[i]
                for i in range(len(validationLabels))
            ].count(True)
            print str(correct), ("correct out of " +
                                 str(len(validationLabels)) + " (%.1f%%).") % (
                                     100.0 * correct / len(validationLabels))
            print "Testing..."
            guesses = classifier.classify(testData)
            correct = [
                guesses[i] == testLabels[i] for i in range(len(testLabels))
            ].count(True)
            percentCorrect = (100.0 * correct / len(testLabels))
            print str(correct), ("correct out of " + str(len(testLabels)) +
                                 " (" + str(percentCorrect) + "%).")
            accTemp.append(percentCorrect)
            #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
        for i in range(len(acc)):
            acc[i] += accTemp[i]
        stdDev.append(accTemp)
    accAvg = []
    stdDev2 = [[] for k in range(sample)]
    for i in range(sample):
        for j in range(reps):
            stdDev2[i].append(stdDev[j][i])
    for i in range(len(acc)):
        accAvg.append(acc[i] / reps)
        currSTD = np.std(stdDev2[i])
        print("Accuracy with " + str(100 * (i + 1) / sample) +
              "% of training data: " + str(accAvg[i]) + "%, Standard Dev: " +
              str(currSTD))
    print("Repetitions: " + str(reps))

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print string3
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
def runClassifier(args, options):
    start = timeit.default_timer()
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']
    percent  = args['percent'] / 100.0

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "faces"):
        face_test_size = 150
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain", 450, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", 450)

        zipped = list(zip(rawTrainingData, trainingLabels));

        amount = int(450 * percent)

        randomed = sample(zipped, amount)
        unzipped = zip(*randomed)
        rawTrainingData = unzipped[0]
        trainingLabels = unzipped[1]

        rawValidationData = samples.loadDataFile("facedata/facedatatrain", face_test_size, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", face_test_size)
        rawTestData = samples.loadDataFile("facedata/facedatatest", face_test_size, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", face_test_size)
    else:
        digit_test_size = 1000
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 5000)

        zipped = list(zip(rawTrainingData,trainingLabels));
        amount = int(5000 * percent)

        randomed = sample(zipped, amount)
        unzipped = zip(*randomed)
        rawTrainingData = unzipped[0]
        trainingLabels = unzipped[1]

        print("Len of sampled data + ", len(rawTrainingData), len(trainingLabels))

        rawValidationData = samples.loadDataFile("digitdata/validationimages", digit_test_size, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels", digit_test_size)
        rawTestData = samples.loadDataFile("digitdata/testimages", digit_test_size, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", digit_test_size)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData, validationLabels)
    """print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (
                100.0 * correct / len(validationLabels))"""
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)

        print string3
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print ("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)

    stop = timeit.default_timer()

    print('Classifier Run Time (in seconds): ', stop - start)
import samples
import util
import numpy as np
import os
from samples import Datum
from samples import readlines
from dataClassifier import DIGIT_DATUM_HEIGHT, DIGIT_DATUM_WIDTH, contestFeatureExtractorDigit
from samples import IntegerConversionFunction

featureFunction = contestFeatureExtractorDigit
rawTrainingData = samples.loadDataFile("digitdata/testimages", 1, 28, 28)
trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", 1)
fin = readlines("digitdata/testimages")
fin.reverse()

a = ['+', ' ', '#']
print(IntegerConversionFunction(a))
data = []
items = []
for j in range(28):
    data.append(list(fin.pop()))

for i in range(28):
    print(data[i])

items.append(Datum(data, 28, 28))
print(items[0].getPixels())
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test
    percent = options.percentage

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 numTest, FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)
    if options.k_number_of_neighbors > 0:
        k = options.k_number_of_neighbors

    # Conduct training and testing
    print "Training..."
    if (options.classifier == 'nearestNeighbors'):
        classifier.train(trainingData, trainingLabels, testData, testLabels,
                         options.k_number_of_neighbors)
    else:

        randTrainingData, randTrainingLabels = randomData(
            trainingData, trainingLabels, percent)
        start = time.clock()
        classifier.train(randTrainingData, randTrainingLabels, validationData,
                         validationLabels)
        runTime = time.clock() - start
        print "training set runtime:\t" + str(runTime)
    if (options.classifier != 'nearestNeighbors'):
        print "Validating..."
        guesses = classifier.classify(validationData)
        correct = [
            guesses[i] == validationLabels[i]
            for i in range(len(validationLabels))
        ].count(True)
        print str(correct), ("correct out of " + str(len(validationLabels)) +
                             " (%.1f%%).") % (100.0 * correct /
                                              len(validationLabels))
        print "Testing..."
        guesses = classifier.classify(testData)
        correct = [
            guesses[i] == testLabels[i] for i in range(len(testLabels))
        ].count(True)
        print str(correct), ("correct out of " + str(len(testLabels)) +
                             " (%.1f%%).") % (100.0 * correct /
                                              len(testLabels))
    # analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print string3
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
import math
import samples

#Naive Bayes - Face Data
if __name__ == '__main__':
    print "Training Phase"
    #stores training data and appropriate labels for faces
    n = 450
    items = samples.loadDataFile("facedata/facedatatrain", n, 60, 70)
    labels = samples.loadLabelsFile("facedata/facedatatrainlabels", n)
    all_feature_vectors = []  #stores all quadrants of all sample images

    for k in range(n):
        #break up face data into 100 6x7 pixel quadrants for feature extraction
        feature_quadrants = []  #will be a list of lists
        temp_array = []
        i_start = 0
        i_end = 6
        j_start = 0
        j_end = 7

        while i_end <= 60 and j_end <= 70:
            #parse through image and store pixels in a temporary array
            for i in range(i_start, i_end):
                for j in range(j_start, j_end):
                    temp_array.append(items[k].getPixel(i, j))

            #add temp_array to feature_quadrant array and reassign temp_array
            feature_quadrants.append(temp_array)
            temp_array = []
示例#22
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "pacman"):
        agentToClone = args.get('agentToClone', None)
        trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get(
            agentToClone, (None, None, None))
        trainingData = trainingData or args.get(
            'trainingData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0]
        validationData = validationData or args.get(
            'validationData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1]
        testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[
            'ContestAgent'][2]
        rawTrainingData, trainingLabels = samples.loadPacmanData(
            trainingData, numTraining)
        rawValidationData, validationLabels = samples.loadPacmanData(
            validationData, numTest)
        rawTestData, testLabels = samples.loadPacmanData(testData, numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                           numTraining, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                            numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                             numTest, DIGIT_DATUM_WIDTH,
                                             DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                              numTest)
    rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                       DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels, options.validate)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))

    if (options.classifier == "perceptron"):
        f = open("perceptron_valid.csv", "a")
        f.write(
            str(len(trainingData)) + "," +
            str(100 * correct / (1.0 * (len(validationData)))) + '\n')
        f.close()

    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)

    if (options.classifier == "perceptron"):
        f = open("perceptron_test.csv", "a")
        f.write(
            str(len(trainingData)) + "," + str(100 * correct /
                                               (1.0 * (len(testData)))) + '\n')
        f.close()
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training

    # Extract features
    print "Extracting features..."
    if options.data == "faces":
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatavalidation",
                                                 TEST_SET_SIZE,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatavalidationlabels", TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("facedata/facedatatest",
                                           TEST_SET_SIZE, FACE_DATUM_WIDTH,
                                           FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            TEST_SET_SIZE)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 TEST_SET_SIZE,
                                                 DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("digitdata/testimages",
                                           TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                            TEST_SET_SIZE)

    if options.classifier == "GDA":
        if options.data == "faces":
            dimension = 13
            principleComponents = getPrincipleComponents(
                map(
                    featureFunction,
                    samples.loadDataFile("facedata/facedatatrain", 451,
                                         FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)),
                dimension)
            trainingData = np.dot(
                basicFeatureDataToNumpyArray(
                    map(featureFunction, rawTrainingData)),
                principleComponents)
            validationData = np.dot(
                basicFeatureDataToNumpyArray(
                    map(featureFunction, rawValidationData)),
                principleComponents)
            testData = np.dot(
                basicFeatureDataToNumpyArray(map(featureFunction,
                                                 rawTestData)),
                principleComponents)
        else:
            dimension = 13
            principleComponents = getPrincipleComponents(
                map(
                    featureFunction,
                    samples.loadDataFile("digitdata/trainingimages", 5000,
                                         DIGIT_DATUM_WIDTH,
                                         DIGIT_DATUM_HEIGHT)), dimension)
            trainingData = np.dot(
                basicFeatureDataToNumpyArray(
                    map(featureFunction, rawTrainingData)),
                principleComponents)
            validationData = np.dot(
                basicFeatureDataToNumpyArray(
                    map(featureFunction, rawValidationData)),
                principleComponents)
            testData = np.dot(
                basicFeatureDataToNumpyArray(map(featureFunction,
                                                 rawTestData)),
                principleComponents)
    else:
        trainingData = map(featureFunction, rawTrainingData)
        validationData = map(featureFunction, rawValidationData)
        testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)
示例#25
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 TEST_SET_SIZE,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("facedata/facedatatest",
                                           TEST_SET_SIZE, FACE_DATUM_WIDTH,
                                           FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            TEST_SET_SIZE)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 TEST_SET_SIZE,
                                                 DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("digitdata/testimages",
                                           TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                            TEST_SET_SIZE)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    print "len guess: %d valid: %d" % (len(guesses), len(validationLabels))
    print guesses[0], guesses[10]
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier != "mostFrequent")):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print string3
        printImage(features_odds)
示例#26
0
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data    
    numTraining = options.training
    numTest = options.test

    if(options.data=="faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)


    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Start training..."
    start = time.time()
    classifier.train(trainingData, trainingLabels, validationData, validationLabels)
    end = time.time() - start
    print "Traning time: " + str(end)
    print "Start validating..."
    guesses = classifier.classify(validationData)
    correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
    print "Validation result: ", str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
    print "Start testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
    print "Testing result: ", str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
    #analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if((options.odds) & (options.classifier == NB) ):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1,label2)
        if(options.classifier == NB):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)        

        print string3
        printImage(features_odds)

    if((options.weights) & (options.classifier == PT)):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print ("=== Features with high weight for label %d ==="%l)
            printImage(features_weights)
"""This file is in Beta and is not the real autograder."""

import data_classification_utils as dcu
import samples
import numpy as np

training_data = samples.loadDataFile("digitdata/trainingimages", 1, 28, 28)
features = dcu.simple_image_featurization(training_data[0])

expected = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 2., 2., 2., 2., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
if not np.array_equal(features, expected):
	print("Error, featurization is incorrect. You reported: ")
	print(features)

示例#28
0
def runClassifier():
    """
  Harness code for running different classifiers on the face or digit data.
  
  This is the main function for classification, and is designed
  to be invoked from the command line (outside the Python interpreter).
  
  Usage:
    > python dataClassifier.py 
    OR
    > python dataClassifier.py <data> <classifierName>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples>
    OR
    > python dataClassifier.py <data> <classifierName> <featureFunction> <numTrainingExamples> <odds class1 class2>
    
  For example:
    > python dataClassifier.py digits naivebayes basic 1000
    
  would run the naive Bayes classifier on 1000 training examples using the
  basicFeatureExtractor function, and then test the classifier on the test data.
  """
    print "Doing classification"
    print "--------------------"
    # Assign default values for arguments if they are not provided.
    if (len(sys.argv) == 1):
        print "No data specified; using digits."
        sys.argv.append("digits")
    if (len(sys.argv) == 2):
        print "No classifier specified; using default."
        sys.argv.append("mostfrequent")
    if (len(sys.argv) == 3):
        print "No feature extraction function specified; using default."
        sys.argv.append("basic")
    if (len(sys.argv) == 4):
        print "No training set size specified; using default."
        sys.argv.append("100")
    if (len(sys.argv) == 5):
        print "Not doing odds ratio computation."
        sys.argv.append("noodds")

    # Set up variables according to the command line input.
    print "data:\t\t" + sys.argv[1]
    print "classifier:\t\t" + sys.argv[2]
    print "feature extractor:\t" + sys.argv[3]
    print "training set size:\t" + sys.argv[4]
    if ((sys.argv[1] == "digits") & (sys.argv[3] == "basic")):
        featureFunction = basicFeatureExtractorDigit
    elif ((sys.argv[1] == "faces") & (sys.argv[3] == "basic")):
        featureFunction = basicFeatureExtractorFace
    elif ((sys.argv[1] == "digits") & (sys.argv[3] == "enhanced")):
        featureFunction = enhancedFeatureExtractorDigit
    elif ((sys.argv[1] == "faces") & (sys.argv[3] == "enhanced")):
        featureFunction = enhancedFeatureExtractorFace
    else:
        print "Unknown feature function:", sys.argv[2]
        return

    if (sys.argv[1] == "digits"):  # if digits detect
        legalLabels = range(10)
    else:  # if face detect
        legalLabels = range(2)

    if (sys.argv[2] == "mostfrequent"):
        classifier = mostFrequent.MostFrequentClassifier(legalLabels)
    elif (sys.argv[2] == "naivebayes"):
        classifier = naiveBayes.NaiveBayesClassifier(legalLabels)
    elif (sys.argv[2] == "perceptron"):
        classifier = perceptron.PerceptronClassifier(legalLabels)
    else:
        print "Unknown classifier:", sys.argv[2]
        return

    # Load data
    numTraining = int(sys.argv[4])

    if (sys.argv[1] == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 TEST_SET_SIZE,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("facedata/facedatatest",
                                           TEST_SET_SIZE, FACE_DATUM_WIDTH,
                                           FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            TEST_SET_SIZE)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 TEST_SET_SIZE,
                                                 DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("digitdata/testimages",
                                           TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                            TEST_SET_SIZE)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    util.pause()
    analysis(classifier, guesses, testLabels, rawTestData)

    # do odds ratio computation if specified at command line
    if ((sys.argv[5] == "odds") & (len(sys.argv) == 8)):
        features_class1, features_class2, features_odds = classifier.findHighOddsFeatures(
            int(sys.argv[6]), int(sys.argv[7]))
        if (sys.argv[1] == "faces"):
            printImage(features_class1, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
            printImage(features_class2, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
            printImage(features_odds, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        else:
            printImage(features_class1, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
            printImage(features_class2, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
            printImage(features_odds, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
示例#29
0
def runClassifier(args, options):

  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options['train']

  if(options['data']=="faces"):
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatatrain", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE)
    
  
  # Extract features
  print "Extracting features..."
  trainingData = map(featureFunction, rawTrainingData)
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  
  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  util.pause()
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
  
  # do odds ratio computation if specified at command line
  if((options['odds']) & (options['classifier'] != "mostfrequent")):
    class1, class2 = options['class1'], options['class2']
    features_class1,features_class2,features_odds = classifier.findHighOddsFeatures(class1,class2)
    if(options['classifier'] == "naivebayes"):
      string1 = "=== Features with max P(F_i = 1 | class = %d) ===" % class1
      string2 = "=== Features with max P(F_i = 1 | class = %d) ===" % class2
      string3 = "=== Features with highest odd ratio of class %d over class %d ===" % (class1, class2)
    else:
      string1 = "=== Features with largest weight for class %d ===" % class1
      string2 = "=== Features with largest weight for class %d ===" % class2
      string3 = "=== Features with for which weight(class %d)-weight(class %d) is biggest ===" % (class1, class2)    
      
    print string1
    printImage(features_class1)
    print string2
    printImage(features_class2)
    print string3
    printImage(features_odds)
示例#30
0
def get_digit_acc2():
    images = samples.loadDataFile('digitdata/trainingimages', 5000, 28, 28)
    labels = samples.loadLabelsFile('digitdata/traininglabels', 5000)

    accuracies = []
    times = []

    def convertlabelslist(digit, labelslist):
        booleanlist = [False for i in range(len(labelslist))]
        for i in range(len(labelslist)):
            if labelslist[i] == digit:
                booleanlist[i] = True
            else:
                booleanlist[i] = False
        return booleanlist

    # For each percentage
    for i in range(10):
        sample_percentage = float((i + 1) / 10.0)
        sample_size = int(math.floor(sample_percentage * float(len(labels))))

        joinedlists = list(zip(images, labels))
        images_sample = []
        labels_sample = []

        # Shuffle until sample has all 9 digits
        while True:

            joinedlists = list(zip(images, labels))
            random.shuffle(joinedlists)
            images_sample, labels_sample = zip(*joinedlists)
            images_sample = images_sample[:sample_size]
            labels_sample = labels_sample[:sample_size]
            #print(labels_sample)
            if 0 in labels_sample and 1 in labels_sample and 2 in labels_sample\
                    and 3 in labels_sample and 4 in labels_sample and 5 in labels_sample\
                    and 6 in labels_sample and 7 in labels_sample and 8 in labels_sample\
                    and 9 in labels_sample:
                break

        # Have 10 different labels list for each digit
        # Convert labelslist into true/false instead of numbers
        labelslist0 = convertlabelslist(0, labels_sample)
        labelslist1 = convertlabelslist(1, labels_sample)
        labelslist2 = convertlabelslist(2, labels_sample)
        labelslist3 = convertlabelslist(3, labels_sample)
        labelslist4 = convertlabelslist(4, labels_sample)
        labelslist5 = convertlabelslist(5, labels_sample)
        labelslist6 = convertlabelslist(6, labels_sample)
        labelslist7 = convertlabelslist(7, labels_sample)
        labelslist8 = convertlabelslist(8, labels_sample)
        labelslist9 = convertlabelslist(9, labels_sample)

        all_labels = [
            labelslist0, labelslist1, labelslist2, labelslist3, labelslist4,
            labelslist5, labelslist6, labelslist7, labelslist8, labelslist9
        ]

        start = time.time()

        # Compute weight vectors for all digits 0-9
        all_weight_vectors = []
        for j in range(10):
            featureslist = perceptron.compute_features2(images_sample)
            weights = perceptron.initialize_weights(28 * 28, 0)
            computed_weights = perceptron.compute_weights(
                weights, featureslist, all_labels[j])
            all_weight_vectors.append((computed_weights))
        elapsed = round(time.time() - start, 2)

        acc = demo_digits(all_weight_vectors)
        print(str((i + 1) * 10) + ' ' + str(elapsed) + ' ' + str(acc))
示例#31
0
def runTask(task):
    print("Grading task " + str(task))
    if task == 2 or task == 5:
        print(
            "This is a manually graded task, write your answers in the pdf file"
        )
    elif task == 1:
        print("The solution cannot be exposed to you now. :)")
    elif task == 3:
        print("Ungraded task")

    else:
        if task == 6:
            numTraining = 800
            numTest = 200
            num_classes = 4
        else:
            numTraining = dataClassifier.TRAIN_SET_SIZE
            numTest = dataClassifier.TEST_SET_SIZE
            num_classes = 10

        if task == 6:
            rawTrainingData = samples.loadDataFile("data/D2/training_data",
                                                   numTraining)
            trainingLabels = samples.loadLabelsFile("data/D2/training_labels",
                                                    numTraining)
            rawTestData = samples.loadDataFile("data/D2/test_data", numTest)
            testLabels = samples.loadLabelsFile("data/D2/test_labels", numTest)
            featureFunction = dataClassifier.enhancedFeatureExtractorDigit

        else:
            rawTrainingData = samples.loadDataFile("data/D1/training_data",
                                                   numTraining)
            trainingLabels = samples.loadLabelsFile("data/D1/training_labels",
                                                    numTraining)
            rawTestData = samples.loadDataFile("data/D1/test_data", numTest)
            testLabels = samples.loadLabelsFile("data/D1/test_labels", numTest)
            featureFunction = dataClassifier.basicFeatureExtractorDigit

        legalLabels = range(num_classes)
        classifier = perceptron1vr.Perceptron1vrClassifier(legalLabels, 3)

        # Extract features
        print("Extracting features...")
        trainingData = map(featureFunction, rawTrainingData)
        testData = map(featureFunction, rawTestData)

        # Conduct training and testing
        print("Training...")
        classifier.train(trainingData, trainingLabels, testData, testLabels,
                         False)

        print("Testing...")
        guesses = classifier.classify(testData)
        correct = [
            guesses[i] == testLabels[i] for i in range(len(testLabels))
        ].count(True)
        acc = 100 * correct / (1.0 * (len(testLabels)))

        if task == 4:
            marks = 0
            if (acc > 70):
                marks = 3
            elif (acc > 60):
                marks = 2
            elif (acc > 50):
                marks = 1
            print("Received Marks : " + str(marks) + "/3")
        elif task == 6:
            marks = 0
            print(acc)
            if len(testData[0]) <= 5:
                if (acc > 85):
                    marks = 3
                elif (acc > 65):
                    marks = 2
                elif (acc > 45):
                    marks = 1
            else:
                print("More than permissible features used")
            print("Received Marks : " + str(marks) + "/3")
    print("--------------------------------------------------------")
示例#32
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    trainingFactor = options.training

    print "training factor {}".format(trainingFactor)

    if options.data == "faces":
        TEST_SET_SIZE = 150

        numTraining = int(451 * trainingFactor)

        print "using {} datapoints out of {} ({}%) for faces".format(
            numTraining, 451, 100 * (numTraining / float(451)))

        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 TEST_SET_SIZE,
                                                 FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("facedata/facedatatest",
                                           TEST_SET_SIZE, FACE_DATUM_WIDTH,
                                           FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            TEST_SET_SIZE)
    else:
        TEST_SET_SIZE = 1000

        numTraining = int(5000 * trainingFactor)

        print "using {} datapoints out of {} ({}%) for digits".format(
            numTraining, TEST_SET_SIZE, 100 * (numTraining / float(5000)))

        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 TEST_SET_SIZE,
                                                 DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  TEST_SET_SIZE)
        rawTestData = samples.loadDataFile("digitdata/testimages",
                                           TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels",
                                            TEST_SET_SIZE)

    # Extract features
    print("Extracting features...")
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    automatic = options.run
    if automatic:
        outcomes = {}
        for iterator in range(5):
            print("Training...")

            if options.data == "faces":
                rawTrainingData = samples.loadDataFile(
                    "facedata/facedatatrain", 451, FACE_DATUM_WIDTH,
                    FACE_DATUM_HEIGHT)
                trainingLabels = samples.loadLabelsFile(
                    "facedata/facedatatrainlabels", 451)

                indices = []

                for x in range(numTraining):
                    indices.append(random.randint(0, 450))

                randomTrainingData = []
                randomTrainingLabels = []
                for index in indices:
                    randomTrainingData.append(rawTrainingData[index])
                    randomTrainingLabels.append(trainingLabels[index])

            else:
                rawTrainingData = samples.loadDataFile(
                    "digitdata/trainingimages", 5000, DIGIT_DATUM_WIDTH,
                    DIGIT_DATUM_HEIGHT)
                trainingLabels = samples.loadLabelsFile(
                    "digitdata/traininglabels", 5000)

                indices = []

                for x in range(numTraining):
                    indices.append(random.randint(0, 4999))

                randomTrainingData = []
                randomTrainingLabels = []

                for index in indices:
                    randomTrainingData.append(rawTrainingData[index])
                    randomTrainingLabels.append(trainingLabels[index])

            trainingData = map(featureFunction, randomTrainingData)

            start = time.time()
            classifier.train(trainingData, randomTrainingLabels,
                             validationData, validationLabels)
            print("Validating...")
            guesses = classifier.classify(validationData)
            correct = [
                guesses[i] == validationLabels[i]
                for i in range(len(validationLabels))
            ].count(True)
            print(str(correct),
                  ("correct out of " + str(len(validationLabels)) +
                   " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
            print("Testing...")
            guesses = classifier.classify(testData)
            correct = [
                guesses[i] == testLabels[i] for i in range(len(testLabels))
            ].count(True)
            print(str(correct),
                  ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
                  (100.0 * correct / len(testLabels)))
            analysis(classifier, guesses, testLabels, testData, rawTestData,
                     printImage)
            interval = time.time() - start
            print "Training and testing time: " + str(interval)
            outcomes[str(iterator)] = [
                "Training and testing time: {}".format(interval),
                "accuracy of training: {}%".format(
                    (100.0 * correct / len(testLabels)))
            ]

        print "outcomes: {}".format(outcomes)

    else:
        print("Training...")
        start = time.time()
        classifier.train(trainingData, trainingLabels, validationData,
                         validationLabels)
        interval = time.time() - start
        print "Training time: " + str(interval)
        print("Validating...")
        guesses = classifier.classify(validationData)
        correct = [
            guesses[i] == validationLabels[i]
            for i in range(len(validationLabels))
        ].count(True)
        print(str(correct),
              ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") %
              (100.0 * correct / len(validationLabels)))
        print("Testing...")
        guesses = classifier.classify(testData)
        correct = [
            guesses[i] == testLabels[i] for i in range(len(testLabels))
        ].count(True)
        print(str(correct),
              ("correct out of " + str(len(testLabels)) + " (%.1f%%).") %
              (100.0 * correct / len(testLabels)))
        analysis(classifier, guesses, testLabels, testData, rawTestData,
                 printImage)

    # do odds ratio computation if specified at command line
    if options.odds & (options.classifier != "mostFrequent"):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if options.classifier == "naiveBayes" or options.classifier == "nb":
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print(string3)
        printImage(features_odds)
示例#33
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "pacman"):
        agentToClone = args.get('agentToClone', None)
        trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get(
            agentToClone, (None, None, None))
        trainingData = trainingData or args.get(
            'trainingData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0]
        validationData = validationData or args.get(
            'validationData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1]
        testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[
            'ContestAgent'][2]
        rawTrainingData, trainingLabels = samples.loadPacmanData(
            trainingData, numTraining)
        rawValidationData, validationLabels = samples.loadPacmanData(
            validationData, numTest)
        rawTestData, testLabels = samples.loadPacmanData(testData, numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        print string3
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
示例#34
0
def demo_digits(n_images, datapath, labelspath, flag):

    start = time.time()

    weights_vectors = []
    base_path = './TrainDigitsResults/TrainingDigitsResults'

    for i in range(0, 10):
        load_path = base_path + str(i) + '/100_percent_digit_train.txt'
        with open(load_path, 'r') as file:
            weights = choose_best_weights(file)
            weights_vectors.append(weights)
            file.close()

    images = samples.loadDataFile(datapath, n_images, 28, 28)
    labels = samples.loadLabelsFile(labelspath, n_images)
    featureslist = compute_features2(images)
    '''
            For each image, apply all weight vectors to image
            Keep track of each sum in a sums list for the current image,
             Choose the sum that:
                The highest value
                
            The sum's index in the sums dictionary is the same as the weight vector's index
            The weight vector's index is the same as the digit's designated/chosen weights
            This index will be what digit you are guessing the image to be
            
            Compare this index/guessed number with the labels[image] value, if they are the same, append True to results
                if false, append False to results
    '''

    results = []
    for image in range(len(images)):
        sums = []

        for weightsvector in range(len(weights_vectors)):
            sum = float(0)
            for weight in range(len(weights_vectors[weightsvector])):
                sum += (weights_vectors[weightsvector][weight] *
                        featureslist[image][weight])
            # Add the sum to the sums dictionary
            sums.append(sum)

        max = float('-inf')
        index = -1
        for z in range(10):
            if sums[z] > max:
                max = sums[z]
                index = z

        #print(index, end=" --> ")
        #print(labels[image])

        # Keep track of all guesses/index vs. labels in a tuples list (guess, label)
        results.append((index, labels[image]))

    correctcount = float(0)
    for t in results:
        if t[0] == t[1]:
            correctcount += float(1)

    if flag:
        rand = random.randint(0, len(results))
        print('Guessed: ' + str(results[rand][0]) + ', Actual: ' +
              str(results[rand][1]) + ' (line ' + str(rand) +
              ' of digit testlabel)')
    else:
        print(results)
        print('Digits accuracy: ' +
              str(round((float(correctcount) * 100 / float(len(labels))), 1)) +
              '%')
        print('Time elapsed: ' + str(round(time.time() - start, 2)) + 's')
示例#35
0
def run_digits_n_times():

    all_final_stddevs = []
    all_final_accs = []

    colors = ['r', 'b', 'g', 'orange', 'k', 'c', 'm', 'y', 'grey', 'pink']

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.set_title('Face Training Runtimes')

    images = samples.loadDataFile('digitdata/trainingimages', 5000, 28, 28)
    labels = samples.loadLabelsFile('digitdata/traininglabels', 5000)

    # For each percentage
    for v in range(1, 11):
        sample_percentage = v * 10
        acc_list = []
        # For all 5 iterations
        for j in range(0, 5):

            weights_vectors = []
            # For each digit 0-9
            for y in range(0, 10):
                digit = y

                images_sample, labels_sample, visited = perceptron.sample_digits(
                    digit, sample_percentage, images, labels)

                # Featureslist = compute_features(functions_list, images_sample, typeflag)
                featureslist = perceptron.compute_features2(images_sample)
                # Weights = initialize_weights(len(functions_list), 0)
                weights = perceptron.initialize_weights(28 * 28, 0)
                '''
                        Run Perceptron / Learn weights
                '''
                start = time.time()
                final_weights = compute_weights(weights, featureslist,
                                                labels_sample)
                elapsed = time.time() - start
                weights_vectors.append(final_weights)

            # Test all weights (weight vectors 0-9), add acc to acc list
            acc = demo_digits(weights_vectors)
            acc_list.append(acc)

        # For percentage sample size v, get the mean accuracy and standard deviation accuracy
        mean = statistics.mean(acc_list)
        stddev = statistics.stdev(acc_list)
        print('Mean accuracy for sample percent-' + str(v) + ' ~ ' + str(mean))
        print('STD Dev for sample percent-' + str(v) + ' ~ ' + str(stddev))

        all_final_stddevs.append(stddev)
        all_final_accs.append(mean)

    print(stddev)
    ax1.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             all_final_accs,
             color='blue',
             marker='.',
             linestyle='--')
    #ax1.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], all_final_stddevs, color='red',marker='.', linestyle='--')
    plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [
        '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%'
    ])
    plt.xlabel('Training Data Sample Size')
    plt.ylabel('Standard Deviation of Accuracies')
    # plt.legend(loc='lower right', title='Face Train Runtime');
    plt.show()
def runClassifier(args, options):

    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test
    trainingTime = []

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain",
                                               numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain",
                                                 numTest, FACE_DATUM_WIDTH,
                                                 FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile(
            "facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,
                                           FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels",
                                            numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                               numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages",
                                                 numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,
                                           DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    f = open("training_time", "a")
    f.write("Classifier: {}\n".format(options.classifier))
    f.write("Data: {}\n".format(options.data))
    f.write("{} training\n".format(options.training))
    f.write("{} testing\n".format(options.test))

    # Extract features
    print("Extracting features...")
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and Testing
    print("Training...")
    start_stamp = time.time()
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    end_stamp = time.time()
    trainingTime.append(str(end_stamp - start_stamp))
    print("Validating...")
    guesses = classifier.classify(validationData)
    correct = [
        guesses[i] == validationLabels[i] for i in range(len(validationLabels))
    ].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) +
                         " (%.1f%%).") % (100.0 * correct /
                                          len(validationLabels))
    print("Testing...")
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData,
             printImage)
    # print str(guesses) ###############################################################################################################################
    accuracy = str(100.0 * correct / len(testLabels))
    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or
                          (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (
                label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (
                label1, label2)

        #print (string3)
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)

    print("training time", trainingTime)
    f.write("{}\n".format(trainingTime))
    f.write("{}\n".format(accuracy))
    f.write("\n")
    f.close()
示例#37
0
def runClassifier(args, options):
    classifier = args['classifier']

    # Load data
    if options.data == "pacman":
        agentToClone = args.get('agentToClone', None)
        trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get(
            agentToClone, (None, None, None))
        trainingData = trainingData or args.get(
            'trainingData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0]
        validationData = validationData or args.get(
            'validationData',
            False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1]
        testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES[
            'ContestAgent'][2]

        trainingData, trainingLabels = samples.loadPacmanData(
            trainingData, options.training)
        validationData, validationLabels = samples.loadPacmanData(
            validationData, options.validation)
        testData, testLabels = samples.loadPacmanData(testData, None)

    elif options.data == "digits":
        if options.training is None:
            options.training = 2000
        if options.validation is None:
            options.validation = 1000
        numTest = 1000

        trainingData = samples.loadDataFile("digitdata/trainingimages",
                                            options.training,
                                            DIGIT_DATUM_WIDTH,
                                            DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                                options.training)

        validationData = samples.loadDataFile("digitdata/validationimages",
                                              options.validation,
                                              DIGIT_DATUM_WIDTH,
                                              DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels",
                                                  options.validation)

        testData = samples.loadDataFile("digitdata/testimages", numTest,
                                        DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    else:
        raise ValueError('unrecognized dataset %r' % options.data)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData,
                     validationLabels)
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i]
               for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) +
                         " (%.1f%%).") % (100.0 * correct / len(testLabels))
示例#38
0
def runClassifier(args, options):
  #print 'args: ', args
  #print 'options', options
  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training
  numTest = options.test

  if(options.data=="faces"):
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest)
    rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
    rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)
    
  
  # Extract features
  #print "Extracting features..."
  #print '#######type of rawTrainingData is', rawTrainingData.__class__ # list of Datum
  #print '#######type of rawTrainingData[0] is', rawTrainingData[0].__class__ # Datum
  trainingData = map(featureFunction, rawTrainingData)
  #print '#######type of trainingData is', trainingData.__class__ # list of Counter
  #print '#######type of trainingData[0] is', trainingData[0].__class__ # Counter
  validationData = map(featureFunction, rawValidationData)
  testData = map(featureFunction, rawTestData)
  
  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  print 'length of guesses is %d' % len(guesses)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)
  
  # do odds ratio computation if specified at command line
  if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
    label1, label2 = options.label1, options.label2
    features_odds = classifier.findHighOddsFeatures(label1,label2)
    if(options.classifier == "naiveBayes" or options.classifier == "nb"):
      string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
    else:
      string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)    
      
    print string3
    printImage(features_odds)

  if((options.weights) & (options.classifier == "perceptron")):
    for l in classifier.legalLabels:
      features_weights = classifier.findHighWeightFeatures(l)
      print ("=== Features with high weight for label %d ==="%l)
      printImage(features_weights)
示例#39
0
def runClassifier():
    global TK_ROOT, SP_CANVAS, LOG_X, LOG_Y

    # Set up variables according to the command line inputs
    featureFunction = basicFeatureExtractorDigit

    legalLabels = range(10)  # number of labels

    # Select classifier
    classifier = perceptron.PerceptronClassifier(legalLabels)

    # Load data
    numTraining = 1

    loadImage()

    rawTrainingData = samples.loadDataFile("digitdata/trainingimages",
                                           numTraining, DIGIT_DATUM_WIDTH,
                                           DIGIT_DATUM_HEIGHT, 'train',
                                           SP_CANVAS)

    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels",
                                            numTraining)

    rawTestData = samples.loadDataFile("digitdata/testingimages",
                                       TEST_SET_SIZE, DIGIT_DATUM_WIDTH,
                                       DIGIT_DATUM_HEIGHT, 'test', SP_CANVAS)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE)

    # Extract features

    print rawTestData
    trainingData = map(basicFeatureExtractorDigit, rawTrainingData)
    print "cp3"
    testData = map(basicFeatureExtractorDigit, rawTestData)

    # Conduct auto training
    SP_CANVAS.create_text(LOG_X,
                          LOG_Y,
                          text="Auto Training...",
                          anchor=NW,
                          font=tkFont.Font(size=-14))
    LOG_Y += 15
    classifier.train(trainingData, trainingLabels, SP_CANVAS)

    # Auto Testing
    # print "Validating..."
    #  guesses = classifier.classify(validationData)
    #  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
    # print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))

    # User Input Testing
    SP_CANVAS.create_text(LOG_X,
                          LOG_Y,
                          text="Recognizing...",
                          anchor=NW,
                          font=tkFont.Font(size=-14))
    LOG_Y += 15
    guesses = classifier.classify(testData, SP_CANVAS, "usr")

    # Completion Notify
    SP_CANVAS.create_text(LOG_X,
                          LOG_Y + 30,
                          text="Completed...",
                          anchor=NW,
                          font=tkFont.Font(size=-14))
    LOG_Y += 15
def runClassifier(args, options):
    
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']
    
    # Load the data for testing, training and validation
    if(options.random):
        numberOfTestPoints = 150 if options.data=="faces" else 1000
        numberOfValidationPoints = 301 if options.data=="faces" else 1000
        totalTrainData = 451 if options.data=="faces" else 5000
        numValidation = numberOfValidationPoints
        numTest = numberOfTestPoints
        numTraining = totalTrainData

        if(options.data=="faces"):
            rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
            rawValidationData = samples.loadDataFile("facedata/facedatatrain", numValidation,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numValidation)
            rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
        else:
            rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
            testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)
            rawValidationData = samples.loadDataFile("digitdata/validationimages", numValidation,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
            validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numValidation)
            rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
            trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)


        print ("Extracting features...")
        trainingData = []
        validationData = []
        testData = []
        for datum1 in rawTestData:
            k = featureFunction(datum1)
            testData.append(k)
        for datum1 in rawValidationData:
            k = featureFunction(datum1)
            validationData.append(k)
        for datum1 in rawTrainingData:
            k = featureFunction(datum1)
            trainingData.append(k)
        for percent in range(1,11):
            accuracy = []
            times = []
            print("\n")
            for runCount in range(0,5):
                # Extract features
                print("======================================\n")
                print ("("+str(runCount+1)+")" +  " Extracting random " + str((percent * 10)) + "% of the training data...")
                numSubTraining = int((percent / 10.0) * totalTrainData)
                indexes = random.sample(range(0, totalTrainData), numSubTraining)
                subTrainingData = []
                subTrainingLabels = []

                for indx in indexes:
                    subTrainingData.append(trainingData[indx])
                    subTrainingLabels.append(trainingLabels[indx])

                # Conduct training and testing
                
                start = time.time()
                print ("("+str(runCount + 1)+")", "Training on", numSubTraining, "data points...")
                classifier.train(subTrainingData, subTrainingLabels, validationData, validationLabels)
                end = time.time()
                elapsed = end - start
                print ("("+str(runCount + 1)+")" + " Training completed in %0.4f second(s)" % elapsed)
                times.append(elapsed)
                
                # Validation
                print ("("+str(runCount+1)+")", "Validating...")
                guesses = classifier.classify(validationData)
                correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
                print ("("+str(runCount + 1)+") " + str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
                
                # Testing
                print ("("+str(runCount+1)+")", "Testing...")
                guesses = classifier.classify(testData)
                correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
                print ("("+str(runCount + 1)+") " + str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)) + "\n")
                accuracy.append(100.0 * correct / len(testLabels))

            averageAccuracy = 0
            avg_time = 0
            for q in range(0, len(accuracy)):
                averageAccuracy += accuracy[q]
                avg_time += times[q]
                
            averageAccuracy = averageAccuracy/len(accuracy)
            avg_time = avg_time/len(times)
            
            print("=================\n")
            print ("Average training time for", numSubTraining, "data points: %0.4f" % avg_time)
            print ("Average accuracy of " + str(percent * 10) + ("% data training: "), str(averageAccuracy))
            
            std_dev = 0
            for a in accuracy:
                temp = a - averageAccuracy
                std_dev += (temp*temp)
            std_dev = std_dev / (len(accuracy) - 1)
            std_dev = math.sqrt(std_dev)
            print ("Standard deviation of accuracy: %0.4f" % std_dev)
            print

        sys.exit(1)
    else:
        numTraining = options.training
        numTest = options.test
        classifier.extra = True

        if(options.data=="faces"):
          rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
          trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
          rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
          validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest)
          rawTestData = samples.loadDataFile("facedata/facedatatest", numTest,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
          testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
        else:
          rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
          trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
          rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
          validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
          rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
          testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)
        
        # Extract features
        print ("Extracting features...")
        trainingData = []
        validationData = []
        testData = []
        for datum1 in rawTestData:
            k = featureFunction(datum1)
            testData.append(k)
        # trainingData = map(featureFunction, rawValidationData)
        for datum1 in rawValidationData:
            k = featureFunction(datum1)
            validationData.append(k)
        for datum1 in rawTrainingData:
            k = featureFunction(datum1)
            trainingData.append(k)

        # Conduct training and testing
        print ("Training...")
        classifier.train(trainingData, trainingLabels, validationData, validationLabels)
        print ("Validating...")
        guesses = classifier.classify(validationData)
        correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
        print (str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
        print ("Testing...")
        guesses = classifier.classify(testData)
        correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
        print (str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)))

        analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

        # HighOddsFeatures
        if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
            label1, label2 = options.label1, options.label2
            featOdds = classifier.findHighOddsFeatures(label1,label2)
            if(options.classifier == "naiveBayes" or options.classifier == "nb"):
                feats = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
            else:
                feats = "=== Features for which weight (label %d) - weight (label %d) is largest ===" % (label1, label2)

            print (feats)
            printImage(featOdds)

        if((options.weights) & (options.classifier == "perceptron")):
            for l in classifier.legalLabels:
                featWeights = classifier.findHighWeightFeatures(l)
                print ("=== Features with high weight for label %d ===" % l)
                printImage(featWeights)
示例#41
0
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']

    # Load data
    numTraining = options.training
    numTest = options.test

    if (options.data == "faces"):
        rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining, FACE_DATUM_WIDTH,
                                               FACE_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
        rawValidationData = samples.loadDataFile("facedata/facedatatrain", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("facedata/facedatatrainlabels", numTest)
        rawTestData = samples.loadDataFile("facedata/facedatatest", numTest, FACE_DATUM_WIDTH, FACE_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("facedata/facedatatestlabels", numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining, DIGIT_DATUM_WIDTH,
                                               DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest, DIGIT_DATUM_WIDTH,
                                                 DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest, DIGIT_DATUM_WIDTH, DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)

    # Extract features
    print("Extracting features...")

    # randomly choose 10% percentage of the dataset from the training set
    percentage = 0.1
    rand_sample = [i for i in sorted(random.sample(range(numTraining), int(numTraining * percentage)))]
    sample_rawTrainingData = [rawTrainingData[i] for i in rand_sample]
    sample_trainingLabels = [trainingLabels[i] for i in rand_sample]
    trainingData = list(map(featureFunction, sample_rawTrainingData))
    validationData = list(map(featureFunction, rawValidationData))
    testData = list(map(featureFunction, rawTestData))

    # Conduct training and testing
    print("Training...")
    start = timeit.default_timer()
    classifier.train(trainingData, sample_trainingLabels, validationData, validationLabels)
    stop = timeit.default_timer()
    print("training time is: ", stop - start)
    print("Validating...")
    guesses = classifier.classify(validationData)
    correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
    print(str(correct),
          ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels)))
    print("Testing...")
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
    print(str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels)))
    analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if ((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb"))):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1, label2)
        if (options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)

        print(string3)
        printImage(features_odds)

    if ((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print("=== Features with high weight for label %d ===" % l)
            printImage(features_weights)
def runClassifier(args, options):
    featureFunction = args['featureFunction']
    classifier = args['classifier']
    printImage = args['printImage']
    
    # Load data
    numTraining = options.training
    numTest = options.test

    if(options.data=="pacman"):
        agentToClone = args.get('agentToClone', None)
        trainingData, validationData, testData = MAP_AGENT_TO_PATH_OF_SAVED_GAMES.get(agentToClone, (None, None, None))
        trainingData = trainingData or args.get('trainingData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][0]
        validationData = validationData or args.get('validationData', False) or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][1]
        testData = testData or MAP_AGENT_TO_PATH_OF_SAVED_GAMES['ContestAgent'][2]
        rawTrainingData, trainingLabels = samples.loadPacmanData(trainingData, numTraining)
        rawValidationData, validationLabels = samples.loadPacmanData(validationData, numTest)
        rawTestData, testLabels = samples.loadPacmanData(testData, numTest)
    else:
        rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
        rawValidationData = samples.loadDataFile("digitdata/validationimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        validationLabels = samples.loadLabelsFile("digitdata/validationlabels", numTest)
        rawTestData = samples.loadDataFile("digitdata/testimages", numTest,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
        testLabels = samples.loadLabelsFile("digitdata/testlabels", numTest)


    # Extract features
    print "Extracting features..."
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

    # Conduct training and testing
    print "Training..."
    classifier.train(trainingData, trainingLabels, validationData, validationLabels)
    print "Validating..."
    guesses = classifier.classify(validationData)
    correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
    print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
    print "Testing..."
    guesses = classifier.classify(testData)
    correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
    print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
    analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)

    # do odds ratio computation if specified at command line
    if((options.odds) & (options.classifier == "naiveBayes" or (options.classifier == "nb")) ):
        label1, label2 = options.label1, options.label2
        features_odds = classifier.findHighOddsFeatures(label1,label2)
        if(options.classifier == "naiveBayes" or options.classifier == "nb"):
            string3 = "=== Features with highest odd ratio of label %d over label %d ===" % (label1, label2)
        else:
            string3 = "=== Features for which weight(label %d)-weight(label %d) is biggest ===" % (label1, label2)

        print string3
        printImage(features_odds)

    if((options.weights) & (options.classifier == "perceptron")):
        for l in classifier.legalLabels:
            features_weights = classifier.findHighWeightFeatures(l)
            print ("=== Features with high weight for label %d ==="%l)
            printImage(features_weights)
示例#43
0
def runClassifier(args, options):

  featureFunction = args['featureFunction']
  classifier = args['classifier']
  printImage = args['printImage']
      
  # Load data  
  numTraining = options.training

  # Extract features
  print "Extracting features..."
  if options.data=="faces":
    rawTrainingData = samples.loadDataFile("facedata/facedatatrain", numTraining,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    trainingLabels  = samples.loadLabelsFile("facedata/facedatatrainlabels", numTraining)
    rawValidationData = samples.loadDataFile("facedata/facedatavalidation", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    validationLabels  = samples.loadLabelsFile("facedata/facedatavalidationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("facedata/facedatatest", TEST_SET_SIZE,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)
    testLabels  = samples.loadLabelsFile("facedata/facedatatestlabels", TEST_SET_SIZE)
  else:
    rawTrainingData = samples.loadDataFile("digitdata/trainingimages", numTraining,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    trainingLabels = samples.loadLabelsFile("digitdata/traininglabels", numTraining)
    rawValidationData = samples.loadDataFile("digitdata/validationimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    validationLabels = samples.loadLabelsFile("digitdata/validationlabels", TEST_SET_SIZE)
    rawTestData = samples.loadDataFile("digitdata/testimages", TEST_SET_SIZE,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)
    testLabels = samples.loadLabelsFile("digitdata/testlabels", TEST_SET_SIZE)

  if options.classifier == "GDA" or options.classifier == "LR":
    import os.path
    if os.path.isfile(options.data + '_' + str(numTraining) + '_pca.np'):
      f = open(options.data + '_' + str(numTraining) + '_pca.np', 'rb')
      principleComponents, trainingData, validationData, testData = cPickle.load(f) 
      f.close()
    else:
      if options.data == "faces":
        dimension = 13
        principleComponents = getPrincipleComponents(map(featureFunction, samples.loadDataFile("facedata/facedatatrain",451,FACE_DATUM_WIDTH,FACE_DATUM_HEIGHT)), dimension)
        trainingData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData)), principleComponents)
        validationData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData)), principleComponents)
        testData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents)
      else:
        dimension = 13
        principleComponents = getPrincipleComponents(map(featureFunction, samples.loadDataFile("digitdata/trainingimages",5000,DIGIT_DATUM_WIDTH,DIGIT_DATUM_HEIGHT)), dimension)
        trainingData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData)), principleComponents)
        validationData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData)), principleComponents)
        testData = np.dot(basicFeatureDataToNumpyArray(map(featureFunction, rawTestData)), principleComponents)
      f = open(options.data + '_' + str(numTraining) + '_pca.np', 'wb')
      cPickle.dump((principleComponents, trainingData, validationData, testData), f)
      f.close()
  elif options.classifier == "GPC":
    trainingData = basicFeatureDataToNumpyArray(map(featureFunction, rawTrainingData))
    validationData = basicFeatureDataToNumpyArray(map(featureFunction, rawValidationData))
    testData = basicFeatureDataToNumpyArray(map(featureFunction, rawTestData))
  else:
    trainingData = map(featureFunction, rawTrainingData)
    validationData = map(featureFunction, rawValidationData)
    testData = map(featureFunction, rawTestData)

  # Conduct training and testing
  print "Training..."
  classifier.train(trainingData, trainingLabels, validationData, validationLabels)
  print "Validating..."
  guesses = classifier.classify(validationData)
  correct = [guesses[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
  print str(correct), ("correct out of " + str(len(validationLabels)) + " (%.1f%%).") % (100.0 * correct / len(validationLabels))
  print "Testing..."
  guesses = classifier.classify(testData)
  correct = [guesses[i] == testLabels[i] for i in range(len(testLabels))].count(True)
  print str(correct), ("correct out of " + str(len(testLabels)) + " (%.1f%%).") % (100.0 * correct / len(testLabels))
  analysis(classifier, guesses, testLabels, testData, rawTestData, printImage)