def classify(self, data): """ classify() classifies each data item in the input by finding the best prototype vector. Keyword Arguments: data -- the test data to classify """ guesses = [] progressBar = ProgressBar(100, len(data), "Classifying Data") for index, entry in enumerate(data): progressBar.update(index) values = util.Counter() # for each label, compute activation for label in self.legalLabels: activation = 0 # sum over the weights * values to get activation for key, value in entry.items(): activation += self.weights[label][key] * value values[label] = activation # add classification guess for data by getting the argmax guesses.append(values.argMax()) progressBar.clear() return guesses
def trainingHelper(self, trainingData, trainingLabels, iterations): """ trainingHelper() classifies training data using the perceptron weights and updates the perceptron weights if the perceptron is incorrect. Keyword Arguments: trainingData -- training data for the perceptron trainingLabels -- labels for the associated training data iterations -- desired number of iterations over the training dataset. """ for i in range(iterations): progressBar = ProgressBar(100, len(trainingData), "Learning Weights, Iteration {0} of {1}" .format(i + 1, iterations)) for j in range(len(trainingData)): progressBar.update(j) values = util.Counter() # Go over each label, and create the value from the training data and current vectors for label in self.legalLabels: activation = 0 for key, value in trainingData[j].items(): activation += self.weights[label][key] * value values[label] = activation # Here, we update values in weight vectors if we reach an incorrect conclusion. if values.argMax() != trainingLabels[j]: for key, value in trainingData[j].items(): self.weights[trainingLabels[j]][key] += value self.weights[values.argMax()][key] -= value progressBar.clear()
def classify(self, testData): """ classify() classifies each data item in the input by finding the best prototype vector. Keyword Arguments: testData -- the test data to classify """ guesses = [] self.posteriors = [ ] # Log posteriors are stored for later data analysis. counter = 0 size = len(testData) progressBar = ProgressBar(100, len(testData), "Classifying Data") for index, datum in enumerate(testData): progressBar.update(index) posterior = self.calculateLogJointProbabilities(datum) guesses.append(posterior.argMax()) self.posteriors.append(posterior) progressBar.clear() return guesses
def defineFeatures(imageList, chop): """ defineFeatures() defines a simple feature of a pixel either being white (0) or not (1) for a list of images and pixel values chops off pixels on outside of image for faster (but less accurate) classification """ featureList = [] features = [] progressBar = ProgressBar(100, len(imageList), "Getting Features for Images") for index, image in enumerate(imageList): # update progress progressBar.update(index) # create feature of on/off for (x, y) positions in image imgFeature = Counter() for x in range(chop, len(image) - chop): for y in range(chop, len(image[x]) - chop): if image[x][y] == 0: imgFeature[(x, y)] = 0 else: imgFeature[(x, y)] = 1 featureList.append(imgFeature) progressBar.clear() # Here, we create a list of all of the features for use in the # perceptron and Naive Bayes classifiers. if len(imageList) > 0: image = imageList[0] for x in range(chop, len(image) - chop): for y in range(chop, len(image[x]) - chop): features.append((x, y)) return featureList, features
def trainAndTune(self, trainingData, trainingLabels, validationData, validationLabels, kgrid): """ trainAndTune() trains the classifier by collecting counts over the training data and choosing the smoothing parameter among the choices in kgrid by using the validation data. This method should store the right parameters as a side-effect and should return the best smoothing parameters. Keyword Arguments: trainingData -- training data for the perceptron trainingLabels -- labels for the associated training data validationData -- validation data for the perceptron tuning function validationLabels -- labels for the associated validation data kgrid -- a list of possible k values to try for smoothing """ # We begin by creating the prior probabilities for each of the labels # and the features based on the counts in the training data. countLabel = util.Counter() # in form k = label, v = numOfL countFeature = util.Counter() # We begin looking over the training data here. progressBar = ProgressBar(100, len(trainingData), "Counting Data") for i in range(len(trainingData)): # update our progress bar progressBar.update(i) label = trainingLabels[i] # Labels are counted at each point they are seen here. countLabel[label] += 1 # Then, if we haven't seen the label, we add it to the feature counter. if label not in countFeature: countFeature[label] = util.Counter() # Finally, we loop over the features for each datum and add each feature once # for each occurrence. for feature in trainingData[i]: countFeature[label][feature] += trainingData[i][feature] progressBar.clear() self.probLabel = copy.deepcopy(countLabel) self.probLabel.normalize() # At this point we have the counts, and we want to see what level of smoothing # increases our accuracy the most over the training set. Essentially, we just # create all of the probabilities from the feature counts while adding the smoothing # and classify the training data each time and pick whatever was most accurate. kClassifications = util.Counter() probForK = util.Counter() numCorrectK = util.Counter() print "Validation Accuracy" print "===================" for k in kgrid: # make counter for probabilities for each k probForK[k] = util.Counter() # make counters for probabilities for each label for label in self.legalLabels: probForK[k][label] = util.Counter() # find probability of each feature given each label progressBar = ProgressBar( 100, len(self.features), "Getting Probabilities for Features, Label {0}".format( label)) for index, feature in enumerate(self.features): progressBar.update(index) if countFeature[label] != 0: probForK[k][label][feature] = float( countFeature[label][feature] + k) / (countLabel[label] + k * len(self.features)) progressBar.clear() # set probabilities for features and classify validation data self.probFeature = probForK[k] classificationLabels = self.classify(validationData) # check how much of the data was classified correctly correct = 0 for i in range(len(classificationLabels)): if classificationLabels[i] == validationLabels[i]: correct += 1 # print accuracy for each k print "k = {0}, number of correct classifications = {1}".format( k, correct) # store the number of correct classifications for k value numCorrectK[k] = correct # pick k from our list of possible k values self.k = None for k in numCorrectK: # find k with the highest number of correct classifications # if there is a tie, use a lower k value if (self.k == None or numCorrectK[self.k] < numCorrectK[k] or (numCorrectK[self.k] == numCorrectK[k] and k < self.k)): self.k = k self.probFeature = probForK[self.k] # print final choice for k print "K chosen = {0}".format(self.k) return self.k