예제 #1
0
    def classify(self, data):
        """
        classify() classifies each data item in the input by finding the best prototype vector.

        Keyword Arguments:
        data -- the test data to classify
        """
        guesses = []
        progressBar = ProgressBar(100, len(data), "Classifying Data")
        for index, entry in enumerate(data):
            progressBar.update(index)
            values = util.Counter()

            # for each label, compute activation
            for label in self.legalLabels:
                activation = 0
                # sum over the weights * values to get activation
                for key, value in entry.items():
                    activation += self.weights[label][key] * value
                values[label] = activation

            # add classification guess for data by getting the argmax
            guesses.append(values.argMax())
        progressBar.clear()
        return guesses
예제 #2
0
    def trainingHelper(self, trainingData, trainingLabels, iterations):
        """
        trainingHelper() classifies training data using the perceptron weights
        and updates the perceptron weights if the perceptron is incorrect.

        Keyword Arguments:
        trainingData -- training data for the perceptron
        trainingLabels -- labels for the associated training data
        iterations -- desired number of iterations over the training dataset.
        """
        for i in range(iterations):
            progressBar = ProgressBar(100, len(trainingData), "Learning Weights, Iteration {0} of {1}"
                .format(i + 1, iterations))
            for j in range(len(trainingData)):
                progressBar.update(j)

                values = util.Counter()

                # Go over each label, and create the value from the training data and current vectors
                for label in self.legalLabels:
                    activation = 0
                    for key, value in trainingData[j].items():
                        activation += self.weights[label][key] * value
                    values[label] = activation

                # Here, we update values in weight vectors if we reach an incorrect conclusion.
                if values.argMax() != trainingLabels[j]:
                    for key, value in trainingData[j].items():
                        self.weights[trainingLabels[j]][key] += value
                        self.weights[values.argMax()][key] -= value
            progressBar.clear()
예제 #3
0
    def classify(self, testData):
        """
        classify() classifies each data item in the input by finding the best prototype vector.

        Keyword Arguments:
        testData -- the test data to classify
        """
        guesses = []
        self.posteriors = [
        ]  # Log posteriors are stored for later data analysis.
        counter = 0
        size = len(testData)
        progressBar = ProgressBar(100, len(testData), "Classifying Data")
        for index, datum in enumerate(testData):
            progressBar.update(index)
            posterior = self.calculateLogJointProbabilities(datum)
            guesses.append(posterior.argMax())
            self.posteriors.append(posterior)
        progressBar.clear()
        return guesses
예제 #4
0
def defineFeatures(imageList, chop):
    """
    defineFeatures() defines a simple feature of a pixel either being white (0)
    or not (1) for a list of images and pixel values

    chops off pixels on outside of image for faster (but less accurate) classification
    """
    featureList = []
    features = []
    progressBar = ProgressBar(100, len(imageList),
                              "Getting Features for Images")
    for index, image in enumerate(imageList):
        # update progress
        progressBar.update(index)

        # create feature of on/off for (x, y) positions in image
        imgFeature = Counter()
        for x in range(chop, len(image) - chop):
            for y in range(chop, len(image[x]) - chop):
                if image[x][y] == 0:
                    imgFeature[(x, y)] = 0
                else:
                    imgFeature[(x, y)] = 1

        featureList.append(imgFeature)

    progressBar.clear()

    # Here, we create a list of all of the features for use in the
    # perceptron and Naive Bayes classifiers.
    if len(imageList) > 0:
        image = imageList[0]
        for x in range(chop, len(image) - chop):
            for y in range(chop, len(image[x]) - chop):
                features.append((x, y))

    return featureList, features
예제 #5
0
    def trainAndTune(self, trainingData, trainingLabels, validationData,
                     validationLabels, kgrid):
        """
        trainAndTune() trains the classifier by collecting counts over the training data and choosing the smoothing parameter among the choices in kgrid by
        using the validation data. This method should store the right parameters
        as a side-effect and should return the best smoothing parameters.

        Keyword Arguments:
        trainingData -- training data for the perceptron
        trainingLabels -- labels for the associated training data
        validationData -- validation data for the perceptron tuning function
        validationLabels -- labels for the associated validation data
        kgrid -- a list of possible k values to try for smoothing
        """

        # We begin by creating the prior probabilities for each of the labels
        # and the features based on the counts in the training data.
        countLabel = util.Counter()  # in form k = label, v = numOfL
        countFeature = util.Counter()
        # We begin looking over the training data here.
        progressBar = ProgressBar(100, len(trainingData), "Counting Data")
        for i in range(len(trainingData)):
            # update our progress bar
            progressBar.update(i)

            label = trainingLabels[i]
            # Labels are counted at each point they are seen here.
            countLabel[label] += 1
            # Then, if we haven't seen the label, we add it to the feature counter.
            if label not in countFeature:
                countFeature[label] = util.Counter()
            # Finally, we loop over the features for each datum and add each feature once
            # for each occurrence.
            for feature in trainingData[i]:
                countFeature[label][feature] += trainingData[i][feature]
        progressBar.clear()

        self.probLabel = copy.deepcopy(countLabel)
        self.probLabel.normalize()

        # At this point we have the counts, and we want to see what level of smoothing
        # increases our accuracy the most over the training set. Essentially, we just
        # create all of the probabilities from the feature counts while adding the smoothing
        # and classify the training data each time and pick whatever was most accurate.
        kClassifications = util.Counter()
        probForK = util.Counter()
        numCorrectK = util.Counter()
        print "Validation Accuracy"
        print "==================="
        for k in kgrid:
            # make counter for probabilities for each k
            probForK[k] = util.Counter()
            # make counters for probabilities for each label
            for label in self.legalLabels:
                probForK[k][label] = util.Counter()
                # find probability of each feature given each label
                progressBar = ProgressBar(
                    100, len(self.features),
                    "Getting Probabilities for Features, Label {0}".format(
                        label))
                for index, feature in enumerate(self.features):
                    progressBar.update(index)
                    if countFeature[label] != 0:
                        probForK[k][label][feature] = float(
                            countFeature[label][feature] +
                            k) / (countLabel[label] + k * len(self.features))
                progressBar.clear()

            # set probabilities for features and classify validation data
            self.probFeature = probForK[k]
            classificationLabels = self.classify(validationData)

            # check how much of the data was classified correctly
            correct = 0
            for i in range(len(classificationLabels)):
                if classificationLabels[i] == validationLabels[i]:
                    correct += 1

            # print accuracy for each k
            print "k = {0}, number of correct classifications = {1}".format(
                k, correct)
            # store the number of correct classifications for k value
            numCorrectK[k] = correct

        # pick k from our list of possible k values
        self.k = None
        for k in numCorrectK:
            # find k with the highest number of correct classifications
            # if there is a tie, use a lower k value
            if (self.k == None or numCorrectK[self.k] < numCorrectK[k]
                    or (numCorrectK[self.k] == numCorrectK[k] and k < self.k)):
                self.k = k
        self.probFeature = probForK[self.k]

        # print final choice for k
        print "K chosen = {0}".format(self.k)

        return self.k