Exemplo n.º 1
0
def runClassification(trainingVocabList, fullData, fullClassVec):
    # split into training and test data
    trainingData = fullData
    trainingClassVec = fullClassVec
    TESTINGDATASIZE = 10
    testingData = []
    actualTestingVec = []
    for index in range(0,TESTINGDATASIZE):
        import random
        i = int(random.uniform(0,len(trainingData)))
        testingData.append(trainingData[i])
        actualTestingVec.append(trainingClassVec[i])
        del(trainingData[i])
        del(trainingClassVec[i])

    (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec)

    topPC0 = []
    topPC1 = []
    for testData in testingData:
        testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData))
        pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1
        pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1
        topPC0 = addUnique(topPC0, getTopN(trainingVocabList, pC0GivenData, 30)) # make a UNIQUE list of the most frequent words
        topPC1 = addUnique(topPC1, getTopN(trainingVocabList, pC1GivenData, 30)) # make a UNIQUE list of the most frequent words

    return getTopNFromList(topPC0,30), getTopNFromList(topPC1,30)
Exemplo n.º 2
0
def runClassification(trainingData, trainingClassVec):
    # split training and test data
    TESTINGDATASIZE = 10
    testingData = []
    actualTestingVec = []
    for index in range(0,TESTINGDATASIZE):
        import random
        i = int(random.uniform(0,len(trainingData)))
        testingData.append(trainingData[i])
        actualTestingVec.append(trainingClassVec[i])
        del(trainingData[i])
        del(trainingClassVec[i])

    trainingVocabList = naiveBayes.createVocabList(trainingData)
    (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec)

    predictedTestingVec = []
    for testData in testingData:
        testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData))
        pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1
        pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1
        if sum(np.log(pC0GivenData)) > sum(np.log(pC1GivenData)):
            predictedTestingVec.append(0)
        else:
            predictedTestingVec.append(1)

    i = 0
    error = 0
    misClassified = []
    for predicted in predictedTestingVec:
        if (actualTestingVec[i] != predicted):
            error += 1
            misClassified.append(testingData[i])
        i += 1

    if(DEBUG):
        print predictedTestingVec
        print actualTestingVec
        print 'num errors: %d' % error
        print 'misclassified:'
        print misClassified

    return float(error)/TESTINGDATASIZE