示例#1
0
def localWords(feed1, feed0):
    docList=[]; classList = []; fullText =[]
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)

    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])

    trainingSet = range(2 * minLen); testSet=[]
    # create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]

    trainMat=[]; trainClasses = []
    # train the classifier (get probs) train
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pVectDict, pCateDict = bayes.train(array(trainMat), trainClasses)
    errorCount = 0
    # classify the remaining items
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classify(array(wordVector), pVectDict, pCateDict) != classList[docIndex]:
            errorCount += 1

    print 'the error rate is: ', float(errorCount) / len(testSet)

    return vocabList, pVectDict
示例#2
0
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1, 26):
        wordList = bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)

    trainingSet = range(50); testSet=[]
    # create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del trainingSet[randIndex]

    trainMat=[]; trainClasses = []
    # train the classifier (get probs) train
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pVectDict, pCateDict = bayes.train(array(trainMat), trainClasses)
    errorCount = 0
    # classify the remaining items
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classify(array(wordVector), pVectDict, pCateDict) != classList[docIndex]:
            errorCount += 1
            print "classification error:", docList[docIndex]
    print 'the error rate is: ',float(errorCount) / len(testSet)