Пример #1
0
def spamTest():
    """
    使用朴素贝叶斯算法对垃圾邮件自动分类
    """
    docList = []  #文档列表
    classList = []  #文档类别列表
    fullText = []
    #1.解析并导入文本文件
    for i in range(1, 26):
        #先处理标记为spam(垃圾邮件)的文件
        spamfilename = 'email/spam/%d.txt' % i
        spamfile = open(spamfilename, 'r')
        text = spamfile.read()
        wordList = textParse(text)
        spamfile.close()
        docList.append(wordList)  #将词汇加入文档列表中
        fullText.extend(wordList)  #将词汇列表扩充进fullText列表中
        classList.append(1)  #加入类标为1

        #处理正常邮件
        hamfilename = 'email/ham/%d.txt' % i
        hamtext = open(hamfilename, 'r').read()
        wordList = textParse(hamtext)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  #加入类标为0
    vocabList = naiveBayes.createVocabList(docList)  #创建无重复的词汇表

    #2. 随机构建训练集和测试集
    trainingSet = list(range(50))  #已知一共有50个邮件样本
    testSet = []  #创建测试集
    for i in range(10):
        #随机选取10个作为测试集样本,并将其从训练集中剔除
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    #3. 构建向量词矩阵
    trainMat = []  #01元素构成的训练集矩阵
    trainClasses = []  #文档类别列表
    for docIndex in trainingSet:
        #遍历训练集,构建向量词矩阵
        #将词包加入训练集矩阵
        trainMat.append(
            naiveBayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])  #将相应的类别加入类别列表

    #4. 计算分类的向量,以及分类为垃圾邮件的概率
    p0V, p1V, pSpam = naiveBayes.trainNB0(np.array(trainMat),
                                          np.array(trainClasses))
    errorCount = 0  #初始化错误分类
    for docIndex in testSet:
        #遍历测试集,对测试集进行分类
        wordVector = naiveBayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if naiveBayes.classifyNB(np.array(wordVector), p0V, p1V,
                                 pSpam) != classList[docIndex]:
            errorCount += 1
            print("当前分类错误,第{:}个邮件分类错误".format(docIndex))
    print('错误率为: ', errorCount / len(testSet))
Пример #2
0
def runClassification(trainingData, trainingClassVec):
    # split training and test data
    TESTINGDATASIZE = 10
    testingData = []
    actualTestingVec = []
    for index in range(0,TESTINGDATASIZE):
        import random
        i = int(random.uniform(0,len(trainingData)))
        testingData.append(trainingData[i])
        actualTestingVec.append(trainingClassVec[i])
        del(trainingData[i])
        del(trainingClassVec[i])

    trainingVocabList = naiveBayes.createVocabList(trainingData)
    (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec)

    predictedTestingVec = []
    for testData in testingData:
        testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData))
        pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1
        pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1
        if sum(np.log(pC0GivenData)) > sum(np.log(pC1GivenData)):
            predictedTestingVec.append(0)
        else:
            predictedTestingVec.append(1)

    i = 0
    error = 0
    misClassified = []
    for predicted in predictedTestingVec:
        if (actualTestingVec[i] != predicted):
            error += 1
            misClassified.append(testingData[i])
        i += 1

    if(DEBUG):
        print predictedTestingVec
        print actualTestingVec
        print 'num errors: %d' % error
        print 'misclassified:'
        print misClassified

    return float(error)/TESTINGDATASIZE
def test(bag=False, prt=True):
    docList = []
    classList = []
    fullText = []
    # 导入文件夹spam和ham下的文本文件并将它们解析为词列表
    for i in range(1, 26):
        wordList = textParse(open('./data/email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./data/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    # 产生词汇表
    vocabList = createVocabList(docList)

    # 随机选择10个样本作为测试集, 剩下40个作为训练集
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = random.randrange(len(trainingSet))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    if not bag:
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            trainMat.append(setOfwords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        BNBModel.fit(trainMat, trainClasses)
        errorCount = 0
        errorClassList = []
        for docIndex in testSet:
            wordVector = np.array(setOfwords2Vec(vocabList,
                                                 docList[docIndex])).reshape(
                                                     1, -1)
            predict = BNBModel.predict(wordVector)
            if predict != classList[docIndex]:
                errorClassList.append(classList[docIndex])
                errorCount += 1
        errorRate = float(errorCount) / len(testSet)
        if prt:
            print('the error rate is {}.'.format(errorRate))
        return errorRate, errorClassList
    else:
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            trainMat.append(bagOfwords2VecMN(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        MModel.fit(trainMat, trainClasses)
        errorCount = 0
        errorClassList = []
        for docIndex in testSet:
            wordVector = np.array(
                bagOfwords2VecMN(vocabList, docList[docIndex])).reshape(1, -1)
            predict = MModel.predict(wordVector)
            if predict != classList[docIndex]:
                errorClassList.append(classList[docIndex])
                errorCount += 1
        errorRate = float(errorCount) / len(testSet)
        if prt:
            print('the error rate is {}.'.format(errorRate))
        return errorRate, errorClassList
Пример #4
0
    minlen = min(len(ny),len(sf))
    fullData = []
    fullClassVec = []
    allWords = []
    for index in range(0,minlen):
        words = parse(ny['entries'][index]['summary'])
        fullData.append(words)
        allWords.extend(words)
        fullClassVec.append(1) # 1 is ny
        words = parse(sf['entries'][index]['summary'])
        fullData.append(words)
        allWords.extend(words)
        fullClassVec.append(0) # 0 is sf

    # remove the most frequent words (combined in both cities).
    trainingVocabList = naiveBayes.createVocabList(fullData)
    trainingVocabList = removeNMostFrequentWords(trainingVocabList, allWords, 30)

    NUMRUNS = 2
    topPC0 = []
    topPC1 = []
    for index in range(0,NUMRUNS):
        tPC0, tPC1 = runClassification(trainingVocabList, list(fullData), list(fullClassVec))
        topPC0 += tPC0
        topPC1 += tPC1

    topPC0 = getTopNFromList(topPC0, 30)
    topPC1 = getTopNFromList(topPC1, 30)
    print "Most common words for New York:"
    print '\n'.join([x for (x,y) in topPC0])
    print "\nMost common words for SF:"
Пример #5
0
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 22 20:27:04 2018

@author: zhe

E-mail: [email protected]
"""

import naiveBayes

listOPosts, listClasses = naiveBayes.loadDataSet()
myVocabList = naiveBayes.createVocabList(listOPosts)

print("Vocubulary:", myVocabList)  #vocabulary

wordVec = naiveBayes.setOfWords2Vec(myVocabList, listOPosts[0])

print("test word vector:", wordVec)

trainMat = []

for postinDoc in listOPosts:
    trainMat.append(naiveBayes.setOfWords2Vec(myVocabList, postinDoc))

p0V, p1V, PAb = naiveBayes.trainNB0(trainMat, listClasses)

print("Probability vector for 0 classification:", p0V)
print("Probability vector for 1 classification:", p1V)
print("Probability of being 0 classification:", 1 - PAb)
print("Probability of being 1 classification", PAb)