Exemplo n.º 1
0
def testingNB():
    listOfPosts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOfPosts)
    trainMatrix = []
    for post in listOfPosts:
        trainMatrix.append(bayes.setOfWordsToVector(myVocabList, post))
    p0,p1, pAbusive = bayes.trainNB(array(trainMatrix), array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(bayes.setOfWordsToVector(myVocabList, testEntry))
    print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0, p1, pAbusive))
    testEntry = ['stupid', 'garbage']
    thisDoc = array(bayes.setOfWordsToVector(myVocabList, testEntry))
    print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0, p1, pAbusive))
Exemplo n.º 2
0
def testingNB():
    listOPosts, listClasses = bayes.loadDataSet()
    # 创建一个包含所有词的列表
    myVocabList = bayes.createVocabList(listOPosts)
    # print(myVocabList)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))

    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ',
          bayes.classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['garbage', 'stupid']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ',
          bayes.classifyNB(thisDoc, p0V, p1V, pAb))
Exemplo n.º 3
0
def spamTest():
    docList = []
    classList = []
    fullList = []
    for i in range(1, 26):
        # 把spam(垃圾邮件)文件夹下的文本加入到docList、fullList中
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullList.extend(wordList)
        # 垃圾邮件类别为1,calssList加入1
        classList.append(1)
        # 把ham(非垃圾邮件)文件夹下的文本加入到docList、fullList中
        wordList = textParse(
            open('email/ham/%d.txt' % i, encoding='gb18030',
                 errors='ignore').read())
        docList.append(wordList)
        fullList.extend(wordList)
        # 垃圾邮件类别为0,calssList加入0
        classList.append(0)
    # 根据输入的文档生成包含文档中所有单词的词汇表
    vocabList = bayes.createVocabList(docList)
    # 生成长度为50的列表,元素值为0-49,用作docList列表的索引
    trainingSet = list(range(50))
    # 声明testSet列表
    testSet = []

    # 在trainingSet中任取10个不重复数据的索引加入到测试及
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        # 把索引对应的trainingSet中的值加入到testSet中
        testSet.append(trainingSet[randIndex])
        # 删除加入到testSet中的索引
        del trainingSet[randIndex]
    # 声明trainMat(训练数据集)、trainClasses(训练数据集的分类列表)
    trainMat = []
    trainClasses = []
    # 给训练数据集、trainClasses添加数据
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    # 调用训练算法进行训练
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    # 使用测试数据集测试训练后的算法的错误率
    errorCount = 0
    for docIndex in testSet:
        # 对于测试数据,求每一个文档的词条向量
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        # 对每一个词条向量分类并与真实分类进行比较计算错误率
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is:', float(errorCount) / len(testSet))
Exemplo n.º 4
0
def localWords(feed0, feed1):
    import feedparser
    import numpy as np
    docList=[]
    classList=[]
    fullText=[]
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    top30Words = calculateMostFrequentValues(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingset = list(range(2*minLen))
    testSet = []
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingset)))
        testSet.append(trainingset[randIndex])
        del(trainingset[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingset:
        trainMat.append(bayes.bagOfWordsToVetor(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0v, p1v, pSpam = bayes.trainNB(np.asarray(trainMat), np.asarray(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWordsToVetor(vocabList, docList[docIndex])
        if bayes.classifyNB(np.asarray(wordVector), p0v, p1v, pSpam) != classList[docIndex]:
            errorCount +=1
    print('the error rate is: ', float(errorCount)/len(testSet))
    return vocabList, p0v, p1v