def testingNB():
    listPosts, listClasses = loadDataSet()
    vocabList = Bayes.createVocabList(listOPosts)
    trainMat = Bayes.words2Mat(vocabList, listOPosts)
    p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'my', 'garbage']
    thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry)
    print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
示例#2
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('testDemo/email/span/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('testDemo/email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = Bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:
            trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = Bayes.trainNBO(array(trainMat), array(trainClasses))
        errorCount = 0
        for docIndex in testSet:
            wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex])
            if Bayes.classifyNB(array(wordVector), p0V, p1V,
                                pSpam) != classList[docIndex]:
                errorCount += 1
        print('the error rate is: ', float(errorCount) / len(testSet))
示例#3
0
def spamTest():
    hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham")
    hamclassList = [0] * len(hamemail)
    spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam")
    spamclassList = [1] * len(spamemail)
    Allemail = []
    Allemail.extend(hamemail)
    Allemail.extend(spamemail)
    AllList = []
    AllList.extend(hamclassList)
    AllList.extend(spamclassList)
    VocalbList = Bayes.createVocabList(Allemail)
    # print(VocalbList)
    testMat = []
    realclass = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(Allemail)))
        testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex]))
        del (Allemail[randIndex])
        realclass.append(AllList[randIndex])
        del (AllList[randIndex])
    trainMat = []
    for i in range(len(Allemail)):
        trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i]))
    p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList)
    # print(p0vect,'\n',p1vect,'\n',pA)
    for i in range(10):
        print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA),
              ",real_result=", realclass[i])
def spamTestOfbag():
    docList =[];classList = [];fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList)
    trainingSet = range(50);testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat =[];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(Bayes.bagOfWord2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = Bayes.bagOfWord2VecMN(vocabList,docList[docIndex])
        if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print ('the error rate is: ',float(errorCount)/len(testSet))
def spamTestOfvoc():
    docList =[];classList = [];fullText = []
    for i in range(1,26):#总共有50份文件,垃圾邮件25份,非垃圾邮件25份
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)  #加入一个词向量样本到docList
        fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html
        classList.append(1) #spam中的样本都是垃圾邮件
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList) #得到所有单词(特征)的词汇表
    trainingSet = range(50) #0-49
    testSet = []
    for i in range(10):#交叉验证,10个样本用于测试
        randIndex = int(random.uniform(0,len(trainingSet)))#生成一个在[0,len(trainingSet)的随机数
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])#在训练集中去掉测试集
    trainMat =[];trainClasses = []
    for docIndex in trainingSet:#得到训练集和其对应的类别
        trainMat.append(Bayes.setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:#测试,得到错误率
        wordVector = Bayes.setOfWords2Vec(vocabList,docList[docIndex])
        if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print ('the error rate is: ',float(errorCount)/len(testSet))
示例#6
0
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = Bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc))
        trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
示例#7
0
def localWords(feed1, feed0):
    docList = []  #以二维数组形式存储所有样本的词汇表
    classList = []  #存储所有样本的类别信息
    fullText = []  #以一维数组形式存储所有样本的词汇表
    minLen = min(len(feed1['entries']), len(feed0['entries']))  #获取两个RSS源的最小长度
    for i in range(minLen):
        #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)  #将该样本词汇添加到docList中
        fullText.extend(wordList)  #将该样本词汇追加到fullText中
        classList.append(1)  #将样本类别信息添加到classList
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList)  #获取docList中所有不重复的单词列表
    #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。
    #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。
    top30Words = calcMostFreq(vocabList, fullText)  #获取在fullText中出现次数最多的30个词汇信息
    for pairW in top30Words:  #从词汇表vocabList中去除出现次数最多的30个单词
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = range(2 * minLen)
    #定义列表变量存储训练样本id
    print 'minLen : %d' % minLen
    if minLen < 20:
        print 'the len is too small.'
    testSet = []  #用于存储测试样本id
    for i in range(20):  #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中
    for docIndex in trainingSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex]))
        #获取当前样本的分类信息classList[docIndex]
        trainClasses.append(classList[docIndex])
    #通过贝叶斯分类器对训练样本进行学习
    #获取两个类别各自单词的出现频率,以及样本集的概率
    p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    #使用测试样本集对学习结果进行测试
    for docIndex in testSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex])
        #对当前测试样本进行分类,判断是否与已知类型相同
        if Bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)  #打印出错误率
    return vocabList, p0V, p1V  #返回词汇表和各个词汇的出现概率
示例#8
0
文件: TextParser.py 项目: baojiong/ml
def spamTest():
    docList = []
    classList = []
    fullText = []

    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(docList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(docList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)
    """
    trainingSet =  [1, 49]
    生成10个50以内的随机数,加入testSet
    从trainingSet中删掉这些数。
    结果就是把【1...49],1分为2,10个作为 testSet, 其他作为 trainingSet
    trainingSet = [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 38, 39, 41, 43, 44, 45, 46, 47, 48, 49]
    testSet = [36, 3, 40, 31, 10, 42, 7, 37, 15, 34]
    """
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0

    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1

    print 'the error rate is: ', float(errorCount) / len(testSet)
示例#9
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):  #将文件夹spam和ham下所有文本文件解析出来
        #从对应文本文件中读出字符串,将其解析为单词列表
        wordList = Bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)  #将当前文本的词汇列表添加到docList变量中
        fullText.extend(wordList)  #将当前文本的所有单词追加到fullText变量中
        classList.append(1)  #分类列表变量classList中增加一个1类信息
        wordList = Bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  #分类列表变量classList中增加一个0类信息
    vocabList = Bayes.createVocabList(docList)  #获取docList中所有出现过的单词的词汇表
    trainingSet = range(50)  #创建拥有50个元素的list变量,存储0-49个数字,对应spam与ham目录下所有文本
    testSet = []
    for i in range(10):  #从0-9循环,产生10个测试样本id
        #uniform() 方法将随机生成下一个实数,它在 [x, y) 范围内。
        #在[0, 50)之间产生一个随机整数
        randIndex = int(random.uniform(0, len(trainingSet)))
        print randIndex
        #将trainingSet中对应训练样本id添加到测试集testSet中,并从trainingSet中删除该id
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    #在40个训练样本中,逐个文本循环处理,获取1类和0类各个单词出现的概率及1类样本的概率
    for docIndex in trainingSet:
        #获取当前文档中单词在词汇表vocabList是否出现的列表,添加到列表变量trainMat中
        trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        #将对应文档的分类信息添加到trainClasses中
        trainClasses.append(classList[docIndex])
    #获取训练样本中1类和0类各个词汇的出现概率,及所有样本中1类样本所占概率
    p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses))
    print classList
    errorCount = 0
    #使用10个测试样本,对贝叶斯分类效果进行检测
    for docIndex in testSet:
        #获取当前测试样本中单词在词汇表vocabList是否出现的列表
        wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex])
        #使用贝叶斯分类器对当前测试样本进行分类,判断分类结果是否正确
        if Bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)  #打印出分类错误率
def localWord(feed0,feed1):
   
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    listOfPost = [];classVec = [];fullText = []
    for i in range(minLen):
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(1)
        
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(0)
        
    vocabList = Bayes.creatVocabList(listOfPost)
    
    top30Words = calMostFreq(vocabList,fullText)
    
    for pairW in top30Words:
        if pairW in vocabList:vocabList.remove(pairW)
        
    trainingSet = range(2*minLen) ; dataSet = []
    
    for i in range(20):
        randIndex = int(np.random.uniform(len(trainingSet)))
        dataSet.append(randIndex)
        del(trainingSet[randIndex])
    
    trainMat = [];trainClass = []
    for docIndex in trainingSet:
        trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]))
        trainClass.append(classVec[docIndex])
                    
    p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass)
    
    errorCount = 0.0
    
    for docIndex in dataSet:
        dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])
        
        if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]:
            errorCount += 1
   # print "the error rate is :",errorCount/float(len(dataSet))
    
    return vocabList,p0V,p1V