def testingNB(): listPosts, listClasses = loadDataSet() vocabList = Bayes.createVocabList(listOPosts) trainMat = Bayes.words2Mat(vocabList, listOPosts) p0V, p1V, pAb = Bayes.trainNB(trainMat, np.array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'my', 'garbage'] thisDoc = Bayes.setOfWords2Vec(vocabList, testEntry) print(testEntry, 'classified as: ', Bayes.classifyNB(thisDoc, p0V, p1V, pAb))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('testDemo/email/span/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('testDemo/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = Bayes.trainNBO(array(trainMat), array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex]) if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print('the error rate is: ', float(errorCount) / len(testSet))
def spamTest(): hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham") hamclassList = [0] * len(hamemail) spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam") spamclassList = [1] * len(spamemail) Allemail = [] Allemail.extend(hamemail) Allemail.extend(spamemail) AllList = [] AllList.extend(hamclassList) AllList.extend(spamclassList) VocalbList = Bayes.createVocabList(Allemail) # print(VocalbList) testMat = [] realclass = [] for i in range(10): randIndex = int(random.uniform(0, len(Allemail))) testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex])) del (Allemail[randIndex]) realclass.append(AllList[randIndex]) del (AllList[randIndex]) trainMat = [] for i in range(len(Allemail)): trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i])) p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList) # print(p0vect,'\n',p1vect,'\n',pA) for i in range(10): print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA), ",real_result=", realclass[i])
def spamTestOfbag(): docList =[];classList = [];fullText = [] for i in range(1,26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) trainingSet = range(50);testSet = [] for i in range(10): randIndex = int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat =[];trainClasses = [] for docIndex in trainingSet: trainMat.append(Bayes.bagOfWord2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = Bayes.bagOfWord2VecMN(vocabList,docList[docIndex]) if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print ('the error rate is: ',float(errorCount)/len(testSet))
def spamTestOfvoc(): docList =[];classList = [];fullText = [] for i in range(1,26):#总共有50份文件,垃圾邮件25份,非垃圾邮件25份 wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) #加入一个词向量样本到docList fullText.extend(wordList) #extend()方法使得fullText中的元素都是单个的单词(list类型),参考:https://www.cnblogs.com/tzuxung/p/5706245.html classList.append(1) #spam中的样本都是垃圾邮件 wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) #得到所有单词(特征)的词汇表 trainingSet = range(50) #0-49 testSet = [] for i in range(10):#交叉验证,10个样本用于测试 randIndex = int(random.uniform(0,len(trainingSet)))#生成一个在[0,len(trainingSet)的随机数 testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex])#在训练集中去掉测试集 trainMat =[];trainClasses = [] for docIndex in trainingSet:#得到训练集和其对应的类别 trainMat.append(Bayes.setOfWords2Vec(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(array(trainMat),array(trainClasses)) errorCount = 0 for docIndex in testSet:#测试,得到错误率 wordVector = Bayes.setOfWords2Vec(vocabList,docList[docIndex]) if Bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]: errorCount += 1 print ('the error rate is: ',float(errorCount)/len(testSet))
def testingNB(): listOPosts, listClasses = loadDataSet() myVocabList = Bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc)) trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry)) thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry)) print testEntry, 'classified as: ', Bayes.classifyNB( thisDoc, p0V, p1V, pAb)
def localWords(feed1, feed0): docList = [] #以二维数组形式存储所有样本的词汇表 classList = [] #存储所有样本的类别信息 fullText = [] #以一维数组形式存储所有样本的词汇表 minLen = min(len(feed1['entries']), len(feed0['entries'])) #获取两个RSS源的最小长度 for i in range(minLen): #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写 wordList = Bayes.textParse(feed1['entries'][i]['summary']) docList.append(wordList) #将该样本词汇添加到docList中 fullText.extend(wordList) #将该样本词汇追加到fullText中 classList.append(1) #将样本类别信息添加到classList wordList = Bayes.textParse(feed0['entries'][i]['summary']) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) #获取docList中所有不重复的单词列表 #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。 #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。 top30Words = calcMostFreq(vocabList, fullText) #获取在fullText中出现次数最多的30个词汇信息 for pairW in top30Words: #从词汇表vocabList中去除出现次数最多的30个单词 if pairW[0] in vocabList: vocabList.remove(pairW[0]) trainingSet = range(2 * minLen) #定义列表变量存储训练样本id print 'minLen : %d' % minLen if minLen < 20: print 'the len is too small.' testSet = [] #用于存储测试样本id for i in range(20): #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中 for docIndex in trainingSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex])) #获取当前样本的分类信息classList[docIndex] trainClasses.append(classList[docIndex]) #通过贝叶斯分类器对训练样本进行学习 #获取两个类别各自单词的出现频率,以及样本集的概率 p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses)) errorCount = 0 #使用测试样本集对学习结果进行测试 for docIndex in testSet: #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况 wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex]) #对当前测试样本进行分类,判断是否与已知类型相同 if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) #打印出错误率 return vocabList, p0V, p1V #返回词汇表和各个词汇的出现概率
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(docList) classList.append(1) wordList = textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(docList) classList.append(0) vocabList = bayes.createVocabList(docList) """ trainingSet = [1, 49] 生成10个50以内的随机数,加入testSet 从trainingSet中删掉这些数。 结果就是把【1...49],1分为2,10个作为 testSet, 其他作为 trainingSet trainingSet = [0, 1, 2, 4, 5, 6, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 38, 39, 41, 43, 44, 45, 46, 47, 48, 49] testSet = [36, 3, 40, 31, 10, 42, 7, 37, 15, 34] """ trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet)
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): #将文件夹spam和ham下所有文本文件解析出来 #从对应文本文件中读出字符串,将其解析为单词列表 wordList = Bayes.textParse(open('email/spam/%d.txt' % i).read()) docList.append(wordList) #将当前文本的词汇列表添加到docList变量中 fullText.extend(wordList) #将当前文本的所有单词追加到fullText变量中 classList.append(1) #分类列表变量classList中增加一个1类信息 wordList = Bayes.textParse(open('email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) #分类列表变量classList中增加一个0类信息 vocabList = Bayes.createVocabList(docList) #获取docList中所有出现过的单词的词汇表 trainingSet = range(50) #创建拥有50个元素的list变量,存储0-49个数字,对应spam与ham目录下所有文本 testSet = [] for i in range(10): #从0-9循环,产生10个测试样本id #uniform() 方法将随机生成下一个实数,它在 [x, y) 范围内。 #在[0, 50)之间产生一个随机整数 randIndex = int(random.uniform(0, len(trainingSet))) print randIndex #将trainingSet中对应训练样本id添加到测试集testSet中,并从trainingSet中删除该id testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] #在40个训练样本中,逐个文本循环处理,获取1类和0类各个单词出现的概率及1类样本的概率 for docIndex in trainingSet: #获取当前文档中单词在词汇表vocabList是否出现的列表,添加到列表变量trainMat中 trainMat.append(Bayes.setOfWords2Vec(vocabList, docList[docIndex])) #将对应文档的分类信息添加到trainClasses中 trainClasses.append(classList[docIndex]) #获取训练样本中1类和0类各个词汇的出现概率,及所有样本中1类样本所占概率 p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses)) print classList errorCount = 0 #使用10个测试样本,对贝叶斯分类效果进行检测 for docIndex in testSet: #获取当前测试样本中单词在词汇表vocabList是否出现的列表 wordVector = Bayes.setOfWords2Vec(vocabList, docList[docIndex]) #使用贝叶斯分类器对当前测试样本进行分类,判断分类结果是否正确 if Bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print 'the error rate is: ', float(errorCount) / len(testSet) #打印出分类错误率
def localWord(feed0,feed1): minLen = min(len(feed1['entries']),len(feed0['entries'])) listOfPost = [];classVec = [];fullText = [] for i in range(minLen): wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始 listOfPost.append(wordList) fullText.extend(wordList) classVec.append(1) wordList = Bayes.textParse(feed0['entries'][i]['summary']) listOfPost.append(wordList) fullText.extend(wordList) classVec.append(0) vocabList = Bayes.creatVocabList(listOfPost) top30Words = calMostFreq(vocabList,fullText) for pairW in top30Words: if pairW in vocabList:vocabList.remove(pairW) trainingSet = range(2*minLen) ; dataSet = [] for i in range(20): randIndex = int(np.random.uniform(len(trainingSet))) dataSet.append(randIndex) del(trainingSet[randIndex]) trainMat = [];trainClass = [] for docIndex in trainingSet: trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])) trainClass.append(classVec[docIndex]) p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass) errorCount = 0.0 for docIndex in dataSet: dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]) if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]: errorCount += 1 # print "the error rate is :",errorCount/float(len(dataSet)) return vocabList,p0V,p1V