import bayes import feedparser listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) # print(myVocabList) # print(bayes.setOfWords2Vec(myVocabList, listOPosts[0])) # trainMat =[] # for postinDoc in listOPosts: # trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # p0v, p1v, pAb = bayes.trainNB0(trainMat, listClasses) # print(p0v) # print(p1v) # print(pAb) # bayes.testingNB() # bayes.spamTest() ny = feedparser.parse('http://newyork/craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay/craigslist.org/stp/index.rss') vocabList, pSF, pNY = bayes.localWords(ny, sf)
#!/usr/bin/env python #-*- coding:utf-8 -*- import bayes # listOPosts,listClasses = bayes.loadDataSet() # myVocabList = bayes.createVocabList(listOPosts) # print myVocabList # print bayes.setOfWords2Vec(myVocabList,listOPosts[0]) # trainMat = [] # for postinDoc in listOPosts: # trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) # p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses) # print pAb # print sum(p0V) # # bayes.testingNB() # import re # regEx=re.compile('\\W*') # emailText=open('email/ham/6.txt').read() # listOfTokens=regEx.split(emailText) # print listOfTokens #bayes.spamTest() import feedparser ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') vocabList,pSF,pNY=bayes.localWords(ny,sf) bayes.getTopWords(ny,sf)
#!/usr/bin/python # -*- coding:utf-8 -*- ''' Created on Oct 31, 2015 @author: yanruibo 用feedparser数据集测试NBC(朴素贝叶斯)算法 因为前面测试是从所有数据中随机选择20个文档数据做为测试集,剩下的作为训练集 这里测试count次,取精确度的平均值 ''' import feedparser import bayes import time if __name__ == '__main__': ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') errorRateSum = 0.0 count = 200 for i in range(count): vocabList, p0V, p1V,errorRate = bayes.localWords(ny, sf) errorRateSum+=errorRate averageAccuracy =(1-errorRateSum/count) print "average accuracy is: ", averageAccuracy
#!/usr/bin/env python # -*- coding: utf-8 -*- import feedparser import bayes ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') print 'ny download over' sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') #valueOfFeat = secondDict[key] print 'sf download over' #由于随机构建测试集,通过多次测试减小有误差 bayes.localWords(ny, sf) bayes.localWords(ny, sf) bayes.localWords(ny, sf) bayes.localWords(ny, sf) bayes.localWords(ny, sf) bayes.localWords(ny, sf) bayes.localWords(ny, sf)
#!/usr/bin/python # -*- coding:utf-8 -*- ''' Created on Oct 31, 2015 @author: yanruibo 用feedparser数据集测试NBC(朴素贝叶斯)算法 因为前面测试是从所有数据中随机选择20个文档数据做为测试集,剩下的作为训练集 这里测试count次,取精确度的平均值 ''' import feedparser import bayes import time if __name__ == '__main__': ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') errorRateSum = 0.0 count = 200 for i in range(count): vocabList, p0V, p1V, errorRate = bayes.localWords(ny, sf) errorRateSum += errorRate averageAccuracy = (1 - errorRateSum / count) print "average accuracy is: ", averageAccuracy
# if __name__ == "__main__": # bayes.testingNB() # if __name__ == "__main__": # bayes.spamTest() import feedparser # ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') # sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') # ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss') # sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') # print(ny) # print(sf) # vocabList,p0Vec,p1Vec = bayes.localWords(ny,sf) # print(ny) # print(len(ny['feed'])) if __name__== "__main__": # testingNB() #导入RSS数据源 import operator ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') vocabList, p0Vec, p1Vec = bayes.localWords(ny, sf) print(vocabList)
# 训练数据 from numpy import * import bayes postList, classVec = bayes.loadDataSet() vocabList = bayes.createVocabList(postList) mat = [] for i in postList: mat.append(bayes.setOfWords2Vec(vocabList, i)) p0, p1, pAbusive = bayes.trainNB0(mat, classVec) # 利用rss源文档测试 import bayes import feedparser sci_env = feedparser.parse( 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml') edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml') rate = 0.0 for i in range(10): vocabList, p0, p1, erate = bayes.localWords(sci_env, edu) rate += erate print "error rate: %f" % (rate / 10) # len(ny['entries']) # 获取各分类文档的出现次数最多的词 import bayes import feedparser sci_env = feedparser.parse( 'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml') edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml') bayes.getTopWords(sci_env, edu)
print "\n 第一个过滤器例子: 恶意留言区分" bayes.testingNB() #第二个例子 垃圾邮件区分 print "\n 第二个过滤器例子: 垃圾邮件区分" bayes.spamTest() #第三个例子 个人广告中录取区域倾向 #书中的RSS不能读取到信息,相关参数:书中的例子RSS len=60,将20个作为测试样本,其余40个作为训练样本,去掉的是频数前30个词。 # 本程序中使用的例子len=20,将5个个作为测试样本,其余15个作为训练样本,去掉频数为个位数是效果最好,这里暂时取3。 print "\n 第三个例子:个人广告RSS中录取区域倾向" nasa = feedparser.parse( 'http://www.nasa.gov/rss/dyn/image_of_the_day.rss') #len=60 NASA 航天新闻 ft = feedparser.parse( 'http://www.ftchinese.com/rss/news') #len=20,FT中文网(正式官方新闻)政治 经济 全球新闻 #sf = feedparser.parse('http://sports.yahoo.com/nba/teams/hou/rss.xml') #len=6 #sf = feedparser.parse('http://rss.yule.sohu.vocabSetcom/rss/yuletoutiao.xml') #搜狐娱乐(娱乐新闻)有时候len=30 有时却异常 ''' print "第一个的长度是:",len(nasa['entries']) print "第一个的内容是:",nasa['entries'] print "第二个的长度是:",len(ft['entries']) print "第二个的内容是:",ft['entries'] print "运行第一次的结果:" ''' #将两个RSS中的数据用来训练和预测 bayes.localWords(nasa, ft) #程序中已经完成了所有的操作,包括预测错误率的计算。 print "\n运行第二次的结果:" bayes.getTopWords(nasa, ft)