Exemplo n.º 1
0
def tests():
    bayes.testingNB()
    bayes.spamTest()
Exemplo n.º 2
0
# -*- coding: utf-8 -*-

import bayes
from numpy import *

#过滤垃圾邮件
bayes.spamTest()


Exemplo n.º 3
0
print(flagLab)

#得到抽取特征后的文档,以及文档所属的类别
listOPosts,listClasses = bayes.loadDataSet()

#构建一个包含所有词的列表
myVocabList = bayes.createVocabList(listOPosts)

#词向量构成的列表
trainMat=[]
for postinDoc in listOPosts:
	trainMat.append( bayes.setOfWords2Vec(myVocabList,postinDoc) )
	
#得到两个类别的概率向量,和侮辱性文档的概率
p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses)

print(pAb)
print(p0V)
print(p1V)
'''
#判断网站留言是否是恶意的
bayes.testingNB()

#过滤垃圾邮件
errRaiosum=0.0
for i in range(10):
	errRaiosum += bayes.spamTest()

print('平均错误率为:' +str( errRaiosum/10))
	
import bayes

print bayes.spamTest()
Exemplo n.º 5
0
    # myVocabList = bayes.createVocabList(listOPosts)
    # print myVocabList
    #
    # print bayes.setOfWords2Vect(myVocabList, listOPosts[0])
    # print bayes.setOfWords2Vect(myVocabList, listOPosts[3])
    #
    # trainMat = map(lambda postinDoc: bayes.setOfWords2Vect(myVocabList, postinDoc), listOPosts)
    # print
    # for item in trainMat:
    # 	print item
    #
    # p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)
    #
    # print p0V
    # print p1V
    # print pAb
    #
    # testEntry = ['love', 'my', 'dalmation']
    # thisDoc = np.array(bayes.setOfWords2Vect(myVocabList, testEntry))
    # # print thisDoc
    # print testEntry, 'classified as:', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
    # testEntry = ['stupid', 'garbage']
    # thisDoc = np.array(bayes.setOfWords2Vect(myVocabList, testEntry))
    # # print thisDoc
    # print testEntry, 'classified as:', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
    result = 0
    N = 1.0
    for i in range(int(N)):
        result += bayes.spamTest()
    print(result / N)
# myVocabList=bayes.createVocabList(listOPosts)
# print(myVocabList)
# # print(len(myVocabList))
# # print(bayes.setOfWords2Vec(myVocabList,listOPosts[0]))
# # print(bayes.setOfWords2Vec(myVocabList,['time']))
#
# trainMat=[]
# for postinDoc in listOPosts:
#     trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
#
# print(trainMat)
# p0V,p1V,pAb=bayes.trainNB0(trainMat,listClasses)
# # print(p0C,p1V,pAb)
# testEntry=['love','my','dalmation']
# thisDoc=np.array(bayes.setOfWords2Vec(myVocabList,testEntry))
# print(testEntry,'classified as:',bayes.classifyNB(thisDoc,p0V,p1V,pAb))
# testEntry=['stupid','garbage']
# thisDoc=np.array(bayes.setOfWords2Vec(myVocabList,testEntry))
# print(testEntry,'classified as:',bayes.classifyNB(thisDoc,p0V,p1V,pAb))

##----------------------------使用朴素贝叶斯过滤垃圾邮件----------------------------------------
filepath = '/Users/songhaiyue/Desktop/B01_python/Machine_Learning_in_Action/machinelearninginaction/Ch04/'

# filename=filepath+'email/ham/6.txt'
# emailText=open(filename,encoding ='unicode_escape').read()
# # print(emailText)
# listOfTokens=re.split('\s',emailText)
# print(listOfTokens)

bayes.spamTest(filepath)
Exemplo n.º 7
0
import bayes


total=0.0
reload(bayes)

for i in range(1,20):
     total+=bayes.spamTest()

print total/100
Exemplo n.º 8
0
    for docIndex in trainingSet:
        trainMat.append(setOfWord2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0

    #3.(以下四行)对测试集分类
    for docIndex in testSet:
        wordVector = sefOfWord2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
        print('the error rate is: ',float(errorCount)/len(testSet))

#测试
print(bayes.spamTest())
print(bayes.spamTest())

import feedparser
ny = feedparser.parse('http://newyork.craiglist.org/stp/index.rss')

ny['entries']
len(ny['entries'])

#RSS源分类器及高频词去除函数
def calcMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.iteritems(),key = operator.itemgetter(1),reveser)
Exemplo n.º 9
0
import bayes
from numpy import *
import re

listOPosts, listClasses = bayes.loadDataSet()

# 4.5
# myVocabList = bayes.createVocabList(listOPosts)
# # print(myVocabList)
# word2vec = bayes.setOfWords2Vec(myVocabList,listOPosts[0])

# trainMat = []
# for postinDoc in listOPosts:
#     trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
# p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses)
# print(pAb)
# print(p0V)
# print(p1V)

# bayes.testingNB()

# 4.6
# mySent = ''
# mySent.split()
# regEx = re.compile('\\W*')
# listOfTokens = regEx.split(mySent)
error_rate = 0
for i in range(100):
    error_rate += bayes.spamTest()
print("average error rate is:", float(error_rate / 100))