def spamTest(): """ 使用朴素贝叶斯算法对垃圾邮件自动分类 """ docList = [] #文档列表 classList = [] #文档类别列表 fullText = [] #1.解析并导入文本文件 for i in range(1, 26): #先处理标记为spam(垃圾邮件)的文件 spamfilename = 'email/spam/%d.txt' % i spamfile = open(spamfilename, 'r') text = spamfile.read() wordList = textParse(text) spamfile.close() docList.append(wordList) #将词汇加入文档列表中 fullText.extend(wordList) #将词汇列表扩充进fullText列表中 classList.append(1) #加入类标为1 #处理正常邮件 hamfilename = 'email/ham/%d.txt' % i hamtext = open(hamfilename, 'r').read() wordList = textParse(hamtext) docList.append(wordList) fullText.extend(wordList) classList.append(0) #加入类标为0 vocabList = naiveBayes.createVocabList(docList) #创建无重复的词汇表 #2. 随机构建训练集和测试集 trainingSet = list(range(50)) #已知一共有50个邮件样本 testSet = [] #创建测试集 for i in range(10): #随机选取10个作为测试集样本,并将其从训练集中剔除 randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) #3. 构建向量词矩阵 trainMat = [] #01元素构成的训练集矩阵 trainClasses = [] #文档类别列表 for docIndex in trainingSet: #遍历训练集,构建向量词矩阵 #将词包加入训练集矩阵 trainMat.append( naiveBayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) #将相应的类别加入类别列表 #4. 计算分类的向量,以及分类为垃圾邮件的概率 p0V, p1V, pSpam = naiveBayes.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 #初始化错误分类 for docIndex in testSet: #遍历测试集,对测试集进行分类 wordVector = naiveBayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if naiveBayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("当前分类错误,第{:}个邮件分类错误".format(docIndex)) print('错误率为: ', errorCount / len(testSet))
def runClassification(trainingData, trainingClassVec): # split training and test data TESTINGDATASIZE = 10 testingData = [] actualTestingVec = [] for index in range(0,TESTINGDATASIZE): import random i = int(random.uniform(0,len(trainingData))) testingData.append(trainingData[i]) actualTestingVec.append(trainingClassVec[i]) del(trainingData[i]) del(trainingClassVec[i]) trainingVocabList = naiveBayes.createVocabList(trainingData) (pC0,pWGivenC0), (pC1,pWGivenC1) = naiveBayes.trainData(trainingVocabList, trainingData, trainingClassVec) predictedTestingVec = [] for testData in testingData: testDataVector = np.array(naiveBayes.bagOfWordsToVector(trainingVocabList, testData)) pC0GivenData = testDataVector * pWGivenC0 * pC0 + 1 pC1GivenData = testDataVector * pWGivenC1 * pC1 + 1 if sum(np.log(pC0GivenData)) > sum(np.log(pC1GivenData)): predictedTestingVec.append(0) else: predictedTestingVec.append(1) i = 0 error = 0 misClassified = [] for predicted in predictedTestingVec: if (actualTestingVec[i] != predicted): error += 1 misClassified.append(testingData[i]) i += 1 if(DEBUG): print predictedTestingVec print actualTestingVec print 'num errors: %d' % error print 'misclassified:' print misClassified return float(error)/TESTINGDATASIZE
def test(bag=False, prt=True): docList = [] classList = [] fullText = [] # 导入文件夹spam和ham下的文本文件并将它们解析为词列表 for i in range(1, 26): wordList = textParse(open('./data/email/spam/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse(open('./data/email/ham/%d.txt' % i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) # 产生词汇表 vocabList = createVocabList(docList) # 随机选择10个样本作为测试集, 剩下40个作为训练集 trainingSet = range(50) testSet = [] for i in range(10): randIndex = random.randrange(len(trainingSet)) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) if not bag: trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(setOfwords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) BNBModel.fit(trainMat, trainClasses) errorCount = 0 errorClassList = [] for docIndex in testSet: wordVector = np.array(setOfwords2Vec(vocabList, docList[docIndex])).reshape( 1, -1) predict = BNBModel.predict(wordVector) if predict != classList[docIndex]: errorClassList.append(classList[docIndex]) errorCount += 1 errorRate = float(errorCount) / len(testSet) if prt: print('the error rate is {}.'.format(errorRate)) return errorRate, errorClassList else: trainMat = [] trainClasses = [] for docIndex in trainingSet: trainMat.append(bagOfwords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) MModel.fit(trainMat, trainClasses) errorCount = 0 errorClassList = [] for docIndex in testSet: wordVector = np.array( bagOfwords2VecMN(vocabList, docList[docIndex])).reshape(1, -1) predict = MModel.predict(wordVector) if predict != classList[docIndex]: errorClassList.append(classList[docIndex]) errorCount += 1 errorRate = float(errorCount) / len(testSet) if prt: print('the error rate is {}.'.format(errorRate)) return errorRate, errorClassList
minlen = min(len(ny),len(sf)) fullData = [] fullClassVec = [] allWords = [] for index in range(0,minlen): words = parse(ny['entries'][index]['summary']) fullData.append(words) allWords.extend(words) fullClassVec.append(1) # 1 is ny words = parse(sf['entries'][index]['summary']) fullData.append(words) allWords.extend(words) fullClassVec.append(0) # 0 is sf # remove the most frequent words (combined in both cities). trainingVocabList = naiveBayes.createVocabList(fullData) trainingVocabList = removeNMostFrequentWords(trainingVocabList, allWords, 30) NUMRUNS = 2 topPC0 = [] topPC1 = [] for index in range(0,NUMRUNS): tPC0, tPC1 = runClassification(trainingVocabList, list(fullData), list(fullClassVec)) topPC0 += tPC0 topPC1 += tPC1 topPC0 = getTopNFromList(topPC0, 30) topPC1 = getTopNFromList(topPC1, 30) print "Most common words for New York:" print '\n'.join([x for (x,y) in topPC0]) print "\nMost common words for SF:"
# -*- coding: utf-8 -*- """ Created on Sun Jul 22 20:27:04 2018 @author: zhe E-mail: [email protected] """ import naiveBayes listOPosts, listClasses = naiveBayes.loadDataSet() myVocabList = naiveBayes.createVocabList(listOPosts) print("Vocubulary:", myVocabList) #vocabulary wordVec = naiveBayes.setOfWords2Vec(myVocabList, listOPosts[0]) print("test word vector:", wordVec) trainMat = [] for postinDoc in listOPosts: trainMat.append(naiveBayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, PAb = naiveBayes.trainNB0(trainMat, listClasses) print("Probability vector for 0 classification:", p0V) print("Probability vector for 1 classification:", p1V) print("Probability of being 0 classification:", 1 - PAb) print("Probability of being 1 classification", PAb)