def spamDict(): docList = [ ] # 切分好的词组成的列表[['his','xx','xx'],['xx','xx','xx'],['xx','xx','xx']] classList = [] # 邮件的类别,垃圾邮件和正常邮件,垃圾邮件为1,正常邮件为0 for i in range(1, 26): with open('ham/%d.txt' % i) as f: wordList = textParse(f.read()) docList.append(wordList) classList.append(1) with open('spam/%d.txt' % i) as f: wordList = textParse(f.read()) docList.append(wordList) classList.append(0) vocabList = Bayes.createVocabList(docList) # 将docList组成词典 ''' 从50封电子邮件中随机选出10封作为测试集,剩下的作为训练集 ''' trainingSet = range(50) testSet = [] for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del trainingSet[randIndex] trainMarix = [] # 训练集数据组成的训练矩阵 trainingClass = [] # 训练数据集中的类别 for docIndex in trainingSet: trainMarix.append(Bayes.setOfWord2Vector(vocabList, docList[docIndex])) trainingClass.append(classList[docIndex]) pAb, p1v, p0v = Bayes.TrainingNB1(array(trainMarix), array(trainingClass)) errorCount = 0.0 for docIndex in testSet: thisDoc = array(Bayes.setOfWord2Vector(vocabList, docList[docIndex])) if classifyNB(array(thisDoc), p0v, p1v, pAb) != classList[docIndex]: errorCount += 1 print 'the error rate is :', float(errorCount) / len(testSet)