def localWords(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # print(minLen)
    for i in range(minLen):
        wordList = sd.textParse(feed1['entries'][i]['summary'])
        # print(wordList)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = sd.textParse(feed0['entries'][i]['summary'])
        # print(wordList)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bf.createVocabList(docList)  # 创建词汇表
    # print(vocabList)
    top30Words = calcMostFreq(vocabList, fullText)  # 删除前30个单词
    # print(top30Words)
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = list(range(2 * minLen))
    # print(trainingSet)
    testSet = []  # 创建测试集
    for i in range(20):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  # 训练分类器(get probs) trainNB0
        trainMat.append(bf.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = of.trainNB0(np.array(trainMat), np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:  # 对剩余项目进行分类
        wordVector = bf.bagOfWords2VecMN(vocabList, docList[docIndex])
        if tt.classifyNB(np.array(wordVector), p0V, p1V,
                         pSpam) != classList[docIndex]:
            errorCount += 1
        print("分类错误的测试集:", docList[docIndex])
    print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100))
    return vocabList, p0V, p1V
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):  # 遍历25个txt文件
        wordList = textParse(open('email/spam/%d.txt' % i,
                                  'r').read())  # 读取每个垃圾邮件,并字符串转换成字符串列表
        # print(wordList)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)  # 标记垃圾邮件,1表示垃圾文件
        wordList = textParse(open('email/ham/%d.txt' % i,
                                  'r').read())  # 读取每个非垃圾邮件,并字符串转换成字符串列表
        # print(wordList)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  # 标记非垃圾邮件,1表示垃圾文件
    vocabList = bf.createVocabList(docList)  # 创建词汇表,不重复
    trainingSet = list(range(50))
    # print(vocabList)
    testSet = []  # 创建存储训练集的索引值的列表和测试集的索引值的列表
    for i in range(10):  # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集
        randIndex = int(random.uniform(0, len(trainingSet)))  # 随机选取索索引值
        testSet.append(trainingSet[randIndex])  # 添加测试集的索引值
        del (trainingSet[randIndex])  # 在训练集列表中删除添加到测试集的索引值
    trainMat = []
    trainClasses = []  # 创建训练集矩阵和训练集类别标签系向量
    for docIndex in trainingSet:  # 遍历训练集
        trainMat.append(bf.setOfWords2Vec(
            vocabList, docList[docIndex]))  # 将生成的词集模型添加到训练矩阵中
        trainClasses.append(classList[docIndex])  # 将类别添加到训练集类别标签系向量中
    p0V, p1V, pSpam = of.trainNB0(np.array(trainMat),
                                  np.array(trainClasses))  # 训练朴素贝叶斯模型
    errorCount = 0  # 错误分类计数
    for docIndex in testSet:  # 遍历测试集
        wordVector = bf.setOfWords2Vec(vocabList,
                                       docList[docIndex])  # 测试集的词集模型
        if te.classifyNB(np.array(wordVector), p0V, p1V,
                         pSpam) != classList[docIndex]:  # 如果分类错误
            errorCount += 1  # 错误计数加1
            print("分类错误的测试集:", docList[docIndex])
    print('错误率:%.2f%%' % (float(errorCount) / len(testSet) * 100))
Пример #3
0
def testingNB():
    listOPosts, listClasses = bf.loadDataSet()  # 创建实验样本
    myVocabList = bf.createVocabList(listOPosts)  # 创建词汇表
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bf.setOfWords2Vec(myVocabList, postinDoc))  # 将实验样本向量化
    p0V, p1V, pAb = tt.trainNB0(np.array(trainMat),
                                np.array(listClasses))  # 训练朴素贝叶斯分类器
    testEntry = ['love', 'my', 'dalmation']  # 测试样本1
    thisDoc = np.array(bf.setOfWords2Vec(myVocabList, testEntry))  # 测试样本向量化
    if classifyNB(thisDoc, p0V, p1V, pAb):
        print(testEntry, '属于侮辱类')  # 执行分类并打印分类结果
    else:
        print(testEntry, '属于非侮辱类')  # 执行分类并打印分类结果
    testEntry = ['stupid', 'garbage']  # 测试样本2

    thisDoc = np.array(bf.setOfWords2Vec(myVocabList, testEntry))  # 测试样本向量化
    if classifyNB(thisDoc, p0V, p1V, pAb):
        print(testEntry, '属于侮辱类')  # 执行分类并打印分类结果
    else:
        print(testEntry, '属于非侮辱类')  # 执行分类并打印分类结果
Пример #4
0
    numWords = len(trainMatrix[0])  # 计算每篇文档的词条数
    pAbusive = sum(trainCategory) / float(numTrainDocs)  # 文档属于侮辱类的概率
    p0Num = np.ones(numWords)
    p1Num = np.ones(numWords)  # 创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑
    p0Denom = 2.0
    p1Denom = 2.0  # 分母初始化为2,拉普拉斯平滑
    for i in range(numTrainDocs):
        # 统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)···
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:  # 统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)···
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = np.log(p1Num / p1Denom)  # 取对数,防止下溢出
    p0Vect = np.log(p0Num / p0Denom)
    return p0Vect, p1Vect, pAbusive  # 返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率


if __name__ == '__main__':
    postingList, classVec = bf.loadDataSet()
    myVocabList = bf.createVocabList(postingList)
    trainMat = []
    for postinDoc in postingList:
        trainMat.append(bf.setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(trainMat, classVec)
    print('p0V:\n', p0V)
    print('p1V:\n', p1V)
    print('classVec:\n', classVec)
    print('pAb:\n', pAb)