Exemplo n.º 1
0
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # print('minlen=%d'%minLen)
    for i in range(minLen):
        # visit one rss source every time
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        # visit rss 0
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

        # remove most frequent words
        vocabList = bayes.createVocabList(docList)
        top30words = calcMostFreq(vocabList, fullText)
        for pairW in top30words:
            if pairW[0] in vocabList:
                vocabList.remove(pairW[0])
        trainingSet = list(range(2 * minLen))
        testSet = []
        for i in range(20):
            randIdx = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIdx])
            del (trainingSet[randIdx])

    trainMat = []
    trainClasses = []
    for docIdx in trainingSet:
        # print("doc idx:%d, len=%d" %( docIdx, len(docList)))
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIdx]))
        trainClasses.append(classList[docIdx])

    p0V, p1V, pSpam = bayes.trainNB0(numpy.array(trainMat),
                                     numpy.array(trainClasses))

    errorCount = 0
    for docIdx in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIdx])
        if bayes.classifyNB(numpy.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIdx]:
            errorCount += 1
    print('the error rate is: %.2f' % (float(errorCount) / len(testSet)))

    return vocabList, p0V, p1V
Exemplo n.º 2
0
def localWords(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = st.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = st.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)

    #remove top frequent words
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])

    #build training and testing set
    trainingSet = range(2*minLen)
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    #training
    p0V, p1V, pLocal = bayes.trainNB0(array(trainMat), array(trainClasses))

    #testing
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pLocal) != classList[docIndex]:
            errorCount+=1

    print 'the error rate is: ', float(errorCount)/len(testSet)
    return vocabList, p0V, p1V
Exemplo n.º 3
0
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        # 每次访问一条RSS源
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)

    # 去掉出现频数最高的词
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])

    trainingSet = list(range(2 * minLen))
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1

    print('the error rate is:', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
Exemplo n.º 4
0
 def test_bagOfWords2VecMN(self):
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     features = bayes.bagOfWords2VecMN(myVocabList, listOPosts[0])
     expected = [
         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1
     ]
     self.assertEqual(features, expected)
Exemplo n.º 5
0
def localWords(feed1, feed0):  # 两份RSS文件分别经feedparser解析,得到2个字典
    docList = []  # 一条条帖子组成的List, 帖子拆成了单词
    classList = []  # 标签列表
    fullText = []  # 所有帖子的所有单词组成的List
    # entries条目包含多个帖子,miNLen记录帖子数少的数目,怕越界
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])  # 取出帖子内容,并拆成词
        docList.append(wordList)  # ['12','34'].append(['56','78']) ==> [ ['12','34'], ['56','78'] ]
        fullText.extend(wordList)  # ['12','34'].extend(['56','78']) ==> ['12','34','56','78']
        classList.append(1)  # 纽约的标签是1
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  # 旧金山的标签是0

    vocabList = bayes.createVocabList(docList)  # 创建词汇表
    # 从fulltext中找出最高频的30个单词,并从vocabList中去除它们
    top30Words = calcMostFreq(vocabList, fullText)
    for (word, count) in top30Words:
        if word in vocabList:
            vocabList.remove(word)

    trainingSet = range(2 * minLen);
    testSet = []  # 创建训练集、测试集
    for i in range(minLen / 10):  # 随机选取10%的数据,建立测试集
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = [];
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))  # 将训练集中的每一条数据,转化为词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses))  # 开始训练

    # 用测试数据,测试分类器的准确性
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
    return vocabList, p0V, p1V
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        str=open('email/spam/%d.txt' % i).read()
        print "str"
        print str
        wordList = bayes.textParse(str)
        print "wordlist"
        print wordList
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    print "doclist"
    print len(docList)
    print docList
    print "fulllist"
    print len(fullText)
    print fullText
    print classList

    vocabList = bayes.createVocabList(docList)#create vocabulary
    trainingSet = range(50); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
Exemplo n.º 7
0
def spamTest():
    """
    将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集,
    50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证)
    :return: 
    """
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):  #随机选出10篇
        randIndex = int(random.uniform(
            0,
            len(trainingSet)))  #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  #遍历训练集中所有的文档
        trainMat.append(bayes.bagOfWords2VecMN(vocabList,
                                               docList[docIndex]))  #构建词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat),
                                     array(trainClasses))  #计算分类所需的概率
    errorCount = 0
    for docIndex in testSet:  #遍历测试集
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is :', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
Exemplo n.º 8
0
'''
@author: laiwei
'''
import bayes
listOPosts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)
print(myVocabList)
print(listOPosts)
print(listClasses)
print(bayes.setOfWords2Vec(myVocabList, listOPosts[0]))
print(bayes.bagOfWords2VecMN(myVocabList, listOPosts[0]))