Пример #1
0
    def scikitNBClassfier(self):
        dataMat, labels = self.loadProcessedData()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        ## 建立bag of words 矩阵
        trainMat = []
        for postinDoc in dataMat:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))

        from sklearn.naive_bayes import GaussianNB

        gnb = GaussianNB()
        X = array(trainMat)
        y = labels

        testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
        testEntry = self.testEntryProcess(testText)

        bayesian = Bayesian()
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        ## 拟合并预测
        y_pred = gnb.fit(X, y).predict(thisDoc)
        clabels = ['军事', '体育']
        y_pred = gnb.fit(X, y).predict(X)
        print("Number of mislabeled points : %d" % (labels != y_pred).sum())
Пример #2
0
    def crossValidClassifier(self):
        dataMat, labels = self.loadProcessedData()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        trainingSet = range(51)
        testSet = []  #create test set
        for i in range(10):
            randIndex = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:  #train the classifier (get probs) trainNB0
            trainMat.append(
                bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex]))
            trainClasses.append(labels[docIndex])
        p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))

        clabels = ['军事', '体育']
        data = self.getData('data/bayesian/rss/rss_junshi.txt') + self.getData(
            'data/bayesian/rss/rss_tiyu.txt')
        errorCount = 0
        for docIndex in testSet:  #classify the remaining items
            wordVector = bayesian.setOfWords2Vec(myVocabList,
                                                 dataMat[docIndex])
            type = bayesian.predict(array(wordVector), p0V, p1V, pSpam)
            if type != labels[docIndex]:
                errorCount += 1
                print "判断类型:", clabels[type]
                print "classification error", data[docIndex]
                print "---------------------------------------"
        print 'the error rate is: ', float(errorCount) / len(testSet)
Пример #3
0
    def SingleClassifier(self):
        ## 加载RSS源并将其保存为文本文件
        ## 除非是生成新数据,否则不执行这段代码
        #juns_count = rss.loadRSS('http://mil.sohu.com/rss/junshi.xml','data/bayesian/rss/rss_junshi.txt')
        #tiyu_count = rss.loadRSS('http://rss.news.sohu.com/rss/sports.xml','data/bayesian/rss/rss_tiyu.txt' )
        #print juns_count
        #print tiyu_count

        dataMat, labels = self.loadProcessedData()

        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        ## 建立bag of words 矩阵
        trainMat = []
        for postinDoc in dataMat:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
        ## 计算已有数据集中的先验概率
        p0V, p1V, pAb = bayesian.fit(array(trainMat), array(labels))

        ## 测试不同字符串的后验概率
        testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
        testEntry = self.testEntryProcess(testText)
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        clabels = ['军事', '体育']
        print testText, 'classified as: ', clabels[bayesian.predict(
            thisDoc, p0V, p1V, pAb)]
Пример #4
0
    def testingNB(self):
        ## 加载已有数据集
        listOPosts, listClasses = self.loadDataSet()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(listOPosts)
        trainMat = []
        for postinDoc in listOPosts:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
        ## 计算已有数据集中的先验概率
        p0V, p1V, pAb = bayesian.fit(array(trainMat), array(listClasses))

        ## 测试不同字符串的后验概率
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
        testEntry = ['stupid', 'garbage']
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
Пример #5
0
    def spamTest(self, bayesian):
        docList = []
        classList = []
        fullText = []
        for i in range(1, 26):
            wordList = self.textParse(
                open('data/bayesian/email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = self.textParse(
                open('data/bayesian/email/ham/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)

        bayesian = Bayesian()
        vocabList = bayesian.createVocabList(docList)  #create vocabulary
        trainingSet = range(50)
        testSet = []  #create test set
        for i in range(10):
            randIndex = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:  #train the classifier (get probs) trainNB0
            trainMat.append(
                bayesian.setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))
        errorCount = 0
        for docIndex in testSet:  #classify the remaining items
            wordVector = bayesian.setOfWords2Vec(vocabList, docList[docIndex])
            if bayesian.predict(array(wordVector), p0V, p1V,
                                pSpam) != classList[docIndex]:
                errorCount += 1
                print "分类错误", docList[docIndex]
        print '错误率是: ', float(errorCount) / len(testSet)
        #return vocabList,fullText