Пример #1
0
	def scikitNBClassfier(self):
		dataMat, labels = self.loadProcessedData()
		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(dataMat)
		## 建立bag of words 矩阵
		trainMat = []
		for postinDoc in dataMat:
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))

		from sklearn.naive_bayes import GaussianNB

		gnb = GaussianNB()
		X = array(trainMat)
		y = labels

		testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
		testEntry = self.testEntryProcess(testText)

		bayesian = Bayesian()
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		## 拟合并预测
		y_pred = gnb.fit(X, y).predict(thisDoc)
		clabels = ['军事', '体育']
		y_pred = gnb.fit(X, y).predict(X)
		print("Number of mislabeled points : %d" % (labels != y_pred).sum())
Пример #2
0
    def scikitNBClassfier(self):
        dataMat, labels = self.loadProcessedData()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        ## 建立bag of words 矩阵
        trainMat = []
        for postinDoc in dataMat:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))

        from sklearn.naive_bayes import GaussianNB

        gnb = GaussianNB()
        X = array(trainMat)
        y = labels

        testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
        testEntry = self.testEntryProcess(testText)

        bayesian = Bayesian()
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        ## 拟合并预测
        y_pred = gnb.fit(X, y).predict(thisDoc)
        clabels = ['军事', '体育']
        y_pred = gnb.fit(X, y).predict(X)
        print("Number of mislabeled points : %d" % (labels != y_pred).sum())
Пример #3
0
    def crossValidClassifier(self):
        dataMat, labels = self.loadProcessedData()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        trainingSet = range(51)
        testSet = []  #create test set
        for i in range(10):
            randIndex = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:  #train the classifier (get probs) trainNB0
            trainMat.append(
                bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex]))
            trainClasses.append(labels[docIndex])
        p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))

        clabels = ['军事', '体育']
        data = self.getData('data/bayesian/rss/rss_junshi.txt') + self.getData(
            'data/bayesian/rss/rss_tiyu.txt')
        errorCount = 0
        for docIndex in testSet:  #classify the remaining items
            wordVector = bayesian.setOfWords2Vec(myVocabList,
                                                 dataMat[docIndex])
            type = bayesian.predict(array(wordVector), p0V, p1V, pSpam)
            if type != labels[docIndex]:
                errorCount += 1
                print "判断类型:", clabels[type]
                print "classification error", data[docIndex]
                print "---------------------------------------"
        print 'the error rate is: ', float(errorCount) / len(testSet)
Пример #4
0
    def SingleClassifier(self):
        ## 加载RSS源并将其保存为文本文件
        ## 除非是生成新数据,否则不执行这段代码
        #juns_count = rss.loadRSS('http://mil.sohu.com/rss/junshi.xml','data/bayesian/rss/rss_junshi.txt')
        #tiyu_count = rss.loadRSS('http://rss.news.sohu.com/rss/sports.xml','data/bayesian/rss/rss_tiyu.txt' )
        #print juns_count
        #print tiyu_count

        dataMat, labels = self.loadProcessedData()

        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(dataMat)
        ## 建立bag of words 矩阵
        trainMat = []
        for postinDoc in dataMat:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
        ## 计算已有数据集中的先验概率
        p0V, p1V, pAb = bayesian.fit(array(trainMat), array(labels))

        ## 测试不同字符串的后验概率
        testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
        testEntry = self.testEntryProcess(testText)
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        clabels = ['军事', '体育']
        print testText, 'classified as: ', clabels[bayesian.predict(
            thisDoc, p0V, p1V, pAb)]
Пример #5
0
	def crossValidClassifier(self):
		dataMat, labels = self.loadProcessedData()
		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(dataMat)
		trainingSet = range(51);
		testSet = []           #create test set
		for i in range(10):
			randIndex = int(random.uniform(0, len(trainingSet)))
			testSet.append(trainingSet[randIndex])
			del (trainingSet[randIndex])
		trainMat = [];
		trainClasses = []
		for docIndex in trainingSet:#train the classifier (get probs) trainNB0
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex]))
			trainClasses.append(labels[docIndex])
		p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))

		clabels = ['军事', '体育']
		data = self.getData('data/bayesian/rss/rss_junshi.txt') + self.getData('data/bayesian/rss/rss_tiyu.txt')
		errorCount = 0
		for docIndex in testSet:        #classify the remaining items
			wordVector = bayesian.setOfWords2Vec(myVocabList, dataMat[docIndex])
			type = bayesian.predict(array(wordVector), p0V, p1V, pSpam)
			if type != labels[docIndex]:
				errorCount += 1
				print "判断类型:", clabels[type]
				print "classification error", data[docIndex]
				print "---------------------------------------"
		print 'the error rate is: ', float(errorCount) / len(testSet)
Пример #6
0
	def SingleClassifier(self):
		## 加载RSS源并将其保存为文本文件
		## 除非是生成新数据,否则不执行这段代码
		#juns_count = rss.loadRSS('http://mil.sohu.com/rss/junshi.xml','data/bayesian/rss/rss_junshi.txt')
		#tiyu_count = rss.loadRSS('http://rss.news.sohu.com/rss/sports.xml','data/bayesian/rss/rss_tiyu.txt' )
		#print juns_count
		#print tiyu_count

		dataMat, labels = self.loadProcessedData()

		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(dataMat)
		## 建立bag of words 矩阵
		trainMat = []
		for postinDoc in dataMat:
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
		## 计算已有数据集中的先验概率
		p0V, p1V, pAb = bayesian.fit(array(trainMat), array(labels))

		## 测试不同字符串的后验概率
		testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。"
		testEntry = self.testEntryProcess(testText)
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		clabels = ['军事', '体育']
		print testText, 'classified as: ', clabels[bayesian.predict(thisDoc, p0V, p1V, pAb)]
Пример #7
0
    def testingNB(self):
        ## 加载已有数据集
        listOPosts, listClasses = self.loadDataSet()
        bayesian = Bayesian()
        myVocabList = bayesian.createVocabList(listOPosts)
        trainMat = []
        for postinDoc in listOPosts:
            trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
        ## 计算已有数据集中的先验概率
        p0V, p1V, pAb = bayesian.fit(array(trainMat), array(listClasses))

        ## 测试不同字符串的后验概率
        testEntry = ['love', 'my', 'dalmation']
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
        testEntry = ['stupid', 'garbage']
        thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
        print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
Пример #8
0
	def testingNB(self):
	## 加载已有数据集
		listOPosts, listClasses = self.loadDataSet()
		bayesian = Bayesian()
		myVocabList = bayesian.createVocabList(listOPosts)
		trainMat = []
		for postinDoc in listOPosts:
			trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc))
		## 计算已有数据集中的先验概率
		p0V, p1V, pAb = bayesian.fit(array(trainMat), array(listClasses))

		## 测试不同字符串的后验概率
		testEntry = ['love', 'my', 'dalmation']
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
		testEntry = ['stupid', 'garbage']
		thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry))
		print testEntry, '被分类为: ', bayesian.predict(thisDoc, p0V, p1V, pAb)
Пример #9
0
    def spamTest(self, bayesian):
        docList = []
        classList = []
        fullText = []
        for i in range(1, 26):
            wordList = self.textParse(
                open('data/bayesian/email/spam/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(1)
            wordList = self.textParse(
                open('data/bayesian/email/ham/%d.txt' % i).read())
            docList.append(wordList)
            fullText.extend(wordList)
            classList.append(0)

        bayesian = Bayesian()
        vocabList = bayesian.createVocabList(docList)  #create vocabulary
        trainingSet = range(50)
        testSet = []  #create test set
        for i in range(10):
            randIndex = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIndex])
            del (trainingSet[randIndex])
        trainMat = []
        trainClasses = []
        for docIndex in trainingSet:  #train the classifier (get probs) trainNB0
            trainMat.append(
                bayesian.setOfWords2Vec(vocabList, docList[docIndex]))
            trainClasses.append(classList[docIndex])
        p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))
        errorCount = 0
        for docIndex in testSet:  #classify the remaining items
            wordVector = bayesian.setOfWords2Vec(vocabList, docList[docIndex])
            if bayesian.predict(array(wordVector), p0V, p1V,
                                pSpam) != classList[docIndex]:
                errorCount += 1
                print "分类错误", docList[docIndex]
        print '错误率是: ', float(errorCount) / len(testSet)
        #return vocabList,fullText
Пример #10
0
	def spamTest(self, bayesian):
		docList = [];
		classList = [];
		fullText = []
		for i in range(1, 26):
			wordList = self.textParse(open('data/bayesian/email/spam/%d.txt' % i).read())
			docList.append(wordList)
			fullText.extend(wordList)
			classList.append(1)
			wordList = self.textParse(open('data/bayesian/email/ham/%d.txt' % i).read())
			docList.append(wordList)
			fullText.extend(wordList)
			classList.append(0)

		bayesian = Bayesian()
		vocabList = bayesian.createVocabList(docList)#create vocabulary
		trainingSet = range(50);
		testSet = []           #create test set
		for i in range(10):
			randIndex = int(random.uniform(0, len(trainingSet)))
			testSet.append(trainingSet[randIndex])
			del (trainingSet[randIndex])
		trainMat = [];
		trainClasses = []
		for docIndex in trainingSet:#train the classifier (get probs) trainNB0
			trainMat.append(bayesian.setOfWords2Vec(vocabList, docList[docIndex]))
			trainClasses.append(classList[docIndex])
		p0V, p1V, pSpam = bayesian.fit(array(trainMat), array(trainClasses))
		errorCount = 0
		for docIndex in testSet:        #classify the remaining items
			wordVector = bayesian.setOfWords2Vec(vocabList, docList[docIndex])
			if bayesian.predict(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
				errorCount += 1
				print "分类错误", docList[docIndex]
		print '错误率是: ', float(errorCount) / len(testSet)
		#return vocabList,fullText