예제 #1
0
	def test_evaluate(self):
		# the data sources
		ukr = self.source.find_clean({"subreddit" : "UkrainianConflict"}, limit=2500, batch_size=1000)
		askr = self.source.find_clean({"subreddit" : "AskReddit"}, limit=2500, batch_size=1000)
		alll = self.source.find_clean(limit=10000)
		tagger = ngrams.make_backoff_tagger()

		featuresets = [(cls.ner_feature(doc,tagger=tagger), "YES") for doc in ukr]
		featuresets.extend([(cls.ner_feature(doc,tagger=tagger), "NO") for doc in askr])
		random.shuffle(featuresets)
		
		trainset, testset = featuresets[1250:], featuresets[:1250]
		classifier = NaiveBayesClassifier.train(trainset)
		f = open("./UkrainianConflictNVM","w")
		for doc in alll:
			del (doc["_id"])
			truthiness = False
			truthiness = classifier.classify(cls.ner_feature(doc,tagger=tagger))
			if truthiness:
				f.write(json.dumps(doc) + "\n")
		f.close()
		print(nltk.classify.accuracy(classifier, testset))
예제 #2
0
	def test_classbased(self):
		tagger = ngrams.make_backoff_tagger()
		params = {
			"corpora" : self.source,
			"labeled_set" : lambda : self.source.find_clean({"subreddit" : "fitness"}, batch_size=1000, limit=2000),
			"unlabeled_set" : lambda : self.source.find_clean({"subreddit" : "AskReddit"}, batch_size=1000, limit=2000),
			"feature" : lambda x : cls.ner_feature(x,tagger=tagger),
			"exit" : lambda self : self.corpora.exit()
		}

		pnb_a = annotate.PNBAnnotater(**params)
		pnb_a.train()
		pnb_a.describe()
		ct = 0
		for doc, annotation in pnb_a.classify_iter(self.source.find_clean()):
			ct += 1
			# print(doc)
			print(annotation)
			if ct == 10:
				break
			print("------------")