示例#1
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x))))
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(tagger.tag(chain(doc["cleansed_text"])))
			if ind == 10:
				break
示例#2
0
def ner_feature(document,tagger=None):
	if tagger is None:
		tagger = ngrams.make_backoff_tagger()
	sents = annotate.dirty_dict(document,tagger=tagger)
	fts = {}
	for noun in sents["nouns"]:
		nn = "contains_noun(" + noun + ")"
		fts[nn] = True
	return fts
示例#3
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break
示例#4
0
 def test_interactive(self):
     docs = self.source.find_clean(batch_size=1000)
     tagger = ngrams.make_backoff_tagger()
     print()
     for ind, doc in docs:
         sentences = pos.tokenize_sents(doc["cleansed_text"])
         tags = pos.tokenize_words(sentences)
         for sent in tags:
             tagged_sent = tagger.tag(sent)
             d = ne_chunk(tagged_sent)
             chunks = tree2conlltags(d)
             print("CHUNKS" + str(chunks))
             print("NE" + str(cnll.get_ne(chunks)))
             print("NOUNS" + str(cnll.get_nouns(chunks)))
         if ind == 10:
             break
示例#5
0
	def test_classbased(self):
		tagger = ngrams.make_backoff_tagger()
		params = {
			"corpora" : self.source,
			"labeled_set" : lambda : self.source.find_clean({"subreddit" : "fitness"}, batch_size=1000, limit=2000),
			"unlabeled_set" : lambda : self.source.find_clean({"subreddit" : "AskReddit"}, batch_size=1000, limit=2000),
			"feature" : lambda x : cls.ner_feature(x,tagger=tagger),
			"exit" : lambda self : self.corpora.exit()
		}

		pnb_a = annotate.PNBAnnotater(**params)
		pnb_a.train()
		pnb_a.describe()
		ct = 0
		for doc, annotation in pnb_a.classify_iter(self.source.find_clean()):
			ct += 1
			# print(doc)
			print(annotation)
			if ct == 10:
				break
			print("------------")
示例#6
0
	def test_evaluate(self):
		# the data sources
		ukr = self.source.find_clean({"subreddit" : "UkrainianConflict"}, limit=2500, batch_size=1000)
		askr = self.source.find_clean({"subreddit" : "AskReddit"}, limit=2500, batch_size=1000)
		alll = self.source.find_clean(limit=10000)
		tagger = ngrams.make_backoff_tagger()

		featuresets = [(cls.ner_feature(doc,tagger=tagger), "YES") for doc in ukr]
		featuresets.extend([(cls.ner_feature(doc,tagger=tagger), "NO") for doc in askr])
		random.shuffle(featuresets)
		
		trainset, testset = featuresets[1250:], featuresets[:1250]
		classifier = NaiveBayesClassifier.train(trainset)
		f = open("./UkrainianConflictNVM","w")
		for doc in alll:
			del (doc["_id"])
			truthiness = False
			truthiness = classifier.classify(cls.ner_feature(doc,tagger=tagger))
			if truthiness:
				f.write(json.dumps(doc) + "\n")
		f.close()
		print(nltk.classify.accuracy(classifier, testset))
示例#7
0
 def setUp(self):
     self.t = self.assertTrue
     self.inst = self.assertIsInstance
     self.source = source.Source(host="localhost", port=27017, database="reddit_stream_test", collection="combined")
     self.tagger = tagger.make_backoff_tagger()
import rdt.data.clean.html as clean
import rdt.data.mongo.source as rdtcorp
import rdt.nlp.ngrams as ngrams
import rdt.nlp.pos as pos
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000: