示例#1
0
	def train(self,training_file):
		"""Trains a CRF tagger using Mallet"""
		wt = WrapperTools()
 		words = wt.unwrap(training_file)
 		self.featurize(words,"featurized_training")
 		os.system("java -cp " +\
 			"\"../mallet-2.0.8RC3/class:../mallet-2.0.8RC3/lib/mallet-deps.jar\" " +\
 			"cc.mallet.fst.SimpleTagger --train true " +\
 			"--model-file trained_model featurized_training")
示例#2
0
 def train(self, training_file):
     """Trains a CRF tagger using Mallet"""
     wt = WrapperTools()
     words = wt.unwrap(training_file)
     self.featurize(words, "featurized_training")
     os.system(
         "java -cp "
         + '"Mallet/class:Mallet/lib/mallet-deps.jar" '
         + "cc.mallet.fst.SimpleTagger "
         + "--train true --model-file trained_model featurized_training"
     )
示例#3
0
	def test(self,test_file):
		"""Tests the trained tagger using Mallet"""
		wt = WrapperTools()
 		words = wt.unwrap(test_file)
 		self.featurize(words,"featurized_test",False)
 		os.system("java -cp " +\
 			"\"../mallet-2.0.8RC3/class:../mallet-2.0.8RC3/lib/mallet-deps.jar\" " +\
 			"cc.mallet.fst.SimpleTagger " +\
 			"--model-file trained_model featurized_test > tagged_test" )
 		#self.post_process("tagged_test", words)
 		#print self.get_precision_and_recall(words,"tagged_test_postprocessed")
 		print self.get_precision_and_recall(words,"tagged_test")
示例#4
0
 def test(self, test_file):
     """Tests the trained tagger using Mallet"""
     wt = WrapperTools()
     words = wt.unwrap(test_file)
     self.featurize(words, "featurized_test", False)
     os.system(
         "java -cp "
         + '"Mallet/class:Mallet/lib/mallet-deps.jar" '
         + "cc.mallet.fst.SimpleTagger "
         + "--model-file trained_model featurized_test > tagged_test"
     )
     print self.get_precision_and_recall(words, "tagged_test")
示例#5
0
	with open('pos_output', 'r') as outfile:
		pos_tags = []
		for line in outfile.readlines():
			pos_tags.append(line.split('_')[-1].strip())

	word_ind = 0
	for tweet in tweets:
		for i in range(len(tweet)):
			tweet[i].insert(0, 'curr_pos'+pos_tags[word_ind])
			if i != 0:
				tweet[i].insert(0, 'prev_pos:'+pos_tags[word_ind-1])
			if i > 1:
				tweet[i].insert(0, 'prevprev_pos:'+pos_tags[word_ind-2])

			if i != (len(tweet)-1):
				tweet[i].insert(0, 'next_pos:'+pos_tags[word_ind+1])

			# if i < (len(tweet)-2):
			# 	tweet[i].insert(0, 'nextnext_pos:'+pos_tags[word_ind+2])
			word_ind += 1
	os.chdir('..')

	return tweets			

if __name__ == "__main__":
	wt = WrapperTools()
 	words = wt.unwrap("./proj1-data/train.gold")
 	NT = NER_Tagger()
 	NT.train("./proj1-data/train.gold")
 	NT.test("./proj1-data/dev.gold")
 	NT.test("./proj1-data/test.gold")