def train(self,training_file): """Trains a CRF tagger using Mallet""" wt = WrapperTools() words = wt.unwrap(training_file) self.featurize(words,"featurized_training") os.system("java -cp " +\ "\"../mallet-2.0.8RC3/class:../mallet-2.0.8RC3/lib/mallet-deps.jar\" " +\ "cc.mallet.fst.SimpleTagger --train true " +\ "--model-file trained_model featurized_training")
def train(self, training_file): """Trains a CRF tagger using Mallet""" wt = WrapperTools() words = wt.unwrap(training_file) self.featurize(words, "featurized_training") os.system( "java -cp " + '"Mallet/class:Mallet/lib/mallet-deps.jar" ' + "cc.mallet.fst.SimpleTagger " + "--train true --model-file trained_model featurized_training" )
def test(self,test_file): """Tests the trained tagger using Mallet""" wt = WrapperTools() words = wt.unwrap(test_file) self.featurize(words,"featurized_test",False) os.system("java -cp " +\ "\"../mallet-2.0.8RC3/class:../mallet-2.0.8RC3/lib/mallet-deps.jar\" " +\ "cc.mallet.fst.SimpleTagger " +\ "--model-file trained_model featurized_test > tagged_test" ) #self.post_process("tagged_test", words) #print self.get_precision_and_recall(words,"tagged_test_postprocessed") print self.get_precision_and_recall(words,"tagged_test")
def test(self, test_file): """Tests the trained tagger using Mallet""" wt = WrapperTools() words = wt.unwrap(test_file) self.featurize(words, "featurized_test", False) os.system( "java -cp " + '"Mallet/class:Mallet/lib/mallet-deps.jar" ' + "cc.mallet.fst.SimpleTagger " + "--model-file trained_model featurized_test > tagged_test" ) print self.get_precision_and_recall(words, "tagged_test")
with open('pos_output', 'r') as outfile: pos_tags = [] for line in outfile.readlines(): pos_tags.append(line.split('_')[-1].strip()) word_ind = 0 for tweet in tweets: for i in range(len(tweet)): tweet[i].insert(0, 'curr_pos'+pos_tags[word_ind]) if i != 0: tweet[i].insert(0, 'prev_pos:'+pos_tags[word_ind-1]) if i > 1: tweet[i].insert(0, 'prevprev_pos:'+pos_tags[word_ind-2]) if i != (len(tweet)-1): tweet[i].insert(0, 'next_pos:'+pos_tags[word_ind+1]) # if i < (len(tweet)-2): # tweet[i].insert(0, 'nextnext_pos:'+pos_tags[word_ind+2]) word_ind += 1 os.chdir('..') return tweets if __name__ == "__main__": wt = WrapperTools() words = wt.unwrap("./proj1-data/train.gold") NT = NER_Tagger() NT.train("./proj1-data/train.gold") NT.test("./proj1-data/dev.gold") NT.test("./proj1-data/test.gold")