def no_backoff_taggers(test, train, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers without backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # unigram tagger uni_tagger = UnigramTagger(train) # bigram tagger bi_tagger = BigramTagger(train) # trigram tagger tri_tagger = TrigramTagger(train) info(uni_tagger) uni_score = uni_tagger.evaluate(test) print('accuracy score: {}\n'.format(uni_score)) info(bi_tagger) bi_score = bi_tagger.evaluate(test) print('accuracy score: {}\n'.format(bi_score)) info(tri_tagger) tri_score = tri_tagger.evaluate(test) print('accuracy score: {}\n'.format(tri_score))
def TrainTaggers(training, testing): global results Unigram = UnigramTagger(training, backoff = default) print('unigram trained') Bigram = BigramTagger(training, backoff = Unigram) print('bigram trained') Trigram = TrigramTagger(training, backoff = Bigram) print('trigram trained') results += [Trigram.evaluate(testing)]
def backoff_taggers(test, train, save, corpus='floresta'): default_tagger = default_tagger_corpus(corpus) info('training {} taggers with backoff'.format(corpus)) info('this may take a while...\n') info(default_tagger) default_score = default_tagger.evaluate(test) print('accuracy score: {}\n'.format(default_score)) # UNIGRAM TAGGER WITH BACKOFF uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger) # BIGRAM TAGGER WITH BACKOFF bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff) # TRIGRAM TAGGER WITH BACKOFF tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff) info(uni_tagger_backoff) uni_backoff_score = uni_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(uni_backoff_score)) info(bi_tagger_backoff) bi_backoff_score = bi_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(bi_backoff_score)) info(tri_tagger_backoff) tri_backoff_score = tri_tagger_backoff.evaluate(test) print('accuracy score: {}\n'.format(tri_backoff_score)) if not save: return accuracy_dict = {} accuracy_dict['uni'] = uni_backoff_score accuracy_dict['bi'] = bi_backoff_score accuracy_dict['tri'] = tri_backoff_score # Saving our Trigram-tagger with backoff if uni_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(uni_tagger_backoff, output, -1) elif bi_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus) output = open(tagger_file, 'wb') dump(bi_tagger_backoff, output, -1) elif tri_backoff_score == max(accuracy_dict.values()): tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus) dump(tri_tagger_backoff, output, -1) output.close() info('saving %s...\n', tagger_file)
def createModel(self): model_name = None try: unigrams = self.buildUnigrams() N = len(self.corpusSents) toTraining = round(self.training_portion * N) #logging.info("Sentencias totales:" + str(N)) training = self.corpusSents[:toTraining] test = self.corpusSents[toTraining:] post_patterns = [] for regex, post in self.regex_list: try: regex = regex.decode('utf-8') except: pass post_patterns.append((regex, post)) for regex, post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'), post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams + training, backoff=regexpTagger) bigramTagger = BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger = NgramTagger(self.max_ngrams, training, backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print( "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion != 1: score_ut = unigramTagger.evaluate(test) score_bt = bigramTagger.evaluate(test) - 0.002 score_tt = trigramTagger.evaluate(test) score_nt = NTagger.evaluate(test) scores = [score_ut, score_bt, score_tt, score_nt] tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"] taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger] bestTagger_index = scores.index(max(scores)) best_msg = max(scores), tagger_names[bestTagger_index] fname = self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname + self.tagger_extension_file): fname = fname + str(len(listdir( self.taggers_path))) + self.tagger_extension_file else: fname = self.taggers_path + tagger_names[ bestTagger_index] + self.tagger_extension_file model = taggers[bestTagger_index] f = open(fname, 'wb') pickle.dump(model, f) f.close() print("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name = fname except Exception, e: print "ERRPR EN POS TAGGER GENERATOR:", str(e) pdb.set_trace()
BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50)) print("Default: ", defaultTB50.evaluate(brownT50)) print("Bigram Brown 90/10: ", BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90)) print("Default: ", defaultTB90.evaluate(brownT90)) print("Unigram chat 50/50: ", UnigramTagger(chatT50, backoff=defaultTChat50).evaluate(chatT50)) print("Default: ", defaultTChat50.evaluate(chatT50)) print("Unigram chat 90/10: ", UnigramTagger(chatT90, backoff=defaultTChat90).evaluate(chatT90)) print("Default: ", defaultTChat90.evaluate(chatT90)) print("Trigram to Bigram to Unigram Brown 90/10: ", triB.evaluate(brownT50)) print("Default: ", defaultTB90.evaluate(brownT90)) print("Trigram to Bigram to Unigram with chat 50/50: ", triB.evaluate(chatT50)) print("Default: ", defaultTChat50.evaluate(chatT50)) #3 def lookupTagger(r, c): # r = range, c = corpus if (c == "brown"): fDist = ConditionalFreqDist(brownTW) freqDist = FreqDist(brown.words()) wordsR = freqDist.most_common(r) likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR) baseline_tagger = UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger("NN"))
def createModel(self): model_name=None try: unigrams=self.buildUnigrams() N=len(self.corpusSents) toTraining=round(self.training_portion*N) #logging.info("Sentencias totales:" + str(N)) training=self.corpusSents[:toTraining] test=self.corpusSents[toTraining:] post_patterns=[] for regex,post in self.regex_list: try: regex=regex.decode('utf-8') except: pass post_patterns.append((regex,post)) for regex,post in self.config.items('postaggers.regex'): post_patterns.append((regex.decode('utf-8'),post)) regexpTagger = RegexpTagger(post_patterns) unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger) bigramTagger= BigramTagger(training, backoff=unigramTagger) trigramTagger = TrigramTagger(training, backoff=bigramTagger) NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger) print("Sentencias de entrenamiento para n-taggers:" + str(len(training))) print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams))) print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams))) print("Sentencias para testing:" + str(len(test))) print("Expresiones regulares para el Tagger:") for post_regex in post_patterns: print post_regex if self.training_portion!=1: score_ut=unigramTagger.evaluate(test) score_bt=bigramTagger.evaluate(test)-0.002 score_tt=trigramTagger.evaluate(test) score_nt=NTagger.evaluate(test) scores=[score_ut,score_bt,score_tt,score_nt] tagger_names=["uTagger","biTagger","triTagger","NTagger"] taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger] bestTagger_index= scores.index(max(scores)) best_msg=max(scores),tagger_names[bestTagger_index] fname=self.taggers_path + tagger_names[bestTagger_index] if os.path.isfile(fname+self.tagger_extension_file): fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file else: fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file model=taggers[bestTagger_index] f = open(fname,'wb') pickle.dump(model, f) f.close() print ("Guardando el tagger :" + fname) #logging.info("Guardando el mejor tagger :" + fname) model_name=fname except Exception,e: print "ERRPR EN POS TAGGER GENERATOR:",str(e) pdb.set_trace()