Пример #1
0
def no_backoff_taggers(test, train, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)

    info('training {} taggers without backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # unigram tagger
    uni_tagger = UnigramTagger(train)
    # bigram tagger
    bi_tagger = BigramTagger(train)
    # trigram tagger
    tri_tagger = TrigramTagger(train)

    info(uni_tagger)
    uni_score = uni_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(uni_score))

    info(bi_tagger)
    bi_score = bi_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(bi_score))

    info(tri_tagger)
    tri_score = tri_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(tri_score))
Пример #2
0
def TrainTaggers(training, testing):
    global results
    Unigram = UnigramTagger(training, backoff = default)
    print('unigram trained')
    Bigram = BigramTagger(training, backoff = Unigram)
    print('bigram trained')
    Trigram = TrigramTagger(training, backoff = Bigram)
    print('trigram trained')
    results += [Trigram.evaluate(testing)]
Пример #3
0
def backoff_taggers(test, train, save, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)
    info('training {} taggers with backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # UNIGRAM TAGGER WITH BACKOFF
    uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger)

    # BIGRAM TAGGER WITH BACKOFF
    bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff)

    # TRIGRAM TAGGER WITH BACKOFF
    tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff)

    info(uni_tagger_backoff)
    uni_backoff_score = uni_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(uni_backoff_score))

    info(bi_tagger_backoff)
    bi_backoff_score = bi_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(bi_backoff_score))

    info(tri_tagger_backoff)
    tri_backoff_score = tri_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(tri_backoff_score))

    if not save:
        return

    accuracy_dict = {}
    accuracy_dict['uni'] = uni_backoff_score
    accuracy_dict['bi'] = bi_backoff_score
    accuracy_dict['tri'] = tri_backoff_score

    # Saving our Trigram-tagger with backoff
    if uni_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(uni_tagger_backoff, output, -1)
    elif bi_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(bi_tagger_backoff, output, -1)
    elif tri_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus)
        dump(tri_tagger_backoff, output, -1)
    output.close()
    info('saving %s...\n', tagger_file)
Пример #4
0
    def createModel(self):

        model_name = None
        try:
            unigrams = self.buildUnigrams()

            N = len(self.corpusSents)
            toTraining = round(self.training_portion * N)

            #logging.info("Sentencias totales:" + str(N))

            training = self.corpusSents[:toTraining]
            test = self.corpusSents[toTraining:]

            post_patterns = []

            for regex, post in self.regex_list:
                try:
                    regex = regex.decode('utf-8')
                except:
                    pass

                post_patterns.append((regex, post))

            for regex, post in self.config.items('postaggers.regex'):
                post_patterns.append((regex.decode('utf-8'), post))

            regexpTagger = RegexpTagger(post_patterns)
            unigramTagger = UnigramTagger(unigrams + training,
                                          backoff=regexpTagger)
            bigramTagger = BigramTagger(training, backoff=unigramTagger)
            trigramTagger = TrigramTagger(training, backoff=bigramTagger)
            NTagger = NgramTagger(self.max_ngrams,
                                  training,
                                  backoff=trigramTagger)

            print("Sentencias de entrenamiento para n-taggers:" +
                  str(len(training)))
            print("Sentencias de entrenamiento para unitaggers:" +
                  str(len(unigrams)))
            print(
                "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:"
                + str(len(unigrams)))
            print("Sentencias para testing:" + str(len(test)))
            print("Expresiones regulares para el Tagger:")

            for post_regex in post_patterns:
                print post_regex

            if self.training_portion != 1:

                score_ut = unigramTagger.evaluate(test)
                score_bt = bigramTagger.evaluate(test) - 0.002
                score_tt = trigramTagger.evaluate(test)
                score_nt = NTagger.evaluate(test)

                scores = [score_ut, score_bt, score_tt, score_nt]
                tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"]
                taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger]

                bestTagger_index = scores.index(max(scores))
                best_msg = max(scores), tagger_names[bestTagger_index]

            fname = self.taggers_path + tagger_names[bestTagger_index]
            if os.path.isfile(fname + self.tagger_extension_file):
                fname = fname + str(len(listdir(
                    self.taggers_path))) + self.tagger_extension_file
            else:
                fname = self.taggers_path + tagger_names[
                    bestTagger_index] + self.tagger_extension_file

            model = taggers[bestTagger_index]

            f = open(fname, 'wb')
            pickle.dump(model, f)
            f.close()

            print("Guardando el tagger :" + fname)
            #logging.info("Guardando el mejor tagger :" + fname)

            model_name = fname

        except Exception, e:
            print "ERRPR EN POS TAGGER GENERATOR:", str(e)
            pdb.set_trace()
Пример #5
0
      BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50))
print("Default: ", defaultTB50.evaluate(brownT50))

print("Bigram Brown 90/10: ",
      BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90))
print("Default: ", defaultTB90.evaluate(brownT90))

print("Unigram chat 50/50: ",
      UnigramTagger(chatT50, backoff=defaultTChat50).evaluate(chatT50))
print("Default: ", defaultTChat50.evaluate(chatT50))

print("Unigram chat 90/10: ",
      UnigramTagger(chatT90, backoff=defaultTChat90).evaluate(chatT90))
print("Default: ", defaultTChat90.evaluate(chatT90))

print("Trigram to Bigram to Unigram Brown 90/10: ", triB.evaluate(brownT50))
print("Default: ", defaultTB90.evaluate(brownT90))

print("Trigram to Bigram to Unigram with chat 50/50: ", triB.evaluate(chatT50))
print("Default: ", defaultTChat50.evaluate(chatT50))


#3
def lookupTagger(r, c):  # r = range, c = corpus
    if (c == "brown"):
        fDist = ConditionalFreqDist(brownTW)
        freqDist = FreqDist(brown.words())
        wordsR = freqDist.most_common(r)
        likely_tags = dict((word, fDist[word].max()) for (word, _) in wordsR)
        baseline_tagger = UnigramTagger(model=likely_tags,
                                        backoff=nltk.DefaultTagger("NN"))
Пример #6
0
	def createModel(self):

		
		model_name=None
		try:
			unigrams=self.buildUnigrams()
			
			N=len(self.corpusSents)
			toTraining=round(self.training_portion*N)
			
			#logging.info("Sentencias totales:" + str(N))

			training=self.corpusSents[:toTraining]
			test=self.corpusSents[toTraining:]
			
			post_patterns=[]

			for regex,post in self.regex_list:
				try:
					regex=regex.decode('utf-8')
				except:
					pass
				
				post_patterns.append((regex,post))


			
			for regex,post in self.config.items('postaggers.regex'):
				post_patterns.append((regex.decode('utf-8'),post))

		
			regexpTagger  = RegexpTagger(post_patterns)
			unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger)	
			bigramTagger= BigramTagger(training, backoff=unigramTagger) 
			trigramTagger = TrigramTagger(training, backoff=bigramTagger)
			NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger)

			print("Sentencias de entrenamiento para n-taggers:" + str(len(training)))
			print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams)))
			print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams)))
			print("Sentencias para testing:" + str(len(test)))
			print("Expresiones regulares para el Tagger:")
			
			for post_regex in post_patterns:
				print post_regex
				
		
			if self.training_portion!=1:
		
				score_ut=unigramTagger.evaluate(test)
				score_bt=bigramTagger.evaluate(test)-0.002
				score_tt=trigramTagger.evaluate(test)
				score_nt=NTagger.evaluate(test)

			

				scores=[score_ut,score_bt,score_tt,score_nt]
				tagger_names=["uTagger","biTagger","triTagger","NTagger"]
				taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger]

				bestTagger_index= scores.index(max(scores))
				best_msg=max(scores),tagger_names[bestTagger_index]
			
		
			fname=self.taggers_path + tagger_names[bestTagger_index]
			if os.path.isfile(fname+self.tagger_extension_file):
				fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file
			else:
				fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file
			
			model=taggers[bestTagger_index]

			f = open(fname,'wb')
			pickle.dump(model, f)
			f.close()
			
			print ("Guardando el tagger :" + fname)
			#logging.info("Guardando el mejor tagger :" + fname)
			
			model_name=fname
			
		except Exception,e:
			print "ERRPR EN POS TAGGER GENERATOR:",str(e)
			pdb.set_trace()