示例#1
0
 def __init__(self, mode, train_sents):
     if mode == TRIGRAM:
         self.tagger = UnigramTagger(train_sents)
         self.tagger = BigramTagger(train_sents, backoff=self.tagger)
         self.tagger = TrigramTagger(train_sents, backoff=self.tagger)
     elif HDM:
         self.tagger = HiddenMarkovModelTagger.train(train_sents)
    def __init__(self, train_sents):
        # Extract only the (POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] 
                      for sent in train_sents]
 
        # Train a TrigramTagger
        self.tagger = TrigramTagger(train_data)
示例#3
0
def no_backoff_taggers(test, train, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)

    info('training {} taggers without backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # unigram tagger
    uni_tagger = UnigramTagger(train)
    # bigram tagger
    bi_tagger = BigramTagger(train)
    # trigram tagger
    tri_tagger = TrigramTagger(train)

    info(uni_tagger)
    uni_score = uni_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(uni_score))

    info(bi_tagger)
    bi_score = bi_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(bi_score))

    info(tri_tagger)
    tri_score = tri_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(tri_score))
示例#4
0
def TrainTaggers(training, testing):
    global results
    Unigram = UnigramTagger(training, backoff = default)
    print('unigram trained')
    Bigram = BigramTagger(training, backoff = Unigram)
    print('bigram trained')
    Trigram = TrigramTagger(training, backoff = Bigram)
    print('trigram trained')
    results += [Trigram.evaluate(testing)]
示例#5
0
    def __init__(self, train_sents, to_detect_list, n_gram=1):
        train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]

        self.tagger = UnigramTagger(train_data)
        if n_gram > 1:
            self.tagger = BigramTagger(train_data, backoff=self.tagger)
        if n_gram > 2:
            self.tagger = TrigramTagger(train_data, backoff=self.tagger)
        self.to_detect_list = to_detect_list
示例#6
0
    def __init__(self, train_sents):
        """Show parameters.

        train_sents: trained sentences which have already been tagged.
        using Brown, conll2000, and TreeBank corpus.
        """
        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)
示例#7
0
def backoff_taggers(test, train, save, corpus='floresta'):
    default_tagger = default_tagger_corpus(corpus)
    info('training {} taggers with backoff'.format(corpus))
    info('this may take a while...\n')

    info(default_tagger)
    default_score = default_tagger.evaluate(test)
    print('accuracy score: {}\n'.format(default_score))

    # UNIGRAM TAGGER WITH BACKOFF
    uni_tagger_backoff = UnigramTagger(train, backoff=default_tagger)

    # BIGRAM TAGGER WITH BACKOFF
    bi_tagger_backoff = BigramTagger(train, backoff=uni_tagger_backoff)

    # TRIGRAM TAGGER WITH BACKOFF
    tri_tagger_backoff = TrigramTagger(train, backoff=bi_tagger_backoff)

    info(uni_tagger_backoff)
    uni_backoff_score = uni_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(uni_backoff_score))

    info(bi_tagger_backoff)
    bi_backoff_score = bi_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(bi_backoff_score))

    info(tri_tagger_backoff)
    tri_backoff_score = tri_tagger_backoff.evaluate(test)
    print('accuracy score: {}\n'.format(tri_backoff_score))

    if not save:
        return

    accuracy_dict = {}
    accuracy_dict['uni'] = uni_backoff_score
    accuracy_dict['bi'] = bi_backoff_score
    accuracy_dict['tri'] = tri_backoff_score

    # Saving our Trigram-tagger with backoff
    if uni_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_unigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(uni_tagger_backoff, output, -1)
    elif bi_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_bigram_tagger_backoff.pkl'.format(corpus)
        output = open(tagger_file, 'wb')
        dump(bi_tagger_backoff, output, -1)
    elif tri_backoff_score == max(accuracy_dict.values()):
        tagger_file = '{}_trigram_tagger_backoff.pkl'.format(corpus)
        dump(tri_tagger_backoff, output, -1)
    output.close()
    info('saving %s...\n', tagger_file)
示例#8
0
def train_tagger(corpus_name, corpus):
	"""
	Train the taggers and saves them
	
	Args:
		corpus_name: 	name of the corpus used to create the tagger
		corpus: 		corpus for creating the tagger
	"""
	
	#List of n-gram taggers names
	complete_names = [corpus_name + '_' + x for x in N_GRAM_NAMES]
	
	# Training UnigramTagger
	tagger1 = UnigramTagger(corpus)
	utilities.save_pickle(tagger1, complete_names[0], TAGGER_EXTENSION, TAGGER_PATH)
	print "UnigramTagger trained with", corpus_name
	
	# Training BigramTagger
	tagger2 = BigramTagger(corpus)
	utilities.save_pickle(tagger2, complete_names[1], TAGGER_EXTENSION, TAGGER_PATH)
	print "BigramTagger trained with", corpus_name
	
	# Training TrigramTagger
	tagger3 = TrigramTagger(corpus)
	utilities.save_pickle(tagger3, complete_names[2], TAGGER_EXTENSION, TAGGER_PATH)
	print "TrigramTagger trained with", corpus_name
class NgramTagger(object):
    """ Trigram tagger
    """
    
    def __init__(self):
        self.tagger = None
    
    def train(self,sentence_list):
        """
        """
        noun_fallback = DefaultTagger('NN')
        affix_fallback = AffixTagger(sentence_list,
            backoff=noun_fallback)
        unigram_fallback = UnigramTagger(sentence_list,
            backoff=affix_fallback)
        bigram_fallback = BigramTagger(sentence_list,
            backoff=unigram_fallback)
        self.tagger = TrigramTagger(sentence_list,
            backoff=bigram_fallback)
    
    def tag(self,words):
        """
        """
        if not self.tagger:
            raise Exception("Trigram Tagger not trained.")
        return self.tagger.tag(words)
            
            
            
示例#10
0
def get_pos_tagger():
    from nltk.corpus import brown
    regexp_tagger = nltk.RegexpTagger([
        (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
        (r'(The|the|A|a|An|an)$', 'AT'),  # articles
        (r'.*able$', 'JJ'),  # adjectives
        (r'.*ness$', 'NN'),  # nouns formed from adjectives
        (r'.*ly$', 'RB'),  # adverbs
        (r'.*s$', 'NNS'),  # plural nouns
        (r'.*ing$', 'VBG'),  # gerunds
        (r'.*ed$', 'VBD'),  # past tense verbs
        (r'.*', 'NN')  # nouns (default)
    ])
    brown_train = brown.tagged_sents()
    unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
    bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
    trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

    # Override particular words
    main_tagger = nltk.RegexpTagger(
        [(r'(A|a|An|an)$', 'ex_quant'),
         (r'(Every|every|All|all)$', 'univ_quant')],
        backoff=trigram_tagger)

    return main_tagger
示例#11
0
class SubjectTrigramTagger(object):
    def __init__(self, train_sents):
        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)

    def tag(self, tokens):
        return self.tagger.tag(tokens)
示例#12
0
def create_tagger(sents,patterns=PATTERNS,maxngram=4):
    '''Обучение Backoff tagger на каком-либо корпусе предложений'''
    
    train = sents
    def_tagger = DefaultTagger('NN')
    re_tagger = RegexpTagger(patterns, backoff=def_tagger)
    uni_tagger = UnigramTagger(train, backoff=re_tagger) 
    bi_tagger = BigramTagger(train, backoff=uni_tagger) 
    tri_tagger = TrigramTagger(train, backoff=bi_tagger) 
    ngram_tagger = NgramTagger(maxngram, train, backoff=tri_tagger)
    return ngram_tagger
示例#13
0
class Tagger(object):
    def __init__(self, mode, train_sents):
        if mode == TRIGRAM:
            self.tagger = UnigramTagger(train_sents)
            self.tagger = BigramTagger(train_sents, backoff=self.tagger)
            self.tagger = TrigramTagger(train_sents, backoff=self.tagger)
        elif HDM:
            self.tagger = HiddenMarkovModelTagger.train(train_sents)

    def tag(self, sentence):
        sentence_tokens = nltk.word_tokenize(sentence)
        return self.tagger.tag(sentence_tokens)
 def train(self,sentence_list):
     """
     """
     noun_fallback = DefaultTagger('NN')
     affix_fallback = AffixTagger(sentence_list,
         backoff=noun_fallback)
     unigram_fallback = UnigramTagger(sentence_list,
         backoff=affix_fallback)
     bigram_fallback = BigramTagger(sentence_list,
         backoff=unigram_fallback)
     self.tagger = TrigramTagger(sentence_list,
         backoff=bigram_fallback)
示例#15
0
class SimpleChunkParser(ChunkParserI):
    def __init__(self, trainingChunkedSents):
        trainingData = [
            [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)]
            for chunkedSent in trainingChunkedSents 
        ]
        self.tagger = TrigramTagger(trainingData)

    def parse(self, sent):
        posTags = [posTag for (word, posTag) in sent]
        bioTags = [bioTag for (posTag, bioTag) in self.tagger.tag(posTags)]
        chunkedSent = [(word, posTag, bioTag) for ((word, posTag), bioTag) in zip(sent, bioTags)]
        return conlltags2tree(chunkedSent)
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # Extract only the(POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag)
                       for word, pos_tag, chunk_tag in tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = TrigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        conlltags = [(word, pos_tag, chunk_tag)
                     for ((word, pos_tag),
                          (pos_tag,
                           chunk_tag)) in zip(sentence, tagged_pos_tags)]
        return conlltags2tree(conlltags)
示例#17
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """
    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))

    return trigram_tagger
class SubjectTrigramTagger(object):
    """ Creates an instance of NLTKs TrigramTagger with a backoff
    tagger of a bigram tagger a unigram tagger and a default tagger that sets
    all words to nouns (NN)
    """
    def __init__(self, train_sents):
        """
        train_sents: trained sentences which have already been tagged.
                Currently using Brown, conll2000, and TreeBank corpuses
        """

        t0 = DefaultTagger('NN')
        t1 = UnigramTagger(train_sents, backoff=t0)
        t2 = BigramTagger(train_sents, backoff=t1)
        self.tagger = TrigramTagger(train_sents, backoff=t2)

    def tag(self, tokens):
        return self.tagger.tag(tokens)
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_sents):
        # Extract only the (POS-TAG, IOB-CHUNK-TAG) pairs
        train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] 
                      for sent in train_sents]
 
        # Train a TrigramTagger
        self.tagger = TrigramTagger(train_data)
 
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
 
        # Get the Chunk tags
        tagged_pos_tags = self.tagger.tag(pos_tags)
 
        # Assemble the (word, pos, chunk) triplets
        conlltags = [(word, pos_tag, chunk_tag) 
                     for ((word, pos_tag), (pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)]
 
        # Transform to tree
        return conlltags2tree(conlltags)
示例#20
0
def trained_tagger():
    """Returns a trained trigram tagger
    existing : set to True if already trained tagger has been pickled
    """

    if os.path.exists(os.path.join(os.getcwd(),
                                   r"DataBase/trained_tagger.pkl")):
        print("Trained Tagger File already Exists..")
        return

    # Aggregate trained sentences for N-Gram Taggers
    train_sents = nltk.corpus.brown.tagged_sents()
    train_sents += nltk.corpus.conll2000.tagged_sents()
    train_sents += nltk.corpus.treebank.tagged_sents()

    t0 = DefaultTagger('NN')
    t1 = UnigramTagger(train_sents, backoff=t0)
    t2 = BigramTagger(train_sents, backoff=t1)
    trigram_tagger = TrigramTagger(train_sents, backoff=t2)

    pickle.dump(trigram_tagger, open(r'DataBase/trained_tagger.pkl', 'wb'))
示例#21
0
文件: tag3.py 项目: mfrodl/pos-tagger
    def __init__(self, train_sents, load=False):
        if load:
            print 'Loading saved tagger...',
            self.load()
            print 'done.'
        else:
            time_start = time.time()

            print 'Training the tagger...'
            tag_counts = Counter([t for s in train_sents for w, t in s])
            default_tag = argmax(tag_counts)

            def_tgr = DefaultTagger(default_tag)
            af_tgr = AffixTagger(train_sents, affix_length=-3, backoff=def_tgr)
            uni_tgr = UnigramTagger(train_sents, backoff=af_tgr)
            bi_tgr = BigramTagger(train_sents, backoff=uni_tgr)
            tri_tgr = TrigramTagger(train_sents, backoff=bi_tgr)
            self.tgr = tri_tgr
            print 'Done.'

            time_stop = time.time()
            print 'Training time: {0:.2f}s'.format(time_stop - time_start)
示例#22
0
class Chunker(nltk.ChunkParserI):
    def __init__(self, train_sents, to_detect_list, n_gram=1):
        train_data = [[(t, c) for w, t, c in sent] for sent in train_sents]

        self.tagger = UnigramTagger(train_data)
        if n_gram > 1:
            self.tagger = BigramTagger(train_data, backoff=self.tagger)
        if n_gram > 2:
            self.tagger = TrigramTagger(train_data, backoff=self.tagger)
        self.to_detect_list = to_detect_list

    def traverse_to_dic(self, t, dicc):
        try:
            t.label()
        except AttributeError:
            dicc.append(list(t)[0])
        else:
            new_list = []
            new_dicc = {t.label(): new_list}
            dicc.append(new_dicc)
            for child in t:
                self.traverse_to_dic(child, new_list)

        return None

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag)
                     for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

    def predict(self, sentence):
        chunked_sentence = self.parse(sentence)
        dic = []
        self.traverse_to_dic(chunked_sentence, dic)
        return dic
示例#23
0
def prepare_toolset():
    toolset = {}
    patterns = [(r'^[\.1-9]+$', 'NUM'), (r'^[^a-zA-Z]+$', '.'),
                (r'^[^a-zA-Z]*[a-zA-Z]+[-\'][a-zA-Z]+[^a-zA-Z]*$', 'NOUN'),
                (r'^.*[a-zA-Z]+[^-a-zA-Z]+[a-zA-Z]+.*$', '.')]
    train_set = brown.tagged_sents(
        categories='learned', tagset='universal') + brown.tagged_sents(
            categories='news', tagset='universal') + brown.tagged_sents(
                categories='reviews', tagset='universal')
    utgr = UnigramTagger(train=train_set, backoff=DefaultTagger('NN'))
    btgr = BigramTagger(train=train_set, backoff=utgr)
    ttgr = TrigramTagger(train=train_set, backoff=btgr)
    toolset['tgr'] = RegexpTagger(regexps=patterns, backoff=ttgr)
    toolset['sw'] = stopwords.words('english')
    toolset['lr'] = WordNetLemmatizer()
    toolset['wntg'] = {
        'NOUN': wordnet.NOUN,
        'VERB': wordnet.VERB,
        'ADJ': wordnet.ADJ,
        'ADV': wordnet.ADV,
        'X': wordnet.NOUN
    }
    print('Tools Ready')
    return toolset
示例#24
0
    def createModel(self):

        model_name = None
        try:
            unigrams = self.buildUnigrams()

            N = len(self.corpusSents)
            toTraining = round(self.training_portion * N)

            #logging.info("Sentencias totales:" + str(N))

            training = self.corpusSents[:toTraining]
            test = self.corpusSents[toTraining:]

            post_patterns = []

            for regex, post in self.regex_list:
                try:
                    regex = regex.decode('utf-8')
                except:
                    pass

                post_patterns.append((regex, post))

            for regex, post in self.config.items('postaggers.regex'):
                post_patterns.append((regex.decode('utf-8'), post))

            regexpTagger = RegexpTagger(post_patterns)
            unigramTagger = UnigramTagger(unigrams + training,
                                          backoff=regexpTagger)
            bigramTagger = BigramTagger(training, backoff=unigramTagger)
            trigramTagger = TrigramTagger(training, backoff=bigramTagger)
            NTagger = NgramTagger(self.max_ngrams,
                                  training,
                                  backoff=trigramTagger)

            print("Sentencias de entrenamiento para n-taggers:" +
                  str(len(training)))
            print("Sentencias de entrenamiento para unitaggers:" +
                  str(len(unigrams)))
            print(
                "Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:"
                + str(len(unigrams)))
            print("Sentencias para testing:" + str(len(test)))
            print("Expresiones regulares para el Tagger:")

            for post_regex in post_patterns:
                print post_regex

            if self.training_portion != 1:

                score_ut = unigramTagger.evaluate(test)
                score_bt = bigramTagger.evaluate(test) - 0.002
                score_tt = trigramTagger.evaluate(test)
                score_nt = NTagger.evaluate(test)

                scores = [score_ut, score_bt, score_tt, score_nt]
                tagger_names = ["uTagger", "biTagger", "triTagger", "NTagger"]
                taggers = [unigramTagger, bigramTagger, trigramTagger, NTagger]

                bestTagger_index = scores.index(max(scores))
                best_msg = max(scores), tagger_names[bestTagger_index]

            fname = self.taggers_path + tagger_names[bestTagger_index]
            if os.path.isfile(fname + self.tagger_extension_file):
                fname = fname + str(len(listdir(
                    self.taggers_path))) + self.tagger_extension_file
            else:
                fname = self.taggers_path + tagger_names[
                    bestTagger_index] + self.tagger_extension_file

            model = taggers[bestTagger_index]

            f = open(fname, 'wb')
            pickle.dump(model, f)
            f.close()

            print("Guardando el tagger :" + fname)
            #logging.info("Guardando el mejor tagger :" + fname)

            model_name = fname

        except Exception, e:
            print "ERRPR EN POS TAGGER GENERATOR:", str(e)
            pdb.set_trace()
示例#25
0
 def __init__(self, trainingChunkedSents):
     trainingData = [
         [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)]
         for chunkedSent in trainingChunkedSents 
     ]
     self.tagger = TrigramTagger(trainingData)
示例#26
0
	def createModel(self):

		
		model_name=None
		try:
			unigrams=self.buildUnigrams()
			
			N=len(self.corpusSents)
			toTraining=round(self.training_portion*N)
			
			#logging.info("Sentencias totales:" + str(N))

			training=self.corpusSents[:toTraining]
			test=self.corpusSents[toTraining:]
			
			post_patterns=[]

			for regex,post in self.regex_list:
				try:
					regex=regex.decode('utf-8')
				except:
					pass
				
				post_patterns.append((regex,post))


			
			for regex,post in self.config.items('postaggers.regex'):
				post_patterns.append((regex.decode('utf-8'),post))

		
			regexpTagger  = RegexpTagger(post_patterns)
			unigramTagger = UnigramTagger(unigrams+training,backoff=regexpTagger)	
			bigramTagger= BigramTagger(training, backoff=unigramTagger) 
			trigramTagger = TrigramTagger(training, backoff=bigramTagger)
			NTagger=NgramTagger(self.max_ngrams,training,backoff=trigramTagger)

			print("Sentencias de entrenamiento para n-taggers:" + str(len(training)))
			print("Sentencias de entrenamiento para unitaggers:" + str(len(unigrams)))
			print("Cantidad de palabras ADICIONALES de DICCIONARIOS para el unitagger:" + str(len(unigrams)))
			print("Sentencias para testing:" + str(len(test)))
			print("Expresiones regulares para el Tagger:")
			
			for post_regex in post_patterns:
				print post_regex
				
		
			if self.training_portion!=1:
		
				score_ut=unigramTagger.evaluate(test)
				score_bt=bigramTagger.evaluate(test)-0.002
				score_tt=trigramTagger.evaluate(test)
				score_nt=NTagger.evaluate(test)

			

				scores=[score_ut,score_bt,score_tt,score_nt]
				tagger_names=["uTagger","biTagger","triTagger","NTagger"]
				taggers=[unigramTagger,bigramTagger,trigramTagger,NTagger]

				bestTagger_index= scores.index(max(scores))
				best_msg=max(scores),tagger_names[bestTagger_index]
			
		
			fname=self.taggers_path + tagger_names[bestTagger_index]
			if os.path.isfile(fname+self.tagger_extension_file):
				fname=fname+str(len(listdir(self.taggers_path)))+self.tagger_extension_file
			else:
				fname=self.taggers_path + tagger_names[bestTagger_index]+self.tagger_extension_file
			
			model=taggers[bestTagger_index]

			f = open(fname,'wb')
			pickle.dump(model, f)
			f.close()
			
			print ("Guardando el tagger :" + fname)
			#logging.info("Guardando el mejor tagger :" + fname)
			
			model_name=fname
			
		except Exception,e:
			print "ERRPR EN POS TAGGER GENERATOR:",str(e)
			pdb.set_trace()
示例#27
0
 def __init__(self, train_sents):
     t0 = DefaultTagger('NN')
     t1 = UnigramTagger(train_sents, backoff=t0)
     t2 = BigramTagger(train_sents, backoff=t1)
     self.tagger = TrigramTagger(train_sents, backoff=t2)
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default) ...
]

rt = RegexpTagger(patterns)

rt.evaluate(test_data)

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

ut.evaluate(test_data)


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

tree = parsetree(sentence)
示例#29
0
#b)
#using regex from nltk.org/book/chp05.html, 4.2
patterns = [
    (r'.*ing$', 'VBG'),  #gerunds
    (r'.*ed$', 'VBD'),  #simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  #modal
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  #plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  #nouns (default)
]
regexp_tagger = RegexpTagger(patterns)
uniB = UnigramTagger(brownT90, backoff=defaultTB90)
biB = BigramTagger(brownT90, backoff=uniB)
triB = TrigramTagger(brownT90, backoff=biB)

uniC = UnigramTagger(chatT50, backoff=defaultTChat50)
biC = BigramTagger(chatT50, backoff=uniC)
triC = TrigramTagger(chatT50, backoff=uniC)

print("Regextag50/50: ", regexp_tagger.evaluate(brownT50))
print("Default: ", defaultTB90.evaluate(brownT50))

print("Bigram Brown 50/50: ",
      BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50))
print("Default: ", defaultTB50.evaluate(brownT50))

print("Bigram Brown 90/10: ",
      BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90))
print("Default: ", defaultTB90.evaluate(brownT90))
示例#30
0
    Template(Word([-1]), Word([1])),
]

#entrenamos el unigramtagger
from nltk import UnigramTagger
from nltk import BigramTagger
from nltk import TrigramTagger

print("unigram tagger")
unigram_tagger = UnigramTagger(wiki_train, backoff=affix_tagger)

print("bigram  tagger")
bigram_tagger = BigramTagger(wiki_train, backoff=unigram_tagger)

print("trigram tagger")
trigram_tagger = TrigramTagger(wiki_train, backoff=bigram_tagger)

#creamos el trainer del brilltagger
trainer = BrillTaggerTrainer(trigram_tagger, templates)

#entrenamos el brill_tagger
wiki_tagger = trainer.train(wiki_train, max_rules=200)

#guardamos el brill_tagger en un archivo
from pickle import dump
output = open('wiki_tagger_u.pkl', 'wb')
dump(wiki_tagger, output, -1)
output.close()

#el brill tagger sobre el test corpus
wiki_eval = wiki_tagger.evaluate(wiki_test)
示例#31
0
import pickle
import nltk
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
import nltk.tag.sequential

train_sents = nltk.corpus.brown.tagged_sents()
train_sents += nltk.corpus.conll2000.tagged_sents()
train_sents += nltk.corpus.treebank.tagged_sents()
t0 = DefaultTagger('NN')
t1 = UnigramTagger(train_sents, backoff=t0)
t2 = BigramTagger(train_sents, backoff=t1)
trigram_tagger = TrigramTagger(train_sents, backoff=t2)
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(trigram_tagger, save_classifier)
save_classifier.close()
示例#32
0
def train_brill_tagger(train_data):
    # Modules for creating the templates.
    from nltk import UnigramTagger
    # The brill tagger module in NLTK.
    from nltk.tag.brill_trainer import BrillTaggerTrainer
    from nltk import BigramTagger,UnigramTagger,TrigramTagger
    import nltk
    from pickle import dump
    #unigram_tagger = UnigramTagger(train_data)
    templates=nltk.tag.brill.fntbl37()
    #Regular expression (Regex) Tagger as a default tagger
    default_tagger = nltk.RegexpTagger(
        [(r'^[Jj]ing', 'ABN'),
         (r'^[pP]yn', 'CAV'),
         (r'^[nN]ga$', '1PSG'),
         (r'^[pP]hi$', '2PG'),
         (r'^[pP]ha$', '2PF'),
         (r'^[mM]e$', '2PM'),
         (r'^[iI]$', '3PSG'),
         (r'^[bB]an$', 'INP'),
         (r'^[Kk]a$', '3PSF'),
         (r'^[uU]$', '3PSM'),
         (r'^[kK]i$', '3PPG'),
         (r'(sha|da|na|hapoh|halor|ha|naduh|shaduh|hapdeng|haduh)$', 'IN'),
         (r'(bad|ruh|namar|hynrei|tangba|katba|katta)$', 'COC'),
         (r'(lada|haba|khnang|ynda)$', 'SUC'),
         (r'(katkum|kat|pat|wat|tang|lang)$', 'AD'),
         (r'(bun|baroh)$', 'QNT'),
         (r'^-?[0-9]+(.[0-9]+)?$', 'CN'),
         (r'(dei|long|don)$', 'CO'),
         (r'^[jJ]ong$', 'POP'),
         (r'^[sS]hah$', 'PAV'),
         (r'^[lL]ah$', 'MOD'),
         (r'^[lL]a$', 'VST'),
         (r'(ym|em|khlem|nym|kam)$', 'NEG'),
         (r'^hi$', 'EM'),
         (r'.*lade$', 'RFP'),
         (r'(dang|nang)$', 'VPP'),
         (r'([uU]n|[kK]an|[kK]in|[sS]a|[yY]n|[nN]gin|[pP]hin)$', 'VFT'),
         (r'(.*ngut|.*tylli)$', 'ADJ'),
         (r'^[bB]a$', 'COM'),
         (r'^\W+$', 'SYM'),
         (r'[^a-z\W]a$', 'IN'),
         (r'([vV]ote|[bB]ye|[cC]onstituency|[sS]outh)$', 'FR'),
         (r'.*', 'CMN')

         ])
    t0 = default_tagger
    print(train_data)
    t1 = UnigramTagger(train_data,backoff=t0)
    t2 = BigramTagger(train_data,backoff=t1)
    t3 = TrigramTagger(train_data,backoff=t2)


    trainer = BrillTaggerTrainer(initial_tagger=t3,
                                   templates=templates, trace=3,
                                   deterministic=True)
    brill_tagger = trainer.train(train_data,max_rules=10)

    # Saving the Tagger for future use
    output = open('t2.pkl', 'wb')
    dump(t3, output, -1)
    output.close()
    return brill_tagger
示例#33
0
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import treebank
from nltk.tag import hmm
from nltk import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
from nltk.corpus import brown
brown_a = nltk.corpus.brown.tagged_sents(
    categories=['news', 'editorial', 'reviews'])
text = brown.tagged_sents(categories='news')[:500]

t0 = DefaultTagger('NN')
t1 = UnigramTagger(text, backoff=t0)
t2 = BigramTagger(text, backoff=t1)
t3 = TrigramTagger(text, backoff=t1)
# default_tagger = nltk.data.load(nltk.tag._POS_TAGGER)

test_sent = brown.sents()[502]
# test_sent = [u'Noting', u'that', u'Plainfield', u'last', u'year', u'had', u'lost', u'the', u'Mack', u'Truck', u'Co.', u'plant', u',', u'he', u'said', u'industry', u'will', u'not', u'come', u'into', u'this', u'state', u'until', u'there', u'is', u'tax', u'reform', u'.']


def ie_preprocess(document):
    print document
    sentences = nltk.sent_tokenize(document)
    # print sentences
    trigram_tagger = nltk.TrigramTagger(brown_a, cutoff=0)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    print "\nDefault tagger"
    x = [t0.tag(sent) for sent in sentences]
    print x
    print "\nUnigram tagger"