Exemplo n.º 1
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tagger = kwargs["tagger"]

        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]

        self.brillTrainer = BrillTaggerTrainer(self.tagger,
                                               templates,
                                               deterministic=True)
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger,
                                               templates,
                                               deterministic=True)
    return trainer.train(train_sents, **kwargs)
Exemplo n.º 3
0
def train_brill_tagger(initial_tagger, training, **kwargs):
    """
        Function to train a brill tagger. Uses rules to correct the results of a tagger
    """
    templates = [
        brill.Template(brill.Pos([-1])),
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])),
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger,
                                               templates,
                                               deterministic=True)
    return trainer.train(training, **kwargs)
Exemplo n.º 4
0
 def get_brill_tagger(self):
     train_data = TaggedCorpusReader('.',
                                     'tagged_input_sentences.txt',
                                     sep="/")
     traindata = list(train_data.tagged_sents())
     postag = load('taggers/maxent_treebank_pos_tagger/english.pickle')
     templates = [
         brill.Template(brill.Pos([-1])),
         brill.Template(brill.Pos([1])),
         brill.Template(brill.Pos([-2])),
         brill.Template(brill.Pos([2])),
         brill.Template(brill.Pos([-2, -1])),
         brill.Template(brill.Pos([1, 2])),
         brill.Template(brill.Pos([-3, -2, -1])),
         brill.Template(brill.Pos([1, 2, 3])),
         brill.Template(brill.Pos([-1]), brill.Pos([1])),
         brill.Template(brill.Word([-1])),
         brill.Template(brill.Word([1])),
         brill.Template(brill.Word([-2])),
         brill.Template(brill.Word([2])),
         brill.Template(brill.Word([-2, -1])),
         brill.Template(brill.Word([1, 2])),
         brill.Template(brill.Word([-3, -2, -1])),
         brill.Template(brill.Word([1, 2, 3])),
         brill.Template(brill.Word([-1]), brill.Word([1]))
     ]
     trainer = BrillTaggerTrainer(postag, templates=templates, trace=3)
     brill_tagger = trainer.train(traindata, max_rules=10)
     return brill_tagger
Exemplo n.º 5
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
	templates = [
		brill.Template(brill.Pos([-1])),
		brill.Template(brill.Pos([1])),
		brill.Template(brill.Pos([-2])),
		brill.Template(brill.Pos([2])),
		brill.Template(brill.Pos([-2, -1])),
		brill.Template(brill.Pos([1, 2])),
		brill.Template(brill.Pos([-3, -2, -1])),
		brill.Template(brill.Pos([1, 2, 3])),
		brill.Template(brill.Pos([-1]), brill.Pos([1])),
		brill.Template(brill.Word([-1])),
		brill.Template(brill.Word([1])),
		brill.Template(brill.Word([-2])),
		brill.Template(brill.Word([2])),
		brill.Template(brill.Word([-2, -1])),
		brill.Template(brill.Word([1, 2])),
		brill.Template(brill.Word([-3, -2, -1])),
		brill.Template(brill.Word([1, 2, 3])),
		brill.Template(brill.Word([-1]), brill.Word([1])),
	]

	#templates = nltkdemo18() # nltkdemo18plus() # fntbl37() # brill24()

	trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True)

	return trainer.train(train_sents, **kwargs)
    def train(self, data):
        # baseline tagger: unigram tagger
        hmm = HMMTagger()
        hmm.train(data)
        self.baseline_tagger = hmm.tagger

        # train brill tagger with HMM as baseline
        templates = [
            brill.Template(brill.Pos([-1])),
            brill.Template(brill.Pos([1])),
            brill.Template(brill.Pos([-2])),
            brill.Template(brill.Pos([2])),
            brill.Template(brill.Pos([-2, -1])),
            brill.Template(brill.Pos([1, 2])),
            brill.Template(brill.Pos([-3, -2, -1])),
            brill.Template(brill.Pos([1, 2, 3])),
            brill.Template(brill.Pos([-1]), brill.Pos([1])),
            brill.Template(brill.Word([-1])),
            brill.Template(brill.Word([1])),
            brill.Template(brill.Word([-2])),
            brill.Template(brill.Word([2])),
            brill.Template(brill.Word([-2, -1])),
            brill.Template(brill.Word([1, 2])),
            brill.Template(brill.Word([-3, -2, -1])),
            brill.Template(brill.Word([1, 2, 3])),
            brill.Template(brill.Word([-1]), brill.Word([1])),
        ]
        self.trainer = BrillTaggerTrainer(self.baseline_tagger, templates=templates)
        self.tagger = self.trainer.train(data)
Exemplo n.º 7
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
	'''
		some suggested rules for the template
			change the POS of a word, depending on the POS of the previous word
			change the POS of a word, depending on the POS of any of the two previous words
			change the POS of a word, depending on the POS of any of the three previous words
			change the POS of a word, depending on the POS of the previous word and the POS of the next word
			change the POS of a word, depending on the previous word
			change the POS of a word, depending on any of the two previous words
			change the POS of a word, depending on any of the three previous words
			change the POS of a word, depending on the previous word and the next word
	'''

	# Template generates rule for the Brill Rules that Brill tagger gonna use it
	 
	templates = [
	       brill.Template(brill.Pos([-1])),  # rule can be generated using the previous POS tag
	       brill.Template(brill.Pos([1])),  # look at the next POS tag to generate a rule
	       brill.Template(brill.Pos([-2])),  # rule can be generated using the two previous POS tag
	       brill.Template(brill.Pos([2])),  # rule can be generated using the next two POS tag
	       brill.Template(brill.Pos([-2, -1])),  # look at the combination of the previous two words to learn transformation rule
	       brill.Template(brill.Pos([1, 2])),
	       brill.Template(brill.Pos([-3, -2, -1])),
	       brill.Template(brill.Pos([1, 2, 3])),
	       brill.Template(brill.Pos([-1]), brill.Pos([1])),
	       brill.Template(brill.Word([-1])),
	       brill.Template(brill.Word([1])),
	       brill.Template(brill.Word([-2])),
		   brill.Template(brill.Word([2])),
		   brill.Template(brill.Word([-2, -1])),
		   brill.Template(brill.Word([1, 2])),
		   brill.Template(brill.Word([-3, -2, -1])),
		   brill.Template(brill.Word([1, 2, 3])),
		   brill.Template(brill.Word([-1]), brill.Word([1])),
		]

	'''
		BrillTaggerTrainer(1st, 2nd, 3rd, ...)
			1st param initial_tagger: (Tagger) the baseline tagger
			2nd param templates : (list of templates) templates to be used in training
			3rd param trace: (int) verbosity level == information level u want to see
			4th param deterministic: (bool) if True, adjudicate ties deterministically
			5th ruleformat: (str) format of reported rules
	'''
	
	trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True, trace=True)
	return trainer.train(train_sents, max_rules=100, min_score=2)
Exemplo n.º 8
0
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag('This is a foo bar sentence'.split())
     expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None),
                 ('bar', 'NN'), ('sentence', None)]
     self.assertEqual(result, expected)
Exemplo n.º 9
0
def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
        brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag
        brill.Template(brill.Pos([1])),
        brill.Template(brill.Pos([-2])),
        brill.Template(brill.Pos([2])),
        brill.Template(brill.Pos([-2, -1])),
        brill.Template(brill.Pos([1, 2])),
        brill.Template(brill.Pos([-3, -2, -1])),
        brill.Template(brill.Pos([1, 2, 3])),
        brill.Template(brill.Pos([-1]), brill.Pos([1])),
        brill.Template(brill.Word([-1])),
        brill.Template(brill.Word([1])),
        brill.Template(brill.Word([-2])),
        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs)
Exemplo n.º 10
0
 def test_pos_template(self):
     train_sents = treebank.tagged_sents()[:1000]
     tagger = UnigramTagger(train_sents)
     trainer = brill_trainer.BrillTaggerTrainer(
         tagger, [brill.Template(brill.Pos([-1]))])
     brill_tagger = trainer.train(train_sents)
     # Example from https://github.com/nltk/nltk/issues/769
     result = brill_tagger.tag("This is a foo bar sentence".split())
     expected = [
         ("This", "DT"),
         ("is", "VBZ"),
         ("a", "DT"),
         ("foo", None),
         ("bar", "NN"),
         ("sentence", None),
     ]
     self.assertEqual(result, expected)
                (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'),
                (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'),
                (r'.*ble+$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'),
                (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'^a$', 'PREP'),
                (r'.*', 'NN')]

    regexp_tagger = nltk.RegexpTagger(patterns)
    affix_tagger = nltk.AffixTagger(tagged_posts, backoff=regexp_tagger)
    unigram_tagger = nltk.UnigramTagger(tagged_posts, backoff=affix_tagger)
    bigram_tagger = nltk.BigramTagger(tagged_posts, backoff=unigram_tagger)
    tbuar_tagger = nltk.TrigramTagger(tagged_posts, backoff=bigram_tagger)
    return tbuar_tagger


templates = [
    brill.Template(brill.Pos([-1])),
    brill.Template(brill.Pos([1])),
    brill.Template(brill.Pos([-2])),
    brill.Template(brill.Pos([2])),
    brill.Template(brill.Pos([-2, -1])),
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([-3, -2, -1])),
    brill.Template(brill.Pos([1, 2, 3])),
    brill.Template(brill.Pos([-1]), brill.Pos([1])),
    brill.Template(brill.Word([-1])),
    brill.Template(brill.Word([1])),
    brill.Template(brill.Word([-2])),
    brill.Template(brill.Word([2])),
    brill.Template(brill.Word([-2, -1])),
    brill.Template(brill.Word([1, 2])),
    brill.Template(brill.Word([-3, -2, -1])),
Exemplo n.º 12
0
print "Unigram accuracy: "
print unigram_tagger.evaluate(evaulation_data)

# Bigram tagger
bigram_tagger = BigramTagger(training_data, backoff=unigram_tagger)
print "Bigram accuracy: "
print bigram_tagger.evaluate(evaulation_data)

# Trigram tagger
trigram_tagger = TrigramTagger(training_data, backoff=bigram_tagger)
print "Trigram accuracy: "
print trigram_tagger.evaluate(evaulation_data)

# Brill tagger templates
templates = [
    Template(brill.Pos([1, 1])),
    Template(brill.Pos([2, 2])),
    Template(brill.Pos([1, 2])),
    Template(brill.Pos([1, 3])),
    Template(brill.Word([1, 1])),
    Template(brill.Word([2, 2])),
    Template(brill.Word([1, 2])),
    Template(brill.Word([1, 3])),
    Template(brill.Pos([-1, -1]), brill.Pos([1, 1])),
    Template(brill.Word([-1, -1]), brill.Word([1, 1])),
]

# First iteration
trainer = brill_trainer.BrillTaggerTrainer(trigram_tagger, templates)
brill_tagger = trainer.train(training_data, max_rules, min_score)
print "Initial Brill accuracy:"
        word = tokens[index]
        return nltk.pos_tag([word])[0][1] if word != "" else None


custom_pos_tagger = POSTagger()

# In[93]:

import nltk
import nltk.tag
from nltk.tag import brill
from nltk.tag import UnigramTagger
from nltk.tag import BrillTaggerTrainer

templates = [
    brill.Template(brill.Pos([1, 1])),
    brill.Template(brill.Pos([2, 2])),
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([1, 3])),
    brill.Template(brill.Pos([1, 1])),
    brill.Template(brill.Pos([2, 2])),
    brill.Template(brill.Pos([1, 2])),
    brill.Template(brill.Pos([1, 3])),
    brill.Template(brill.Word([-1, -1])),
    brill.Template(brill.Word([-1, -1]))
]

trainer_initial_pos = BrillTaggerTrainer(initial_tagger=custom_pos_tagger,
                                         templates=templates,
                                         trace=3,
                                         deterministic=True)