def get_tagger(): d_tagger = nltk.DefaultTagger('NN') re_tagger = nltk.RegexpTagger(patterns,backoff=d_tagger) # train is the proportion of data used in training; the rest is reserved # for testing. print("Loading tagged data... ") tagged_data = brown_tagged_sents cutoff = int(1000*.8) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:1000] testing_data = [[t[0] for t in sent] for sent in gold_data] print("Done loading.") bigram_tagger = tag.BigramTagger(training_data,backoff=re_tagger) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)), ] trainer = brill.FastBrillTaggerTrainer(bigram_tagger, templates, 0) brill_tagger = trainer.train(training_data, max_rules=100, min_score=3) return brill_tagger
def train(self, sentence_list): """Trains the tagger from the tagged sentences provided """ noun_fallback = DefaultTagger('NN') affix_fallback = AffixTagger(sentence_list, backoff=noun_fallback) unigram_fallback = UnigramTagger(sentence_list, backoff=affix_fallback) bigram_fallback = BigramTagger(sentence_list, backoff=unigram_fallback) trigram_fallback = TrigramTagger(sentence_list, backoff=bigram_fallback) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(trigram_fallback, templates) self.tagger = trainer.train(sentence_list, max_rules=100, min_score=3)
def train_brill_tagger(initial_tagger, train_sents, **kwargs): sym_bounds = [(1, 1), (2, 2), (1, 2), (1, 3)] asym_bounds = [(-1, -1), (1, 1)] templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, *sym_bounds), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, *sym_bounds), brill.ProximateTokensTemplate(brill.ProximateTagsRule, *asym_bounds), brill.ProximateTokensTemplate(brill.ProximateWordsRule, *asym_bounds) ] trainer = brill.FastBrillTaggerTrainer(initial_tagger, templates, deterministic=True) return trainer.train(train_sents, **kwargs)
def get_brill(train_sents=None): import nltk, nltk.tag from nltk.tag import brill if train_sents is None: train_sents = nltk.corpus.conll2000.tagged_sents( ) + nltk.corpus.brown.tagged_sents() raubt_tagger = backoff_tagger(train_sents, [ nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger ], backoff=nltk.tag.RegexpTagger(word_patterns)) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates) braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3) return braubt_tagger
def __init__(self, msr_data=True, msr_data_location=params.msr_dataset): tagged_sents = nltk.corpus.indian.tagged_sents(fileids="bangla.pos") filtered_sents1 = [] for i, sent in enumerate(tagged_sents): try: nltk.tag.UnigramTagger([sent]) filtered_sents1.append(sent) except ValueError: pass filtered_sents2 = [] if msr_data == True: cwdir = os.getcwd() listing = os.listdir(msr_data_location) os.chdir(msr_data_location) for fileName in listing: xmlTree = parse(fileName) nodes = xmlTree.getElementsByTagName('sentence') try: for node in nodes: pairs = node.childNodes[0].nodeValue.split() pairs = map(lambda a: a.split("\\"), pairs) pairs = map( lambda (a, b): (a.encode("utf-8"), b.encode().split(".")[0]), pairs) nltk.tag.UnigramTagger([pairs]) filtered_sents2.append(pairs) except ValueError: pass os.chdir(cwdir) total_set = filtered_sents1 + filtered_sents2 aubt_tagger = [ nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger ] brill_templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] btrainer = nltk.tag.brill.FastBrillTaggerTrainer( self.backoff_tagger(total_set, aubt_tagger), brill_templates) self.tagger = btrainer.train(total_set, max_rules=300, min_score=3)
from nltk.tag import brill from nltk.corpus import TaggedCorpusReader from .Extras import carrega from .AnotaCorpus import abre_etiquetador from .ConstroiRUBT import EXEMPLO, SENTENCA TEMPLATES = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)) ] INICIAL = carrega("AeliusRUBT.pkl") def treina(expressao_regular, etiquetador=INICIAL, destino="BRUBT.pkl", raiz=".", codificacao="utf-8", max_rules=100, min_score=3): inicial = abre_etiquetador(etiquetador) corpus = TaggedCorpusReader(raiz, expressao_regular, encoding=codificacao)
def demo(num_sents=100, max_rules=200, min_score=2, error_output="errors.out", rule_output="rules.out", randomize=False, train=.8, trace=3): """ Brill Tagger Demonstration @param num_sents: how many sentences of training and testing data to use @type num_sents: L{int} @param max_rules: maximum number of rule instances to create @type max_rules: L{int} @param min_score: the minimum score for a rule in order for it to be considered @type min_score: L{int} @param error_output: the file where errors will be saved @type error_output: L{string} @param rule_output: the file where rules will be saved @type rule_output: L{string} @param randomize: whether the training data should be a random subset of the corpus @type randomize: L{boolean} @param train: the fraction of the the corpus to be used for training (1=all) @type train: L{float} @param trace: the level of diagnostic tracing output to produce (0-3) @type trace: L{int} """ from nltk.corpus import treebank from nltk import tag from nltk.tag import brill NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." sents = [] for item in treebank.items: sents.extend(treebank.tagged(item)) if randomize: random.seed(len(sents)) random.shuffle(sents) tagged_data = [t for s in sents[:num_sents] for t in s] cutoff = int(len(tagged_data) * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:] testing_data = [t[0] for t in gold_data] # Unigram tagger print "Training unigram tagger:", u = tag.Unigram(backoff=NN_CD_tagger) # NB training and testing are required to use a list-of-lists structure, # so we wrap the flattened corpus data with the extra list structure. u.train([training_data]) print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) # Brill tagger templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)), ] #trainer = brill.FastBrillTrainer(u, templates, trace) trainer = brill.BrillTrainer(u, templates, trace) b = trainer.train(training_data, max_rules, min_score) print print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) print("\nRules: ") printRules = file(rule_output, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule) + "\n\n") testing_data = list(b.tag(testing_data)) el = errorList(gold_data, testing_data) errorFile = file(error_output, 'w') for e in el: errorFile.write(e + "\n\n") errorFile.close() print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)