def create_tagger(train_sents): ct = CombinedTagger() # ct.example_train(train_sents, True) ct.unmarshal("tresoldi") tokens = "Mauro viu o livro sobre a mesa".split() print(list(ct.tag(tokens))) # tests acc = tag.accuracy(ct, [train_sents]) print('Accuracy = %4.2f%%' % (100 * acc))
def create_tagger (train_sents): ct = CombinedTagger() # ct.example_train(train_sents, True) ct.unmarshal("tresoldi") tokens = "Mauro viu o livro sobre a mesa".split() print list(ct.tag(tokens)) # tests acc = tag.accuracy(ct, [train_sents]) print 'Accuracy = %4.2f%%' % (100 * acc)
def _demo_tagger(tagger, gold): from en.parser.nltk_lite.tag import accuracy acc = accuracy(tagger, gold) print('Accuracy = %4.1f%%' % (100.0 * acc))
def demo(num_sents=100, max_rules=200, min_score=2, error_output="errors.out", rule_output="rules.out", randomize=False, train=.8, trace=3): """ Brill Tagger Demonstration @param num_sents: how many sentences of training and testing data to use @type num_sents: L{int} @param max_rules: maximum number of rule instances to create @type max_rules: L{int} @param min_score: the minimum score for a rule in order for it to be considered @type min_score: L{int} @param error_output: the file where errors will be saved @type error_output: L{string} @param rule_output: the file where rules will be saved @type rule_output: L{string} @param randomize: whether the training data should be a random subset of the corpus @type randomize: L{boolean} @param train: the fraction of the the corpus to be used for training (1=all) @type train: L{float} @param trace: the level of diagnostic tracing output to produce (0-3) @type train: L{int} """ from en.parser.nltk_lite.corpora import treebank from en.parser.nltk_lite import tag from en.parser.nltk_lite.tag import brill NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." sents = list(treebank.tagged()) if randomize: random.seed(len(sents)) random.shuffle(sents) tagged_data = [t for s in sents[:num_sents] for t in s] cutoff = int(len(tagged_data) * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:] testing_data = [t[0] for t in gold_data] # Unigram tagger print "Training unigram tagger:", u = tag.Unigram(backoff=NN_CD_tagger) # NB training and testing are required to use a list-of-lists structure, # so we wrap the flattened corpus data with the extra list structure. u.train([training_data]) print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) # Brill tagger templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1, 3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1, 3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1, 1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1, 1)), ] #trainer = brill.FastBrillTrainer(u, templates, trace) trainer = brill.BrillTrainer(u, templates, trace) b = trainer.train(training_data, max_rules, min_score) print print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) print("\nRules: ") printRules = file(rule_output, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule) + "\n\n") testing_data = list(b.tag(testing_data)) el = errorList(gold_data, testing_data) errorFile = file(error_output, 'w') for e in el: errorFile.write(e + "\n\n") errorFile.close() print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
def demo(num_sents=100, max_rules=200, min_score=2, error_output = "errors.out", rule_output="rules.out", randomize=False, train=.8, trace=3): """ Brill Tagger Demonstration @param num_sents: how many sentences of training and testing data to use @type num_sents: L{int} @param max_rules: maximum number of rule instances to create @type max_rules: L{int} @param min_score: the minimum score for a rule in order for it to be considered @type min_score: L{int} @param error_output: the file where errors will be saved @type error_output: L{string} @param rule_output: the file where rules will be saved @type rule_output: L{string} @param randomize: whether the training data should be a random subset of the corpus @type randomize: L{boolean} @param train: the fraction of the the corpus to be used for training (1=all) @type train: L{float} @param trace: the level of diagnostic tracing output to produce (0-3) @type train: L{int} """ from en.parser.nltk_lite.corpora import treebank from en.parser.nltk_lite import tag from en.parser.nltk_lite.tag import brill NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." sents = list(treebank.tagged()) if randomize: random.seed(len(sents)) random.shuffle(sents) tagged_data = [t for s in sents[:num_sents] for t in s] cutoff = int(len(tagged_data)*train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:] testing_data = [t[0] for t in gold_data] # Unigram tagger print "Training unigram tagger:", u = tag.Unigram(backoff=NN_CD_tagger) # NB training and testing are required to use a list-of-lists structure, # so we wrap the flattened corpus data with the extra list structure. u.train([training_data]) print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) # Brill tagger templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)), ] #trainer = brill.FastBrillTrainer(u, templates, trace) trainer = brill.BrillTrainer(u, templates, trace) b = trainer.train(training_data, max_rules, min_score) print print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) print("\nRules: ") printRules = file(rule_output, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule)+"\n\n") testing_data = list(b.tag(testing_data)) el = errorList(gold_data, testing_data) errorFile = file(error_output, 'w') for e in el: errorFile.write(e+"\n\n") errorFile.close() print "Done; rules and errors saved to %s and %s." % (rule_output, error_output)
def _demo_tagger(tagger, gold): from en.parser.nltk_lite.tag import accuracy acc = accuracy(tagger, gold) print 'Accuracy = %4.1f%%' % (100.0 * acc)