def demo2(): from nltk_lite import tag from nltk_lite.corpora import treebank import tnt d = list(treebank.tagged()) t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = tag.accuracy(t, d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print 'Capitalisation off:' print 'Accuracy:', tacc print 'Percentage known:', tp_kn print 'Percentage unknown:', tp_un print 'Accuracy over known words:', (tacc / tp_kn) sacc = tag.accuracy(s, d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print 'Capitalisation on:' print 'Accuracy:', sacc print 'Percentage known:', sp_kn print 'Percentage unknown:', sp_un print 'Accuracy over known words:', (sacc / sp_kn)
def demo2(): from nltk_lite import tag from nltk_lite.corpora import treebank import tnt d = list(treebank.tagged()) t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=True) t.train(d[(11) * 100:]) s.train(d[(11) * 100:]) for i in range(10): tacc = tag.accuracy(t, d[i * 100:((i + 1) * 100)]) tp_un = float(t.unknown) / float(t.known + t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print 'Capitalisation off:' print 'Accuracy:', tacc print 'Percentage known:', tp_kn print 'Percentage unknown:', tp_un print 'Accuracy over known words:', (tacc / tp_kn) sacc = tag.accuracy(s, d[i * 100:((i + 1) * 100)]) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print 'Capitalisation on:' print 'Accuracy:', sacc print 'Percentage known:', sp_kn print 'Percentage unknown:', sp_un print 'Accuracy over known words:', (sacc / sp_kn)
def demo3(): from nltk_lite import tag from nltk_lite.corpora import treebank from nltk_lite.corpora import brown import tnt d = list(treebank.tagged()) e = list(brown.tagged()) d = d[:1000] e = e[:1000] d10 = int(len(d) * 0.1) e10 = int(len(e) * 0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=False) dtest = d[(i * d10):((i + 1) * d10)] etest = e[(i * e10):((i + 1) * e10)] dtrain = d[:(i * d10)] + d[((i + 1) * d10):] etrain = e[:(i * e10)] + e[((i + 1) * e10):] t.train(dtrain) s.train(etrain) tacc = tag.accuracy(t, dtest) tp_un = float(t.unknown) / float(t.known + t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = tag.accuracy(s, etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print "brown: acc over words known:", 10 * tknacc print " : overall accuracy:", 10 * tallacc print " : words known:", 10 * tknown print "treebank: acc over words known:", 10 * sknacc print " : overall accuracy:", 10 * sallacc print " : words known:", 10 * sknown
def demo(numSents=100, max_rules=200, min_score=2, ruleFile="dump.rules", errorOutput = "errors.out", ruleOutput="rules.out", randomize=False, train=.8, trace=3): from nltk_lite.corpora import treebank from nltk_lite import tag from nltk_lite.tag import brill NN_CD_tagger = tag.Regexp([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." sents = list(treebank.tagged()) if randomize: random.seed(len(sents)) random.shuffle(sents) tagged_data = [t for s in sents[:numSents] for t in s] cutoff = int(len(tagged_data)*train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:] testing_data = [t[0] for t in gold_data] # Unigram tagger print "Training unigram tagger:", u = tag.Unigram(backoff=NN_CD_tagger) # NB training and testing are required to use a list-of-lists structure, # so we wrap the flattened corpus data with the extra list structure. u.train([training_data]) print("[accuracy: %f]" % tag.accuracy(u, [gold_data])) # Brill tagger templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)), ] #trainer = brill.FastBrillTrainer(u, templates, trace) trainer = brill.BrillTrainer(u, templates, trace) b = trainer.train(training_data, max_rules, min_score) print print("Brill accuracy: %f" % tag.accuracy(b, [gold_data])) print("\nRules: ") printRules = file(ruleOutput, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule)+"\n\n") #b.saveRules(ruleFile) testing_data = list(b.tag(testing_data)) el = errorList(gold_data, testing_data) errorFile = file(errorOutput, 'w') for e in el: errorFile.write(e+"\n\n") errorFile.close() print "Done; rules and errors saved to %s and %s." % (ruleOutput, errorOutput)
def _demo_tagger(tagger, gold): from nltk_lite.tag import accuracy acc = accuracy(tagger, gold) print 'Accuracy = %4.1f%%' % (100.0 * acc)
def demo3(): from nltk_lite import tag from nltk_lite.corpora import treebank from nltk_lite.corpora import brown import tnt d = list(treebank.tagged()) e = list(brown.tagged()) d = d[:1000] e = e[:1000] d10 = int(len(d)*0.1) e10 = int(len(e)*0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = tnt.Tnt(N=1000, C=False) s = tnt.Tnt(N=1000, C=False) dtest = d[(i*d10):((i+1)*d10)] etest = e[(i*e10):((i+1)*e10)] dtrain = d[:(i*d10)] + d[((i+1)*d10):] etrain = e[:(i*e10)] + e[((i+1)*e10):] t.train(dtrain) s.train(etrain) tacc = tag.accuracy(t, dtest) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = tag.accuracy(s, etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print "brown: acc over words known:", 10*tknacc print " : overall accuracy:", 10*tallacc print " : words known:", 10*tknown print "treebank: acc over words known:", 10*sknacc print " : overall accuracy:", 10*sallacc print " : words known:", 10*sknown