def TrainSenseTagger(Pcfg,CFDist): logger.info("Training unigram tagger:") SenseUnigramTagger = UnigramTagger(TAG='SENSE',TEXT='STEM') #SenseUnigramTagger.train(taggedData) SenseUnigramTagger._freqdist = invertConditionalFreqDist(CFDist) SenseDefaultTagger = DefaultTagger('APPEAR', TAG='SENSE',TEXT='STEM') backoff = BackoffTagger([SenseUnigramTagger,SenseDefaultTagger], TAG='SENSE',TEXT='STEM') return backoff
def TrainSenseTagger(Pcfg, CFDist): logger.info("Training unigram tagger:") SenseUnigramTagger = UnigramTagger(TAG="SENSE", TEXT="STEM") # SenseUnigramTagger.train(taggedData) SenseUnigramTagger._freqdist = invertConditionalFreqDist(CFDist) SenseDefaultTagger = DefaultTagger("APPEAR", TAG="SENSE", TEXT="STEM") backoff = BackoffTagger([SenseUnigramTagger, SenseDefaultTagger], TAG="SENSE", TEXT="STEM") return backoff
def test( numFiles=100, max_rules=200, min_score=2, ruleFile="dump.rules", errorOutput="errors.out", ruleOutput="rules.out", randomize=False, train=0.8, trace=3, ): NN_CD_tagger = RegexpTagger([(r"^[0-9]+(.[0-9]+)?$", "CD"), (r".*", "NN")], TAG="POS") # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." taggedData = getWSJTokens(numFiles, randomize) trainCutoff = int(len(taggedData) * train) trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff]) goldData = Token(SUBTOKENS=taggedData[trainCutoff:]) testingData = goldData.exclude("POS") # Unigram tagger print "Training unigram tagger:", u = UnigramTagger(TAG="POS") u.train(trainingData) backoff = BackoffTagger([u, NN_CD_tagger], TAG="POS") print ("[accuracy: %f]" % tagger_accuracy(backoff, [goldData])) # Brill tagger templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)), # ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] # trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS') trainer = BrillTaggerTrainer(backoff, templates, trace, TAG="POS") b = trainer.train(trainingData, max_rules, min_score) print print ("Brill accuracy: %f" % tagger_accuracy(b, [goldData])) print ("\nRules: ") printRules = file(ruleOutput, "w") for rule in b.rules(): print (str(rule)) printRules.write(str(rule) + "\n\n") # b.saveRules(ruleFile) b.tag(testingData) el = errorList(goldData, testingData) errorFile = file(errorOutput, "w") for e in el: errorFile.write(e + "\n\n") errorFile.close() print ("Done.") return b
def test(numFiles=100, max_rules=200, min_score=2, ruleFile="dump.rules", errorOutput="errors.out", ruleOutput="rules.out", randomize=False, train=.8, trace=3): NN_CD_tagger = RegexpTagger([(r'^[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')], TAG='POS') # train is the proportion of data used in training; the rest is reserved # for testing. print "Loading tagged data..." taggedData = getWSJTokens(numFiles, randomize) trainCutoff = int(len(taggedData) * train) trainingData = Token(SUBTOKENS=taggedData[0:trainCutoff]) goldData = Token(SUBTOKENS=taggedData[trainCutoff:]) testingData = goldData.exclude('POS') # Unigram tagger print "Training unigram tagger:", u = UnigramTagger(TAG='POS') u.train(trainingData) backoff = BackoffTagger([u, NN_CD_tagger], TAG='POS') print("[accuracy: %f]" % tagger_accuracy(backoff, [goldData])) # Brill tagger templates = [ SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 1)), SymmetricProximateTokensTemplate(ProximateTagsRule, (2, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 2)), SymmetricProximateTokensTemplate(ProximateTagsRule, (1, 3)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,1)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (2,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,2)), # SymmetricProximateTokensTemplate(ProximateWordsRule, (1,3)), ProximateTokensTemplate(ProximateTagsRule, (-1, -1), (1, 1)), # ProximateTokensTemplate(ProximateWordsRule, (-1, -1), (1,1)), ] #trainer = FastBrillTaggerTrainer(backoff, templates, trace, TAG='POS') trainer = BrillTaggerTrainer(backoff, templates, trace, TAG='POS') b = trainer.train(trainingData, max_rules, min_score) print print("Brill accuracy: %f" % tagger_accuracy(b, [goldData])) print("\nRules: ") printRules = file(ruleOutput, 'w') for rule in b.rules(): print(str(rule)) printRules.write(str(rule) + "\n\n") #b.saveRules(ruleFile) b.tag(testingData) el = errorList(goldData, testingData) errorFile = file(errorOutput, 'w') for e in el: errorFile.write(e + "\n\n") errorFile.close() print("Done.") return b