-
Notifications
You must be signed in to change notification settings - Fork 0
/
indivTaggers.py
92 lines (75 loc) · 3.54 KB
/
indivTaggers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Copyright (C) 2015 Kathrin Donandt
# For license information see LICENSE.txt
## functions for training individual taggers
from crf import CRFTagger
from create_reader import create_reader
from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger, AffixTagger, tnt
from regextagger_tonal import Regexp as RegexpTonal
from regextagger_non_tonal import Regexp
from regextagger_non_tonal_SA import Regexp as RegexpSA
from regextagger_tonal_SA import Regexp as RegexpTonalSA
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.tag.hmm import LidstoneProbDist
from create_reader import dictionary, dictionary_backoff
def indivDefault(bambara):
default = DefaultTagger('n')
print(default.evaluate(bambara.test_sents))
return default
def indivCRF(bambara, tone, tag):
crf = CRFTagger(training_opt={'max_iterations':100,'max_linesearch' : 10,'c1': 0.0001,'c2': 1.0})#best training_opt für CRF
# c1 and c2 according to suggestion on http://nbviewer.ipython.org/github/tpeng/python-crfsuite/blob/master/examples/CoNLL%202002.ipynb
print("Training CRF Tagger...")
crf.train(bambara.train_sents, "Models/model.indivCRF"+tone+tag+".tagger")
print("CRF accuracy: ",crf.evaluate(bambara.test_sents))
return crf
def indivTnT(bambara, backoff):
tnttagger = tnt.TnT(unk=backoff, Trained= True, N=100)
tnttagger.train(bambara.train_sents)
print("TnT accuracy: ",tnttagger.evaluate(bambara.test_sents))
return tnttagger
def indivHMM(bambara):
tag_set= set()
symbols=set()
for i in bambara.train_sents:
for j in i:
tag_set.add(j[1])
symbols.add(j[0])
trainer = HiddenMarkovModelTrainer(list(tag_set), list(symbols))
hmm = trainer.train_supervised(bambara.train_sents, estimator=lambda fd, bins:LidstoneProbDist(fd, 0.1, bins))
print("HMM accuracy:",hmm.evaluate(bambara.test_sents))
return hmm
def indivAffix(bambara, affix_length, backoff):
affix=AffixTagger(bambara.train_sents, min_stem_length=0, affix_length=affix_length, backoff = backoff)
print("Affix accuracy: ",affix.evaluate(bambara.test_sents))
return affix
def indivUnigram(bambara,backoff):
unigram= UnigramTagger(bambara.train_sents, backoff=backoff)
print("Unigram accuracy: ",unigram.evaluate(bambara.test_sents))
return unigram
def indivBigram(bambara, backoff):
bigram= BigramTagger(bambara.train_sents, backoff=backoff)
print("Bigram accuracy: ",bigram.evaluate(bambara.test_sents))
return bigram
def indivTrigram(bambara,backoff):
trigram=TrigramTagger(bambara.train_sents, backoff=backoff)
print("Trigram accuracy: ",trigram.evaluate(bambara.test_sents))
return trigram
def indivRegexp(bambara, option_tag, option_tones, backoff):
if option_tones == "tonal" and option_tag == "Affixes":
regex=RegexpTonalSA(backoff=backoff)
if option_tones == "tonal" and option_tag == "POS":
regex=RegexpTonal(backoff=backoff)
if option_tones == "nontonal" and option_tag == "Affixes":
regex=RegexpSA(backoff=backoff)
if option_tones == "nontonal" and option_tag == "POS":
regex=Regexp(backoff=backoff)
print("Regexp accuracy: ",regex.evaluate(bambara.test_sents))
return regex
def indivDic(bambara, tone):#backoff = DefaultTagger('n')
dic = dictionary(tone)
print("Dictionary accuracy: ",dic.evaluate(bambara.test_sents))
return dic
def indivDic_backoff(bambara,tone,backoff):
dic = dictionary_backoff(tone, backoff)
print("Dictionary with backoff accuracy: ",dic.evaluate(bambara.test_sents))
return dic