-
Notifications
You must be signed in to change notification settings - Fork 0
/
dd_tagger_fst.py
107 lines (94 loc) · 2.92 KB
/
dd_tagger_fst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# ! /usr/bin/python
'''
Dual decomposition for a tagger combined with an FST to avoid a
given sequence.
Created on Oct 15, 2013
@author: swabha
'''
from collections import defaultdict
import utils, cky, viterbi, fst_search, math
import sys
def init_dd_param(u, n, tagset):
for i in xrange(0, n):
u[i] = {}#defaultdict()
for t in tagset:
u[i][t] = 0
'''
Executes the dual decomposition algorithm
'''
def run(sentence, tagset, hmm_prob):
max_iterations = 200
#step_size = 100
n = len(sentence)
u = {}#defaultdict() # dual decomposition parameter
init_dd_param(u, n, tagset)
k = 1 # number of iterations
while k <= max_iterations:
step_size = 1.0 / math.sqrt(k)
#print "\niteration:", k
#print "-------------------------------"
#print "step size = ", "{0:.2f}".format(step_size)
tags1, aug_hmm_score, hmm_score = viterbi.run(sentence, tagset, hmm_prob, u)
#print "vit output:", ' '.join(tags1)
if k == 1:
best_tags = tags1
tags2, fst_score = fst_search.run(best_tags, u, tagset)
if agree(tags1, tags2):
#sys.stderr.write("hmm only = "+ str( hmm_score) + "\n")
#sys.stderr.write("fst only = "+ str(fst_score) + "\n")
#sys.stderr.write("big hmm = "+ str(aug_hmm_score) + "\n")
#sys.stderr.write("hmm fst = "+ str(aug_hmm_score + fst_score) + "\n")
return best_tags, k, tags1, tags2 # converges in the kth iteration
y = compute_indicators(tags1, tagset)
z = compute_indicators(tags2, tagset)
update(y, z, u, step_size)
k += 1
return best_tags, -1, tags1, tags2 # does not converge
# can be made faster, use dictionary shallow copying
def compute_indicators(tags, labelset):
ind = defaultdict()
for i in xrange(0, len(tags)):
z = defaultdict()
for t in labelset:
if tags[i] == t:
z[t] = 1
else:
z[t] = 0
ind[i] = z
return ind
'''
Dual decomposition update
'''
def update(indi1, indi2, u, step_size):
for i in xrange(0, len(indi1)):
for t in u[i].iterkeys():
u[i][t] -= (indi2[i][t] - indi1[i][t])*step_size
'''
Check if two tag sequences agree
'''
def agree(tags1, tags2):
for i in xrange(0, len(tags1)):
if tags1[i] != tags2[i]:
return False
else:
continue
return True
if __name__ == "__main__":
labelset = ["a", "b", "c"]
tags = ["a", "a", "c"]
tags2 = ["c", "c", "c"]
ind = compute_indicators(tags, labelset)
ind2 = compute_indicators(tags2, labelset)
u = defaultdict()
init_dd_param(u, 3, labelset)
for i in xrange(0, len(tags)):
for t in labelset:
print ind[i][t],
print
print
for i in xrange(0, len(tags)):
for t in labelset:
print ind2[i][t],
print
print
update(ind, ind2, u, 10)