/
default.py
138 lines (103 loc) · 7 KB
/
default.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
You have to write the perc_train function that trains the feature weights using the perceptron algorithm for the CoNLL 2000 chunking task.
Each element of train_data is a (labeled_list, feat_list) pair.
Inside the perceptron training loop:
- Call perc_test to get the tagging based on the current feat_vec and compare it with the true output from the labeled_list
- If the output is incorrect then we have to update feat_vec (the weight vector)
- In the notation used in the paper we have w = w_0, w_1, ..., w_n corresponding to \phi_0(x,y), \phi_1(x,y), ..., \phi_n(x,y)
- Instead of indexing each feature with an integer we index each feature using a string we called feature_id
- The feature_id is constructed using the elements of feat_list (which correspond to x above) combined with the output tag (which correspond to y above)
- The function perc_test shows how the feature_id is constructed for each word in the input, including the bigram feature "B:" which is a special case
- feat_vec[feature_id] is the weight associated with feature_id
- This dictionary lookup lets us implement a sparse vector dot product where any feature_id not used in a particular example does not participate in the dot product
- To save space and time make sure you do not store zero values in the feat_vec dictionary which can happen if \phi(x_i,y_i) - \phi(x_i,y_{perc_test}) results in a zero value
- If you are going word by word to check if the predicted tag is equal to the true tag, there is a corner case where the bigram 'T_{i-1} T_i' is incorrect even though T_i is correct.
"""
import perc
import sys, optparse, os, copy
from collections import defaultdict
def perc_train(train_data, tagset, numepochs):
feat_vec = defaultdict(int)
# feat_vec = perc.perc_read_from_file('model_19');
# insert your code here
if len(tagset) <= 0:
raise ValueError("Empty tagset")
numepochs = int(50)
default_tag = tagset[0]
for t in range(0, numepochs):
print 'Iteration#',t,' is processing now.'
for (labeled_list, feat_list) in train_data:
labels = copy.deepcopy(labeled_list)
# add in the start and end buffers for the context
# for every sentence in the training set, iterate numepochs times
output = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag)
# compare current output and true result
# correct_flag = True
feat_index = 0
# check word by word if the predicted tag is equal to the true tag
for i, v in enumerate(output):
(feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
# retrieve the feature for a word
if len(feats) == 0:
print >>sys.stderr, " ".join(labels), " ".join(feat_list), "\n"
raise ValueError("features do not align with input sentence")
fields = labels[i].split()
label = fields[2]
if i > 0:
label_pre = labels[i-1].split()[2]
for feat in feats:
if feat[0] == 'B': # for bigram feature
feat_out = feat + ":" + output[i-1] # feat_out is the "B:<previous output>"
feat_lab = feat + ":" + label_pre # feat_lab is the "B:<previous label>"
if output[i-1] != label_pre and output[i] != label:
feat_vec[feat_out, output[i]] -= 1
feat_vec[feat_lab, output[i]] -= 1
feat_vec[feat_out, label] += 1
feat_vec[feat_lab, label] += 1
elif output[i-1] == label_pre and output[i] != label:
feat_vec[feat_lab, output[i]] -= 2
feat_vec[feat_lab, label] += 2
elif output[i-1] != label_pre and output[i] == label:
pass
elif output[i-1] == label_pre and output[i] == label:
pass
# feat_vec[feat_out, output[i]] = feat_vec[feat_out, output[i]] - 1
# feat_vec[feat_lab, label] = feat_vec[feat_lab, label] + 1
# feat_vec[feat_out, label] = feat_vec[feat_out, label] + 1
# feat_vec[feat_lab, output[i]] = feat_vec[feat_lab, output[i]] - 1
else: # for U00 to U22 feature
feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
feat_vec[feat, label] = feat_vec[feat, label] + 1
else: # for i==0 case, all the first word in each sentence
label_pre = 'B_-1' # previous label will be denoted by B_-1
for feat in feats:
if feat[0] == 'B': # bigram feature case
feat = feat + ":" + label_pre
feat_vec[feat, output[i]] = feat_vec[feat, output[i]] - 1
feat_vec[feat, label] = feat_vec[feat, label] + 1
if t % 5 == 0:
perc.perc_write_to_file(feat_vec, 'model_' + str(t))
perc.perc_write_to_file(feat_vec, 'model')
os.system('python perc.py -m model | python score-chunks.py')
# please limit the number of iterations of training to n iterations
return feat_vec
if __name__ == '__main__':
optparser = optparse.OptionParser()
optparser.add_option("-t", "--tagsetfile", dest="tagsetfile", default=os.path.join("data", "tagset.txt"), help="tagset that contains all the labels produced in the output, i.e. the y in \phi(x,y)")
optparser.add_option("-i", "--trainfile", dest="trainfile", default=os.path.join("data", "train.txt.gz"), help="input data, i.e. the x in \phi(x,y)")
optparser.add_option("-f", "--featfile", dest="featfile", default=os.path.join("data", "train.feats.gz"), help="precomputed features for the input data, i.e. the values of \phi(x,_) without y")
optparser.add_option("-e", "--numepochs", dest="numepochs", default=int(10), help="number of epochs of training; in each epoch we iterate over over all the training examples")
optparser.add_option("-m", "--modelfile", dest="modelfile", default=os.path.join("data", "default.model"), help="weights for all features stored on disk")
(opts, _) = optparser.parse_args()
# each element in the feat_vec dictionary is:
# key=feature_id value=weight
feat_vec = {}
# format: {('U14:VBG','B-VP'):w1, ...}
tagset = []
train_data = []
tagset = perc.read_tagset(opts.tagsetfile)
print >>sys.stderr, "reading data ..."
train_data = perc.read_labeled_data(opts.trainfile, opts.featfile)
print >>sys.stderr, "done."
feat_vec = perc_train(train_data, tagset, int(opts.numepochs))
perc.perc_write_to_file(feat_vec, opts.modelfile)