-
Notifications
You must be signed in to change notification settings - Fork 0
/
CheckProbParses.py
205 lines (198 loc) · 9.73 KB
/
CheckProbParses.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# -*- coding: utf-8 -*-
'''
Created on 21-10-2013
@author: Katarzyna Krasnowska
'''
import sys
import subprocess
#sys.path.insert(0, '/home/kasia/Dokumenty/Robota/LFG/stochastic/')
from conf import *
from ReadProlog import read_prolog_file
from FStructure import build_fs, fs_to_tex
sys.setrecursionlimit(1000)
parse_list = 'dev-list'
parse_dir = 'dev-parses%s/' % FEATURES_SUFFIX
tex_dir = LFG_DIR + 'stochastic/' + parse_dir + 'tex/'
output_overall = True
output_individual = True
#EPS = 0.00001
rs = (800, 600, 400, 300, 250, 200, 150, 100, 80, 60, 40, 20, 10, 8, 6, 4, 2, 1)
adds = (50,)
print 'rm %s*' % tex_dir
subprocess.call('rm %s*' % tex_dir, shell=True)
print tex_dir
subprocess.call('cp %s/parses/avm.sty %s' % (LFG_DIR, tex_dir), shell=True)
sentences, sentences_0, wfs = [], [], []
results_overall = {}
results_individual = {}
with open(LFG_DIR + '/stochastic/' + parse_list, 'r') as sfile:
sentences = tuple(l.replace('\n', '').split() for l in sfile.readlines())
with open(LFG_DIR + '/stochastic/' + parse_list + '-0', 'r') as sfile:
sentences_0 = tuple(l.replace('\n', '').split() for l in sfile.readlines())
with open(WEIGHTS_FILES_LIST + FEATURES_SUFFIX, 'r') as wf_list:
active_wfs = tuple(l.replace('\n', '') for l in wf_list.readlines() if not l.startswith('#'))
#active_wfs = tuple((int(wf.replace('-0', '').replace('weights-file_r', '')), wf) for wf in wfs)
wfs = {}
for a in adds:
wfs[a] = []
for r in rs:
for list_0 in (True,):#, False):
s = '-0' if list_0 else ''
wf = 'step-%d%s/weights-file%s_r%d' % (a, FEATURES_SUFFIX, s, r)
if wf in active_wfs:
wfs[a].append((r, wf))
print active_wfs
for sents, list_0 in ((sentences, False),):#, (sentences_0, True)):
for a in adds:
for r, wf in sorted(wfs[a]):
print r, wf, list_0
individual = []
sentences_ranks = {}
#avg_first_rank = 0.0
c = 0
j = 0.0
correct = 0.0
half_correct = 0.0
all_correct = 0.0
cutoff = 0.0
for s_full, s_disamb in sents:
c += 1
if c % 100 == 0:
print c, '/', len(sentences)
s_name = s_full.replace(LFG_DIR + 'parses/' + PARSES_VERSION + '/', '')
s_prob = LFG_DIR + 'stochastic/' + parse_dir + wf + '/' + s_name[:-3] + PROB_SUFFIX
data = None
try:
data = read_prolog_file(s_prob, True)
except IOError:
#print 'IOError'
continue
if not data.text:
#print 'Oh noes!'
continue
text = data.text
solutions = data.num_solutions
#print '*', len(data.opts), data.num_solutions
weighted_constrs = {}
for opts in data.opts:
weight = 0
for o in opts:
if o in data.choices_weights:
weight -= data.choices_weights[o]
constrs = tuple(c for c, l_no in data.constraint_list if c.cond.intersection(opts))
if not weight in weighted_constrs:
weighted_constrs[weight] = set()
weighted_constrs[weight].add((constrs, tuple(opts)))
#print '***', len(data.opts)
disamb_constrs = []
data = read_prolog_file(s_disamb, True)
num_disamb = len(data.opts)
for opts in data.opts:
constrs = tuple(c for c, l_no in data.constraint_list if c.cond.intersection(opts))
disamb_constrs.append((tuple(sorted(opts)), constrs))
#print '*****', len(data.opts)
i = 0
j += len(weighted_constrs)
tex_fs = []
#found_first = False
num_first = 0
opts_found = set()
sentences_ranks[s_name] = set()
for w, cs_set in sorted(weighted_constrs.items()):
i += 1
for cs, opts in cs_set:
found = False
for d_opts, d_cs in disamb_constrs:
if (cs == d_cs):
found = True
opts_found.add(d_opts)
if i == 1:
#found_first = True
num_first += 1
#fs, eqs = build_fs(cs)
#tex_fs.append((w, fs_to_tex(fs[0], fs, eqs)))
if found:
sentences_ranks[s_name].add(i)
#avg_first_rank += i
if (len(opts_found) != num_disamb):
print '======================================'
print len(opts_found), num_disamb
print opts_found
print data.opts
raise RuntimeError
half_disamb = float(num_disamb) / 2
if not half_disamb:
half_disamb = 1.0
#print num_first, num_disamb / 2, bool(num_first), num_first >= num_disamb / 2
if num_first:
correct += 1
if num_first >= half_disamb:
half_correct += 1
if half_correct > correct:
print half_correct, correct
raise RuntimeError
if sentences_ranks[s_name] == set([1]):
all_correct += 1
coff = 1.0 - float(len(sorted(weighted_constrs.items()[0][1]))) / solutions
cutoff += coff
individual.append((s_name, solutions, num_first, coff))
#tex_text = ('\\documentclass[8pt]{article}\n\\usepackage[utf8]{inputenc}\n' +
# '\\usepackage{polski}\n\usepackage{synttree}\n' +
# '\\usepackage[landscape, a4paper, top=20pt, bottom=20pt, left=20pt, right=20pt]{geometry}\n' +
# '\\usepackage{avm}\n\\avmoptions{bottom}\n\\avmfont{\\sc\\tiny}\n\n' +
# '\\usepackage[usenames,dvipsnames]{xcolor}\n' +
# '\\begin{document}\n\n\scriptsize{\n')
#with open(tex_dir + s_name[:-3] + '.tex', 'w') as tf:
# tf.write(tex_text)
# for w, t in tex_fs:
# tf.write('\\subsection*{%.2f}' % w)
# tf.write('\\subsubsection*{\\textit{%s}}' % text.replace('-', ' '))
# tf.write(t + '\n')
# tf.write('\\newpage\n')
# tf.write('\\end{document}\n')
avg_first_rank = 0.0
avg_last_rank = 0.0
avg_avg_rank = 0.0
for s, ranks in sentences_ranks.items():
avg_first_rank += sorted(ranks)[0]
avg_last_rank += sorted(ranks)[-1]
avg_avg_rank += sum(ranks) / len(ranks)
avg_first_rank = avg_first_rank / len(sentences_ranks)
avg_last_rank = avg_last_rank / len(sentences_ranks)
avg_avg_rank = avg_avg_rank / len(sentences_ranks)
print avg_first_rank, len(sentences_ranks), j, len(sentences_ranks)
if sentences_ranks:
accuracy = correct / len(sentences_ranks)
half_accuracy = half_correct / len(sentences_ranks)
all_accuracy = all_correct / len(sentences_ranks)
cutoff = cutoff / len(sentences_ranks)
results_overall[(a, wf, list_0)] = (accuracy, half_accuracy, all_accuracy, cutoff,
avg_first_rank, avg_last_rank, avg_avg_rank,
j / len(sentences_ranks))
results_individual[(a, wf, list_0)] = tuple(sorted(individual))
print
for a in adds:
if output_overall:
print LFG_DIR + 'stochastic/results%s/results_overall-n-step-%d.csv' % (FEATURES_SUFFIX, a)
with open(LFG_DIR + 'stochastic/results%s/results_overall-n-step-%d.csv' % (FEATURES_SUFFIX, a), 'w') as f:
f.write('TRAIN,R,ACCURACY,HALF.ACCURACY,ALL.ACCURACY,CUTOFF,AVG.F.RANK,AVG.L.RANK,AVG.A.RANK,AVGRANKS,TEST\n')
for list_0 in (False, True):
test = 'only0' if list_0 else 'all'
for r, wf in sorted(wfs[a]):
if (a, wf, list_0) in results_overall:
wf_tmp = wf.replace('step-%d%s/' % (a, FEATURES_SUFFIX), '').replace('weights-file', '').replace('-0_r', 'only0,').replace('_r', 'all,')
form = (wf_tmp,) + results_overall[(a, wf, list_0)] + (test,)
f.write('%s,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%.3f,%s\n' % form)
if output_individual:
for list_0 in (False, True):
test = 'only0' if list_0 else 'all'
for r, wf in sorted(wfs[a]):
if (a, wf, list_0) in results_individual:
train = wf.replace('weights-file', '').replace('-0_r', 'only0') \
.replace('_r', 'all').replace('step-%d/' % a, '') \
.replace('%d' % r, '')
fname = 'results-step-%s-r-%d-train-%s-test-%s.csv' % (a, r, train, test)
with open(LFG_DIR + 'stochastic/results%s/%s' % (FEATURES_SUFFIX, fname), 'w') as f:
f.write('ID,SOLUTIONS,ACCURACY,CUTOFF\n')
for row in results_individual[(a, wf, list_0)]:
f.write('%s,%d,%d,%.3f\n' % row)