/
qa_cnn.py
428 lines (375 loc) · 20 KB
/
qa_cnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#!/usr/bin/python
# -*- coding:UTF-8 -*-
'''
cnn网络获得句子向量:
将句子中的每个单词用词向量替换,reshape成一个矩阵,矩阵的行数就是单词数,
矩阵的列数就是词向量的维度,使用不同大小的filter,
无论如何filter的width始终==词向量的维度,filter沿竖直方向滑动,得到每一张图的
卷积的结果是一个(sequence_len - filer_height + 1 , 1)维度的向量,
sequence_len 是句子的长度,filter_height是filter的高度
然后对于这个向量,使用max或者mean pooling
这里的pooling_size,选择的就是(sequence_len - filter_height , 1)
那么这样之后就得到一个(1,1)的scalar
这样就相当于将一个句子用这个filter抽取之后,只是得到了一个数值,这个数值可能描述
的是这个和、句基于当下这个filter的特征。
只是一个数怎么够呢,所以我们有了不止一个filter,而是num_filter个filter,
那么这样之后就会有num_filter个scalar,可以拼接起来作为一个向量,来作为句子的向量。
但是只用一种尺寸的filter得到num_filter个特征可能有点不好
所以可以设置n种不同尺寸的filter得到 n 个num_filter维度的向量,再把这n个向量
首尾拼接起来得到一个长向量,以此作为句子的向量。
***********************************************
得到句子向量后就可以再根据句子向量判断问答句子是否匹配了
余弦相似度,词共现数,weighted词共现数(每个词根据idf值得到一个 weight),欧拉距离 and so on
这里使用余弦相似度
评价标准和loss是精确度,precision = TP / (TP + FP);
***********************************************
输入的句子依旧是字符串形式,在本程序中会先将其转换为id表示。
'''
from __future__ import print_function;
import os,sys, timeit,random,operator
import numpy as np
import numpy
import theano
import theano.tensor as T
from theano.tensor.signal import pool
from theano.tensor.nnet import conv2d
import numpy as np
import numpy
import cPickle
theano.config.floatX = 'float64'
import time
ISOTIMEFORMAT = "%Y - %m - %d %X"
def printf(strr):#传入的参数是要写入文件中的字符串
f = open("cnn_result.txt","a")#以追加的方式写入,这样不会覆盖掉之前的内容
print(time.strftime(ISOTIMEFORMAT,time.localtime()) , file = f);#写入时间
print(strr,file = f);
f.close();
filenames = [r"WikiQASent-dev",
r"WikiQASent-dev-filtered",
r"WikiQASent-test",
r"WikiQASent-test-filtered",
r"WikiQASent-train"]
folder = "data_cnn";
trainfile = "-train.txt"
validfile = "-dev.txt"
testfile = "-test.txt"
trainfile = os.path.join(folder,trainfile);
validfile = os.path.join(folder,validfile);
testfile = os.path.join(folder,testfile);
word_id = cPickle.load(open(os.path.join(folder,"word_id.pkl"),"rb"))
word_vec = cPickle.load(open(os.path.join(folder,"word_vec.pkl"),"rb"));
id_vec = cPickle.load(open(os.path.join(folder,"id_vec.pkl"),"rb"));
def load_data_list(filename):#将语料数据都取出来放在列表中
List = [];
with open(filename,"r") as f:
filelines = f.readlines();
for line in filelines:
temp = line.strip("\n").strip().split("\t",5);#que ,ans, label ,qid, wordco;
List.append([ temp[0],temp[1],int(temp[2]),int(temp[3]) ]);
return List;
def encode_sent(string,size = 236):
#句子列表,元素时字符串,改成元素是id,句子的最长的长度是236,但是这个长度是包含了,。?;!等句子中的符号的,符号用<unk>代表
global word_id;
x = [];
for i in range(0,size):
x.append(word_id["<NULL>"])
words = string.strip().split();
for i in range(len(words)):
if words[i] in word_id:
x[i] = word_id[words[i]];
else:
x[i] = word_id["<UNK>"]
return x;
def load_batch_data(List, start_pos, batch_size):#将List中的数据分批取出来,每一批有batch_size个,如果最后剩余的少于batch_size个则取[-batch_size:]
que ,ans ,label = [], [], [];
total_len = len(List);
Temp = [];
if start_pos + batch_size <= total_len :
Temp = List[start_pos:start_pos + batch_size];
else :
Temp = List[ -batch_size :];
for i in range(batch_size):
q = Temp[i][0];
a = Temp[i][1];
l = int(Temp[i][2]);
que.append(encode_sent(q,236));
ans.append(encode_sent(a,236));
label.append(l);
return que,ans,label;
def clean_word(word):
word = word.strip(",").strip(".").strip("?").strip(":").strip(";").strip("!").strip("(").strip(")");
def validation(validate_model , test_List , batch_size):
score_list , index = [] , int(0);
avg_COST ,tot_COST = float(0) , float(0);
while True:
que , ans , label = load_batch_data(test_List ,index , batch_size);
COST , batch_scores , _ , _ = validate_model(que,ans,label,1.0);#在验证的时候不dropout,所以最后一个是1
tot_COST += COST;
for score in batch_scores:#这里的score指的是cos_sim搬移,压缩到0~1之间之后的值
score_list.append(score);
index += batch_size;
if index >= len(test_List):
break;
print("validating...",index);
avg_COST = tot_COST / len(test_List);
#构造可以用于计算MAP,MRR,precision的sdict
sdict , index = {} , int(0);
for items in test_List:#item = [que , ans ,label , qid]
qid = items[3];
if not qid in sdict:
sdict[qid] = [];
sdict[qid].append((score_list[index] , items[2]))#将“余弦值”和label作为元组加入到sdict[qid],这个列表中
index += 1;
#计算precision
lev0 , lev1 = float(0) ,float(0)
for qid , cases in sdict.items():
cases.sort(key = operator.itemgetter(0) , reverse = True)#按照余弦值的大小降序排序
score , flag = cases[0] #取最高的得分,和对应的label
if flag == 1:
lev1 += 1;
if flag == 0:
lev0 += 1;
precision = lev1 /(lev0 + lev1)
#计算MRR
MRR , num_Q , reciptrocal_rank = float(0) , float(0) , float(0);
for qid ,cases in sdict.items():
#已经拍过序了
for j in range(len(cases)):
if cases[j][1] == 1:
reciptrocal_rank = 1.0 /(j + 1);
break;
if reciptrocal_rank != 0.0:
MRR += reciptrocal_rank ;
num_Q += 1;
if num_Q != 0.0:
MRR /= num_Q;
else :
MRR = 0.0;
#计算MAP
MAP , num_Q , single_avg , single_num = float(0) , float(0) , float(0) , float(0);
#single_avg :单个主题的平均准确率的平均值
for qid ,cases in sdict.items():
#cases.sort(key = operator.itemgetter(0) , reverse = True)
single_avg = 0.0;
single_num = 0.0;
for j in range(len(cases)):
if cases[j][1] == 1:
single_num += 1;
single_avg += single_num/(j + 1);
if single_num != 0.0 and single_num != len(cases):
single_avg /= single_num;
else:
single_avg = 0.0;
if single_avg != 0.0:
MAP += single_avg ;
num_Q += 1;
if num_Q != 0.0:
MAP /= num_Q;
else :
MAP = 0.0;
return tot_COST , avg_COST , MAP , MRR , precision;
class QACNN(object):
def __init__(self,que, ans, label, word_embeddings , batch_size,max_sequence_len,embedding_size , filter_sizes , num_filters, keep_prob):
'''
input1,inpu2 每一行是一个id化的句子,batch_size行 max_sequence_len 列;
max_sequence_len:最长的句子的长度
'''
rng = np.random.RandomState(23455);
self.params = [];
id_vec = theano.shared(name = "id_vec" , value = word_embeddings);
que_matrix = id_vec[T.cast(que.flatten() , dtype = "int32")]#flatten()之后就是batch_size * max_sequence_len列,一行,但是再从id_vec矩阵中析取之后就是batch_size *max_sequence_len行,embedding_size列了
ans_matrix = id_vec[T.cast(ans.flatten() , dtype = "int32")]#每max_sequence_len行构成一个句子。
que_x = que_matrix.reshape((batch_size , 1 , max_sequence_len , embedding_size))
ans_x = ans_matrix.reshape((batch_size , 1 , max_sequence_len , embedding_size))#为了进行卷积,将原始数据构成这样的格式[batch_size, num_feature_maps,img_height,mig_width]
self.dbg_x1 = que_x;
que_vec , ans_vec = [] , [];
#filter_sizes是一个列表有多种大小的filter_height;
#对于每一种filter之间并不是级联的关系,而是并列关系,每个filter的width都==embedding_size,然后池化层又都是利用(max_sequence_len - filter_height + 1 , 1)大小的池化块来池化,所以最后每个filter的情况下得到的都是num_filters维的向量
for i,filter_size in enumerate(filter_sizes):
filter_shape = (num_filters , 1 , filter_size , embedding_size)#根据函数conv2d(A,W)的要求,W[目标特征数,源特征数,filter_height, filter_width]
img_shape =(batch_size , 1 , max_sequence_len , embedding_size)#A[batch_size , 图片特征数,img_height , img_width]
fan_in = np.prod(filter_shape[1:])#每一层的神经元接收的上一层的输入的数量是[上一层的feature_map数 * filter_height * filter_width]
fan_out = np.prod(filter_shape[2:]) * filter_shape[0] // (max_sequence_len - filter_size + 1)
W_bound = np.sqrt(6./(fan_in + fan_out))
W = theano.shared(
name = "W_%d"%i,
value =
np.array(
rng.uniform(low = - W_bound , high = W_bound , size = filter_shape),
dtype = theano.config.floatX
),
borrow = True)
b_values = np.zeros((filter_shape[0],),dtype = theano.config.floatX)
b = theano.shared(name = "b_%d"%i , value = b_values,borrow = True)
#卷积+max_pooling
conv_out = conv2d(input = que_x , filters = W , filter_shape = filter_shape , input_shape = img_shape)
#卷积后[batch_size , num_filters , max_sequence_len - filer_size + 1 , 1]
pooled_out = pool.pool_2d(input = conv_out , ds = (max_sequence_len - filter_size + 1 , 1) , ignore_border = True , mode = "max")
#池化后[batch_size , numfilter , 1 , 1]
pooled_active = T.tanh(pooled_out + b.dimshuffle('x' , 0 , 'x' , 'x'))
que_vec.append(pooled_active)
conv_out = conv2d(input = ans_x , filters = W , filter_shape = filter_shape , input_shape = img_shape)
pooled_out = pool.pool_2d(input = conv_out , ds = (max_sequence_len - filter_size + 1 , 1) , ignore_border = True , mode = "max")
pooled_active = T.tanh(pooled_out + b.dimshuffle('x' , 0 , 'x' , 'x'))
ans_vec.append(pooled_active)
self.params += [W,b]
self.dbg_conv_out = conv_out.shape
num_filters_total = num_filters * len(filter_sizes)
self.dbg_outputs_que = que_vec[0].shape
que_vec_flat = T.reshape(T.concatenate(que_vec , axis = 1) , [batch_size , num_filters_total])
ans_vec_flat = T.reshape(T.concatenate(ans_vec , axis = 1) , [batch_size , num_filters_total])
#问答句向量到此计算结束
drop_que = self._dropout(rng , que_vec_flat , keep_prob)
drop_ans = self._dropout(rng , ans_vec_flat , keep_prob)
#drop_out
#计算问题与答案之间的余弦相似度
len_que = T.sqrt(T.sum(drop_que * drop_que , axis = 1))
len_ans = T.sqrt(T.sum(drop_ans * drop_ans , axis = 1))
cos_sim = T.sum(drop_que * drop_ans , axis = 1) / (len_que * len_ans)
self.cos_sim = cos_sim;
#损失
def loss(label , cos_sim , batch_size):
cos_sim = (cos_sim + 1.0)/2.0 #搬移,压缩到(0~1)之间,
cos_sim = T.clip(cos_sim , 1e-10 , 1-1e-10)#对0有效,对1无效
cross_entropy_loss = T.nnet.binary_crossentropy(cos_sim, label)
ones = theano.shared(numpy.ones(batch_size , dtype = theano.config.floatX), borrow = True);
cross_entropy_loss = T.switch(T.eq(cos_sim, ones),- ( label * T.log(cos_sim) + (1 - label) * T.log(1e-10) ),cross_entropy_loss)
return cross_entropy_loss;
cross_entropy_loss = loss(label , cos_sim , batch_size);
self.cost = T.sum(cross_entropy_loss , acc_dtype = theano.config.floatX);
def _dropout(self , rng , layer , keep_prob):
srng = T.shared_randomstreams.RandomStreams(rng.randint(123456));
mask = srng.binomial(n = 1, p = keep_prob , size = layer.shape)#二项分布,成功地概率是p,实验次数n,返回成功地次数
output = layer * T.cast(mask,theano.config.floatX)
output = output / keep_prob;
return output;
def save(self,folder = "save_my_params_cnn"):
if not os.path.exists(folder):
os.mkdir(folder);
for param in self.params:
numpy.save(os.path.join(folder,param.name + ".npy"),param.get_value());
def load(self,folder = "save_my_params_cnn"):
for param in self.params:
param.set_value(numpy.load(open(os.path.join(folder , param.name + ".npy"))));
def Show(self):
for param in self.params:
print(param.name , param.get_value());
if __name__=="__main__":
params = {
"batch_size" : int(256),#一次训练的句子数
"filter_size" : [2,3,4],
"num_filters" : 500,
"embedding_size" : 300,#词向量的维度
"learning_rate" : 0.0005,#原来是0.001,感觉效果不好
"n_epoches" : 1000,
"valid_frequency" : 79,#在下面训练前会改为n_epoch
"keep_prob" : 0.5,#used for drop out
"max_sequence_len" : 236
};
printf("params = " + str(params));
#将function的定义放在类外,将计算图的定义放在类内,这样我觉得更好
que , ans , label = T.matrix("que") , T.matrix("ans") , T.vector("label");
keep_prob = T.fscalar("keep_prob");#直接参与运算的都需要是tensor
model = QACNN(que = que , ans = ans , label = label , keep_prob = keep_prob ,word_embeddings = id_vec ,
batch_size = params['batch_size'] , max_sequence_len = params['max_sequence_len'] ,
embedding_size = params['embedding_size'] ,filter_sizes = params["filter_size"] , num_filters = params["num_filters"])
dbg_x1 = model.dbg_x1;# = que_x
dbg_outputs_que = model.dbg_outputs_que # = que_vec[0].shape
#在类中只是将计算图定义完了,计算图的真正的启动--定义函数输入输出以触发图的计算,还有梯度反传的定部分还没有定义,
#梯度反传和计算图之间是要通过function的定义联系到一起的。
cost , cos_sim = model.cost , model.cos_sim;
graph_params = model.params;
grads = T.grad(cost , graph_params);
learning_rate = T.dscalar("learning_rate");
updates = [(param_i , param_i - learning_rate * grad_i) for param_i , grad_i in zip(graph_params ,grads) ]
qt , at , lt = T.matrix("q1") , T.matrix("a1") , T.vector("l1");
prob = T.fscalar("prob");
train_model = theano.function(inputs = [ qt , at , lt , prob , learning_rate] ,
outputs = [ cost , dbg_x1 , dbg_outputs_que],
updates = updates,
givens = {
que : qt , ans : at ,label : lt , keep_prob : prob
})
qv, av , lv = T.matrix("qv") , T.matrix("av") , T.vector("lv");
validate_model = theano.function(inputs = [qv , av , lv , prob],
outputs = [cost , cos_sim , dbg_x1 , dbg_outputs_que],
#updates = updates,
givens = {
que : qv , ans : av , label : lv , keep_prob : prob
})
train_List = load_data_list(trainfile);
valid_List = load_data_list(validfile);
printf("train_start");
print("train_start")
n_train_batches = len(train_List) // params["batch_size"];
params["valid_frequency"] = n_train_batches + 1;# =78 + 1
params["patience"] = int((n_train_batches + 1)*1.5);
epoch = 0;
done_looping = False;
last_pos = 0;
best_MAP = -numpy.inf;
best_MRR = -numpy.inf;
best_COST = numpy.inf;
best_PREC = -numpy.inf;
index = 0;
valid_times = 0;
last_update_valid_time = 0;
while(epoch < params["n_epoches"] )and (not done_looping):
epoch = epoch + 1;
print("epoch = ",epoch);
for minibatch_index in range(n_train_batches + 1):
index += 1;
print("index = ",index);
train_que,train_ans,train_label = load_batch_data(train_List,last_pos,params['batch_size']);
last_pos = last_pos + params["batch_size"];
if last_pos >= len(train_List):
last_pos = last_pos % len(train_List);
cost , dbg_x1 , dbg_outputs_que = train_model(train_que, train_ans , train_label , params["keep_prob"],params['learning_rate'])
printf("epoch : " + str(epoch) + " index :" + str(index) + " cost = " + str(cost));
print("epoch : " + str(epoch) + "index :" + str(index) + " cost = " + str(cost))
if index % params["valid_frequency"] == 0:
valid_times += 1;
flag = False
printf("validation....")
print("validation....")
COST , avg_COST , MAP , MRR , PREC = validation(validate_model , valid_List , params["batch_size"])
printf("tot_COST=" + str(COST) + " avg_COST=" + str(avg_COST) + " MAP=" +str(MAP) + " MRR=" +str(MRR) + "prec="+str(PREC));
print("tot_COST=" + str(COST) + " avg_COST=" + str(avg_COST) + " MAP=" +str(MAP) + " MRR=" +str(MRR) + "prec="+str(PREC));
if MAP > best_MAP:
best_MAP = MAP;
flag = True;
if MRR > best_MRR:
best_MRR = MRR;
flag = True;
if avg_COST < best_COST:#因为最后一个batch可能句子数不足
best_COST = avg_COST;
flag = True;
if PREC > best_PREC:
best_PREC = PREC;
flag = True;
if flag:
last_update_valid_time += 1;
printf("last_update_valid_time = " + str(last_update_valid_time) + " best_cost = "+ str(best_COST));
if ( valid_times - last_update_valid_time ) > params["patience"] :#如果过了很久都没有改进就没有必要再训练下去了,我这里设置的是,trian中全部batch都训练完了,又在下一个epoch训练了半个训练集还没有更新的话,就将学习率递减/2
params['learning_rate'] /= 2.0;
last_update_valid_time = valid_times;#lr/2后要将两者的差值变为0;
if (params['learning_rate'] < 1e-8 ):
done_looping = True;
break;
printf("train_over");
print("train_over");
model.save();
##########
#测试模型#
##########
del train_List;
del valid_List;
test_List = load_data_list(testfile)
test_List
COST , avg_COST , MAP , MRR , PREC = validation(validate_model , test_List ,params["batch_size"]);
printf("test....");
print("test....");
printf("tot_COST=" + str(COST) + " avg_COST=" + str(avg_COST) + " MAP=" +str(MAP) + " MRR=" +str(MRR) + "prec="+str(PREC));
print("tot_COST=" + str(COST) + " avg_COST=" + str(avg_COST) + " MAP=" +str(MAP) + " MRR=" +str(MRR) + "prec="+str(PREC));
printf("test over")
print("test over")