-
Notifications
You must be signed in to change notification settings - Fork 0
/
kpsattentionrnn.py
428 lines (316 loc) · 13.1 KB
/
kpsattentionrnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
import warnings
warnings.filterwarnings("ignore")
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Reshape
from keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
import keras.backend as K
import keras
import numpy as np
import random
import tqdm
import matplotlib.pyplot as plt
# English source data
with open("data/small_vocab_en", "r", encoding="utf-8") as f:
source_text = f.read()
# French target data
with open("data/small_vocab_fr", "r", encoding="utf-8") as f:
target_text = f.read()
view_sentence_range = (0, 10)
# Separate the source language text by spaces, to see how many distinct words are contained in it
print('Dataset Stats')
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))
# 统计英文语料数据
print("-" * 5 + "English Text" + "-" * 5)
sentences = source_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))
print('Max number of words in a sentence: {}'.format(np.max(word_counts)))
# 统计法语语料数据
print()
print("-" * 5 + "French Text" + "-" * 5)
sentences = target_text.split('\n')
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))
print('Max number of words in a sentence: {}'.format(np.max(word_counts)))
# 打印语料的前10个句子
print()
print('English sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('French sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
# 构造英文词典
source_vocab = list(set(source_text.lower().split()))
# 构造法语词典
target_vocab = list(set(target_text.lower().split()))
print("The size of English vocab is : {}".format(len(source_vocab)))
print("The size of French vocab is : {}".format(len(target_vocab)))
# 增加特殊编码
SOURCE_CODES = ['<PAD>', '<UNK>']
TARGET_CODES = ['<PAD>', '<EOS>', '<UNK>', '<GO>']
# 构造英文语料的映射表
source_vocab_to_int = {word: idx for idx, word in enumerate(SOURCE_CODES + source_vocab)}
source_int_to_vocab = {idx: word for idx, word in enumerate(SOURCE_CODES + source_vocab)}
# 构造法语语料的映射表
target_vocab_to_int = {word: idx for idx, word in enumerate(TARGET_CODES + target_vocab)}
target_int_to_vocab = {idx: word for idx, word in enumerate(TARGET_CODES + target_vocab)}
print("The size of English Map is : {}".format(len(source_vocab_to_int)))
print("The size of French Map is : {}".format(len(target_vocab_to_int)))
def text_to_int(sentence, map_dict, max_length=20, is_target=False):
"""
Encoding the text into integers.
@param sentence: 完整的句子,str类型
@param map_dict: 单词到数字编码的映射
@param max_length: 最大句子长度
@param is_target: 当前传入的句子是否是目标语句。
对于目标语句,我们要在末尾添加"<EOS>"
"""
text_to_idx = []
# 特殊单词的数字编码
unk_idx = map_dict.get("<UNK>")
pad_idx = map_dict.get("<PAD>")
eos_idx = map_dict.get("<EOS>")
# 如果不是目标语句(即源语句)
if not is_target:
for word in sentence.split():
text_to_idx.append(map_dict.get(word, unk_idx))
# 目标语句要对结尾添加"<EOS>"
else:
for word in sentence.split():
text_to_idx.append(map_dict.get(word, unk_idx))
text_to_idx.append(eos_idx)
# 超长句子进行截断
if len(text_to_idx) > max_length:
return text_to_idx[:max_length]
# 不足长度的句子进行"<PAD>"
else:
text_to_idx = text_to_idx + [pad_idx] * (max_length - len(text_to_idx))
return text_to_idx
# 对英文语料进行编码,其中设置英文句子最大长度为20
Tx = 20
source_text_to_int = []
for sentence in tqdm.tqdm(source_text.split("\n")):
source_text_to_int.append(text_to_int(sentence, source_vocab_to_int, Tx, is_target=False))
# 对法语语料进行编码,其中设置法语句子最大长度为25
Ty = 25
target_text_to_int = []
for sentence in tqdm.tqdm(target_text.split("\n")):
target_text_to_int.append(text_to_int(sentence, target_vocab_to_int, Ty, is_target=True))
random_index = 77
print("-" * 5 + "English example" + "-" * 5)
print(source_text.split("\n")[random_index])
print(source_text_to_int[random_index])
print()
print("-" * 5 + "French example" + "-" * 5)
print(target_text.split("\n")[random_index])
print(target_text_to_int[random_index])
from keras.utils import to_categorical
X = np.array(source_text_to_int)
Y = np.array(target_text_to_int)
# 对X和Y做One Hot Encoding
Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(source_vocab_to_int)), X)))
Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(target_vocab_to_int)), Y)))
# 自定义softmax函数
def softmax(x, axis=1):
"""
Softmax activation function.
"""
ndim = K.ndim(x)
if ndim == 2:
return K.softmax(x)
elif ndim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
else:
raise ValueError('Cannot apply softmax to a tensor that is 1D')
# 定义全局网络层对象
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor_tanh = Dense(32, activation="tanh")
densor_relu = Dense(1, activation="relu")
activator = Activation(softmax, name='attention_weights')
dotor = Dot(axes=1)
def one_step_attention(a, s_prev):
"""
Attention机制的实现,返回加权后的Context Vector
@param a: BiRNN的隐层状态
@param s_prev: Decoder端LSTM的上一轮隐层输出
Returns:
context: 加权后的Context Vector
"""
# 将s_prev复制Tx次
s_prev = repeator(s_prev)
# 拼接BiRNN隐层状态与s_prev
concat = concatenator([a, s_prev])
# 计算energies
e = densor_tanh(concat)
energies = densor_relu(e)
# 计算weights
alphas = activator(energies)
# 加权得到Context Vector
context = dotor([alphas, a])
return context
# 加载预训练好的glove词向量# 加载预训练好
with open("/Users/mahaoyang/PycharmProjects/Enterprize/data/glove.6B.100d.txt", 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
def pretrained_embedding_layer(word_to_vec_map, source_vocab_to_int):
"""
构造Embedding层并加载预训练好的词向量(这里我使用的是100维)
@param word_to_vec_map: 单词到向量的映射
@param word_to_index: 单词到数字编码的映射
"""
vocab_len = len(source_vocab_to_int) + 1 # Keras Embedding的API要求+1
emb_dim = word_to_vec_map["the"].shape[0]
# 初始化embedding矩阵
emb_matrix = np.zeros((vocab_len, emb_dim))
# 用词向量填充embedding矩阵
for word, index in source_vocab_to_int.items():
word_vector = word_to_vec_map.get(word, np.zeros(emb_dim))
emb_matrix[index, :] = word_vector
# 定义Embedding层,并指定不需要训练该层的权重
embedding_layer = Embedding(vocab_len, emb_dim, trainable=False)
# build
embedding_layer.build((None,))
# set weights
embedding_layer.set_weights([emb_matrix])
return embedding_layer
# 获取Embedding layer
embedding_layer = pretrained_embedding_layer(word_to_vec_map, source_vocab_to_int)
n_a = 32 # The hidden size of Bi-LSTM
n_s = 128 # The hidden size of LSTM in Decoder
decoder_LSTM_cell = LSTM(n_s, return_state=True)
output_layer = Dense(len(target_vocab_to_int), activation=softmax)
# 定义网络层对象(用在model函数中)
reshapor = Reshape((1, len(target_vocab_to_int)))
concator = Concatenate(axis=-1)
def model(Tx, Ty, n_a, n_s, source_vocab_size, target_vocab_size):
"""
构造模型
@param Tx: 输入序列的长度
@param Ty: 输出序列的长度
@param n_a: Encoder端Bi-LSTM隐层结点数
@param n_s: Decoder端LSTM隐层结点数
@param source_vocab_size: 输入(英文)语料的词典大小
@param target_vocab_size: 输出(法语)语料的词典大小
"""
# 定义输入层
X = Input(shape=(Tx,))
# Embedding层
embed = embedding_layer(X)
# Decoder端LSTM的初始状态
s0 = Input(shape=(n_s,), name='s0')
c0 = Input(shape=(n_s,), name='c0')
# Decoder端LSTM的初始输入
out0 = Input(shape=(target_vocab_size,), name='out0')
out = reshapor(out0)
s = s0
c = c0
# 模型输出列表,用来存储翻译的结果
outputs = []
# 定义Bi-LSTM
a = Bidirectional(LSTM(n_a, return_sequences=True))(embed)
# Decoder端,迭代Ty轮,每轮生成一个翻译结果
for t in range(Ty):
# 获取Context Vector
context = one_step_attention(a, s)
# 将Context Vector与上一轮的翻译结果进行concat
context = concator([context, reshapor(out)])
s, _, c = decoder_LSTM_cell(context, initial_state=[s, c])
# 将LSTM的输出结果与全连接层链接
out = output_layer(s)
# 存储输出结果
outputs.append(out)
model = Model([X, s0, c0, out0], outputs)
return model
model = model(Tx, Ty, n_a, n_s, len(source_vocab_to_int), len(target_vocab_to_int))
model.summary()
out = model.compile(optimizer=Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.001),
metrics=['accuracy'],
loss='categorical_crossentropy')
# 初始化各类向量# 初始化各类向
m = X.shape[0]
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
out0 = np.zeros((m, len(target_vocab_to_int)))
outputs = list(Yoh.swapaxes(0, 1))
# 训练模型
model.fit([X, s0, c0, out0], outputs, epochs=5, batch_size=128)
# 保存参数
model.save_weights("pretrained_seq2seq_model.h5")
# Have a look at source text
print(source_text.split("\n")[:100])
def make_prediction(sentence):
"""
对给定的句子进行翻译
"""
# 将句子分词后转化为数字编码
unk_idx = source_vocab_to_int["<UNK>"]
word_idx = [source_vocab_to_int.get(word, unk_idx) for word in sentence.lower().split()]
word_idx = np.array(word_idx + [0] * (20 - len(word_idx)))
# 翻译结果
preds = model.predict([word_idx.reshape(-1, 20), s0, c0, out0])
predictions = np.argmax(preds, axis=-1)
# 转换为单词
idx = [target_int_to_vocab.get(idx[0], "<UNK>") for idx in predictions]
# 返回句子
return " ".join(idx)
your_sentence = input("Please input your sentences: ")
make_prediction(your_sentence)
import seaborn as sns
def plot_attention(sentence, Tx=20, Ty=25):
"""
可视化Attention层
@param sentence: 待翻译的句子,str类型
@param Tx: 输入句子的长度
@param Ty: 输出句子的长度
"""
X = np.array(text_to_int(sentence, source_vocab_to_int))
f = K.function(model.inputs, [model.layers[9].get_output_at(t) for t in range(Ty)])
s0 = np.zeros((1, n_s))
c0 = np.zeros((1, n_s))
out0 = np.zeros((1, len(target_vocab_to_int)))
r = f([X.reshape(-1, 20), s0, c0, out0])
attention_map = np.zeros((Ty, Tx))
for t in range(Ty):
for t_prime in range(Tx):
attention_map[t][t_prime] = r[t][0, t_prime, 0]
Y = make_prediction(sentence)
source_list = sentence.split()
target_list = Y.split()
f, ax = plt.subplots(figsize=(20, 15))
sns.heatmap(attention_map, xticklabels=source_list, yticklabels=target_list, cmap="YlGnBu")
ax.set_xticklabels(ax.get_xticklabels(), fontsize=15, rotation=90)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=15)
plot_attention("she likes mangoes , apples , and bananas .")
plot_attention("california is never pleasant during winter , and it is sometimes wonderful in december .")
from nltk.translate.bleu_score import sentence_bleu
# 存储每个句子的模型翻译结果# 存储每个句子
fr_preds = []
# 对样本中的每个英文进行翻译
for sentence in tqdm.tqdm(source_text.split("\n")):
fr_pred = make_prediction(sentence)
# 存储翻译结果
fr_preds.append(fr_pred)
# 以样本中的法语翻译结果为reference
references = target_text.split("\n")
# 存储每个句子的BLEU分数
bleu_score = []
for i in tqdm.tqdm(range(len(fr_preds))):
# 去掉特殊字符
pred = fr_preds[i].replace("<EOS>", "").replace("<PAD>", "").rstrip()
reference = references[i].lower()
# 计算BLEU分数
score = sentence_bleu([reference.split()], pred.split())
bleu_score.append(score)
print("The BLEU score on our corpus is about {}".format(sum(bleu_score) / len(bleu_score)))