-
Notifications
You must be signed in to change notification settings - Fork 0
/
DMN_SimplerGate.py
716 lines (557 loc) · 40.8 KB
/
DMN_SimplerGate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
from theano.gradient import disconnected_grad
__author__ = 'Dan'
from theano.compile.mode import FAST_COMPILE
from numpy import dtype
import numpy as np
import re
import theano
import theano.tensor as T
import theano.typed_list
from collections import OrderedDict
from nltk.tokenize import RegexpTokenizer
import random
from random import shuffle
import pickle
import sys
import os
from theano import config
class DMN_SimplerGate(object):
# We take as input a string of "facts"
def __init__(self, num_fact_hidden_units, number_word_classes, dimension_fact_embeddings, num_episode_hidden_units, max_number_of_facts_read):
dimension_fact_embeddings = 8
print(" Starting dmn no scan... ")
self.preprocess_babi_set_for_dmn()
print(" Starting dmn no scidxsan... ")
self.X_train, self.mask_sentences_train, self.fact_ordering_train, self.question_train, self.question_train_mask, self.Y_train, self.X_test, self.mask_sentences_test, self.fact_ordering_test, self.question_test, self.question_test_mask, self.Y_test, word2idx, self.idx2word, dimension_fact_embeddings, max_queslen, max_sentlen, total_sequence_length = self.process_data("embeddings")
print(" Building model... ")
number_word_classes = max(self.idx2word.keys(), key=int) + 1
#max_fact_seqlen = max_article_len
max_fact_seqlen = max_sentlen
total_number_of_sentences_per_episode = int(total_sequence_length / max_fact_seqlen)
max_number_of_facts_read = 1
dimension_word_embeddings = 10
max_number_of_episodes_read = 1
self.initialization_randomization = 1
nh = 8 # Dimension of the hidden layer
num_hidden_units = nh
num_hidden_units_facts = num_hidden_units
num_hidden_units_episodes = num_hidden_units_facts
num_hidden_units_questions = num_hidden_units_episodes
num_hidden_units_words = num_hidden_units_questions
total_len_word_seq = total_sequence_length
self.initialize_dmn_params(nh, num_hidden_units_words, num_hidden_units_facts, num_hidden_units_episodes, num_hidden_units_questions, dimension_word_embeddings, dimension_fact_embeddings, max_fact_seqlen, max_number_of_episodes_read, number_word_classes, total_number_of_sentences_per_episode)
question_encoding = self.GRU_question(dimension_fact_embeddings, num_hidden_units_questions, num_hidden_units_episodes, max_queslen, dimension_word_embeddings)
# Size is (num_episodes_read, word_idx_in_sentence)
word_idxs = T.lmatrix("word_indices") # TODO: For batch, would need to be a matrix.
word_mask = T.lmatrix("word_mask") # TODO: You will also need a matrix here (Tensor)
facts_input = T.lvector("fact_indices")
y_sentence = T.lscalar('y_sentence')
def slice_w(x, n):
return x[n*num_hidden_units_facts:(n+1)*num_hidden_units_facts]
# TODO: BATCH: You will need to change the row dimension of h_prev so you have one for each batch
def gru_reader_step(x_cur_word, h_prev, w_mask):
W_in_stacked = T.concatenate([self.W_word_reset_gate_x, self.W_word_update_gate_x, self.W_word_hidden_gate_x], axis=1) # I think your issue is that this should have # dim word embeddings
W_hid_stacked = T.concatenate([self.W_word_reset_gate_h, self.W_word_update_gate_h, self.W_word_hidden_gate_h], axis=1)
input_n = T.dot(x_cur_word, W_in_stacked) # TODO: BATCH : so if you do batches, this dot will still work, output, input_n, will be (n_batches, W_in_stacked_n_cols)
hid_input = T.dot(h_prev, W_hid_stacked)
#TODO Note, try to make output of input_in (61, n_batches). Even so, how will you add them?
# TODO Batch: If you have (n_batches, W_stacked_n_cols), your slicing function will need to change, otherwise you will slice the batch and not the
resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
resetgate = T.tanh(resetgate)
updategate = T.tanh(updategate)
hidden_update = slice_w(input_n, 2) + resetgate * slice_w(hid_input, 2)
hidden_update = T.tanh(hidden_update)
h_cur = (1 - updategate) * hidden_update + updategate * hidden_update
h_cur = w_mask * h_cur + (1 - w_mask) * h_prev
# h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev))
return h_cur
list_of_fact_softmaxes = []
def brain_gru_reader(episode_reading_idx, h_prev_episode):
h_cur = question_encoding # Initial state of word GRU
current_facts = None
for idx in range(total_len_word_seq):
# TODO BATCH: You may want to flatten the word_reading input here, right now, you just put one word size (1, dim_word_embedding) into gru_reader_step.
# This should be a matrix of size (n_batches, word_dimension)
h_cur = gru_reader_step(self.emb[word_idxs[episode_reading_idx][idx]], h_cur, word_mask[episode_reading_idx][idx])
if idx % max_fact_seqlen == 0:
output = T.tanh(T.dot(self.W_word_to_fact_vector, h_cur) + self.b_word_to_fact_vector)
if current_facts is None:
current_facts = [output] # Run through softmax
else:
current_facts = T.concatenate((current_facts, [output]), axis=0)
z_dmn = T.concatenate(([question_encoding], [h_prev_episode]), axis=0) # This has dimension (2,8)
self.G_dmn = T.nnet.sigmoid(T.dot(self.W_dmn_2, T.tanh(T.dot(z_dmn, self.W_dmn_1)) + self.b_dmn_1) + self.b_dmn_2) # Note that this should be 1-dimensional
self.cur_facts = current_facts
# Current_facts is (2, 8). You want the output to be (2,1). This will necessitate:
# G_dmn has size (8,1)
self.t_dot_res = T.dot(current_facts, self.G_dmn).T
result_of_gate = T.nnet.softmax(self.t_dot_res).T # Ensure that this has 1 dimension, should be weight of whether or not we take g_i * F_i
# Note, I think result of gate is (1,2)
list_of_fact_softmaxes.append(result_of_gate)
# Believe error is here
brain_input = T.dot(current_facts.T, result_of_gate) # I believe this has size (2, 8)
# I believe brian is 8 x 1
# W_in_stacked is 8x63, each individual element is (8,21)
W_in_stacked = T.concatenate([self.W_episode_reset_gate_x, self.W_episode_update_gate_x, self.W_episode_hidden_gate_x], axis=1) # I think your issue is that this should have # dim brain embeddings
W_hid_stacked = T.concatenate([self.W_episode_reset_gate_h, self.W_episode_update_gate_h, self.W_episode_hidden_gate_h], axis=1)
# W_in_stacked is 8x24
# brain_input is DEFINITELY (8,1)
input_n = T.dot(brain_input.T, W_in_stacked) # Line is fine
hid_input = T.dot(h_prev_episode, W_hid_stacked) # This is not the error
# LAST RUN THE SECOND ONE HAD TOO MANY!
# Now the first one has too many, definitely the following line
# Note: input_in has size (1,24). Note but there is an extra dimension
self.output_in = input_n
resetgate = slice_w(input_n[0], 0) + slice_w(hid_input, 0)
updategate = slice_w(hid_input, 1) + slice_w(input_n[0], 1) # Just this line is problem
resetgate = T.tanh(resetgate)
updategate = T.tanh(updategate)
hidden_update = resetgate * slice_w(hid_input, 2) + slice_w(input_n[0], 2)
hidden_update = T.tanh(hidden_update)
h_cur = (1 - updategate) * hidden_update + updategate * hidden_update
# h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev))
return h_cur
# Brain for loop
state_episode_step = question_encoding
# Main Brain Loop
for idx in range(max_number_of_facts_read):
state_episode_step = brain_gru_reader(idx, state_episode_step)
# Episode to word is size: (number_word_classes, num_hidden_units_facts)
output = T.nnet.softmax(T.dot(self.W_episode_to_word, state_episode_step) + self.b_episode_to_word)
self.out_softmax = output
p_y_given_x_sentence = output[0, :]
y_pred = T.argmax(p_y_given_x_sentence, axis=0)
lr = T.scalar('lr')
sentence_nll = -T.mean(T.log(p_y_given_x_sentence)[y_sentence])
fact_nll = 0
for idx in range(max_number_of_facts_read):
fact_nll += -T.mean(T.log(list_of_fact_softmaxes[idx])[facts_input[idx]])
fact_grads = T.grad(fact_nll, self.fact_params)
fact_updates = OrderedDict((p, p - lr*g) for p, g in zip(self.fact_params, fact_grads)) # computes the update for each of the params.
sentence_gradients = T.grad(sentence_nll, self.params, disconnected_inputs='warn') # Returns gradients of the nll w.r.t the params
sentence_updates = OrderedDict((p, p - lr*g) for p, g in zip(self.params, sentence_gradients)) # computes the update for each of the params.
print("Compiling fcns...")
self.classify = theano.function(inputs=[word_idxs, word_mask, self.question_idxs, self.question_mask], outputs=y_pred)
self.sentence_train = theano.function(inputs=[word_idxs, word_mask, self.question_idxs, self.question_mask, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates)
self.out_in = theano.function(inputs=[word_idxs, word_mask, self.question_idxs, self.question_mask, y_sentence, lr], outputs=self.output_in, updates=sentence_updates, on_unused_input='warn')
self.sm = theano.function(inputs=[word_idxs, word_mask, self.question_idxs, self.question_mask, y_sentence, lr], outputs=self.out_softmax, updates=sentence_updates, on_unused_input='warn')
# The problem is here
self.fact_train = theano.function(inputs=[word_idxs, word_mask, facts_input, self.question_idxs, self.question_mask, lr], outputs=fact_nll, updates=fact_updates, on_unused_input='warn')
#self.eval_fact_train = theano.function(inputs=[word_idxs, word_mask, facts_input, self.question_idxs, self.question_mask, lr], outputs=list_of_fact_softmaxes[0], updates=fact_updates, on_unused_input='warn')
#self.get_gdmn = theano.function(inputs=[word_idxs, word_mask, facts_input, self.question_idxs, self.question_mask, lr], outputs=self.G_dmn, updates=fact_updates, on_unused_input='warn')
#self.get_curf = theano.function(inputs=[word_idxs, word_mask, facts_input, self.question_idxs, self.question_mask, lr], outputs=self.cur_facts, updates=fact_updates, on_unused_input='warn')
#self.t_dot = theano.function(inputs=[word_idxs, word_mask, facts_input, self.question_idxs, self.question_mask, lr], outputs=self.t_dot_res, updates=fact_updates, on_unused_input='warn')
print("Done compiling!")
def GRU_question(self, dimension_fact_embedding, num_hidden_units_questions, num_hidden_units_episodes, max_question_len, dimension_word_embeddings):
self.question_idxs = T.lmatrix("question_indices") # as many columns as words in the context window and as many lines as words in the sentence
self.question_mask = T.lvector("question_mask")
q = self.emb[self.question_idxs].reshape((self.question_idxs.shape[0], dimension_word_embeddings)) # x basically represents the embeddings of the words IN the current sentence. So it is shape
def slice_w(x, n):
return x[n*num_hidden_units_questions:(n+1)*num_hidden_units_questions]
def question_gru_recursion(x_cur, h_prev, q_mask):
W_in_stacked = T.concatenate([self.W_question_reset_gate_x, self.W_question_update_gate_x, self.W_question_hidden_gate_x], axis=1)
W_hid_stacked = T.concatenate([self.W_question_reset_gate_h, self.W_question_update_gate_h, self.W_question_hidden_gate_h], axis=1)
input_n = T.dot(x_cur, W_in_stacked)
hid_input = T.dot(h_prev, W_hid_stacked)
resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
resetgate = T.tanh(resetgate)
updategate = T.tanh(updategate)
hidden_update = slice_w(input_n, 2) + resetgate * slice_w(hid_input, 2)
hidden_update = T.tanh(hidden_update)
h_cur = (1 - updategate) * hidden_update + updategate * hidden_update
h_cur = q_mask * h_cur + (1 - q_mask) * h_prev
# h_cur = T.tanh(T.dot(self.W_fact_to_hidden, x_cur) + T.dot(self.W_hidden_to_hidden, h_prev))
return h_cur
state = self.h0_questions
for jdx in range(max_question_len):
state = question_gru_recursion(q[jdx], state, self.question_mask[jdx])
return T.tanh(T.dot(state, self.W_question_to_vector) + self.b_question_to_vector)
def idx2sentence(self, x):
cur_sent = ""
for w in x:
cur_sent += " " + self.idx2word[int(w)]
return cur_sent
def train(self):
# self.X_train, self.mask_train, self.question_train, self.Y_train, self.X_test, self.mask_test, self.question_test, self.Y_test, word2idx, idx2word, dimension_fact_embeddings = self.process_data()
lr = .01
max_epochs = 20000
print(" Starting training...")
last_ll = 100000
min_ll_seen = 10000
best_correct_seen = 0
for e in range(max_epochs):
shuffled_idxs = [i for i in range(len(self.X_train))]
shuffle(shuffled_idxs)
ll = 0
num_train_correct, tot_num_train = 0, 0
ll_fact = 0
for idx in shuffled_idxs:
x, word_mask, fact_ordering, q, mask_question, y = self.X_train[idx], self.mask_sentences_train[idx], self.fact_ordering_train[idx], self.question_train[idx], self.question_train_mask[idx], self.Y_train[idx]
ll_fact += self.fact_train([x], [word_mask], fact_ordering, q, mask_question, lr)
# val_you = self.eval_fact_train([x], [word_mask], fact_ordering, q, mask_question, lr)
# print(" one val you: ", val_you)
# gdm = self.get_gdmn([x], [word_mask], fact_ordering, q, mask_question, lr)
#
# print(" this is gdm :" , gdm)
# curf = self.get_curf([x], [word_mask], fact_ordering, q, mask_question, lr)
# print(" curf: ", curf)
# res = self.t_dot([x], [word_mask], fact_ordering, q, mask_question, lr)
#
# print(" res: ", res)
#
print(" This is the ll fact: ", ll_fact)
for idx in shuffled_idxs:
x, word_mask, fact_ordering, q, mask_question, y = self.X_train[idx], self.mask_sentences_train[idx], self.fact_ordering_train[idx], self.question_train[idx], self.question_train_mask[idx], self.Y_train[idx]
ll += self.sentence_train([x], [word_mask], q, mask_question, y, lr)
outin = self.out_in([x], [word_mask], q, mask_question, y, lr)
sm = self.sm([x], [word_mask], q, mask_question, y, lr)
#print(" soft max res: ", sm)
shuffle(shuffled_idxs)
for idx in shuffled_idxs:
x, word_mask, fact_ordering, q, mask_question, y = self.X_train[idx], self.mask_sentences_train[idx], self.fact_ordering_train[idx], self.question_train[idx], self.question_train_mask[idx], self.Y_train[idx]
predictions_test = self.classify([x], [word_mask], q, mask_question)
if predictions_test == y:
num_train_correct += 1
tot_num_train += 1
#
print(" ratio training data predicted correctly: ", num_train_correct / tot_num_train)
correct = 0
total_tests = 0
for idx in range(len(self.X_test)):
x, word_mask, fact_ordering, q, mask_question, y = self.X_test[idx], self.mask_sentences_test[idx], self.fact_ordering_test[idx], self.question_test[idx], self.question_test_mask[idx], self.Y_test[idx]
predictions_test = self.classify([x], [word_mask], q, mask_question)
# if e % 5 == 0:
# if predictions_test != y:
# print(" wrong, this is question: ", self.idx2sentence(q))
# print(" Wrong, this is sentence: ", self.idx2sentence(x[0]))
# print(" sentence mask: ", sentence_mask)
# print(" word mask: ", word_mask)
#
# print(" Wrong! this is predction: ", self.idx2word[int(predictions_test)], " and this y: ", self.idx2word[int(y)])
# else:
# print("This is predction: ", self.idx2word[int(predictions_test)], " and this y: ", self.idx2word[int(y)])
#
if predictions_test == y:
correct += 1
total_tests += 1
print("epoch , " , e, " training ll: ", ll, " ll improvement: ", last_ll - ll, " ratio correct: ", correct / total_tests)
#
if ll < min_ll_seen:
min_ll_seen = ll
if correct / total_tests > best_correct_seen:
best_correct_seen = correct / total_tests
print(" best ll so far: ", min_ll_seen, " and best ratio: ", best_correct_seen)
last_ll = ll
if last_ll < ll:
pass
# #lr = 0.95 * lr
# else:
# lr *= 1
#
# if e % 25 == 0:
# lr /= 2
def preprocess_babi_set_for_dmn(self):
filename_train = 'qa1_single-supporting-fact_train.txt'
filename_test = 'qa1_single-supporting-fact_test.txt'
filename_output_train = 'babi_train1.txt'
filename_output_test = 'babi_test1.txt'
self._write_file(filename_train, filename_output_train)
self._write_file(filename_test, filename_output_test)
def _write_file(self, filename_input, filename_output):
max_babi_article_len = 15
version = "simple"
with open(filename_input, encoding='utf-8') as a, open(filename_output, 'w+') as b:
cur_idx, cur_qa_set, cur_article = 0, [], []
for line in a:
if "?" not in line:
cur_article.append((''.join([i for i in line if not i.isdigit()]))[1:])
else:
answer_question = re.split(r'\t+', line.strip())
cur_name = answer_question[0].split()[3][:-1]
if version == "simple":
written_sents = []
for sent in reversed(cur_article):
if cur_name in sent:
written_sents.append(sent)
break
for sent in reversed(cur_article):
if cur_name not in sent:
written_sents.append(sent)
break
flipped = 0
if len(written_sents) > 1:
if random.random() < 0.5:
flipped = 1
tmp = written_sents[0]
written_sents[0] = written_sents[1]
written_sents[1] = tmp
for w in written_sents:
b.write(w)
b.write("$ " + str(flipped) +"\n")
else:
for sent in cur_article:
b.write(sent)
b.write("@ " + answer_question[0][2:-2] +"\n")
b.write("? " + answer_question[1] + "\n")
cur_idx += 1
if cur_idx == max_babi_article_len:
cur_idx, cur_qa_set, cur_article = 0, [], []
def process_data(self, type_of_embedding):
filename_train = 'babi_train1.txt'
filename_test = 'babi_test1.txt'
X_train, mask_sentences_train, mask_articles_train, Question_train, Question_train_mask, Y_train, X_test, mask_sentences_test, mask_articles_test, Question_test, Question_test_mask, Y_test, max_queslen = [], [], [], [], [], [], [], [], [], [], [], [], 0
fact_ordering_train, fact_ordering_test = [], []
cur_idx = 1
word2idx = {}
idx2word = {}
tokenizer = RegexpTokenizer(r'\w+')
word2idx["<BLANK>"] = 0
idx2word[0] = "<BLANK>"
max_article_len = 0
max_sentence_len = 0
max_queslen = 0
with open(filename_train, encoding='utf-8') as f:
cur_article = []
for line in f:
if "?" not in line and "@" not in line and "$" not in line:
cur_sentence = []
for w in tokenizer.tokenize(line):
cur_sentence.append(w.strip().lower())
cur_article.append(cur_sentence)
if len(cur_sentence) > max_sentence_len:
max_sentence_len = len(cur_sentence)
else:
fact_ordering_train.append(np.asarray([int(line[2:])], dtype='int32'))
question_phrase = next(f)[2:].split()
cur_question = []
if len(cur_article) > max_article_len:
max_article_len = len(cur_article)
if len(question_phrase) > max_queslen:
max_queslen = len(question_phrase)
for w in question_phrase:
cur_question.append(w.strip().lower())
Question_train.append((cur_question))
X_train.append(cur_article)
Y_train.append(next(f)[2:].strip())
cur_article = []
with open(filename_test, encoding='utf-8') as f:
cur_article = []
for line in f:
if "?" not in line and "@" not in line and "$" not in line:
cur_sentence = []
for w in tokenizer.tokenize(line):
cur_sentence.append(w.strip().lower())
cur_article.append(cur_sentence)
if len(cur_sentence) > max_sentence_len:
max_sentence_len = len(cur_sentence)
else:
fact_ordering_test.append(np.asarray([int(line[2:])], dtype='int32'))
question_phrase = next(f)[2:].split()
cur_question = []
if len(cur_article) > max_article_len:
max_article_len = len(cur_article)
if len(question_phrase) > max_queslen:
max_queslen = len(question_phrase)
for w in question_phrase:
cur_question.append(w.strip().lower())
Question_test.append((cur_question))
X_test.append(cur_article)
Y_test.append(next(f)[2:].strip())
cur_article = []
for l in Question_train:
cur_mask = np.zeros(max_queslen, dtype='int32')
cur_mask[0:len(l)] = 1
Question_train_mask.append(cur_mask)
for l in Question_test:
cur_mask = np.zeros(max_queslen, dtype='int32')
cur_mask[0:len(l)] = 1
Question_test_mask.append(cur_mask)
for l in X_train + X_test:
for s in l:
for w in s:
if w not in word2idx:
word2idx[w] = cur_idx
idx2word[cur_idx] = w
cur_idx += 1
for y in Y_test + Y_train:
if y not in word2idx:
word2idx[y] = cur_idx
idx2word[cur_idx] = y
cur_idx += 1
for q in Question_train + Question_test:
for w in q:
if w not in word2idx:
word2idx[w] = cur_idx
idx2word[cur_idx] = w
cur_idx += 1
word2idx["<EOS>"] = cur_idx
idx2word[cur_idx] = "<EOS>"
cur_idx += 1
X_train_vec, Question_train_vec, Y_train_vec, X_test_vec, Question_test_vec, Y_test_vec = [], [], [], [], [], []
for article in X_train:
cur_article = np.zeros((max_article_len, max_sentence_len), dtype='int32')
for idx, fact in enumerate(article):
for jdx, word in enumerate(fact):
cur_article[idx][jdx] = word2idx[word]
X_train_vec.append(np.asarray(cur_article))
for article in X_test:
cur_article = np.zeros((max_article_len, max_sentence_len), dtype='int32')
for idx, fact in enumerate(article):
for jdx, word in enumerate(fact):
cur_article[idx][jdx] = word2idx[word]
X_test_vec.append(np.asarray(cur_article))
for y in Y_train:
if type_of_embedding == "embeddings":
new_vec = word2idx[y]
else:
new_vec = np.zeros((len(word2idx)))
new_vec[word2idx[y]] = 1
Y_train_vec.append(new_vec)
for y in Y_test:
if type_of_embedding == "embeddings":
new_vec = word2idx[y]
else:
new_vec = np.zeros((len(word2idx)))
new_vec[word2idx[y]] = 1
Y_test_vec.append(new_vec)
for q in Question_train:
cur_question = []
for w in q:
if type_of_embedding == "embeddings":
new_vec = [word2idx[w]]
else:
assert(1 == 2) # Not implemented
#new_vec = np.zeros((len(word2idx)))
#new_vec[word2idx[f]] = 1
cur_question.append(new_vec)
Question_train_vec.append(np.asmatrix(cur_question))
for q in Question_test:
cur_question = []
for w in q:
if type_of_embedding == "embeddings":
new_vec = [word2idx[w]]
else:
assert(1 == 2) # Not implemented
#new_vec = np.zeros((len(word2idx)))
#new_vec[word2idx[f]] = 1
cur_question.append(new_vec)
Question_test_vec.append(np.asmatrix(cur_question))
# Ensure we have all the same length
new_question_train_vec = []
for q in Question_train_vec:
for added_el in range(len(q), max_queslen):
q = np.concatenate((q, [[0]]), axis=0)
new_question_train_vec.append(q)
Question_train_vec = new_question_train_vec
new_question_test_vec = []
for q in Question_test_vec:
for added_el in range(len(q), max_queslen):
q = np.concatenate((q, [[0]]), axis=0)
new_question_test_vec.append(q)
Question_test_vec = new_question_test_vec
assert(len(X_test_vec) == len(Y_test_vec))
# TODO: What needs edited: the x_train_vec, x_test_vec, x_mask
total_sequence_length = max_sentence_len * max_article_len + max_article_len
max_sentence_len = max_sentence_len +1
for article in X_train:
cur_mask_sentence = np.zeros(0, dtype='int32')
for sentence in article:
cur_mask_sentence_new = np.zeros(max_sentence_len, dtype='int32')
cur_mask_sentence_new[0:len(sentence)] = 1
cur_mask_sentence_new[-1] = 1
cur_mask_sentence = np.concatenate((cur_mask_sentence, cur_mask_sentence_new), axis=0)
if len(cur_mask_sentence) < total_sequence_length:
cur_mask_sentence = np.concatenate((cur_mask_sentence, [0, 0, 0, 0, 0, 0, 1] ), axis=0)
mask_sentences_train.append(cur_mask_sentence)
for article in X_test:
cur_mask_sentence = np.zeros(0, dtype='int32')
for sentence in article:
cur_mask_sentence_new = np.zeros(max_sentence_len, dtype='int32')
cur_mask_sentence_new[0:len(sentence)] = 1
cur_mask_sentence_new[-1] = 1
cur_mask_sentence = np.concatenate((cur_mask_sentence, cur_mask_sentence_new), axis=0)
if len(cur_mask_sentence) < total_sequence_length:
cur_mask_sentence = np.concatenate((cur_mask_sentence, [0, 0, 0, 0, 0, 0, 1] ), axis=0)
mask_sentences_test.append(cur_mask_sentence)
#X_train_vec_new, X_test_vec_new = [], []
X_train_vec_new = np.zeros((len(X_train_vec), total_sequence_length),dtype='int32')
X_test_vec_new = np.zeros((len(X_train_vec), total_sequence_length),dtype='int32')
for idx, x in enumerate(X_train_vec):
cur_sentence = np.zeros(total_sequence_length, dtype='int32')
for jdx, s in enumerate(x):
res = [word2idx["<EOS>"]]
cur_sentence[jdx * (max_sentence_len): (jdx + 1) * (max_sentence_len)] = np.concatenate((s, res), axis=1)
X_train_vec_new[idx, :] = cur_sentence
for idx, x in enumerate(X_test_vec):
cur_sentence = np.zeros(total_sequence_length, dtype='int32')
for jdx, s in enumerate(x):
res = [word2idx["<EOS>"]]
cur_sentence[jdx * (max_sentence_len): (jdx + 1) * (max_sentence_len)] = np.concatenate((s, res), axis=1)
X_test_vec_new[idx, :] = cur_sentence
X_train_vec = X_train_vec_new
X_test_vec = X_test_vec_new
assert(len(X_train_vec[0]) == total_sequence_length)
assert(total_sequence_length == max_article_len * max_sentence_len)
return X_train_vec, mask_sentences_train, fact_ordering_train, Question_train_vec, Question_train_mask, Y_train_vec, X_test_vec, mask_sentences_test, fact_ordering_test, Question_test_vec, Question_test_mask, Y_test_vec, word2idx, idx2word, len(word2idx), max_queslen, max_sentence_len, total_sequence_length
def initialize_dmn_params(self, nh, num_hidden_units_words, num_hidden_units_facts, num_hidden_units_episodes, num_hidden_units_questions, dimension_word_embeddings, dimension_fact_embeddings, max_fact_seqlen, max_number_of_episodes_read, number_word_classes, total_number_of_sentences_per_episode):
# Initializers
self.emb = theano.shared(name='embeddings_prob', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (number_word_classes, dimension_word_embeddings)).astype(theano.config.floatX))
#self.h0_facts_reading_1 = theano.shared(name='h0_facts', value=np.zeros(nh, dtype=theano.config.floatX))
#self.h0_facts_reading_2 = theano.shared(name='h0_facts', value=np.zeros(nh, dtype=theano.config.floatX))
#self.h0_facts = [self.h0_facts_reading_1]
#self.h0_episodes = theano.shared(name='h0_episodes', value=np.zeros(num_hidden_units_episodes, dtype=theano.config.floatX))
#self.emb_helper = theano.shared(name='embeddings_prob_helper', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (number_word_classes, dimension_word_embeddings)).astype(theano.config.floatX))
#self.h0_facts_reading_1_helper = theano.shared(name='h0_facts_helper', value=np.zeros(nh, dtype=theano.config.floatX))
#self.h0_facts_reading_2_helper = theano.shared(name='h0_facts_helper', value=np.zeros(nh, dtype=theano.config.floatX))
#self.h0_facts_helper = [self.h0_facts_reading_1]
#self.h0_episodes_helper = theano.shared(name='h0_episodes_helper', value=np.zeros(num_hidden_units_episodes, dtype=theano.config.floatX))
#self.initialization_randomization_helper = 0
# GRU Word Parameters
self.W_word_reset_gate_h = theano.shared(name='W_word_reset_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_words, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_reset_gate_x = theano.shared(name='W_word_reset_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_update_gate_h = theano.shared(name='W_word_update_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_words, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_update_gate_x = theano.shared(name='W_word_update_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_hidden_gate_h = theano.shared(name='W_word_hidden_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_words, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_hidden_gate_x = theano.shared(name='W_word_hidden_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_words)).astype(theano.config.floatX))
self.W_word_to_fact_vector = theano.shared(name='W_word_to_fact_vector', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_words, num_hidden_units_episodes)).astype(theano.config.floatX))
self.b_word_to_fact_vector = theano.shared(name='b_word_to_fact_vector', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, num_hidden_units_episodes).astype(theano.config.floatX))
dimension_fact_embeddings = 8
# GRU Episode Parameters
self.W_episode_reset_gate_h = theano.shared(name='W_episode_reset_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, num_hidden_units_episodes)).astype(theano.config.floatX))
self.W_episode_reset_gate_x = theano.shared(name='W_episode_reset_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, dimension_fact_embeddings)).astype(theano.config.floatX))
self.W_episode_update_gate_h = theano.shared(name='W_episode_update_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, num_hidden_units_episodes)).astype(theano.config.floatX))
self.W_episode_update_gate_x = theano.shared(name='W_episode_update_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, dimension_fact_embeddings)).astype(theano.config.floatX))
self.W_episode_hidden_gate_h = theano.shared(name='W_episode_hidden_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, num_hidden_units_episodes)).astype(theano.config.floatX))
self.W_episode_hidden_gate_x = theano.shared(name='W_episode_hidden_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, dimension_fact_embeddings)).astype(theano.config.floatX))
# W_episode to word is (21 x 8)
self.W_episode_to_word = theano.shared(name='W_fact_reset_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (number_word_classes, num_hidden_units_facts)).astype(theano.config.floatX))
self.b_episode_to_word = theano.shared(name='W_fact_reset_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, number_word_classes).astype(theano.config.floatX))
# DMN Gate Parameters
num_rows_z_dmn = 2
inner_dmn_dimension = 8
# W_dmn_2: size (8, 2). W_dmn_1 size (8,1), b_dmn_1 = 1, b_dmn-2 = 1
self.W_dmn_1 = theano.shared(name='W_dmn_1', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_facts , 1)).astype(theano.config.floatX))
self.W_dmn_2 = theano.shared(name='W_dmn_2', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_episodes, total_number_of_sentences_per_episode)).astype(theano.config.floatX))
self.b_dmn_1 = theano.shared(name='b_dmn_1', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (1)).astype(theano.config.floatX))
self.b_dmn_2 = theano.shared(name='b_dmn_2', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (1)).astype(theano.config.floatX))
#self.W_dmn_b = theano.shared(name='W_dmn_2', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_facts, num_hidden_units_questions)).astype(theano.config.floatX))
self.params = [self.W_word_reset_gate_h, self.W_word_reset_gate_x, self.W_word_update_gate_h, self.W_word_update_gate_x, self.W_word_hidden_gate_h, self.W_word_hidden_gate_x,
self.W_word_to_fact_vector, self.b_word_to_fact_vector, self.W_episode_reset_gate_h, self.W_episode_reset_gate_x, self.W_episode_update_gate_h, self.W_episode_update_gate_x,
self.W_episode_hidden_gate_h, self.W_episode_hidden_gate_x, self.W_episode_to_word, self.b_episode_to_word, self.W_dmn_1, self.W_dmn_2, self.b_dmn_1, self.b_dmn_2]
# Question GRU Parameters
self.W_question_reset_gate_h = theano.shared(name='W_question_reset_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_questions, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_reset_gate_x = theano.shared(name='W_question_reset_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_update_gate_h = theano.shared(name='W_question_update_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_questions, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_update_gate_x = theano.shared(name='W_question_update_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_hidden_gate_h = theano.shared(name='W_question_hidden_gate_h', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_questions, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_hidden_gate_x = theano.shared(name='W_question_hidden_gate_x', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (dimension_word_embeddings, num_hidden_units_questions)).astype(theano.config.floatX))
self.W_question_to_vector = theano.shared(name='W_question_to_vector', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, (num_hidden_units_questions, num_hidden_units_episodes)).astype(theano.config.floatX))
self.b_question_to_vector = theano.shared(name='b_question_to_vector', value=self.initialization_randomization * np.random.uniform(-1.0, 1.0, num_hidden_units_episodes).astype(theano.config.floatX))
self.h0_questions = theano.shared(name='h0_questions', value=np.zeros(num_hidden_units_questions, dtype=theano.config.floatX))
self.params.extend((self.W_question_reset_gate_h, self.W_question_reset_gate_x, self.W_question_update_gate_h, self.W_question_update_gate_x, self.W_question_hidden_gate_h, self.W_question_hidden_gate_x,
self.W_question_to_vector, self.b_question_to_vector, self.h0_questions))
self.fact_params = [self.W_question_reset_gate_x, self.W_question_reset_gate_h, self.W_question_update_gate_h, self.W_question_update_gate_x, self.W_question_hidden_gate_h, self.W_question_hidden_gate_x,
self.W_question_to_vector, self.b_question_to_vector, self.h0_questions, self.emb, self.W_dmn_1, self.W_dmn_2, self.b_dmn_1, self.b_dmn_2, self.W_word_to_fact_vector, self.b_word_to_fact_vector,
self.W_word_reset_gate_h, self.W_word_reset_gate_x, self.W_word_update_gate_h, self.W_word_update_gate_x, self.W_word_hidden_gate_h, self.W_word_hidden_gate_x]