def new_episode(self, mem, all_h=False): ''' Create a new episode Compute the g using the attention mechanism Compute the new state h_t^i of the mem update mechanism Use it to compute e^i = h_T_C^i (see paper AMA:DMN for QA) :param mem: current memory :return e[-1]: latest episode ''' #g_updates seems useless. g, g_updates = theano.scan(fn=self.new_attention_step, sequences=self.inp_c, non_sequences=[mem, self.q_q], outputs_info=T.zeros_like(self.inp_c[0][0])) #Softmax if normalize_attention? if (self.normalize_attention): g = nn_utils.softmax(g) #e_updates seems useless. e, e_updates = theano.scan(fn=self.new_episode_step, sequences=[self.inp_c, g], outputs_info=T.zeros_like(self.inp_c[0])) return e[-1]
def answer_step(prev_a, prev_y): a = nn_utils.GRU_update( prev_a, T.concatenate([prev_y, self.q_q, self.last_mem]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y]
def transformer(src, target, n): z = src for i in range(n): z = encoder(z) z_enc = z z = target for i in range(n): z = decoder(z) z = linear(z) z = nn_utils.softmax(z) return z
def new_episode(self, mem): g, g_updates = theano.scan(fn=self.new_attention_step, sequences=self.inp_c, non_sequences=[mem, self.q_q, self.c_vecs], outputs_info=T.zeros_like(self.inp_c[0][0])) if (self.normalize_attention): g = nn_utils.softmax(g) e, e_updates = theano.scan(fn=self.new_episode_step, sequences=[self.inp_c, g], outputs_info=T.zeros_like(self.inp_c[0])) return e[-1]
def new_episode(self, mem): g, g_updates = theano.scan(fn=self.new_attention_step, sequences=self.inp_c, non_sequences=[mem, self.q_q], outputs_info=T.zeros_like(self.inp_c[0][0])) if (self.normalize_attention): g = nn_utils.softmax(g) e, e_updates = theano.scan(fn=self.new_episode_step, sequences=[self.inp_c, g], outputs_info=T.zeros_like(self.inp_c[0])) e_list = [] for index in range(self.batch_size): e_list.append(e[self.fact_count_var[index] - 1, :, index]) return T.stack(e_list).dimshuffle((1, 0))
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.type = "batch" self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3( 'input_var') # (batch_size, seq_len, glove_dim) self.q_var = T.tensor3('question_var') # as self.input_var self.answer_var = T.ivector( 'answer_var') # answer of example in minibatch self.fact_count_var = T.ivector( 'fact_count_var') # number of facts in the example of minibatch self.input_mask_var = T.imatrix( 'input_mask_var') # (batch_size, indices) print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) input_var_shuffled = self.input_var.dimshuffle(1, 2, 0) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=input_var_shuffled, outputs_info=T.zeros_like(inp_dummy)) inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1) inp_c_list = [] inp_c_mask_list = [] for batch_index in range(self.batch_size): taken = inp_c_history_shuffled[batch_index].take( self.input_mask_var[ batch_index, :self.fact_count_var[batch_index]], axis=0) inp_c_list.append( T.concatenate([ taken, T.zeros((self.input_mask_var.shape[1] - taken.shape[0], self.dim), floatX) ])) inp_c_mask_list.append( T.concatenate([ T.ones((taken.shape[0], ), np.int32), T.zeros((self.input_mask_var.shape[1] - taken.shape[0], ), np.int32) ])) self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0) inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) q_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_q_history, _ = theano.scan(fn=self.input_gru_step, sequences=q_var_shuffled, outputs_info=T.zeros_like(q_dummy)) self.q_q = q_q_history[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared( np.zeros((self.vocab_size, self.batch_size), dtype=floatX)) results, updates = theano.scan( fn=self.answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], #(last_mem, y) n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") self.prediction = self.prediction.dimshuffle(1, 0) self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) self.attentions = T.stack(self.attentions) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, answer_step_nbr, input_mask_mode, memory_hops, l2, normalize_attention, max_input_size, **kwargs): ''' Build the DMN :param babi_train_raw: train dataset :param babi_test_raw: test dataset :param word2vec: a dictionary containing the word embeddings TODO: Check if right :param word_vector_size: dimension of the word embeddings (50,100,200,300) :param dim: number of hidden units in input module GRU :param mode: train or test mode :param answer_module: answer module type: feedforward or recurrent :param input_mask_mode: input_mask_mode: word or sentence :param memory_hops: memory GRU steps :param l2: L2 regularization :param normalize_attention: enable softmax on attention vector :param **kwargs: ''' print("==> not used params in DMN class:", kwargs.keys()) self.type = "pointer" self.vocab = {} self.ivocab = {} print(max_input_size) #save params self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim #number of hidden units in input layer GRU self.pointer_dim = max_input_size #maximal size for the input, used as hyperparameter self.mode = mode self.answer_module = answer_module self.answer_step_nbr = answer_step_nbr self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_pointers_s, self.train_pointers_e = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_pointers_s, self.test_pointers_e = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.ivector('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.pointers_s_var = T.ivector('pointers_s_var') self.pointers_e_var = T.ivector('pointer_e_var') print("==> building input module") self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #TODO why 3 different set of weights & bias? #This does some loop inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) #in case of multiple sentences, only keep the hidden states which index match the <eos> char self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) #This seems to be the memory. self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] #take only last elem print("==> creating parameters for memory module") self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #Attnetion mechanisms 2 layer FFNN weights & bias self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print( "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops) memory = [self.q_q.copy()] #So q_q is memory initialization for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( nn_utils.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) self.last_mem = memory[-1] print("==> building answer module") self.Ws_p = nn_utils.normal_param( std=0.1, shape=(self.pointer_dim, self.dim)) #shape must be size_input * mem_size = self.dim self.We_p = nn_utils.normal_param(std=0.1, shape=(self.pointer_dim, self.dim)) self.Wh_p = nn_utils.normal_param(std=0.1, shape=(self.pointer_dim, self.dim)) self.Ws_pr = nn_utils.normal_param( std=0.1, shape=(self.pointer_dim, self.dim)) #shape must be size_input * mem_size = self.dim self.We_pr = nn_utils.normal_param(std=0.1, shape=(self.pointer_dim, self.dim)) self.Wh_pr = nn_utils.normal_param(std=0.1, shape=(self.pointer_dim, self.dim)) self.Psp = nn_utils.softmax(T.dot( self.Ws_p, self.last_mem)) #size must be == size_input self.Pepr = nn_utils.softmax(T.dot(self.We_pr, self.last_mem)) #TODO: self.start_idx = T.argmax(self.Psp) self.end_idxr = T.argmax(self.Pepr) self.start_idx_state = inp_c_history[ self. start_idx] #must be hidden state idx idx_max_val(Psp) self.last_mem# self.end_idx_state = inp_c_history[self.end_idxr] #temp1 = T.dot(self.We_p, self.last_mem) #temp2 = T.dot(self.Wh_p, self.start_idx_state) #temp3 = temp1 + temp2 self.Pep = nn_utils.softmax( T.dot(self.We_p, self.last_mem) + T.dot( self.Wh_p, self.start_idx_state)) #size must be == size_input self.Pspr = nn_utils.softmax( T.dot(self.Ws_pr, self.last_mem) + T.dot(self.Wh_pr, self.end_idx_state)) Ps = (self.Psp + self.Pspr) / 2 Pe = (self.Pep + self.Pepr) / 2 self.start_idxr = T.argmax(self.Pspr) self.end_idx = T.argmax(self.Pep) self.start_idx_f = T.argmax(Ps) #(self.start_idx + self.start_idxr)/2 self.end_idx_f = T.argmax(Pe) #(self.end_idx + self.end_idxr)/2 #multiple_answers = [] #bboole = T.lt(self.start_idx_f, self.end_idx_f) #trange = ifelse(bboole, T.arange(self.start_idx_f, self.end_idx_f), T.arange(self.start_idx_f - 1, self.start_idx_f)) # self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) # # if self.answer_module == 'recurrent': # self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size)) # self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # # self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size)) # self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # # self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size)) # self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) # self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # # def answer_step(prev_a, prev_y): # a = nn_utils.GRU_update(prev_a, T.concatenate([prev_y, self.q_q, self.last_mem]), # self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, # self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, # self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) # # y = nn_utils.softmax(T.dot(self.W_a, a)) # return [a, y] # # # TODO: add conditional ending # dummy_ = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) # results, updates = theano.scan(fn=answer_step, # outputs_info=[self.last_mem, T.zeros_like(dummy_)], # n_steps=self.answer_step_nbr) # # self.multiple_predictions = results[1] #don't get the memory (i.e. a) # # # else: # raise Exception("invalid answer_module") print("==> collecting all parameters") self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.Ws_p, self.We_p, self.Wh_p, self.Ws_pr, self.We_pr, self.Wh_pr ] # if self.answer_module == 'recurrent': # self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, # self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, # self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] # # print("==> building loss layer and computing updates") # def temp_loss(curr_pred, curr_ans, loss): # temp = T.nnet.categorical_crossentropy(curr_pred.dimshuffle("x",0),T.stack([curr_ans]))[0] # return loss + temp # # outputs, updates = theano.scan(fn=temp_loss, # sequences=[self.multiple_predictions, self.answer_var], # outputs_info = [np.float64(0.0)], # n_steps=self.answer_step_nbr) loss_start = T.nnet.categorical_crossentropy( Ps.dimshuffle("x", 0), T.stack([self.pointers_s_var[0]]))[0] loss_end = T.nnet.categorical_crossentropy( Pe.dimshuffle("x", 0), T.stack([self.pointers_e_var[0]]))[0] #loss_1 = Ps # def temp_loss(curr_idx, curr_ans, loss): # curr_pred = self.input_var[curr_idx] # temp = T.nnet.catergorical_crossentropy(curr_pred, curr_ans)[0] # return loss + temp # # outputs, udpates = theano.scan(fn=temp_loss, # sequences = [answers_range, self.answer_var], # outputs_info = [np.float64(0.0)], # n_steps = ???) # self.loss_ce = outputs[-1] #temp1 = (self.end_idx_f - self.pointers_e_var) #temp2 = T.abs_(temp1) #* temp1 #temp3 = (self.start_idx_f)# - self.pointers_s_var) #temp4 = T.abs_(temp3) #* temp3 self.loss_ce = loss_start + loss_end #(temp2 + temp4) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print("==> compiling train_fn") self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.input_mask_var, self.pointers_s_var, self.pointers_e_var ], outputs=[self.start_idx_f, self.end_idx_f, self.loss], updates=updates, allow_input_downcast=True) if self.mode != 'minitest': print("==> compiling test_fn") self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.input_mask_var, self.pointers_s_var, self.pointers_e_var ], outputs=[ self.start_idx_f, self.end_idx_f, self.loss, self.inp_c, self.q_q ], allow_input_downcast=True) if self.mode == 'minitest': print("==> compiling minitest_fn") self.minitest_fn = theano.function( inputs=[ self.input_var, self.q_var, self.input_mask_var, self.pointers_s_var, self.pointers_e_var ], outputs=[self.start_idx_f, self.end_idx_f])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): ''' Build the DMN :param babi_train_raw: train dataset :param babi_test_raw: test dataset :param word2vec: a dictionary containing the word embeddings TODO: Check if right :param word_vector_size: dimension of the word embeddings (50,100,200,300) :param dim: number of hidden units in input module GRU :param mode: train or test mode :param answer_module: answer module type: feedforward or recurrent :param input_mask_mode: input_mask_mode: word or sentence :param memory_hops: memory GRU steps :param l2: L2 regularization :param normalize_attention: enable softmax on attention vector :param **kwargs: ''' print("==> not used params in DMN class:", kwargs.keys()) self.vocab = {} self.ivocab = {} self.type = "basic" #save params self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print("==> building input module") #Input weights for first layer #TODO why is there an input layer?? self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) #Input weights for hidden layer self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #Input bias #TODO why constant? self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #TODO why 3 different set of weights & bias? #This does some loop inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) #This seems to be the memory. self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] #take only last elem print("==> creating parameters for memory module") self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #Attnetion mechanisms 2 layer FFNN weights & bias self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print( "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops) memory = [self.q_q.copy()] #So q_q is memory initialization for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( nn_utils.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] print("==> building answer module") self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = nn_utils.GRU_update(prev_a, T.concatenate([prev_y, self.q_q ]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print("==> collecting all parameters") self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print("==> building loss layer and computing updates") self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print("==> compiling train_fn") self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print("==> compiling test_fn") self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[ self.prediction, self.loss, self.inp_c, self.q_q, last_mem ]) if self.mode == 'train': print("==> computing gradients (for debugging)") gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=gradient)
def feed_forward(z, W1, W2): h = z.dot(W1) u = h.dot(W2) z = nn_utils.softmax(u) return z
def attention(Q, K, V, f, d_k=1): col = [i for i in range(len(K.shape))] col[-2], col[-1] = col[-1], col[-2] return nn_utils.softmax(f(Q, K.transpose(tuple(col))) / d_k).dot(V)
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops #self.batch_size = 1 self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = 4 # number of answer choices self.inp_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.ca_var = T.matrix('ca_var') self.cb_var = T.matrix('cb_var') self.cc_var = T.matrix('cc_var') self.cd_var = T.matrix('cd_var') self.ans_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.word_vector_size)), borrow=True) self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.inp_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] self.c_vecs = [] for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]: history, _ = theano.scan(fn=self.input_gru_step, sequences=choice, outputs_info=T.zeros_like(self.b_inp_hid)) self.c_vecs.append(history[-1]) self.c_vecs = T.stack(self.c_vecs).transpose((1, 0)) # (dim, 4) self.inp_c = T.stack([self.inp_c] * 4).transpose( (1, 2, 0)) # (fact_cnt, dim, 4) self.q_q = T.stack([self.q_q] * 4).transpose((1, 0)) # (dim, 4) print "==> creating parameters for memory module" self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.W_b = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, self.dim)), borrow=True) self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample( (self.dim, 10 * self.dim + 3)), borrow=True) self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample( (1, self.dim)), borrow=True) self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample( (self.dim, )), borrow=True) self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )), borrow=True) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] # (dim, 4) for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update_batch(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1].flatten() print "==> building answer module" self.W_a = theano.shared(lasagne.init.Normal(0.1).sample( (self.vocab_size, 4 * self.dim)), borrow=True) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=[ self.prediction, self.loss, self.inp_c, self.q_q, last_mem ]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var, self.cc_var, self.cd_var, self.input_mask_var ], outputs=gradient)