def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', activation=T.tanh, params=None): input = input.dimshuffle(1, 0) if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) self.W = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bh', borrow=True) else: self.emb, self.W, self.U, self.bh = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.emb, self.W, self.U, self.bh] def recurrence(x_t, h_tm_prev): h_t = activation(T.dot(self.emb[x_t], self.W) + T.dot(h_tm_prev, self.U) + self.bh) return h_t h, _ = theano.scan( fn=recurrence, sequences=input, outputs_info=T.alloc(self.h0, input.shape[1], hidden_dim) ) # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h[-1]
def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): input = input.dimshuffle(1, 0) if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # update gate self.W_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_r', borrow=True) # hidden state self.W_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_h', borrow=True) self.U_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_h', borrow=True) self.b_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_h', borrow=True) else: self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, \ self.W_h, self.U_h, self.b_h = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W_h, self.U_h, self.b_h] def recurrence(x_t, h_tm_prev): x_z = T.dot(self.emb[x_t], self.W_z) + self.b_z x_r = T.dot(self.emb[x_t], self.W_r) + self.b_r x_h = T.dot(self.emb[x_t], self.W_h) + self.b_h z_t = inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = activation(x_h + T.dot(r_t * h_tm_prev, self.U_h)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev return h_t h, _ = theano.scan( fn=recurrence, sequences=input, outputs_info=T.alloc(self.h0, input.shape[1], hidden_dim) ) # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h[-1]
def __init__(self, input, input_dim, hidden_dim, output_dim, mini_batch=False, params=None): self.mini_batch = mini_batch input_f = input if mini_batch: input_b = input[::, ::-1] else: input_b = input[::-1] if params is None: self.fwd_rnn = Rnn(input=input_f, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, mini_batch=mini_batch) self.bwd_rnn = Rnn(input=input_b, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim, mini_batch=mini_batch) self.V_f = theano.shared( value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True ) self.V_b = theano.shared( value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True ) self.by = theano.shared( value=get('zero', shape=(output_dim,)), name='by', borrow=True) else: # To support loading from persistent storage, the current implementation of Rnn() will require a # change and is therefore not supported. # An elegant way would be to implement BiRnn() without using Rnn() [is a trivial thing to do]. raise NotImplementedError # since now birnn is doing the actual classification ; we don't need 'Rnn().V & Rnn().by' as they # are not part of computational graph (separate logistic-regression unit/layer is probably the best way to # handle this). Here's the ugly workaround -_- self.params = [self.fwd_rnn.W, self.fwd_rnn.U, self.fwd_rnn.bh, self.bwd_rnn.W, self.bwd_rnn.U, self.bwd_rnn.bh, self.V_f, self.V_b, self.by] self.bwd_rnn.h_t = self.bwd_rnn.h_t[::-1] # Take the weighted sum of forward & backward rnn's hidden representation self.h_t = T.dot(self.fwd_rnn.h_t, self.V_f) + T.dot(self.bwd_rnn.h_t, self.V_b) if mini_batch: # T.nnet.softmax cannot operate on tensor3, here's a simple reshape trick to make it work. h_t = self.h_t + self.by h_t_t = T.reshape(h_t, (h_t.shape[0] * h_t.shape[1], -1)) y_t = T.nnet.softmax(h_t_t) self.y_t = T.reshape(y_t, h_t.shape) self.y = T.argmax(self.y_t, axis=2) else: self.y_t = T.nnet.softmax(self.h_t + self.by) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, hidden_dim, output_dim, params=None): self.input_f = input self.input_b = input[::-1] if params is None: self.fwd_lstm = Lstm(input=self.input_f, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.bwd_lstm = Lstm(input=self.input_b, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.V_f = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True) self.V_b = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True) self.by = theano.shared(value=get('zero', shape=(output_dim, )), name='by', borrow=True) else: # To support loading from persistent storage, the current implementation of Lstm() will require a # change and is therefore not supported. # An elegant way would be to implement BiLstm() without using Lstm() [is a trivial thing to do]. raise NotImplementedError # since now bilstm is doing the actual classification ; we don't need 'Lstm().V & Lstm().by' as they # are not part of computational graph (separate logistic-regression unit/layer is probably the best way to # handle this). Here's the ugly workaround -_- self.params = [ self.fwd_lstm.W_i, self.fwd_lstm.U_i, self.fwd_lstm.b_i, self.fwd_lstm.W_f, self.fwd_lstm.U_f, self.fwd_lstm.b_f, self.fwd_lstm.W_c, self.fwd_lstm.U_c, self.fwd_lstm.b_c, self.fwd_lstm.W_o, self.fwd_lstm.U_o, self.fwd_lstm.b_o, self.bwd_lstm.W_i, self.bwd_lstm.U_i, self.bwd_lstm.b_i, self.bwd_lstm.W_f, self.bwd_lstm.U_f, self.bwd_lstm.b_f, self.bwd_lstm.W_c, self.bwd_lstm.U_c, self.bwd_lstm.b_c, self.bwd_lstm.W_o, self.bwd_lstm.U_o, self.bwd_lstm.b_o, self.V_f, self.V_b, self.by ] self.bwd_lstm.h_t = self.bwd_lstm.h_t[::-1] # Take the weighted sum of forward & backward lstm's hidden representation self.h_t = T.dot(self.fwd_lstm.h_t, self.V_f) + T.dot( self.bwd_lstm.h_t, self.V_b) self.y_t = T.nnet.softmax(self.h_t + self.by) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, hidden_dim, output_dim, params=None): self.input_f = input self.input_b = input[::-1] if params is None: self.fwd_gru = Gru(input=self.input_f, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.bwd_gru = Gru(input=self.input_b, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.V_f = theano.shared( value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True ) self.V_b = theano.shared( value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True ) self.by = theano.shared( value=get('zero', shape=(output_dim,)), name='by', borrow=True) else: # To support loading from persistent storage, the current implementation of Gru() will require a # change and is therefore not supported. # An elegant way would be to implement BiGru() without using Gru() [is a trivial thing to do]. raise NotImplementedError # since now bigru is doing the actual classification ; we don't need 'Gru().V & Gru().by' as they # are not part of computational graph (separate logistic-regression unit/layer is probably the best way to # handle this). Here's the ugly workaround -_- self.params = [self.fwd_gru.W_z, self.fwd_gru.U_z, self.fwd_gru.b_z, self.fwd_gru.W_r, self.fwd_gru.U_r, self.fwd_gru.b_r, self.fwd_gru.W, self.fwd_gru.U, self.fwd_gru.b_h, self.bwd_gru.W_z, self.bwd_gru.U_z, self.bwd_gru.b_z, self.bwd_gru.W_r, self.bwd_gru.U_r, self.bwd_gru.b_r, self.bwd_gru.W, self.bwd_gru.U, self.bwd_gru.b_h, self.V_f, self.V_b, self.by] self.bwd_gru.h_t = self.bwd_gru.h_t[::-1] # Take the weighted sum of forward & backward gru's hidden representation self.h_t = T.dot(self.fwd_gru.h_t, self.V_f) + T.dot(self.bwd_gru.h_t, self.V_b) self.y_t = T.nnet.softmax(self.h_t + self.by) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, output_dim, params): if params is None: self.W = theano.shared(value=get(identifier='uniform', shape=(input_dim, output_dim)), name='w', borrow=True) self.b = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b', borrow=True) else: self.W, self.b = params self.params = [self.W, self.b] self.p_y_given_x = T.clip( T.nnet.softmax(T.dot(input, self.W) + self.b), 0.0001, 0.9999) # need 'clipping' to avoid nan in nll self.pred = T.argmax(self.p_y_given_x, axis=1)
def __init__(self, input, input_dim, hidden_dim, output_dim, activation=T.tanh, init='uniform', inner_init='orthonormal', mini_batch=False, params=None): self.activation = activation self.mini_batch = mini_batch if mini_batch: input = input.dimshuffle(1, 0, 2) if params is None: self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True ) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True ) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True ) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bh', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by', borrow=True) else: self.W, self.U, self.V, self.bh, self.by = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.W, self.U, self.V, self.bh, self.by] if mini_batch: def recurrence(x_t, h_tm_prev): h_t = activation(T.dot(x_t, self.W) + T.dot(h_tm_prev, self.U) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by) return h_t, y_t [self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[T.alloc(self.h0, input.shape[1], hidden_dim), None] ) self.h_t = self.h_t.dimshuffle(1, 0, 2) self.y_t = self.y_t.dimshuffle(1, 0, 2) self.y = T.argmax(self.y_t, axis=2) else: def recurrence(x_t, h_tm_prev): h_t = activation(T.dot(x_t, self.W) + T.dot(h_tm_prev, self.U) + self.bh) y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.by) return h_t, y_t[0] [self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[self.h0, None] ) self.y = T.argmax(self.y_t, axis=1)
def pickle_w2vec(w2vec, dataset, emb_path, emb_dim=300): model = gensim.models.Word2Vec.load_word2vec_format(w2vec, binary=True) voc, _ = load_pickled_data(path=dataset) vocab, words_to_ix, _ = voc emb = [0] * len(vocab) vocab.remove('EOS') vocab.remove('UNKNOWN_TOKEN') # initialize randomly for 'EOS' & 'UNKNOWN_TOKEN' emb[words_to_ix['EOS']] = get(identifier='emb', shape=(emb_dim,), scale=np.sqrt(3)) emb[words_to_ix['UNKNOWN_TOKEN']] = get(identifier='emb', shape=(emb_dim,), scale=np.sqrt(3)) unk_count = 2 for word in vocab: if word in model.vocab: emb[words_to_ix[word]] = model[word] else: unk_count += 1 emb[words_to_ix[word]] = get(identifier='emb', shape=(emb_dim,), scale=np.sqrt(3)) print('... embeddings initialized randomly for %d words' % unk_count) # pickle our mini embeddings with open(emb_path, 'wb') as f: pkl.dump(emb, f, pkl.HIGHEST_PROTOCOL) print('... %s created' % emb_path)
def __init__(self, input, input_dim, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): self.input = input self.inner_activation = inner_activation self.activation = activation if params is None: # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) # weights pertaining to output neuron self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.b_y = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y', borrow=True) else: self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, \ self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.V, self.b_y = params self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [ self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.V, self.b_y ] def recurrence(x_t, c_tm_prev, h_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * activation( x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * activation(c_t) # actual hidden state y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return c_t, h_t, y_t[0] [_, self.h_t, self.y_t], _ = theano.scan(recurrence, sequences=self.input, outputs_info=[self.c0, self.h0, None]) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, story, question, hidden_dim, output_dim, attn_dim, params, activation=T.tanh): story = story.dimshuffle(1, 0, 2) if params is None: self.W_att_story = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, attn_dim)), name='W_att_story', borrow=True) self.W_att_question = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, attn_dim)), name='W_att_question', borrow=True) # weight matrix for 'm_t' (see original paper: page 5) self.W_m = theano.shared(value=get(identifier='uniform', shape=(attn_dim, )), name='W_m', borrow=True) self.W_rg = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='W_rg', borrow=True) self.W_ug = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='W_ug', borrow=True) self.b = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b', borrow=True) else: self.W_att_story, self.W_att_question, self.W_m, self.W_rg, self.W_ug, self.b = params self.params = [ self.W_att_story, self.W_att_question, self.W_m, self.W_rg, self.W_ug, self.b ] # applying attention i.e weighted sum of story & question def step(token_t): m_t = activation( T.dot(token_t, self.W_att_story) + T.dot(question, self.W_att_question)) # attention at time-step t (will be a scalar value) s_t = T.dot(m_t, self.W_m) # is 'W_m' even needed here ? return s_t s, _ = theano.scan(step, sequences=story, outputs_info=None) s = s.dimshuffle(1, 0) # normalized attention s_norm = T.nnet.softmax(s) # embedding of 'story' def compute_batch_sum(story_, norm_): return story_.T * norm_ r_t, _ = theano.scan(compute_batch_sum, sequences=[story, s_norm.dimshuffle(1, 0)], outputs_info=None) r = T.sum(r_t.dimshuffle(2, 0, 1), axis=1) # given 'r' & 'u' ; compute the final 'g' (where 'u' = encoding of question) self.g = T.nnet.softmax( activation( T.dot(r, self.W_rg) + T.dot(question, self.W_ug) + self.b)) self.pred = T.argmax(self.g, axis=1)
def __init__(self, input, vocab_size, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, merge_mode='concat'): input_f = input.dimshuffle(1, 0) input_b = input[::-1].dimshuffle(1, 0) if params is None: self.emb = theano.shared(value=get(identifier=init, shape=(vocab_size, emb_dim), scale=np.sqrt(3)), name='emb', borrow=True) # Forward LSTM # input gate self.Wf_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_i', borrow=True) self.Uf_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_i', borrow=True) self.bf_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_i', borrow=True) # forget gate self.Wf_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_f', borrow=True) self.Uf_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_f', borrow=True) self.bf_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='bf_f', borrow=True) # memory self.Wf_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_c', borrow=True) self.Uf_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_c', borrow=True) self.bf_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_c', borrow=True) # output gate self.Wf_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_o', borrow=True) self.Uf_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_o', borrow=True) self.bf_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_o', borrow=True) # Backward LSTM # input gate self.Wb_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_i', borrow=True) self.Ub_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_i', borrow=True) self.bb_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_i', borrow=True) # forget gate self.Wb_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_f', borrow=True) self.Ub_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_f', borrow=True) self.bb_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='bb_f', borrow=True) # memory self.Wb_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_c', borrow=True) self.Ub_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_c', borrow=True) self.bb_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_c', borrow=True) # output gate self.Wb_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_o', borrow=True) self.Ub_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_o', borrow=True) self.bb_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_o', borrow=True) else: self.emb, self.Wf_i, self.Uf_i, self.bf_i, self.Wf_f, self.Uf_f, self.bf_f, \ self.Wf_c, self.Uf_c, self.bf_c, self.Wf_o, self.Uf_o, self.bf_o,\ self.Wb_i, self.Ub_i, self.bb_i, self.Wb_f, self.Ub_f, self.bb_f, \ self.Wb_c, self.Ub_c, self.bb_c, self.Wb_o, self.Ub_o, self.bb_o = params self.cf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='cf', borrow=True) self.hf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hf', borrow=True) self.cb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='cb', borrow=True) self.hb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hb', borrow=True) self.params = [self.emb, self.Wf_i, self.Uf_i, self.bf_i, self.Wf_f, self.Uf_f, self.bf_f, self.Wf_c, self.Uf_c, self.bf_c, self.Wf_o, self.Uf_o, self.bf_o, self.Wb_i, self.Ub_i, self.bb_i, self.Wb_f, self.Ub_f, self.bb_f, self.Wb_c, self.Ub_c, self.bb_c, self.Wb_o, self.Ub_o, self.bb_o] # forward lstm def recurrence_f(xf_t, cf_tm, hf_tm): xf_i = T.dot(self.emb[xf_t], self.Wf_i) + self.bf_i xf_f = T.dot(self.emb[xf_t], self.Wf_f) + self.bf_f xf_c = T.dot(self.emb[xf_t], self.Wf_c) + self.bf_c xf_o = T.dot(self.emb[xf_t], self.Wf_o) + self.bf_o if_t = inner_activation(xf_i + T.dot(hf_tm, self.Uf_i)) ff_t = inner_activation(xf_f + T.dot(hf_tm, self.Uf_f)) cf_t = ff_t * cf_tm + if_t * activation(xf_c + T.dot(hf_tm, self.Uf_c)) # internal memory of_t = inner_activation(xf_o + T.dot(hf_tm, self.Uf_o)) hf_t = of_t * activation(cf_t) # actual hidden state return cf_t, hf_t [_, self.h_f], _ = theano.scan( recurrence_f, sequences=input_f, outputs_info=[T.alloc(self.cf, input_f.shape[1], hidden_dim), T.alloc(self.hf, input_f.shape[1], hidden_dim)] ) # backward lstm def recurrence(xb_t, cb_tm, hb_tm): xb_i = T.dot(self.emb[xb_t], self.Wb_i) + self.bb_i xb_f = T.dot(self.emb[xb_t], self.Wb_f) + self.bb_f xb_c = T.dot(self.emb[xb_t], self.Wb_c) + self.bb_c xb_o = T.dot(self.emb[xb_t], self.Wb_o) + self.bb_o ib_t = inner_activation(xb_i + T.dot(hb_tm, self.Ub_i)) fb_t = inner_activation(xb_f + T.dot(hb_tm, self.Ub_f)) cb_t = fb_t * cb_tm + ib_t * activation(xb_c + T.dot(hb_tm, self.Ub_c)) # internal memory ob_t = inner_activation(xb_o + T.dot(hb_tm, self.Ub_o)) hb_t = ob_t * activation(cb_t) # actual hidden state return cb_t, hb_t [_, self.h_b], _ = theano.scan( recurrence, sequences=input_b, outputs_info=[T.alloc(self.cb, input_b.shape[1], hidden_dim), T.alloc(self.hb, input_b.shape[1], hidden_dim)] ) if merge_mode == 'sum': self.y = self.h_f[-1] + self.h_b[-1] elif merge_mode == 'multiply': self.y = self.h_f[-1] * self.h_b[-1] elif merge_mode == 'average': self.y = (self.h_f[-1] + self.h_b[-1]) / 2 elif merge_mode == 'concat': self.y = T.concatenate([self.h_f[-1], self.h_b[-1]], axis=1) else: print('Supported "merge_mode" for forward + backward lstm are: "sum", "multiply", average" & "concat".') raise NotImplementedError
def __init__(self, input, vocab_size, emb_dim, hidden_dim, n_layers=2, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): input = input.dimshuffle(1, 0) assert(n_layers == 2) # can only stack one layer if params is None: self.emb = theano.shared(value=get(identifier=init, shape=(vocab_size, emb_dim), scale=np.sqrt(3)), name='emb', borrow=True) # *** Layer 1 *** # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) # *** Layer 2 *** # input gate self.W_i_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_i_1', borrow=True) self.U_i_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i_1', borrow=True) self.b_i_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i_1', borrow=True) # forget gate self.W_f_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_f_1', borrow=True) self.U_f_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f_1', borrow=True) self.b_f_1 = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f_1', borrow=True) # memory self.W_c_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_c_1', borrow=True) self.U_c_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c_1', borrow=True) self.b_c_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c_1', borrow=True) # output gate self.W_o_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_o_1', borrow=True) self.U_o_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o_1', borrow=True) self.b_o_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o_1', borrow=True) else: self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, \ self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, \ self.W_i_1, self.U_i_1, self.b_i_1, self.W_f_1, self.U_f_1, self.b_f_1, \ self.W_c_1, self.U_c_1, self.b_c_1, self.W_o_1, self.U_o_1, self.b_o_1 = params self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.c1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c1', borrow=True) self.h1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h1', borrow=True) self.params = [self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.W_i_1, self.U_i_1, self.b_i_1, self.W_f_1, self.U_f_1, self.b_f_1, self.W_c_1, self.U_c_1, self.b_c_1, self.W_o_1, self.U_o_1, self.b_o_1] def recurrence(x_t, c_t1_prev, h_t1_prev, c_t2_prev, h_t2_prev): # Layer 1 computation x_i = T.dot(self.emb[x_t], self.W_i) + self.b_i x_f = T.dot(self.emb[x_t], self.W_f) + self.b_f x_c = T.dot(self.emb[x_t], self.W_c) + self.b_c x_o = T.dot(self.emb[x_t], self.W_o) + self.b_o i_t = inner_activation(x_i + T.dot(h_t1_prev, self.U_i)) f_t = inner_activation(x_f + T.dot(h_t1_prev, self.U_f)) c_t = f_t * c_t1_prev + i_t * activation(x_c + T.dot(h_t1_prev, self.U_c)) # internal memory o_t = inner_activation(x_o + T.dot(h_t1_prev, self.U_o)) h_t = o_t * activation(c_t) # actual hidden state # Layer 2 computation x_i_1 = T.dot(h_t, self.W_i_1) + self.b_i_1 x_f_1 = T.dot(h_t, self.W_f_1) + self.b_f_1 x_c_1 = T.dot(h_t, self.W_c_1) + self.b_c_1 x_o_1 = T.dot(h_t, self.W_o_1) + self.b_o_1 i_t_1 = inner_activation(x_i_1 + T.dot(h_t2_prev, self.U_i_1)) f_t_1 = inner_activation(x_f_1 + T.dot(h_t2_prev, self.U_f_1)) c_t_1 = f_t_1 * c_t2_prev + i_t_1 * activation(x_c_1 + T.dot(h_t2_prev, self.U_c_1)) # internal memory o_t_1 = inner_activation(x_o_1 + T.dot(h_t2_prev, self.U_o_1)) h_t_1 = o_t_1 * activation(c_t_1) # actual hidden state return c_t, h_t, c_t_1, h_t_1 [_, h_1, _, h_2], _ = theano.scan( recurrence, sequences=input, outputs_info=[T.alloc(self.c0, input.shape[1], hidden_dim), T.alloc(self.h0, input.shape[1], hidden_dim), T.alloc(self.c1, input.shape[1], hidden_dim), T.alloc(self.h1, input.shape[1], hidden_dim)] ) # since every hidden layer is connected to output self.y = T.concatenate([h_1[-1], h_2[-1]], axis=1)
def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, merge_mode='sum'): if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # Forward LSTM # input gate self.Wf_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_i', borrow=True) self.Uf_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_i', borrow=True) self.bf_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_i', borrow=True) # forget gate self.Wf_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_f', borrow=True) self.Uf_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_f', borrow=True) self.bf_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='bf_f', borrow=True) # memory self.Wf_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_c', borrow=True) self.Uf_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_c', borrow=True) self.bf_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_c', borrow=True) # output gate self.Wf_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_o', borrow=True) self.Uf_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_o', borrow=True) self.bf_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_o', borrow=True) # Backward LSTM # input gate self.Wb_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_i', borrow=True) self.Ub_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_i', borrow=True) self.bb_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_i', borrow=True) # forget gate self.Wb_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_f', borrow=True) self.Ub_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_f', borrow=True) self.bb_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='bb_f', borrow=True) # memory self.Wb_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_c', borrow=True) self.Ub_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_c', borrow=True) self.bb_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_c', borrow=True) # output gate self.Wb_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_o', borrow=True) self.Ub_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_o', borrow=True) self.bb_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_o', borrow=True) else: self.emb, self.Wf_i, self.Uf_i, self.bf_i, self.Wf_f, self.Uf_f, self.bf_f, \ self.Wf_c, self.Uf_c, self.bf_c, self.Wf_o, self.Uf_o, self.bf_o,\ self.Wb_i, self.Ub_i, self.bb_i, self.Wb_f, self.Ub_f, self.bb_f, \ self.Wb_c, self.Ub_c, self.bb_c, self.Wb_o, self.Ub_o, self.bb_o = params self.cf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='cf', borrow=True) self.hf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hf', borrow=True) self.cb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='cb', borrow=True) self.hb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hb', borrow=True) self.params = [self.emb, self.Wf_i, self.Uf_i, self.bf_i, self.Wf_f, self.Uf_f, self.bf_f, self.Wf_c, self.Uf_c, self.bf_c, self.Wf_o, self.Uf_o, self.bf_o, self.Wb_i, self.Ub_i, self.bb_i, self.Wb_f, self.Ub_f, self.bb_f, self.Wb_c, self.Ub_c, self.bb_c, self.Wb_o, self.Ub_o, self.bb_o] input_f = input.dimshuffle(1, 0) input_b = input[::-1].dimshuffle(1, 0) # forward lstm def recurrence_f(xf_t, cf_tm, hf_tm): xf_i = T.dot(self.emb[xf_t], self.Wf_i) + self.bf_i xf_f = T.dot(self.emb[xf_t], self.Wf_f) + self.bf_f xf_c = T.dot(self.emb[xf_t], self.Wf_c) + self.bf_c xf_o = T.dot(self.emb[xf_t], self.Wf_o) + self.bf_o if_t = inner_activation(xf_i + T.dot(hf_tm, self.Uf_i)) ff_t = inner_activation(xf_f + T.dot(hf_tm, self.Uf_f)) cf_t = ff_t * cf_tm + if_t * activation(xf_c + T.dot(hf_tm, self.Uf_c)) # internal memory of_t = inner_activation(xf_o + T.dot(hf_tm, self.Uf_o)) hf_t = of_t * activation(cf_t) # actual hidden state return cf_t, hf_t [_, h_f], _ = theano.scan( fn=recurrence_f, sequences=input_f, outputs_info=[T.alloc(self.cf, input_f.shape[1], hidden_dim), T.alloc(self.hf, input_f.shape[1], hidden_dim)] ) # backward lstm def recurrence_b(xb_t, cb_tm, hb_tm): xb_i = T.dot(self.emb[xb_t], self.Wb_i) + self.bb_i xb_f = T.dot(self.emb[xb_t], self.Wb_f) + self.bb_f xb_c = T.dot(self.emb[xb_t], self.Wb_c) + self.bb_c xb_o = T.dot(self.emb[xb_t], self.Wb_o) + self.bb_o ib_t = inner_activation(xb_i + T.dot(hb_tm, self.Ub_i)) fb_t = inner_activation(xb_f + T.dot(hb_tm, self.Ub_f)) cb_t = fb_t * cb_tm + ib_t * activation(xb_c + T.dot(hb_tm, self.Ub_c)) # internal memory ob_t = inner_activation(xb_o + T.dot(hb_tm, self.Ub_o)) hb_t = ob_t * activation(cb_t) # actual hidden state return cb_t, hb_t [_, h_b], _ = theano.scan( fn=recurrence_b, sequences=input_b, outputs_info=[T.alloc(self.cb, input_b.shape[1], hidden_dim), T.alloc(self.hb, input_b.shape[1], hidden_dim)] ) if merge_mode == 'sum': h = h_f[-1] + h_b[-1] elif merge_mode == 'multiply': h = h_f[-1] * h_b[-1] elif merge_mode == 'average': h = (h_f[-1] + h_b[-1]) / 2 elif merge_mode == 'concat': h = T.concatenate([h_f, h_b]) else: print('Supported "merge_mode" for forward + backward lstm are: "sum", "multiply", "average" & "concat".') raise NotImplementedError # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h
def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): input = input.dimshuffle(1, 0) if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) else: self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, \ self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o = params self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o] def recurrence(x_t, c_tm_prev, h_tm_prev): x_i = T.dot(self.emb[x_t], self.W_i) + self.b_i x_f = T.dot(self.emb[x_t], self.W_f) + self.b_f x_c = T.dot(self.emb[x_t], self.W_c) + self.b_c x_o = T.dot(self.emb[x_t], self.W_o) + self.b_o i_t = inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * activation(c_t) # actual hidden state return c_t, h_t [_, h], _ = theano.scan( fn=recurrence, sequences=input, outputs_info=[T.alloc(self.c0, input.shape[1], hidden_dim), T.alloc(self.h0, input.shape[1], hidden_dim)] ) # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h[-1]
def __init__(self, input, input_dim, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): self.input = input self.hidden_dim = hidden_dim self.activation = activation self.inner_activation = inner_activation if params is None: # update gate self.W_z = theano.shared( value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared( value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared( value=get(identifier='zero', shape=(hidden_dim,)), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_r', borrow=True) # weights pertaining to input, hidden & output neurons (externally) self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.b_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_h', borrow=True) self.b_y = theano.shared(value=get(identifier='zero', shape=(output_dim,)), name='b_y', borrow=True) else: self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, \ self.W, self.U, self.V, self.b_h, self.b_y = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W, self.U, self.V, self.b_h, self.b_y] def recurrence(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W) + self.b_h z_t = inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = activation(x_h + T.dot(r_t * h_tm_prev, self.U)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return h_t, y_t[0] [self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=self.input, outputs_info=[self.h0, None] ) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, vocab_size, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, merge_mode='concat'): input_f = input.dimshuffle(1, 0) input_b = input[::-1].dimshuffle(1, 0) if params is None: self.emb = theano.shared(value=get(identifier=init, shape=(vocab_size, emb_dim), scale=np.sqrt(3)), name='emb', borrow=True) # Forward GRU # update gate self.Wf_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_z', borrow=True) self.Uf_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_z', borrow=True) self.bf_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_z', borrow=True) # reset gate self.Wf_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_r', borrow=True) self.Uf_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_r', borrow=True) self.bf_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_r', borrow=True) # hidden state self.Wf_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_h', borrow=True) self.Uf_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_h', borrow=True) self.bf_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bf_h', borrow=True) # Backward GRU # update gate self.Wb_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_z', borrow=True) self.Ub_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_z', borrow=True) self.bb_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_z', borrow=True) # reset gate self.Wb_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_r', borrow=True) self.Ub_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_r', borrow=True) self.bb_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_r', borrow=True) # hidden state self.Wb_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_h', borrow=True) self.Ub_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_h', borrow=True) self.bb_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bb_h', borrow=True) else: self.emb, self.Wf_z, self.Uf_z, self.bf_z, self.Wf_r, self.Uf_r, self.bf_r, self.Wf_h, self.Uf_h, \ self.bf_h, self.Wb_z, self.Ub_z, self.bb_z, self.Wb_r, self.Ub_r, self.bb_r, self.Wb_h, \ self.Ub_h, self.bb_h = params self.hf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hf', borrow=True) self.hb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hb', borrow=True) self.params = [ self.emb, self.Wf_z, self.Uf_z, self.bf_z, self.Wf_r, self.Uf_r, self.bf_r, self.Wf_h, self.Uf_h, self.bf_h, self.Wb_z, self.Ub_z, self.bb_z, self.Wb_r, self.Ub_r, self.bb_r, self.Wb_h, self.Ub_h, self.bb_h ] # forward gru def recurrence_f(xf_t, hf_tm): xf_z = T.dot(self.emb[xf_t], self.Wf_z) + self.bf_z xf_r = T.dot(self.emb[xf_t], self.Wf_r) + self.bf_r xf_h = T.dot(self.emb[xf_t], self.Wf_h) + self.bf_h zf_t = inner_activation(xf_z + T.dot(hf_tm, self.Uf_z)) rf_t = inner_activation(xf_r + T.dot(hf_tm, self.Uf_r)) hhf_t = activation(xf_h + T.dot(rf_t * hf_tm, self.Uf_h)) hf_t = (T.ones_like(zf_t) - zf_t) * hhf_t + zf_t * hf_tm return hf_t self.h_f, _ = theano.scan(recurrence_f, sequences=input_f, outputs_info=T.alloc(self.hf, input_f.shape[1], hidden_dim)) # backward gru def recurrence_b(xb_t, hb_tm): xb_z = T.dot(self.emb[xb_t], self.Wb_z) + self.bb_z xb_r = T.dot(self.emb[xb_t], self.Wb_r) + self.bb_r xb_h = T.dot(self.emb[xb_t], self.Wb_h) + self.bb_h zb_t = inner_activation(xb_z + T.dot(hb_tm, self.Ub_z)) rb_t = inner_activation(xb_r + T.dot(hb_tm, self.Ub_r)) hhb_t = activation(xb_h + T.dot(rb_t * hb_tm, self.Ub_h)) hb_t = (T.ones_like(zb_t) - zb_t) * hhb_t + zb_t * hb_tm return hb_t self.h_b, _ = theano.scan(recurrence_b, sequences=input_b, outputs_info=T.alloc(self.hb, input_b.shape[1], hidden_dim)) if merge_mode == 'sum': self.y = self.h_f[-1] + self.h_b[-1] elif merge_mode == 'multiply': self.y = self.h_f[-1] * self.h_b[-1] elif merge_mode == 'average': self.y = (self.h_f[-1] + self.h_b[-1]) / 2 elif merge_mode == 'concat': self.y = T.concatenate([self.h_f[-1], self.h_b[-1]], axis=1) else: print( 'Supported "merge_mode" for forward + backward gru are: "sum", "multiply", average" & "concat".' ) raise NotImplementedError
def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, merge_mode='sum'): if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # Forward GRU # update gate self.Wf_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_z', borrow=True) self.Uf_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_z', borrow=True) self.bf_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bf_z', borrow=True) # reset gate self.Wf_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_r', borrow=True) self.Uf_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_r', borrow=True) self.bf_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bf_r', borrow=True) # hidden state self.Wf_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wf_h', borrow=True) self.Uf_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Uf_h', borrow=True) self.bf_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bf_h', borrow=True) # Backward GRU # update gate self.Wb_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_z', borrow=True) self.Ub_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_z', borrow=True) self.bb_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bb_z', borrow=True) # reset gate self.Wb_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_r', borrow=True) self.Ub_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_r', borrow=True) self.bb_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bb_r', borrow=True) # hidden state self.Wb_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='Wb_h', borrow=True) self.Ub_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='Ub_h', borrow=True) self.bb_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='bb_h', borrow=True) else: self.emb, self.Wf_z, self.Uf_z, self.bf_z, self.Wf_r, self.Uf_r, self.bf_r, \ self.Wf_h, self.Uf_h, self.bf_h, self.Wb_z, self.Ub_z, self.bb_z, self.Wb_r, \ self.Ub_r, self.bb_r, self.Wb_h, self.Ub_h, self.bb_h = params self.hf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hf', borrow=True) self.hb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hb', borrow=True) self.params = [self.emb, self.Wf_z, self.Uf_z, self.bf_z, self.Wf_r, self.Uf_r, self.bf_r, self.Wf_h, self.Uf_h, self.bf_h, self.Wb_z, self.Ub_z, self.bb_z, self.Wb_r, self.Ub_r, self.bb_r, self.Wb_h, self.Ub_h, self.bb_h] input_f = input.dimshuffle(1, 0) input_b = input[::-1].dimshuffle(1, 0) # forward gru def recurrence_f(xf_t, hf_tm): xf_z = T.dot(self.emb[xf_t], self.Wf_z) + self.bf_z xf_r = T.dot(self.emb[xf_t], self.Wf_r) + self.bf_r xf_h = T.dot(self.emb[xf_t], self.Wf_h) + self.bf_h zf_t = inner_activation(xf_z + T.dot(hf_tm, self.Uf_z)) rf_t = inner_activation(xf_r + T.dot(hf_tm, self.Uf_r)) hhf_t = activation(xf_h + T.dot(rf_t * hf_tm, self.Uf_h)) hf_t = (T.ones_like(zf_t) - zf_t) * hhf_t + zf_t * hf_tm return hf_t h_f, _ = theano.scan( fn=recurrence_f, sequences=input_f, outputs_info=T.alloc(self.hf, input_f.shape[1], hidden_dim) ) # backward gru def recurrence_b(xb_t, hb_tm): xb_z = T.dot(self.emb[xb_t], self.Wb_z) + self.bb_z xb_r = T.dot(self.emb[xb_t], self.Wb_r) + self.bb_r xb_h = T.dot(self.emb[xb_t], self.Wb_h) + self.bb_h zb_t = inner_activation(xb_z + T.dot(hb_tm, self.Ub_z)) rb_t = inner_activation(xb_r + T.dot(hb_tm, self.Ub_r)) hhb_t = activation(xb_h + T.dot(rb_t * hb_tm, self.Ub_h)) hb_t = (T.ones_like(zb_t) - zb_t) * hhb_t + zb_t * hb_tm return hb_t h_b, _ = theano.scan( fn=recurrence_b, sequences=input_b, outputs_info=T.alloc(self.hb, input_b.shape[1], hidden_dim) ) if merge_mode == 'sum': h = h_f[-1] + h_b[-1] elif merge_mode == 'multiply': h = h_f[-1] * h_b[-1] elif merge_mode == 'average': h = (h_f[-1] + h_b[-1]) / 2 elif merge_mode == 'concat': h = T.concatenate([h_f, h_b]) else: print('Supported "merge_mode" for forward + backward gru are: "sum", "multiply", "average" & "concat".') raise NotImplementedError # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h
def __init__(self, input, vocab_size, emb_dim, hidden_dim, n_layers=2, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): input = input.dimshuffle(1, 0) assert (n_layers == 2) # can only stack one layer if params is None: self.emb = theano.shared(value=get(identifier=init, shape=(vocab_size, emb_dim), scale=np.sqrt(3)), name='emb', borrow=True) # Layer 1 # update gate self.W_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_r', borrow=True) # hidden state self.W_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_h', borrow=True) self.U_h = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_h', borrow=True) self.b_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_h', borrow=True) # Layer 2 # update gate self.W_z_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_z_1', borrow=True) self.U_z_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z_1', borrow=True) self.b_z_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_z_1', borrow=True) # reset gate self.W_r_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_r_1', borrow=True) self.U_r_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r_1', borrow=True) self.b_r_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_r_1', borrow=True) # hidden state self.W_h_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='W_h_1', borrow=True) self.U_h_1 = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_h_1', borrow=True) self.b_h_1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_h_1', borrow=True) # Skip-connections from input to layer 2 self.s_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='s_z', borrow=True) self.s_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='s_r', borrow=True) self.s_h = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='s_h', borrow=True) else: self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W_h, self.U_h, \ self.b_h, self.W_z_1, self.U_z_1, self.b_z_1, self.W_r_1, self.U_r_1, self.b_r_1, \ self.W_h_1, self.U_h_1, self.b_h_1, self.s_z, self.s_r, self.s_h = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.h1 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h1', borrow=True) self.params = [ self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W_h, self.U_h, self.b_h, self.W_z_1, self.U_z_1, self.b_z_1, self.W_r_1, self.U_r_1, self.b_r_1, self.W_h_1, self.U_h_1, self.b_h_1, self.s_z, self.s_r, self.s_h ] def recurrence(x_t, h_t1_prev, h_t2_prev): # Layer 1 x_z_1 = T.dot(self.emb[x_t], self.W_z) + self.b_z x_r_1 = T.dot(self.emb[x_t], self.W_r) + self.b_r x_h_1 = T.dot(self.emb[x_t], self.W_h) + self.b_h z_t_1 = inner_activation(x_z_1 + T.dot(h_t1_prev, self.U_z)) r_t_1 = inner_activation(x_r_1 + T.dot(h_t1_prev, self.U_r)) hh_t_1 = activation(x_h_1 + T.dot(r_t_1 * h_t1_prev, self.U_h)) h_t_1 = (T.ones_like(z_t_1) - z_t_1) * hh_t_1 + z_t_1 * h_t1_prev # Layer 2 # 's_*' represents skip connections from previous layer x_z_2 = T.dot(h_t_1, self.W_z_1) + T.dot(self.emb[x_t], self.s_z) + self.b_z_1 x_r_2 = T.dot(h_t_1, self.W_r_1) + T.dot(self.emb[x_t], self.s_r) + self.b_r_1 x_h_2 = T.dot(h_t_1, self.W_h_1) + T.dot(self.emb[x_t], self.s_h) + self.b_h_1 z_t_2 = inner_activation(x_z_2 + T.dot(h_t2_prev, self.U_z_1)) r_t_2 = inner_activation(x_r_2 + T.dot(h_t2_prev, self.U_r_1)) hh_t_2 = activation(x_h_2 + T.dot(r_t_2 * h_t2_prev, self.U_h_1)) h_t_2 = (T.ones_like(z_t_2) - z_t_2) * hh_t_2 + z_t_2 * h_t2_prev return h_t_1, h_t_2 [h_1, h_2], _ = theano.scan(recurrence, sequences=input, outputs_info=[ T.alloc(self.h0, input.shape[1], hidden_dim), T.alloc(self.h1, input.shape[1], hidden_dim) ]) # since every hidden layer is connected to output self.y = T.concatenate([h_1[-1], h_2[-1]], axis=1)
def __init__(self, input, input_dim, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, mini_batch=False, params=None): self.inner_activation = inner_activation self.activation = activation self.mini_batch = mini_batch if mini_batch: input = input.dimshuffle(1, 0, 2) if params is None: # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) # weights pertaining to output neuron self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.b_y = theano.shared(value=get(identifier='zero', shape=(output_dim,)), name='b_y', borrow=True) else: self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, \ self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.V, self.b_y = params self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.V, self.b_y] if mini_batch: def recurrence(x_t, c_tm_prev, h_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * activation(c_t) # actual hidden state y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return c_t, h_t, y_t [_, self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[T.alloc(self.c0, input.shape[1], hidden_dim), T.alloc(self.h0, input.shape[1], hidden_dim), None] ) self.h_t = self.h_t.dimshuffle(1, 0, 2) self.y_t = self.y_t.dimshuffle(1, 0, 2) self.y = T.argmax(self.y_t, axis=2) else: def recurrence(x_t, c_tm_prev, h_tm_prev): x_i = T.dot(x_t, self.W_i) + self.b_i x_f = T.dot(x_t, self.W_f) + self.b_f x_c = T.dot(x_t, self.W_c) + self.b_c x_o = T.dot(x_t, self.W_o) + self.b_o i_t = inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * activation(x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * activation(c_t) # actual hidden state y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return c_t, h_t, y_t[0] [_, self.h_t, self.y_t], _ = theano.scan( recurrence, sequences=input, outputs_info=[self.c0, self.h0, None] ) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, mini_batch=False, params=None): self.activation = activation self.inner_activation = inner_activation self.mini_batch = mini_batch if mini_batch: input = input.dimshuffle(1, 0, 2) if params is None: # update gate self.W_z = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_r', borrow=True) # weights pertaining to input, hidden & output neurons (externally) self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.b_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_h', borrow=True) self.b_y = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y', borrow=True) else: self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, \ self.W, self.U, self.V, self.b_h, self.b_y = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [ self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W, self.U, self.V, self.b_h, self.b_y ] if mini_batch: def recurrence(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W) + self.b_h z_t = inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = activation(x_h + T.dot(r_t * h_tm_prev, self.U)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return h_t, y_t [self.h_t, self.y_t], _ = theano.scan(recurrence, sequences=input, outputs_info=[ T.alloc(self.h0, input.shape[1], hidden_dim), None ]) self.h_t = self.h_t.dimshuffle(1, 0, 2) self.y_t = self.y_t.dimshuffle(1, 0, 2) self.y = T.argmax(self.y_t, axis=2) else: def recurrence(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W) + self.b_h z_t = inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = activation(x_h + T.dot(r_t * h_tm_prev, self.U)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return h_t, y_t[0] [self.h_t, self.y_t], _ = theano.scan(recurrence, sequences=input, outputs_info=[self.h0, None]) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, enc_h, mask, emb_mat, vocab_size, emb_dim, hidden_dim, eos_token, batch_size, max_len, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, max_response=100): self.enc_h = enc_h self.mask = mask self.eos_token = eos_token self.batch_size = batch_size self.inner_activation = inner_activation self.activation = activation self.max_response = max_response if params is None: self.emb = theano.shared(value=np.asarray( emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # update gate self.W_z = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_r', borrow=True) # weights pertaining to input, hidden & output neurons (externally) self.W = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='V', borrow=True) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bh', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(vocab_size, )), name='by', borrow=True) # to weight 'context' from encoder self.c_h = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='c_h', borrow=True) self.c_y = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='c_y', borrow=True) # to weight 'y_t-1' for decoder's 'y' self.y_t1 = theano.shared(value=get(identifier=init, shape=(emb_dim, vocab_size)), name='y_t1', borrow=True) else: self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, \ self.W, self.U, self.V, self.bh, self.by, self.c_h, self.c_y, \ self.y_t1 = params self.params = [ self.emb, self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W, self.U, self.V, self.bh, self.by, self.c_h, self.c_y, self.y_t1 ] self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) # y(t-1) from encoder will always be 'eos' token self.y0 = theano.shared(value=np.asarray(np.full((batch_size, ), self.eos_token), dtype='int32'), name='y0', borrow=True) # remember for decoder both h_t and y_t are conditioned on 'enc_h' & 'y_t-1'. def recurrence(msk, h_tm_prev, y_tm_prev): x_z = T.dot(self.emb[y_tm_prev], self.W_z) + self.b_z x_r = T.dot(self.emb[y_tm_prev], self.W_r) + self.b_r x_h = T.dot(self.emb[y_tm_prev], self.W) + T.dot( self.enc_h, self.c_h) + self.bh z_t = self.inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = self.inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = self.activation(x_h + T.dot(r_t * h_tm_prev, self.U)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev # needed to back-propagate errors y_d_t = T.dot(h_t, self.V) + T.dot(self.enc_h, self.c_y) + T.dot( self.emb[y_tm_prev], self.y_t1) + self.by # ignore padded tokens y_d_t = T.batched_dot(y_d_t, msk) y_d = T.clip(T.nnet.softmax(y_d_t), 0.0001, 0.9999) y_t = T.argmax(y_d, axis=1) return h_t, y_d, T.cast(y_t.flatten(), 'int32') [_, y_dist, y], _ = theano.scan( fn=recurrence, sequences=mask.dimshuffle( 1, 0), # ugly, but we have to go till the end outputs_info=[ T.alloc(self.h0, self.enc_h.shape[0], hidden_dim), None, T.alloc(self.y0, self.enc_h.shape[0]) ], n_steps=max_len) self.y = y.dimshuffle(1, 0) self.y_dist = y_dist.dimshuffle(1, 0, 2)
def __init__(self, input, emb_mat, emb_dim, hidden_dim, init='uniform', inner_init='orthonormal', activation=T.tanh, params=None, merge_mode='sum'): if params is None: self.emb = theano.shared(value=np.asarray(emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # weights for forward rnn self.W_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_f', borrow=True) # weights for backward rnn self.W_b = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_b', borrow=True) self.U_b = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_b', borrow=True) self.b_b = theano.shared(value=get(identifier='zero', shape=(hidden_dim,)), name='b_b', borrow=True) else: self.emb, self.W_f, self.U_f, self.b_f, self.W_b, self.U_b, self.b_b = params self.hf = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hf', borrow=True) self.hb = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='hb', borrow=True) self.params = [self.emb, self.W_f, self.U_f, self.b_f, self.W_b, self.U_b, self.b_b] input_f = input.dimshuffle(1, 0) input_b = input[::-1].dimshuffle(1, 0) # forward rnn def recurrence_f(xf_t, hf_tm): hf_t = activation(T.dot(self.emb[xf_t], self.W_f) + T.dot(hf_tm, self.U_f) + self.b_f) return hf_t h_f, _ = theano.scan( fn=recurrence_f, sequences=input_f, outputs_info=T.alloc(self.hf, input_f.shape[1], hidden_dim) ) # backward rnn def recurrence_b(xb_t, hb_tm): hf_b = activation(T.dot(self.emb[xb_t], self.W_b) + T.dot(hb_tm, self.U_b) + self.b_b) return hf_b h_b, _ = theano.scan( fn=recurrence_b, sequences=input_b, outputs_info=T.alloc(self.hb, input_b.shape[1], hidden_dim) ) if merge_mode == 'sum': h = h_f[-1] + h_b[-1] elif merge_mode == 'multiply': h = h_f[-1] * h_b[-1] elif merge_mode == 'average': h = (h_f[-1] + h_b[-1]) / 2 elif merge_mode == 'concat': h = T.concatenate([h_f, h_b]) else: print('Supported "merge_mode" for forward + backward rnn are: "sum", "multiply", "average" & "concat".') raise NotImplementedError # 'hidden state + prediction' at last time-step need to be passed to the decoder; # prediction at last-time step will always be 'eos' therefore, ignored self.h = h
def __init__(self, enc_h, mask, emb_mat, vocab_size, emb_dim, hidden_dim, eos_token, batch_size, max_len, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None, max_response=100): self.enc_h = enc_h self.mask = mask self.eos_token = eos_token self.batch_size = batch_size self.inner_activation = inner_activation self.activation = activation self.max_response = max_response if params is None: self.emb = theano.shared(value=np.asarray( emb_mat, dtype=theano.config.floatX), name='emb', borrow=True) # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(emb_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) # weights pertaining to output neuron self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='V', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(vocab_size, )), name='by', borrow=True) # to weight 'context' from encoder self.c_h = theano.shared(value=get(identifier=init, shape=(hidden_dim, hidden_dim)), name='c_h', borrow=True) self.c_y = theano.shared(value=get(identifier=init, shape=(hidden_dim, vocab_size)), name='c_y', borrow=True) # to weight 'y_t-1' for decoder's 'y' self.y_t1 = theano.shared(value=get(identifier=init, shape=(emb_dim, vocab_size)), name='y_t1', borrow=True) else: self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, \ self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, \ self.V, self.by, self.c_h, self.c_y, self.y_t1 = params self.params = [ self.emb, self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_c, self.U_c, self.b_c, self.W_o, self.U_o, self.b_o, self.V, self.by, self.c_h, self.c_y, self.y_t1 ] self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) # y(t-1) from encoder will always be 'eos' token self.y0 = theano.shared(value=np.asarray(np.full((batch_size, ), self.eos_token), dtype='int32'), name='y0', borrow=True) # remember for decoder both h_t and y_t are conditioned on 'enc_h' & 'y_t-1'. def recurrence(msk, c_tm_prev, h_tm_prev, y_tm_prev): x_i = T.dot(self.emb[y_tm_prev], self.W_i) + self.b_i x_f = T.dot(self.emb[y_tm_prev], self.W_f) + self.b_f x_c = T.dot(self.emb[y_tm_prev], self.W_c) + self.b_c x_o = T.dot(self.emb[y_tm_prev], self.W_o) + T.dot( self.enc_h, self.c_h) + self.b_o i_t = self.inner_activation(x_i + T.dot(h_tm_prev, self.U_i)) f_t = self.inner_activation(x_f + T.dot(h_tm_prev, self.U_f)) c_t = f_t * c_tm_prev + i_t * self.activation( x_c + T.dot(h_tm_prev, self.U_c)) # internal memory o_t = self.inner_activation(x_o + T.dot(h_tm_prev, self.U_o)) h_t = o_t * self.activation(c_t) # actual hidden state # needed to back-propagate errors y_d_t = T.dot(h_t, self.V) + T.dot(self.enc_h, self.c_y) + T.dot( self.emb[y_tm_prev], self.y_t1) + self.by # ignore padded tokens y_d_t = T.batched_dot(y_d_t, msk) y_d = T.clip(T.nnet.softmax(y_d_t), 0.0001, 0.9999) y_t = T.argmax(y_d, axis=1) return c_t, h_t, y_d, T.cast(y_t.flatten(), 'int32') [_, _, y_dist, y], _ = theano.scan( fn=recurrence, sequences=mask.dimshuffle( 1, 0), # ugly, but we have to go till the end outputs_info=[ T.alloc(self.c0, self.enc_h.shape[0], hidden_dim), T.alloc(self.h0, self.enc_h.shape[0], hidden_dim), None, T.alloc(self.y0, self.enc_h.shape[0]) ], n_steps=max_len) self.y = y.dimshuffle(1, 0) self.y_dist = y_dist.dimshuffle(1, 0, 2)