def __init__(self, input, input_dim, hidden_dim, output_dim, params=None): self.input_f = input self.input_b = input[::-1] if params is None: self.fwd_gru = GRU(input=self.input_f, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.bwd_gru = GRU(input=self.input_b, input_dim=input_dim, hidden_dim=hidden_dim, output_dim=output_dim) self.V_f = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True) self.V_b = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True) self.by = theano.shared(value=get('zero', shape=(output_dim, )), name='by', borrow=True) else: # To support loading from persistent storage, the current implementation of Gru() will require a # change and is therefore not supported. # An elegant way would be to implement BiGru() without using Gru() [is a trivial thing to do]. raise NotImplementedError # since now bigru is doing the actual classification ; we don't need 'Gru().V & Gru().by' as they # are not part of computational graph (separate logistic-regression unit/layer is probably the best way to # handle this). Here's the ugly workaround -_- self.params = [ self.fwd_gru.W_z, self.fwd_gru.U_z, self.fwd_gru.b_z, self.fwd_gru.W_r, self.fwd_gru.U_r, self.fwd_gru.b_r, self.fwd_gru.W, self.fwd_gru.U, self.fwd_gru.b_h, self.bwd_gru.W_z, self.bwd_gru.U_z, self.bwd_gru.b_z, self.bwd_gru.W_r, self.bwd_gru.U_r, self.bwd_gru.b_r, self.bwd_gru.W, self.bwd_gru.U, self.bwd_gru.b_h, self.V_f, self.V_b, self.by ] self.bwd_gru.h_t = self.bwd_gru.h_t[::-1] # Take the weighted sum of forward & backward gru's hidden representation self.h_t = T.dot(self.fwd_gru.h_t, self.V_f) + T.dot( self.bwd_gru.h_t, self.V_b) self.y_t = T.nnet.softmax(self.h_t + self.by) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, minibatch, hidden_dim, output_dim, params=None): self.in_fwd = input self.in_bwd = input[::-1] ## create tuning parameters or use existing ones if params is None: self.fwd_lstm = LSTM(input=self.in_fwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim) self.bwd_lstm = LSTM(input=self.in_bwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim) # self.V_f = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True) # self.V_b = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True) # self.by = theano.shared(value=get('zero', shape=(output_dim, )), name='by', borrow=True) self.V_f_0 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f_0', borrow=True) self.V_f_1 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f_1', borrow=True) self.V_f_2 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f_2', borrow=True) self.V_f_3 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f_3', borrow=True) self.V_b_0 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b_0', borrow=True) self.V_b_1 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b_1', borrow=True) self.V_b_2 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b_2', borrow=True) self.V_b_3 = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b_3', borrow=True) self.by_0 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by_0', borrow=True) self.by_1 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by_1', borrow=True) self.by_2 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by_2', borrow=True) self.by_3 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by_3', borrow=True) elif params is not None: [ fwd_lstm_Wi, fwd_lstm_Ui, fwd_lstm_bi, fwd_lstm_Wf, fwd_lstm_Uf, fwd_lstm_bf, fwd_lstm_Wo, fwd_lstm_Uo, fwd_lstm_bo, fwd_lstm_Wc, fwd_lstm_Uc, fwd_lstm_bc, bwd_lstm_Wi, bwd_lstm_Ui, bwd_lstm_bi, bwd_lstm_Wf, bwd_lstm_Uf, bwd_lstm_bf, bwd_lstm_Wo, bwd_lstm_Uo, bwd_lstm_bo, bwd_lstm_Wc, bwd_lstm_Uc, bwd_lstm_bc, V_f_0, V_f_1, V_f_2, V_f_3, V_b_0, V_b_1, V_b_2, V_b_3, by_0, by_1, by_2, by_3 ] = params void_M = theano.shared(value=np.zeros(1)) fwd_param = [ fwd_lstm_Wi, fwd_lstm_Ui, fwd_lstm_bi, fwd_lstm_Wf, fwd_lstm_Uf, fwd_lstm_bf, fwd_lstm_Wo, fwd_lstm_Uo, fwd_lstm_bo, fwd_lstm_Wc, fwd_lstm_Uc, fwd_lstm_bc, void_M, void_M, void_M, void_M, void_M, void_M, void_M, void_M ] bwd_param = [ bwd_lstm_Wi, bwd_lstm_Ui, bwd_lstm_bi, bwd_lstm_Wf, bwd_lstm_Uf, bwd_lstm_bf, bwd_lstm_Wo, bwd_lstm_Uo, bwd_lstm_bo, bwd_lstm_Wc, bwd_lstm_Uc, bwd_lstm_bc, void_M, void_M, void_M, void_M, void_M, void_M, void_M, void_M ] self.fwd_lstm = LSTM(input=self.in_fwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim, params=fwd_param) self.bwd_lstm = LSTM(input=self.in_bwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim, params=bwd_param) self.V_f_0 = V_f_0 self.V_f_1 = V_f_1 self.V_f_2 = V_f_2 self.V_f_3 = V_f_3 self.V_b_0 = V_b_0 self.V_b_1 = V_b_1 self.V_b_2 = V_b_2 self.V_b_3 = V_b_3 self.by_0 = by_0 self.by_1 = by_1 self.by_2 = by_2 self.by_3 = by_3 # parameter list self.params = [ self.fwd_lstm.W_i, self.fwd_lstm.U_i, self.fwd_lstm.b_i, self.fwd_lstm.W_f, self.fwd_lstm.U_f, self.fwd_lstm.b_f, self.fwd_lstm.W_o, self.fwd_lstm.U_o, self.fwd_lstm.b_o, self.fwd_lstm.W_c, self.fwd_lstm.U_c, self.fwd_lstm.b_c, self.bwd_lstm.W_i, self.bwd_lstm.U_i, self.bwd_lstm.b_i, self.bwd_lstm.W_f, self.bwd_lstm.U_f, self.bwd_lstm.b_f, self.bwd_lstm.W_o, self.bwd_lstm.U_o, self.bwd_lstm.b_o, self.bwd_lstm.W_c, self.bwd_lstm.U_c, self.bwd_lstm.b_c, self.V_f_0, self.V_f_1, self.V_f_2, self.V_f_3, self.V_b_0, self.V_b_1, self.V_b_2, self.V_b_3, self.by_0, self.by_1, self.by_2, self.by_3 ] self.bwd_lstm.h_t = self.bwd_lstm.h_t[::-1] # weighted sum of forward & backward # self.y_t = T.nnet.sigmoid(T.dot(self.fwd_lstm.h_t, self.V_f) + T.dot(self.bwd_lstm.h_t, self.V_b) + self.by) # self.y_t_0 = T.nnet.sigmoid(T.dot(self.fwd_lstm.h_t, self.V_f_0) + T.dot(self.bwd_lstm.h_t, self.V_b_0) + self.by_0) # self.y_t_1 = T.nnet.sigmoid(T.dot(self.fwd_lstm.h_t, self.V_f_1) + T.dot(self.bwd_lstm.h_t, self.V_b_1) + self.by_1) # self.y_t_2 = T.nnet.sigmoid(T.dot(self.fwd_lstm.h_t, self.V_f_2) + T.dot(self.bwd_lstm.h_t, self.V_b_2) + self.by_2) # self.y_t_3 = T.nnet.sigmoid(T.dot(self.fwd_lstm.h_t, self.V_f_3) + T.dot(self.bwd_lstm.h_t, self.V_b_3) + self.by_3) self.y_t_0 = T.dot(self.fwd_lstm.h_t, self.V_f_0) + T.dot( self.bwd_lstm.h_t, self.V_b_0) + self.by_0 self.y_t_1 = T.dot(self.fwd_lstm.h_t, self.V_f_1) + T.dot( self.bwd_lstm.h_t, self.V_b_1) + self.by_1 self.y_t_2 = T.dot(self.fwd_lstm.h_t, self.V_f_2) + T.dot( self.bwd_lstm.h_t, self.V_b_2) + self.by_2 self.y_t_3 = T.dot(self.fwd_lstm.h_t, self.V_f_3) + T.dot( self.bwd_lstm.h_t, self.V_b_3) + self.by_3 self.y_temp = T.stack([self.y_t_0, self.y_t_1, self.y_t_2, self.y_t_3], axis=2) self.y_t = T.reshape(self.y_temp, [ self.y_temp.shape[0] * self.y_temp.shape[1], self.y_temp.shape[2] ]) # softmax self.y_t = T.nnet.softmax(self.y_t) # class label with maximum probability self.y_label = T.argmax(self.y_t, axis=1) self.y_t = T.reshape( self.y_t, [self.y_temp.shape[0], self.y_temp.shape[1], self.y_temp.shape[2]]) self.y_label = T.reshape(self.y_label, [self.y_temp.shape[0], self.y_temp.shape[1]])
def __init__(self, input, input_dim, minibatch, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', gate_act=T.nnet.sigmoid, tanh_act=T.tanh, params=None): self.input = input self.gate_act = gate_act self.activation = tanh_act # create tuning parameters or use existing ones if params is None: # input gate self.W_i = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_i', borrow=True) self.U_i = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_i', borrow=True) self.b_i = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_i', borrow=True) # forget gate self.W_f = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_f', borrow=True) self.U_f = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_f', borrow=True) self.b_f = theano.shared(value=get(identifier='one', shape=(hidden_dim, )), name='b_f', borrow=True) # output gate self.W_o = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_o', borrow=True) self.U_o = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_o', borrow=True) self.b_o = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_o', borrow=True) # memory self.W_c = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_c', borrow=True) self.U_c = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_c', borrow=True) self.b_c = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_c', borrow=True) # weights to output neuron self.V_0 = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V_0', borrow=True) self.b_y_0 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y_0', borrow=True) self.V_1 = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V_1', borrow=True) self.b_y_1 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y_1', borrow=True) self.V_2 = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V_2', borrow=True) self.b_y_2 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y_2', borrow=True) self.V_3 = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V_3', borrow=True) self.b_y_3 = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y_3', borrow=True) elif params is not None: [ self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.W_c, self.U_c, self.b_c, self.V_0, self.b_y_0, self.V_1, self.b_y_1, self.V_2, self.b_y_2, self.V_3, self.b_y_3 ] = params # parameter list self.params = [ self.W_i, self.U_i, self.b_i, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o, self.W_c, self.U_c, self.b_c, self.V_0, self.b_y_0, self.V_1, self.b_y_1, self.V_2, self.b_y_2, self.V_3, self.b_y_3 ] # initialize internal and hidden state if minibatch == 1: self.c0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) else: self.c0 = theano.shared(value=get(identifier='zero', shape=(minibatch, hidden_dim)), name='c0', borrow=True) self.h0 = theano.shared(value=get(identifier='zero', shape=(minibatch, hidden_dim)), name='h0', borrow=True) def recurrence(x_t, c_tm_prev, h_tm_prev): i_t = gate_act( T.dot(x_t, self.W_i) + T.dot(h_tm_prev, self.U_i) + self.b_i) f_t = gate_act( T.dot(x_t, self.W_f) + T.dot(h_tm_prev, self.U_f) + self.b_f) o_t = gate_act( T.dot(x_t, self.W_o) + T.dot(h_tm_prev, self.U_o) + self.b_o) # internal memory x_c = T.dot(x_t, self.W_c) + self.b_c c_t = f_t * c_tm_prev + i_t * tanh_act(x_c + T.dot(h_tm_prev, self.U_c)) # hidden state h_t = o_t * tanh_act(c_t) # output # y_t_0 = T.nnet.sigmoid(T.dot(h_t, self.V_0) + self.b_y_0) # y_t_1 = T.nnet.sigmoid(T.dot(h_t, self.V_1) + self.b_y_1) # y_t_2 = T.nnet.sigmoid(T.dot(h_t, self.V_2) + self.b_y_2) # y_t_3 = T.nnet.sigmoid(T.dot(h_t, self.V_3) + self.b_y_3) y_t_0 = T.dot(h_t, self.V_0) + self.b_y_0 y_t_1 = T.dot(h_t, self.V_1) + self.b_y_1 y_t_2 = T.dot(h_t, self.V_2) + self.b_y_2 y_t_3 = T.dot(h_t, self.V_3) + self.b_y_3 y_t = T.stack([y_t_0, y_t_1, y_t_2, y_t_3], axis=1) # softmax y_t = T.nnet.softmax(y_t) # class label with maximum probability y_label = T.argmax(y_t, axis=1) return c_t, h_t, y_t, y_label [self.c_t, self.h_t, self.y_t, self.y_label ], _ = theano.scan(recurrence, sequences=self.input, outputs_info=[self.c0, self.h0, None, None])
def __init__(self, input, input_dim, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', inner_activation=T.nnet.hard_sigmoid, activation=T.tanh, params=None): self.input = input self.hidden_dim = hidden_dim self.activation = activation self.inner_activation = inner_activation if params is None: # update gate self.W_z = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_z', borrow=True) self.U_z = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_z', borrow=True) self.b_z = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_z', borrow=True) # reset gate self.W_r = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W_r', borrow=True) self.U_r = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U_r', borrow=True) self.b_r = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_r', borrow=True) # weights pertaining to input, hidden & output neurons (externally) self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.b_h = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='b_h', borrow=True) self.b_y = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='b_y', borrow=True) else: self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, \ self.W, self.U, self.V, self.b_h, self.b_y = params self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) self.params = [ self.W_z, self.U_z, self.b_z, self.W_r, self.U_r, self.b_r, self.W, self.U, self.V, self.b_h, self.b_y ] def recurrence(x_t, h_tm_prev): x_z = T.dot(x_t, self.W_z) + self.b_z x_r = T.dot(x_t, self.W_r) + self.b_r x_h = T.dot(x_t, self.W) + self.b_h z_t = inner_activation(x_z + T.dot(h_tm_prev, self.U_z)) r_t = inner_activation(x_r + T.dot(h_tm_prev, self.U_r)) hh_t = activation(x_h + T.dot(r_t * h_tm_prev, self.U)) h_t = (T.ones_like(z_t) - z_t) * hh_t + z_t * h_tm_prev y_t = T.nnet.softmax(T.dot(h_t, self.V) + self.b_y) return h_t, y_t[0] [self.h_t, self.y_t], _ = theano.scan(recurrence, sequences=self.input, outputs_info=[self.h0, None]) self.y = T.argmax(self.y_t, axis=1)
def __init__(self, input, input_dim, minibatch, hidden_dim, output_dim, params=None): self.in_fwd = input self.in_bwd = input[::-1] self.hidden_dim = hidden_dim # create tuning parameters or use existing ones if params is None: self.fwd_rnn = RNN(input=self.in_fwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim) self.bwd_rnn = RNN(input=self.in_bwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim) self.V_fwd = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_f', borrow=True) self.V_bwd = theano.shared(value=get(identifier='uniform', shape=(hidden_dim, output_dim)), name='V_b', borrow=True) self.by = theano.shared(value=get('zero', shape=(output_dim, )), name='by', borrow=True) elif params is not None: [ fwd_rnn_W, fwd_rnn_U, fwd_rnn_bh, bwd_rnn_W, bwd_rnn_U, bwd_rnn_bh, V_fwd, V_bwd, by ] = params void_M = theano.shared(value=np.zeros(1)) fwd_param = [fwd_rnn_W, fwd_rnn_U, fwd_rnn_bh, void_M, void_M] bwd_param = [bwd_rnn_W, bwd_rnn_U, bwd_rnn_bh, void_M, void_M] self.fwd_rnn = RNN(input=self.in_fwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim, params=fwd_param) self.bwd_rnn = RNN(input=self.in_bwd, input_dim=input_dim, minibatch=minibatch, hidden_dim=hidden_dim, output_dim=output_dim, params=bwd_param) self.V_fwd = V_fwd self.V_bwd = V_bwd self.by = by # print '#########################' # print np.asarray(self.fwd_rnn.W.eval()) # parameter list self.params = [ self.fwd_rnn.W, self.fwd_rnn.U, self.fwd_rnn.bh, self.bwd_rnn.W, self.bwd_rnn.U, self.bwd_rnn.bh, self.V_fwd, self.V_bwd, self.by ] self.bwd_rnn.h_t = self.bwd_rnn.h_t[::-1] # weighted sum of forward & backward self.y_t = T.nnet.sigmoid( T.dot(self.fwd_rnn.h_t, self.V_fwd) + T.dot(self.bwd_rnn.h_t, self.V_bwd) + self.by)
def __init__(self, input, input_dim, minibatch, hidden_dim, output_dim, init='uniform', inner_init='orthonormal', params=None): self.input = input self.hidden_dim = hidden_dim # create tuning parameters or use existing ones if params is None: self.W = theano.shared(value=get(identifier=init, shape=(input_dim, hidden_dim)), name='W', borrow=True) self.U = theano.shared(value=get(identifier=inner_init, shape=(hidden_dim, hidden_dim)), name='U', borrow=True) self.bh = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='bh', borrow=True) self.V = theano.shared(value=get(identifier=init, shape=(hidden_dim, output_dim)), name='V', borrow=True) self.by = theano.shared(value=get(identifier='zero', shape=(output_dim, )), name='by', borrow=True) elif params is not None: self.W, self.U, self.bh, self.V, self.by = params # parameter list self.params = [self.W, self.U, self.bh, self.V, self.by] # initialize hidden state if minibatch == 1: self.h0 = theano.shared(value=get(identifier='zero', shape=(hidden_dim, )), name='h0', borrow=True) else: self.h0 = theano.shared(value=get(identifier='zero', shape=(minibatch, hidden_dim)), name='h0', borrow=True) def recurrence(x_t, h_tm_prev): # hidden state h_t = T.tanh( T.dot(x_t, self.W) + T.dot(h_tm_prev, self.U) + self.bh) # output y_t = T.nnet.sigmoid(T.dot(h_t, self.V) + self.by) return h_t, y_t ### recurrent propagation ### [self.h_t, self.y_t], _ = theano.scan(recurrence, sequences=input, outputs_info=[self.h0, None])