def auc_cost(self, y, kappa=0.9, tau=2): f_pos = T.nonzero_values(y * self.p_y_given_x) f_neg = T.nonzero_values((1 - y) * self.p_y_given_x) diff = f_pos.T.dimshuffle(0, 'x') - f_neg.T.dimshuffle('x', 0) r = (-(diff - kappa)) ** tau * (diff < kappa) auc = T.mean(r) return auc
def _step2(diag_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = tensor.cast(state_.argmax(axis=-1), "int32") msk_ = tensor.fill( (tensor.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, encoderInputs.shape[1], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = tensor.as_tensor_variable(hs), tensor.as_tensor_variable( Cs) state_below0 = state_below0.reshape( (encoderInputs.shape[1], self.de_hidden_size)) attn_index = tensor.nonzero(diag_, True) attn_value = tensor.nonzero_values(diag_) en_context = Encoder_shuffle[:, attn_index[0], :] attn_context = Encoder_shuffle_re[:, attn_index[0], :] attn_weight = tensor.batched_dot(attn_context, state_below0) attn_weight = tensor.nnet.softmax(attn_weight) #attn_weight *= (encoderMask.dimshuffle(1,0)) attn_weight *= (attn_value.dimshuffle('x', 0)) ##attn_weight = attn_weight/(tensor.sum(attn_weight, axis=1).dimshuffle(0,'x')) ####### ctx_ : (b, h) ctx_ = tensor.sum(en_context * attn_weight[:, :, None], axis=1) state_below0 = tensor.concatenate([ctx_, state_below0], axis=1) newpred = tensor.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = tensor.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = tensor.zeros_like(hs[:, :, 0]) state_below = tensor.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs
if __name__ == '__main__': net = createNetwork() ## loading network parameters # params = pickle.load(open(loading_path,"rb")) # lasagne.layers.set_all_param_values(net['fc5'], params) # print "loading params successfully" #### input_X = T.tensor4('input_X') target_Y = T.vector("target_Y") action_input = T.matrix("action") pred_Y = lasagne.layers.get_output(net['fc5'], inputs=input_X) Action_Y_index = T.argmax(pred_Y, axis=1) error_term = target_Y - T.nonzero_values(action_input * pred_Y) cost = T.mean(T.sqr(error_term)) #scaled_error_term = lasagne.updates.norm_constraint(error_term, max_norm=1, norm_axes=0) #cost = T.mean(T.sqr(scaled_error)) params = lasagne.layers.get_all_params(net['fc5'], trainable=True) updates = lasagne.updates.adam(cost, params, learning_rate=LEARNING_RATE, beta1=GRADIENT_MOMENTUM, beta2=SQUARED_GRADIENT_MOMENTUM, epsilon=MIN_SQUARED_GRADIENT) average_Q = T.mean(T.nonzero_values(action_input * pred_Y)) Q_value = T.max(pred_Y, axis=1) train_fn = theano.function( inputs=[input_X, action_input, target_Y], updates=updates, outputs=[average_Q, cost]) action_index_fn = theano.function( inputs=[input_X], outputs=[Action_Y_index]) Q_value_fn = theano.function( inputs=[input_X], outputs=[Q_value])
def __init__(self, nh, nc, nf, mb): ''' nh :: dimension of the hidden layer nc :: number of classes nf :: input feature size mb :: mini batch size ''' # parameters of the model # first level : input to hidden bias self.wx_z = generate_weight(nf, nh, 'wx_z') # first level: recurrent : hidden to hidden state self.wh_z = generate_weight(nh, nh, 'wh_z') # first level: input to hidden bias self.bh_z = generate_weight(1, nh, 'bh_z') # first level : input to hidden bias self.wx_i = theano.shared(name='wx_i', value=0.2 * np.random.uniform(-1.0, 1.0, (nf, nh)) .astype(theano.config.floatX)) # first level: recurrent : hidden to hidden state self.wh_i = theano.shared(name='wh_i', value=0.2 * np.random.uniform(-1.0, 1.0, (nh, nh)) .astype(theano.config.floatX)) # first level: input to hidden bias self.bh_i = theano.shared(name='bh_i', value=np.zeros((1, nh), dtype=theano.config.floatX)) # first level : input to hidden bias self.wx_f = theano.shared(name='wx_f', value=0.2 * np.random.uniform(-1.0, 1.0, (nf, nh)) .astype(theano.config.floatX)) # first level: recurrent : hidden to hidden state self.wh_f = theano.shared(name='wh_f', value=0.2 * np.random.uniform(-1.0, 1.0, (nh, nh)) .astype(theano.config.floatX)) # first level: input to hidden bias self.bh_f = theano.shared(name='bh_f', value=np.zeros((1, nh), dtype=theano.config.floatX)) # first level : input to hidden bias self.wx_o = theano.shared(name='wx_o', value=0.2 * np.random.uniform(-1.0, 1.0, (nf, nh)) .astype(theano.config.floatX)) # first level: recurrent : hidden to hidden state self.wh_o = theano.shared(name='wh_o', value=0.2 * np.random.uniform(-1.0, 1.0, (nh, nh)) .astype(theano.config.floatX)) # first level: input to hidden bias self.bh_o = theano.shared(name='bh_o', value=np.zeros((1, nh), dtype=theano.config.floatX)) ## the peephole weights self.ph_o = theano.shared(name='ph_o', value=np.zeros((1, nh), dtype=theano.config.floatX)) self.ph_i = theano.shared(name='ph_i', value=np.zeros((1, nh), dtype=theano.config.floatX)) self.ph_f = theano.shared(name='ph_f', value=np.zeros((1, nh), dtype=theano.config.floatX)) # hidden layer value : self.h0 = theano.shared(name='h0', value=np.zeros((mb, nh), dtype=theano.config.floatX)) # hidden layer value : self.c0 = theano.shared(name='c0', value=np.zeros((mb, nh), dtype=theano.config.floatX)) ## LAST Level # last level: hidden to output self.w = theano.shared(name='w', value=0.2 * np.random.uniform(-1.0, 1.0, (nh, nc)) .astype(theano.config.floatX)) # last level: hidden to output self.b = theano.shared(name='b', value=np.zeros((1, nc), dtype=theano.config.floatX)) self.I_mb = theano.shared(name='I', value=np.ones((mb, 1), dtype=theano.config.floatX)) # bundle self.params = [self.wx_z, self.wx_f, self.wx_i, self.wx_o, self.wh_z, self.wh_f, self.wh_i, self.wh_o, self.bh_z, self.bh_f, self.bh_i, self.bh_o, self.ph_i, self.ph_o, self.ph_f, self.w, self.b] lr = T.scalar('lr') idxs = T.tensor3() # input, since batched, dim rise to 3 x = idxs.astype(theano.config.floatX) yinput = T.tensor3() # labels y_sentence = yinput.astype(theano.config.floatX) #idxs = T.imatrix() #y_sentence = T.ivector() # no batch version def recurrence(x_t, h_tm1, c_tm1): z_t = T.tanh(T.dot(x_t, self.wx_z) + T.dot(h_tm1, self.wh_z) + T.dot(self.I_mb, self.bh_z)) i_t = T.nnet.sigmoid(T.dot(x_t, self.wx_i) + T.dot(h_tm1, self.wh_i) + T.dot(self.I_mb, self.ph_i) * c_tm1 + T.dot(self.I_mb, self.bh_i)) f_t = T.nnet.sigmoid(T.dot(x_t, self.wx_f) + T.dot(h_tm1, self.wh_f) + T.dot(self.I_mb, self.ph_f) * c_tm1 + T.dot(self.I_mb, self.bh_f)) c_t = z_t * i_t + c_tm1 * f_t o_t = T.nnet.sigmoid(T.dot(x_t, self.wx_o) + T.dot(h_tm1, self.wh_o) + T.dot(self.I_mb, self.ph_o) * c_t + T.dot(self.I_mb, self.bh_o)) h_t = T.tanh(c_t) * o_t s_t = T.nnet.softmax(T.dot(h_t, self.w) + T.dot(self.I_mb, self.b)) '''no batch, raw math equations''' ''' z_t = T.tanh(T.dot(x_t, self.wx_z) + T.dot(h_tm1, self.wh_z) + self.bh_z) i_t = T.nnet.sigmoid(T.dot(x_t, self.wx_i) + T.dot(h_tm1, self.wh_i) + self.bh_i + self.ph_i * c_tm1) f_t = T.nnet.sigmoid(T.dot(x_t, self.wx_f) + T.dot(h_tm1, self.wh_f) + self.bh_f + self.ph_f * c_tm1) c_t = z_t * i_t + c_tm1 * f_t o_t = T.nnet.sigmoid(T.dot(x_t, self.wx_o) + T.dot(h_tm1, self.wh_o) + self.bh_o + self.ph_o * c_t) h_t = T.tanh(c_t) * o_t s_t = T.nnet.softmax(T.dot(h_t, self.w) + self.b) ''' return [h_t, c_t, s_t] [h, c, s], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[self.h0, self.c0, None], n_steps=x.shape[0]) p_y_given_x_sentence = s[:, :, :] # here size len x nc x mb y_pred = T.argmax(p_y_given_x_sentence, axis=2) #no batch: #p_y_given_x_sentence = s[:, 0, :] #y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate sentence_nll = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_sentence))) * mb #no batch: #sentence_nll = -T.mean(T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence]) sentence_gradients = T.grad(sentence_nll, self.params) sentence_updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, sentence_gradients)) # theano functions to compile self.classify = theano.function(inputs=[idxs], outputs=y_pred)#, mode=profmode) self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates) # by default it is sgd self.optm = optimizers.sgd self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)), sentence_gradients,x, y_sentence, sentence_nll)
def __init__(self, nh, nc, nf, mb): ''' nh :: dimension of the hidden layer nc :: number of classes nf :: number of features mb :: batch size : mini batch ''' # parameters of the model self.wx = theano.shared(name='wx', value=0.2 * numpy.random.uniform(-1.0, 1.0, (nf, nh)) .astype(theano.config.floatX)) self.wh = theano.shared(name='wh', value=0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nh)) .astype(theano.config.floatX)) self.w = theano.shared(name='w', value=0.2 * numpy.random.uniform(-1.0, 1.0, (nh, nc)) .astype(theano.config.floatX)) self.bh = theano.shared(name='bh', value=numpy.zeros((nh, 1), dtype=theano.config.floatX)) self.b = theano.shared(name='b', value=numpy.zeros((nc, 1), dtype=theano.config.floatX)) self.h0 = theano.shared(name='h0', value=numpy.zeros((mb, nh), dtype=theano.config.floatX)) self.I_mb = theano.shared(name='I', value=numpy.ones((mb, 1), dtype=theano.config.floatX)) # bundle self.params = [self.wx, self.wh, self.w, self.bh, self.b] lr = T.scalar('lr') idxs = T.tensor3() # input, since batched, dim rise to 3 x = idxs.astype(theano.config.floatX) yinput = T.tensor3() # labels y_sentence = yinput.astype(theano.config.floatX) # no batch: #idxs = T.imatrix() #y_sentence = T.ivector('y_sentence') # labels def recurrence(x_t, h_tm1): h_t = T.nnet.sigmoid(T.dot(x_t, self.wx) + T.dot(h_tm1, self.wh) + T.dot(self.I_mb, self.bh.T)) # s_t = T.nnet.softmax(T.dot(h_t, self.w) + T.dot(self.I_mb, self.b.T)) # # trying for the sparse version? //TODO: suppose to be much faster since both cost and grad are sparse ## Sparse.structured_dot(Sparse.csc_from_dense(x_t), self.wx) return [h_t, s_t] # output is dimension len x (nc x 1) but s_t is of len x 1 x nc [h, s], _ = theano.scan(fn=recurrence, sequences=x, outputs_info=[self.h0, None], n_steps=x.shape[0]) p_y_given_x_sentence = s[:, :, :] # here size len x nc x mb y_pred = T.argmax(p_y_given_x_sentence, axis=2) # no batch: #p_y_given_x_sentence = s[:, 0, :] #y_pred = T.argmax(p_y_given_x_sentence, axis=1) # cost and gradients and learning rate # y is matrix (nlabel , batch) now instead of pure vector # TODO: NEED TO FIGURE OUT PROPER WAY TO COMPUTE COST, NOW MEAN DOES NOT MAKE SENSE .... #sentence_nll = -T.mean(T.log(p_y_given_x_sentence) * y_sentence) * mb * 5 sentence_nll = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_sentence))) * mb # sparse version? # T.mean(T.log( T.nonzero_values(p_y_given_x_sentence * y_sentence))) # non-batch version: # sentence_nll = -T.mean(T.log(p_y_given_x_sentence)[T.arange(x.shape[0]), y_sentence]) sentence_gradients = T.grad(sentence_nll, self.params) sentence_updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, sentence_gradients)) # theano functions to compile self.classify = theano.function(inputs=[idxs], outputs=y_pred) # this is not going to be used ..... self.sentence_train = theano.function(inputs=[idxs, y_sentence, lr], outputs=sentence_nll, updates=sentence_updates) # by default it is sgd self.optm = optimizers.rmsprop self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)), sentence_gradients,x, y_sentence, sentence_nll)
b = T.ivector() c = a[T.arange(b.shape[0]),b] test = theano.function([a,b], c) a0 = np.random.uniform(-0.01, 0.01, (3, 4, 5)).astype('float32') b0 = np.random.randint(4, size=(3)).astype('int32') c0 = test(a0, b0) """ a = T.fvector() #a = T.fmatrix() #b = T.ftensor3() c = T.nonzero(a, True) d = T.nonzero_values(a) test = theano.function([a], [c, d]) #a0 = np.random.uniform(-0.01, 0.01, (2, 3)).astype('float32') #b0 = np.random.uniform(-0.01, 0.01, (2, 3, 3)).astype('float32') #b0 = np.random.randint(0, 4, size=(4)).astype('int32') #c0 = np.random.randint(0, 4, size=(4)).astype('int32') a0 = np.array([1, 2, 0, 0]).astype('float32') c0, d0 = test(a0) #print a0 #print b0 print c0, d0
def __init__(self, nh_enc, nh_dec, nh_att, nx, ny, mb, lt, bidir, nonlstm_encode=False, restriction=None): ''' nh_enc :: dimension of the hidden layer of encoder nh_dec :: dimension of the hidden layer of decoder nh_att :: dimension of the hidden layer of attention ny :: number of classes nx :: input feature size mb :: mini batch size lt :: length of input, after padding .. for attention bidir:: bidirection or not ... 2 is bidirection, 1 is single ... ''' self.nh_enc = nh_enc self.nh_dec = nh_dec self.nh_att = nh_att self.nx = nx self.ny = ny self.lt = lt self.bidir = bidir # parameters of the model xhdim = nx+nh_enc*bidir # encoder forward # 1 level : input to hidden bias, *4 below is since we compressed the W, H, b computation self.Wf_enc_z = generate_weight(nx, nh_enc, "Wf_enc_z") self.Wf_enc_i = generate_weight(nx, nh_enc, "Wf_enc_i") self.Wf_enc_f = generate_weight(nx, nh_enc, "Wf_enc_f") self.Wf_enc_o = generate_weight(nx, nh_enc, "Wf_enc_o") self.Hf_enc_z = generate_weight(nh_enc, nh_enc, "Hf_enc_z") self.Hf_enc_i = generate_weight(nh_enc, nh_enc, "Hf_enc_i") self.Hf_enc_f = generate_weight(nh_enc, nh_enc, "Hf_enc_f") self.Hf_enc_o = generate_weight(nh_enc, nh_enc, "Hf_enc_o") self.bf_enc_z = generate_weight(1, nh_enc, "bf_enc_z") self.bf_enc_i = generate_weight(1, nh_enc, "bf_enc_i") self.bf_enc_f = generate_weight(1, nh_enc, "bf_enc_f") self.bf_enc_o = generate_weight(1, nh_enc, "bf_enc_o") # encoder backward: self.Wb_enc_z = generate_weight(nx, nh_enc, "Wb_enc_z") self.Wb_enc_i = generate_weight(nx, nh_enc, "Wb_enc_i") self.Wb_enc_f = generate_weight(nx, nh_enc, "Wb_enc_f") self.Wb_enc_o = generate_weight(nx, nh_enc, "Wb_enc_o") self.Hb_enc_z = generate_weight(nh_enc, nh_enc, "Hb_enc_z") self.Hb_enc_i = generate_weight(nh_enc, nh_enc, "Hb_enc_i") self.Hb_enc_f = generate_weight(nh_enc, nh_enc, "Hb_enc_f") self.Hb_enc_o = generate_weight(nh_enc, nh_enc, "Hb_enc_o") self.bb_enc_z = generate_weight(1, nh_enc, "bb_enc_z") self.bb_enc_i = generate_weight(1, nh_enc, "bb_enc_i") self.bb_enc_f = generate_weight(1, nh_enc, "bb_enc_f") self.bb_enc_o = generate_weight(1, nh_enc, "bb_enc_o") ## attention level: self.UV_att = generate_weight(xhdim, nh_att, "UV_att") self.W_att = generate_weight(nh_dec, nh_att, "W_att") self.v_att = generate_weight(nh_att, 1, "v_att") # decoder level : input to hidden bias self.W_dec_z = generate_weight(xhdim, nh_dec, "W_dec_z") self.W_dec_i = generate_weight(xhdim, nh_dec, "W_dec_i") self.W_dec_f = generate_weight(xhdim, nh_dec, "W_dec_f") self.W_dec_o = generate_weight(xhdim, nh_dec, "W_dec_o") self.H_dec_z = generate_weight(nh_dec, nh_dec, "H_dec_z") self.H_dec_i = generate_weight(nh_dec, nh_dec, "H_dec_i") self.H_dec_f = generate_weight(nh_dec, nh_dec, "H_dec_f") self.H_dec_o = generate_weight(nh_dec, nh_dec, "H_dec_o") self.b_dec_z = generate_weight(1, nh_dec, "b_dec_z") self.b_dec_i = generate_weight(1, nh_dec, "b_dec_i") self.b_dec_f = generate_weight(1, nh_dec, "b_dec_f") self.b_dec_o = generate_weight(1, nh_dec, "b_dec_o") # e is extra in decoder, for previous outut self.E_dec_z = generate_weight(ny, nh_dec, "E_dec_z") self.E_dec_i = generate_weight(ny, nh_dec, "E_dec_i") self.E_dec_f = generate_weight(ny, nh_dec, "E_dec_f") self.E_dec_o = generate_weight(ny, nh_dec, "E_dec_o") ## LAST Level # last level: hidden to output self.W_y = generate_weight(xhdim, ny, "W_y") self.H_y = generate_weight(nh_dec, ny, "H_y") self.E_y = generate_weight(ny, ny, "E_y") self.b_y = generate_weight(1, ny, "b_y", 0.0) ## INTERMEDIATE value hf0 = theano.shared(name='hf0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) # forward cf0 = theano.shared(name='cf0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) hb0 = theano.shared(name='hb0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) # backward cb0 = theano.shared(name='cb0', value=np.zeros((mb, nh_enc), dtype=config.floatX)) sd0 = theano.shared(name='sd0', value=np.zeros((mb, nh_dec), dtype=config.floatX)) cd0 = theano.shared(name='cd0', value=np.zeros((mb, nh_dec), dtype=config.floatX)) a= np.zeros((1, mb, ny), dtype=config.floatX) a[:,:,0] =1 y0 = theano.shared(name='y0', value=a) # all one vector for batch size ... , deprecated, should be matching automatically I_mb = theano.shared(name='I', value=np.ones((mb, 1), dtype=config.floatX)) WHb_f_enc = [self.Wf_enc_z, self.Hf_enc_z, self.bf_enc_z, self.Wf_enc_i, self.Hf_enc_i, self.bf_enc_i, self.Wf_enc_f, self.Hf_enc_f, self.bf_enc_f, self.Wf_enc_o, self.Hf_enc_o, self.bf_enc_o] WHb_b_enc = [self.Wb_enc_z, self.Hb_enc_z, self.bb_enc_z, self.Wb_enc_i, self.Hb_enc_i, self.bb_enc_i, self.Wb_enc_f, self.Hb_enc_f, self.bb_enc_f, self.Wb_enc_o, self.Hb_enc_o, self.bb_enc_o] WHEb_dec = [self.W_dec_z, self.E_dec_z, self.H_dec_z, self.b_dec_z, self.W_dec_i, self.E_dec_i, self.H_dec_i, self.b_dec_i, self.W_dec_f, self.E_dec_f, self.H_dec_f, self.b_dec_f, self.W_dec_o, self.E_dec_o, self.H_dec_o, self.b_dec_o] Wb_nonlstm_enc = [self.Wf_enc_z, self.bf_enc_z] # bundle, todo: note we removed peephole from definition ... self.params = [self.UV_att, self.W_att, self.v_att, self.W_y, self.H_y, self.b_y, self.E_y] + WHEb_dec if not nonlstm_encode: self.params += WHb_f_enc if bidir == 2: self.params += WHb_b_enc else: self.params += Wb_nonlstm_enc ## special case, to test image capture, just twist the encode to be nonlstm # Used for dropout. trng = RandomStreams(SEED) use_noise = theano.shared(numpy_floatX(0.)) # input parameter defined .... x_in = T.tensor3() # input, since batched, dim rise to 3 : len * mb * nx x = x_in.astype(config.floatX) y_in = T.tensor3() # ground truth labels , len * mb * ny y_target = y_in.astype(config.floatX) y_decinput = T.concatenate([y0, y_target], axis=0)[:-1, :,:].astype(config.floatX) # decode input labels, shifted to right by one and start with eos lr = T.scalar('lr') def encode(x_t, h_tm1, c_tm1, W_enc_z, H_enc_z, b_enc_z, W_enc_i, H_enc_i, b_enc_i, W_enc_f, H_enc_f, b_enc_f, W_enc_o, H_enc_o, b_enc_o): g_t = T.tanh(T.dot(x_t, W_enc_z) + T.dot(h_tm1, H_enc_z) + T.dot(I_mb, b_enc_z)) i_t = T.nnet.sigmoid(T.dot(x_t, W_enc_i) + T.dot(h_tm1, H_enc_i) + T.dot(I_mb, b_enc_i) ) # + T.dot(I_mb, ph_i.T) * c_tm1) f_t = T.nnet.sigmoid(T.dot(x_t, W_enc_f) + T.dot(h_tm1, H_enc_f) + T.dot(I_mb, b_enc_f) ) # + T.dot(I_mb, ph_f.T) * c_tm1 c_t = g_t * i_t + c_tm1 * f_t o_t = T.nnet.sigmoid(T.dot(x_t, W_enc_o) + T.dot(h_tm1, H_enc_o) + T.dot(I_mb, b_enc_o) ) # + T.dot(I_mb, ph_o.T) * c_t h_t = T.tanh(c_t) * o_t return [h_t, c_t] def relu(x): return theano.tensor.switch(x<0, 0, x) if nonlstm_encode: hf = relu(T.dot(x, self.Wf_enc_z) + T.dot(I_mb, self.bf_enc_z)) # len * mb * nx xh = T.concatenate([x, hf], axis=2) # since dim0 is the length of input, so it is of len * batch * xh_dim else: [hf, cf], _ = theano.scan(fn=encode, sequences=x, outputs_info=[hf0, cf0], non_sequences=WHb_f_enc, n_steps=x.shape[0]) xh = T.concatenate([x, hf], axis=2) # since dim0 is the length of input, so it is of len * batch * xh_dim if bidir == 2: [hb, cb], _ = theano.scan(fn=encode, sequences=x, outputs_info=[hb0, cb0], non_sequences=WHb_b_enc, go_backwards=True) xh = T.concatenate([x, hf, hb[::-1]], axis=2) #same as above # note: scan is in input backward fashion, but output corresponding to an inverted order, thus use [::-1] to reverse it. # attention prepare, since the same across all place UVxh = T.dot(xh, self.UV_att) #.dimshuffle(1, 0) actually it does not matter (then shuffle dim by switch 1st and 2nd dim) # dim z=x+h, then dot of len*mb*z and z*a= len*mb*a if restriction is not None: restriction_matrix = theano.shared(name="restriction", value=restriction).astype(config.floatX) def stable_softmax(yin): e_yin = np.exp(yin - yin.max(axis=1, keepdims=True)) return e_yin / e_yin.sum(axis=1, keepdims=True) def stable_softmax_nonzero(yin, zerosout): e_yin = np.exp(yin - yin.max(axis=1, keepdims=True)) # return e_yin / e_yin.sum(axis=1, keepdims=True) * zerosout #return T.nnet.softmax(yin - yin.max(axis=1, keepdims=True)) * zerosout def decode(y_tm1, sd_tm1, cd_tm1, xh, UVxh, I_mb): beta_st = T.dot(sd_tm1, self.W_att) + UVxh # note, dimension mismatch is fine, a*mb + len * a * mb beta_t = T.dot(beta_st, self.v_att) #1*len*mb v_att is (a*1) => len * mb * 1 alpha_t = stable_softmax(beta_t.dimshuffle(1,0,2)) z_t = T.batched_dot(xh.dimshuffle(1, 2, 0), alpha_t).flatten(2) g_t = T.tanh(T.dot(z_t, self.W_dec_z) + T.dot(sd_tm1, self.H_dec_z) + T.dot(I_mb, self.b_dec_z) + T.dot(y_tm1, self.E_dec_z)) i_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_i) + T.dot(sd_tm1, self.H_dec_i) + T.dot(I_mb, self.b_dec_i) + T.dot(y_tm1, self.E_dec_i)) # + T.dot(I_mb, ph_i.T) * c_tm1) f_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_f) + T.dot(sd_tm1, self.H_dec_f) + T.dot(I_mb, self.b_dec_f) + T.dot(y_tm1, self.E_dec_f)) #+ T.dot(I_mb, ph_f.T) * c_tm1 cd_t = g_t * i_t + cd_tm1 * f_t o_t = T.nnet.sigmoid(T.dot(z_t, self.W_dec_o) + T.dot(sd_tm1, self.H_dec_o) + T.dot(I_mb, self.b_dec_o) + T.dot(y_tm1, self.E_dec_o)) # + T.dot(I_mb, ph_o.T) * c_t sd_t = T.tanh(cd_t) * o_t #sd_t = dropout(sd_t, use_noise, trng) if restriction is None: y_t = stable_softmax( ( T.dot(z_t, self.W_y) + T.dot(sd_t, self.H_y) + T.dot(y_tm1, self.E_y) + T.dot(I_mb, self.b_y) ) ) else: restriction_perbatch = restriction_matrix[T.argmax(y_tm1, axis=1)] y_t = stable_softmax_nonzero( (T.dot(z_t, self.W_y) + T.dot(sd_t, self.H_y) + T.dot(y_tm1, self.E_y) + T.dot(I_mb, self.b_y)) , restriction_perbatch) return [sd_t, cd_t, y_t] [sd_dec, cd_dec, y_dec], _ = theano.scan(fn=decode, sequences=y_decinput, # dict(input=y_decinput, taps=[0]), outputs_info=[dict(initial=sd0, taps=[-1]), dict(initial=cd0, taps=[-1]), None ], #, dict(initial=y0, taps=[-1])], non_sequences=[xh, UVxh, I_mb], n_steps=y_decinput.shape[0]) p_y_given_x_sentence = y_dec[:, :, :] # here size len x ny x mb y_pred = T.argmax(p_y_given_x_sentence, axis=2) # cost and gradients and learning rate sentence_cost = -T.mean(T.log(T.nonzero_values(p_y_given_x_sentence * y_target[:,:,:]) + np.float32(1e-8))) sentence_gradients = T.grad(sentence_cost, self.params) sentence_updates = OrderedDict((p, p - lr * g) for p, g in zip(self.params, sentence_gradients)) # theano functions to compile self.classify = theano.function(inputs=[x_in, y_target], outputs=y_pred) self.sentence_train = theano.function(inputs=[x_in, y_target, lr], outputs=sentence_cost, updates=sentence_updates) self.only_encode = theano.function(inputs=[x_in], outputs=[xh, UVxh]) self.only_decode_step = decode # by default it is sgd self.optm = optimizers.sgd self.f_grad_shared, self.f_update = self.optm(lr, dict(zip([s.name for s in self.params], self.params)), sentence_gradients, x, y_target, sentence_cost)
def test4(x3): return TT.nonzero_values(x3)