def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden trans = np.random.uniform(-0.01, 0.01, (26, 26)).astype('float32') transition = theano.shared(trans) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 25, nonlinearity=lasagne.nonlinearities.linear) local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var}) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy*mask_var[:,:,None] end_term = transition[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] length_index = T.sum(mask_var, axis=1) loss_train = crf_loss0(local_energy, transition, target_var, mask_var).mean() prediction, corr = crf_accuracy0(local_energy, transition, target_var, mask_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(transition) print network_params self.network_params = network_params loss_train = loss_train + params.l2*sum(lasagne.regularization.l2(x) for x in network_params) updates = lasagne.updates.sgd(loss_train, network_params, params.eta) updates = lasagne.updates.apply_momentum(updates, network_params, momentum=0.9) self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length], [loss_train, corr_train, num_tokens, local_energy], updates=updates, on_unused_input='ignore') self.eval_fn = theano.function([input_var, target_var, mask_var, mask_var1, length], [loss_train, corr_train, num_tokens, prediction], on_unused_input='ignore')
def __init__(self, params, data): self.get_pos_map(data) self.cap = params.cap self.lowercase = params.lowercase self.featuretype = params.featuretype chardim = params.chardim #dimension of character network layer worddim = params.worddim #dimension of character embedding and word LSTM layer if not params.nntype == "charagram": self.chars = self.get_character_dict(data) Ce = lasagne.init.Uniform(range=0.5/len(self.chars)) Ce_np = Ce.sample((len(self.chars),params.worddim)) Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX)) char = T.imatrix(); charmask = T.matrix() word = T.imatrix(); wordmask = T.matrix() idxs = T.ivector() Y = T.matrix() l_in_char = lasagne.layers.InputLayer((None, None)) if params.nntype == "charlstm": l_mask_char = lasagne.layers.InputLayer(shape=(None, None)) l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) if not params.outgate: l_lstm_char = lasagne_lstm_nooutput(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1) We = lasagne.layers.get_output(l_We, {l_in_char: char, l_mask_char: charmask}) elif params.nntype == "charagram": char = T.matrix() self.featuremap = self.get_feature_map(data, params.featuretype, params.cutoff, params.lowercase) print "Number of features: ", len(self.featuremap) l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+1)) if self.cap: l_in_char = lasagne.layers.InputLayer((None, len(self.featuremap)+2)) l_1 = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) if params.numlayers == 1: l_We = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) elif params.numlayers == 2: l_We = lasagne.layers.DenseLayer(l_1, chardim, nonlinearity=params.act) else: raise ValueError('Only 1-2 layers are supported currently.') We = lasagne.layers.get_output(l_We, {l_in_char:char}) elif params.nntype == "charcnn": l_emb_char = lasagne.layers.EmbeddingLayer(l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1)) conv_params = None if params.conv_type == 1: conv_params = [(175,2),(175,3),(175,4)] else: conv_params = [(25,1),(50,2),(75,3),(100,4),(125,5),(150,6)] layers = [] for num_filters, filter_size in conv_params: conv = lasagne.layers.Conv1DLayer(emb, num_filters, filter_size, nonlinearity=params.act) pl = lasagne.layers.GlobalPoolLayer(conv,theano.tensor.max) pl = lasagne.layers.FlattenLayer(pl) layers.append(pl) concat = lasagne.layers.ConcatLayer(layers) l_We = lasagne.layers.DenseLayer(concat, num_units=chardim, nonlinearity=params.act) We = lasagne.layers.get_output(l_We, {l_in_char: char}) else: l_We = None We = None l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word, backwards = True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,(-1,worddim)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,(-1,worddim)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_emb = lasagne.layers.DenseLayer(concat2, num_units=worddim, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer(l_emb, num_units=len(self.tags), nonlinearity=lasagne.nonlinearities.softmax) embg = lasagne.layers.get_output(l_out, {l_in_word: word, l_mask_word: wordmask}) embg = embg[idxs] prediction = T.argmax(embg, axis=1) self.all_params = lasagne.layers.get_all_params(l_out, trainable=True) + lasagne.layers.get_all_params(l_We, trainable=True) reg = 0.5*params.LC*sum(lasagne.regularization.l2(x) for x in self.all_params) cost = T.nnet.categorical_crossentropy(embg,Y) cost = T.mean(cost) + reg self.feedforward_function = None self.scoring_function = None self.cost_function = None self.train_function = None if params.nntype == "charlstm": self.feedforward_function = theano.function([char, charmask, word, wordmask, idxs], embg) self.scoring_function = theano.function([char, charmask, word, wordmask, idxs], prediction) self.cost_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function([char, charmask, word, wordmask, idxs, Y], cost, updates=updates) elif params.nntype == "charcnn" or params.nntype == "charagram": self.feedforward_function = theano.function([char, word, wordmask, idxs], embg) self.scoring_function = theano.function([char, word, wordmask, idxs], prediction) self.cost_function = theano.function([char, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum(grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function([char, word, wordmask, idxs, Y], cost, updates=updates)
def __init__(self, We_initial, char_embedd_table_initial, params): self.eta = params.eta We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) g = T.imatrix() gmask = T.fmatrix() y = T.ivector() idxs = T.ivector() length = T.iscalar() char_input_var = T.itensor3(name='char-inputs') l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5) # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) if params.dropout: l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5) if (params.inf == 0): l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) elif (params.inf == 1): l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad='same') l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad='same') l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1) #l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2) concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, 2 * hidden)) else: l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad='same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) concat2 = lasagne.layers.ReshapeLayer(concat2, (-1, hidden)) concat2 = lasagne.layers.DenseLayer(concat2, num_units=hidden) if params.dropout: concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5) #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer( concat2, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.softmax) output = lasagne.layers.get_output(l_out, { l_in_word: g, l_mask_word: gmask, layer_char_input: char_input_var }) output_1 = output[idxs] test_output = lasagne.layers.get_output( l_out, { l_in_word: g, l_mask_word: gmask, layer_char_input: char_input_var }, deterministic=True) test_output_1 = test_output[idxs] model_params = lasagne.layers.get_all_params(l_out, trainable=True) self.model_p = lasagne.layers.get_all_params(l_out, trainable=True) reg = sum(lasagne.regularization.l2(x) for x in model_params) cost = lasagne.objectives.categorical_crossentropy(output_1, y) cost = T.mean(cost) + params.L2 * reg #pred = T.argmax(output_1, axis=1) final_pred = T.argmax(test_output_1, axis=1) self.acc_function = theano.function( [g, char_input_var, gmask, y, idxs, length], final_pred, on_unused_input='warn') #updates = lasagne.updates.adam(cost, model_params, self.eta) #from adam import adam #updates = adam(cost, model_params, self.eta) updates = lasagne.updates.sgd(cost, model_params, self.eta) updates = lasagne.updates.apply_momentum(updates, model_params, momentum=0.9) self.train_function = theano.function( [g, char_input_var, gmask, y, idxs, length], cost, updates=updates, on_unused_input='warn')
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.num_labels = params.num_labels self.de_hidden_size = params.de_hidden_size self.en_hidden_size = params.en_hidden_size print params.de_hidden_size, hidden, params.num_labels self.lstm_layers_num = 1 input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='in_targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, 512, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * 512)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): #print data[idx].shape p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [self.linear, self.linear_bias, self.de_lookuptable] #concatenate state_below = We[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) enclstm_f = LSTM(embsize, self.en_hidden_size) enclstm_b = LSTM(embsize, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs ei, di, dt = T.imatrices(3) #place holders em, dm, tf, di0 = T.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ input_var: ei, mask_var: em }) state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([Encoder, state_below], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) def _step2(ctx_, state_, hs_, Cs_): #print ctx_.shape, state_.shape, hs_.shape, Cs_.shape hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = T.fmatrices(2) hs_0 = T.ftensor3() Cs_0 = T.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, self.num_labels)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, self.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, self.num_labels) A = A.reshape((-1, length, self.num_labels)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ ##from adam import adam ##updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, dt, em, em1, length0, di0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0 })
def __init__(self, We_initial, char_embedd_table_initial, params): #self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, 25)).astype('float32') self.input_init = theano.shared(input_init) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) char_input_var = T.itensor3() char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=25, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) #print len(network_params) f = open( 'POS_Bilstm_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.002_0.030_emb_1_hidden_100.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy_init = self.input_init[:, :length, :] a_params = [self.input_init] predy = T.nnet.softmax(predy_init.reshape((-1, 25))) predy = predy.reshape((-1, length, 25)) prediction = T.argmax(predy_init, axis=2) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) #if (params.regutype==0): # ce_hinge = lasagne.objectives.categorical_crossentropy(predy_f + eps, y_f) # ce_hinge = ce_hinge.reshape((-1, length)) # ce_hinge = T.sum(ce_hinge* mask_var, axis=1) # cost = T.mean(-cost11) + lamb*T.mean(ce_hinge) #else: cost = T.mean(-cost11) #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) #gy = T.grad(cost, predy_init) #if (params.regutype==0): self.inf_fn = theano.function( [input_var, char_input_var, mask_var, mask_var1, length], cost, updates=updates_a) #else: # self.train_fn = theano.function([input_var, char_input_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') #corr = T.eq(prediction, target_var) #corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) #num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( [input_var, char_input_var, mask_var, mask_var1, length], [prediction, -cost11], on_unused_input='ignore') if params.WarmStart: hidden_inf = params.hidden_inf We_inf = theano.shared(We_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne.layers.EmbeddingLayer( l_in_word_a, input_size=We_initial.shape[0], output_size=embsize, W=We_inf, name='inf_word_embedding') layer_char_input_a = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2])) layer_char_embedding_a = lasagne.layers.EmbeddingLayer( layer_char_a, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding') layer_char_a = lasagne.layers.DimshuffleLayer( layer_char_embedding_a, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # construct convolution layer cnn_layer_a = lasagne.layers.Conv1DLayer( layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) #_, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer_a = lasagne.layers.reshape( pool_layer_a, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word_a = lasagne.layers.concat( [output_cnn_layer_a, l_emb_word_a], axis=2) l_cnn_input_a = lasagne.layers.DimshuffleLayer( l_emb_word_a, (0, 2, 1)) l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 1, 1, pad='same') l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 3, 1, pad='same') l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) concat2_a = lasagne.layers.ReshapeLayer(concat2_a, (-1, 2 * hidden_inf)) ## output logit scores before the softmax operations, not probability l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.linear) predy_inf = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }) predy_inf = predy_inf.reshape((-1, length, 25)) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) f = open( 'CRF_Inf_POS_num_filters_30_dropout_1_LearningRate_0.002_1.0_emb_1_inf_1_hidden_inf_300.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(a_params): p.set_value(data[idx]) self.start_fn = theano.function( [input_var, char_input_var, mask_var, length], predy_inf, on_unused_input='ignore')
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden start0 = np.random.uniform(-0.02, 0.02, (1, 25)).astype('float32') end0 = np.random.uniform(-0.02, 0.02, (1, 25)).astype('float32') start = theano.shared(start0) end = theano.shared(end0) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=25, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) f = open( 'LF_LIFU_Simple_CRF_lstm_pretrain.Batchsize_10_dropout_0_LearningRate_0.1_1e-050_emb_0.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, We) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy0 = lasagne.layers.get_output(l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var }) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 25) A = A.reshape((-1, length, 25)) predy = predy0.reshape((-1, length, 25)) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() ce_hinge = lasagne.objectives.categorical_crossentropy(predy_f, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) / mask_var.sum(axis=1) entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) """ label sequence language model score for each sequence """ l_LM_in = lasagne.layers.InputLayer((None, None, 25)) l_LM_mask = lasagne.layers.InputLayer(shape=(None, None)) l_LM_lstm = lasagne.layers.LSTMLayer(l_LM_in, hidden, mask_input=l_LM_mask) l_reshape_LM = lasagne.layers.ReshapeLayer(l_LM_lstm, (-1, hidden)) l_LM = lasagne.layers.DenseLayer( l_reshape_LM, num_units=26, nonlinearity=lasagne.nonlinearities.softmax) LM_params = lasagne.layers.get_all_params(l_LM, trainable=True) LM_params.append(start) LM_params.append(end) f = open('Label_LM.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(LM_params): p.set_value(data[idx]) predy_tmp = predy[:, 0, :].reshape((-1, 1, 25)) tmp = T.ones_like(predy_tmp) sos = tmp * (start.dimshuffle('x', 0, 1)) eos = tmp * (end.dimshuffle('x', 0, 1)) y_lm_in = T.concatenate([sos, predy], axis=1) y_lm_out = T.concatenate([predy, eos], axis=1) lm_mask_var = T.concatenate([tmp[:, 0, 0].reshape((-1, 1)), mask_var], axis=1) LM_out = lasagne.layers.get_output(l_LM, { l_LM_in: y_lm_in, l_LM_mask: lm_mask_var }) LM_out = LM_out.reshape((-1, length + 1, 26)) LM_cost = T.sum(T.log( T.sum(LM_out[:, :-1, :-1] * y_lm_out[:, :-1, :], axis=2) + eps) * mask_var, axis=1) cost = T.mean(-cost11) - params.l3 * T.mean( entropy_term) - params.lm * T.mean(LM_cost) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) self.train_fn = theano.function( [input_var, target_var, mask_var, mask_var1, length], [cost, T.mean(entropy_term), T.mean(LM_cost)], updates=updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( [input_var, target_var, mask_var, mask_var1, length], [cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore')
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden hidden_inf = params.hidden_inf input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (18, 18)).astype('float32') Wyy = theano.shared(Wyy0) char_input_var = T.itensor3() char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=17, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne.layers.EmbeddingLayer( l_in_word_a, input_size=We_initial.shape[0], output_size=embsize, W=We_inf, name='inf_word_embedding') layer_char_input_a = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2])) layer_char_embedding_a = lasagne.layers.EmbeddingLayer( layer_char_a, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding') layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char_a = lasagne.layers.DropoutLayer(layer_char_a, p=0.5) # construct convolution layer cnn_layer_a = lasagne.layers.Conv1DLayer( layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) #_, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word_a = lasagne.layers.concat( [output_cnn_layer_a, l_emb_word_a], axis=2) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) if (params.inf == 0): l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards=True) l_emb_word_a1 = lasagne.layers.concat( [l_lstm_wordf_a, l_lstm_wordb_a], axis=2) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a1, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a1, hidden_inf, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden_inf)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden_inf)) concat2_a = lasagne.layers.ConcatLayer( [l_reshapef_a, l_reshapeb_a]) else: l_cnn_input_a = lasagne.layers.DimshuffleLayer( l_emb_word_a, (0, 2, 1)) l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 3, 1, pad='same') l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 1, 1, pad='same') #l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_a, hidden, 3, 1, pad = 'same') l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) concat2_a = lasagne.layers.ReshapeLayer(concat2_a, (-1, 2 * hidden_inf)) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=17, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy0 = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }) predy_inf = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }, deterministic=True) predy_inf = predy_inf.reshape((-1, length, 17)) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 17) A = A.reshape((-1, length, 17)) predy = predy0.reshape((-1, length, 17)) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) predy_f = predy.reshape((-1, 17)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (np.e)**(-0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) ###from adam import adam ###updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length, t_t ], [cost, ce_hinge], updates=updates_a, on_unused_input='ignore') else: self.train_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length, t_t ], [cost, entropy_term], updates=updates_a, on_unused_input='ignore') prediction = T.argmax(predy_inf, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length ], [corr_train, num_tokens, prediction], on_unused_input='ignore')
def __init__(self, We_initial, char_embedd_table_initial, params): We = theano.shared(We_initial) # initial embedding for the InfNet We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = 17 self.de_hidden_size = params.de_hidden_size char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') char_input_var = T.itensor3(name='char-inputs') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() use_dropout = T.fscalar() use_dropout0 = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (self.num_labels +1 , self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open('NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.lstm_layers_num = 1 ei, di, dt = T.imatrices(3) #place holders decoderInputs0 ,em, em1, dm, tf, di0 =T.fmatrices(6) ci = T.itensor3() #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform(self.num_labels +1, self.de_hidden_size), borrow=True) self.linear = theano.shared(name="Linear", value = init_xavier_uniform(self.de_hidden_size + 2*self.en_hidden_size, self.num_labels), borrow= True) self.linear_bias = theano.shared(name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, )*0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1,0) target_var_shuffle = target_var.dimshuffle(1,0) self.params += [We_inf, self.linear, self.de_lookuptable, self.linear_bias] ######[batch, sent_length, embsize] state_below = We_inf[input_var_shuffle.flatten()].reshape((input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) ###### character word embedding layer_char_input_inf = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2])) layer_char_embedding_inf = lasagne.layers.EmbeddingLayer(layer_char_inf, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding_inf') layer_char_inf = lasagne.layers.DimshuffleLayer(layer_char_embedding_inf, pattern=(0, 2, 1)) #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5) cnn_layer_inf = lasagne.layers.Conv1DLayer(layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf') pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size) output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True) self.params += char_params ###### [batch, sent_length, num_filters] #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var}) char_state_below = lasagne.layers.get_output(output_cnn_layer_inf) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1,0, 2) state_below = T.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) enclstm_f = LSTM(embsize+num_filters, self.en_hidden_size) enclstm_b = LSTM(embsize+num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs state_below = self.de_lookuptable[target_var_in_shuffle.flatten()].reshape((target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) """ costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle]) #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) loss = costs.sum() / mask_var.sum() updates = lasagne.updates.sgd(loss, self.params, self.eta) updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt} ) """ def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32" ) msk_ = T.fill( (T.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape((1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape((ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis =1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = T.zeros_like(hs[:,:,0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = T.as_tensor_variable(self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan( fn=_step2, sequences = [Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0] ) predy = train_outputs[0].dimshuffle(1, 0 , 2) predy = predy[:,:,:-1]*mask_var[:,:,None] predy0 = predy.reshape((-1, 17)) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 17) A = A.reshape((-1, length, 17)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) cost = T.mean(-cost11) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) self.train_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[cost], updates=updates_a, on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} ) prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, ci, em, em1, length0, di0, use_dropout0], outputs=[prediction, -cost11], on_unused_input='ignore', givens={input_var:ei, char_input_var:ci, mask_var:em, mask_var1:em1, length: length0, decoderInputs0:di0, use_dropout:use_dropout0} )
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_local = lasagne.layers.DenseLayer( concat2, num_units=25, b=None, nonlinearity=lasagne.nonlinearities.linear) ### the above is for the uniary term energy """ if params.emb ==1: f = open('F.pickle') else: f = open('F0_new.pickle') para = pickle.load(f) f.close() """ f_params = lasagne.layers.get_all_params(l_local, trainable=True) """ for idx, p in enumerate(f_params): p.set_value(para[idx]) """ Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) d_params = lasagne.layers.get_all_params(l_local, trainable=True) d_params.append(Wyy) self.d_params = d_params l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, l_emb_word.W) #l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a, input_size=We_initial.shape[0] , output_size = embsize, W =We) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) #a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) #self.a_params = a_params """ if params.emb ==1: f = open('F.pickle') else: f = open('F0_new.pickle') PARA = pickle.load(f) f.close() for idx, p in enumerate(a_params): p.set_value(PARA[idx]) """ l_local_a_inf = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) y_in = T.ftensor3() y = T.imatrix() g = T.imatrix() gmask = T.fmatrix() y_mask = T.fmatrix() length = T.iscalar() predy0 = lasagne.layers.get_output(l_local_a, { l_in_word_a: g, l_mask_word_a: gmask }) predy = predy0.reshape((-1, length, 25)) predy0_inf = lasagne.layers.get_output(l_local_a_inf, { l_in_word_a: g, l_mask_word_a: gmask }) predy_inf = predy0_inf.reshape((-1, length, 25)) #predy = predy * gmask[:,:,None] #newpredy = T.concatenate([predy, y0] , axis=2) # n , L, 46, 46 # predy0: n, L, 25 # energy loss def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy) return [targets_one_step, new_ta_energy] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) local_energy = lasagne.layers.get_output(l_local, { l_in_word: g, l_mask_word: gmask }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * gmask[:, :, None] targets_shuffled = y_in.dimshuffle(1, 0, 2) masks_shuffled = gmask.dimshuffle(1, 0) # initials should be energies_shuffles[0, :, -1, :] target_time0 = targets_shuffled[0] initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) length_index = T.sum(gmask, axis=1) - 1 length_index = T.cast(length_index, 'int32') """for ground-truth energy""" initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) pos_end_target = y_in[T.arange(length_index.shape[0]), length_index] pos_cost = target_energies[-1] + T.sum( T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot( pos_end_target, Wyy[:-1, -1]) check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1) """for cost-augmented InfNet""" negtargets_shuffled = predy.dimshuffle(1, 0, 2) negtarget_time0 = negtargets_shuffled[0] neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1]) neginitials = [negtarget_time0, neginitial_energy0] [_, negtarget_energies], _ = theano.scan( fn=inner_function, outputs_info=neginitials, sequences=[negtargets_shuffled[1:], masks_shuffled[1:]]) neg_end_target = predy[T.arange(length_index.shape[0]), length_index] neg_cost = negtarget_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot( neg_end_target, Wyy[:-1, -1]) """for InfNet""" negtargets_inf_shuffled = predy_inf.dimshuffle(1, 0, 2) negtarget_inf_time0 = negtargets_inf_shuffled[0] neginitial_inf_energy0 = T.dot(negtarget_inf_time0, Wyy[-1, :-1]) neginitials_inf = [negtarget_inf_time0, neginitial_inf_energy0] [_, negtarget_inf_energies], _ = theano.scan( fn=inner_function, outputs_info=neginitials_inf, sequences=[negtargets_inf_shuffled[1:], masks_shuffled[1:]]) neg_inf_end_target = predy_inf[T.arange(length_index.shape[0]), length_index] neg_inf_cost = negtarget_inf_energies[-1] + T.sum( T.sum(local_energy * predy_inf, axis=2) * gmask, axis=1) + T.dot( neg_inf_end_target, Wyy[:-1, -1]) y_f = y.flatten() predy_f = predy.reshape((-1, 25)) ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * gmask, axis=1) predy_inf_f = predy_inf.reshape((-1, 25)) ce_hinge_inf = lasagne.objectives.categorical_crossentropy( predy_inf_f + eps, y_f) ce_hinge_inf = ce_hinge_inf.reshape((-1, length)) ce_hinge_inf = T.sum(ce_hinge_inf * gmask, axis=1) entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * gmask, axis=1) delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask delta0 = T.sum(delta0, axis=1) hinge_cost_inf = neg_inf_cost - pos_cost if (params.margin_type == 1): hinge_cost0 = 1 + neg_cost - pos_cost elif (params.margin_type == 2): hinge_cost0 = neg_cost - pos_cost elif (params.margin_type == 0): hinge_cost0 = delta0 + neg_cost - pos_cost elif (params.margin_type == 3): hinge_cost0 = delta0 * (1.0 + neg_cost - pos_cost) #g_cost = T.mean(T.maximum(-hinge_cost0, 0.0)) if (params.regu_type == 0): g_cost = T.mean(-hinge_cost0) + 10 * T.mean( -hinge_cost_inf) + T.mean(ce_hinge) + T.mean(ce_hinge_inf) else: g_cost = T.mean(-hinge_cost0) + 10 * T.mean(-hinge_cost_inf) #g_cost_later = T.mean(-hinge_cost0) d_cost = T.mean(T.maximum(hinge_cost0, 0.0)) + params.Lambda * T.mean( T.maximum(hinge_cost_inf, 0.0)) #hinge_cost = hinge_cost0 * T.gt(hinge_cost0, 0) #d_cost = T.sum(hinge_cost) #d_cost0 = d_cost ###l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) #hinge_cost_g = hinge_cost0 * T.lt(hinge_cost0, 0) #d_cost0_g = T.mean(hinge_cost_g) """select different regulizer""" ###g_cost = -d_cost0 + params.l2* sum(lasagne.regularization.l2(x) for x in a_params) + params.l3*T.mean(ce_hinge) #g_cost = -d_cost0_g #g_cost_final = -T.mean(hinge_cost_g) + params.l2* sum(lasagne.regularization.l2(x) for x in a_params) #d_cost = d_cost #g_cost = -T.mean(hinge_cost_g) #d_cost = T.mean(hinge_cost0) a_params = lasagne.layers.get_all_params([l_local_a, l_local_a_inf], trainable=True) updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta) updates_g = lasagne.updates.apply_momentum(updates_g, a_params, momentum=0.9) #updates_g = lasagne.updates.adam(g_cost, a_params, 0.001) #updates_g_later = lasagne.updates.adam(g_cost_later, a_params, 0.0006) self.train_g = theano.function( [g, gmask, y, y_in, length], [g_cost, d_cost, pos_cost, neg_cost, delta0, check], updates=updates_g, on_unused_input='ignore') #self.train_g_later = theano.function([g, gmask, y, y_in, length], [g_cost, d_cost, pos_cost, neg_cost, delta0, check], updates=updates_g_later, on_unused_input='ignore') #updates_d = lasagne.updates.sgd(d_cost, d_params, params.eta) #updates_d = lasagne.updates.apply_momentum(updates_d, d_params, momentum=0.9) updates_d = lasagne.updates.adam(d_cost, d_params, 0.001) self.train_d = theano.function( [g, gmask, y, y_in, length], [d_cost, g_cost, pos_cost, neg_cost, delta0, check], updates=updates_d, on_unused_input='ignore') """build the function for the test time inference""" pred = T.argmax(predy_inf, axis=2) pg = T.eq(pred, y) pg = pg * gmask acc_inf = 1.0 * T.sum(pg) / T.sum(gmask) pred = T.argmax(predy, axis=2) pg = T.eq(pred, y) pg = pg * gmask acc_cost = 1.0 * T.sum(pg) / T.sum(gmask) self.test_time = theano.function([g, gmask, y, length], [acc_inf, acc_cost])
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) trans = np.random.uniform(-0.01, 0.01, (18, 18)).astype('float32') transition = theano.shared(trans) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() char_input_var = T.itensor3(name='char-inputs') l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length ), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer(layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5) # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer(layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) if params.dropout: incoming = lasagne.layers.DropoutLayer(incoming, p=0.5) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5.) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5., backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) if params.dropout: concat = lasagne.layers.DropoutLayer(concat, p=0.5) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= 17, nonlinearity=lasagne.nonlinearities.linear) #bi_lstm_crf = CRFLayer(concat, params.num_labels, mask_input= l_mask_word) local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy*mask_var[:,:,None] end_term = transition[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] local_energy_eval = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var, layer_char_input:char_input_var}, deterministic=True) local_energy_eval = local_energy_eval.reshape((-1, length, 17)) local_energy_eval = local_energy_eval*mask_var[:,:,None] local_energy_eval = local_energy_eval + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] #energies_train = lasagne.layers.get_output(bi_lstm_crf, {l_in_word: input_var, l_mask_word: mask_var}) loss_train = crf_loss0(local_energy, transition, target_var, mask_var).mean() prediction, corr = crf_accuracy0(local_energy_eval, transition, target_var, mask_var) ##loss_train = crf_loss(energies_train, target_var, mask_var).mean() ##prediction, corr = crf_accuracy(energies_train, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(transition) print network_params self.network_params = network_params loss_train = loss_train + params.L2*sum(lasagne.regularization.l2(x) for x in network_params) #updates = lasagne.updates.adam(loss_train, network_params, params.eta) updates = lasagne.updates.sgd(loss_train, network_params, params.eta) updates = lasagne.updates.apply_momentum(updates, network_params, momentum=0.9) self.train_fn = theano.function([input_var, char_input_var, target_var, mask_var, mask_var1, length], loss_train, updates=updates, on_unused_input='ignore') self.eval_fn = theano.function([input_var, char_input_var, target_var, mask_var, mask_var1, length], [corr_train, num_tokens, prediction], on_unused_input='ignore')
def __init__(self, We_initial, params): self.eta = params.eta We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden g = T.imatrix() gmask = T.fmatrix() y = T.ivector() idxs = T.ivector() l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) if params.dropout: l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) if params.dropout: concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5) l_out = lasagne.layers.DenseLayer( concat2, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) output = lasagne.layers.get_output(l_out, { l_in_word: g, l_mask_word: gmask }) output_1 = output[idxs] test_output = lasagne.layers.get_output(l_out, { l_in_word: g, l_mask_word: gmask }, deterministic=True) test_output_1 = test_output[idxs] model_params = lasagne.layers.get_all_params(l_out, trainable=True) self.model_p = lasagne.layers.get_all_params(l_out, trainable=True) reg = sum(lasagne.regularization.l2(x) for x in model_params) cost = lasagne.objectives.categorical_crossentropy(output_1, y) cost = T.mean(cost) + params.L2 * reg final_pred = T.argmax(test_output_1, axis=1) y1 = T.ones_like(y) SUM = T.sum(y1) acc = 1.0 * T.sum(T.eq(final_pred, y)) / SUM self.acc_function = theano.function([g, gmask, y, idxs], acc, on_unused_input='warn') #updates = lasagne.updates.adam(cost, model_params, self.eta) updates = lasagne.updates.sgd(cost, model_params, self.eta) updates = lasagne.updates.apply_momentum(updates, model_params, momentum=0.9) self.train_function = theano.function([g, gmask, y, idxs], [cost, acc], updates=updates, on_unused_input='warn')
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden hidden_inf = params.hidden_inf input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=25, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'POS_CRF_lstm_pretrain.Batchsize_10_dropout_0_LearningRate_0.1_1e-050_emb_0.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) ##if params.small: ## We_small_init = np.random.uniform(-0.1, 0.1, (We_initial.shape[0], hidden_inf)).astype('float32') ## We_small = theano.shared(We_small_init) ## l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a, input_size= We_small_init.shape[0] , output_size = hidden_inf, W =We_small) ##else: l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, l_emb_word.W) if (params.inf == 0): l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden_inf)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden_inf)) concat2_a = lasagne.layers.ConcatLayer( [l_reshapef_a, l_reshapeb_a]) else: l_cnn_input_a = lasagne.layers.DimshuffleLayer( l_emb_word_a, (0, 2, 1)) l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 1, 1, pad='same') l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 3, 1, pad='same') l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) #l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same') concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) #concat2_a = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2) concat2_a = lasagne.layers.ReshapeLayer(concat2_a, (-1, 2 * hidden_inf)) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy0 = lasagne.layers.get_output(l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var }) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 25) A = A.reshape((-1, length, 25)) predy = predy0.reshape((-1, length, 25)) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( [input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates=updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( [input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates=updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( [input_var, target_var, mask_var, mask_var1, length], [cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore')
def __init__(self, We_initial, params): self.eta = params.eta We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden g = T.imatrix() gmask = T.fmatrix() y = T.ivector() idxs = T.ivector() l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) #l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize , W =We) #l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize , We) if params.dropout: l_emb_word = lasagne.layers.DropoutLayer(l_emb_word, p=0.5) if (params.inf==0): l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf,(-1,hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb,(-1,hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) elif(params.inf==1) : l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn_1 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same') l_cnn_3 = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad = 'same') l_cnn = lasagne.layers.ConcatLayer([l_cnn_1, l_cnn_3], axis=1) #l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 1, 1, pad = 'same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) #concat2 = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2) concat2 = lasagne.layers.ReshapeLayer(concat2 ,(-1, 2*hidden)) else: l_cnn_input = lasagne.layers.DimshuffleLayer(l_emb_word, (0, 2, 1)) l_cnn = lasagne.layers.Conv1DLayer(l_cnn_input, hidden, 3, 1, pad = 'same') concat2 = lasagne.layers.DimshuffleLayer(l_cnn, (0, 2, 1)) concat2 = lasagne.layers.ReshapeLayer(concat2 ,(-1, hidden)) concat2 = lasagne.layers.DenseLayer(concat2, num_units= hidden) if params.dropout: concat2 = lasagne.layers.DropoutLayer(concat2, p=0.5) #l_emb = lasagne.layers.DenseLayer(concat2, num_units=hidden, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer(concat2, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.softmax) output = lasagne.layers.get_output(l_out, {l_in_word: g, l_mask_word: gmask}) output_1= output[idxs] test_output = lasagne.layers.get_output(l_out, {l_in_word: g, l_mask_word: gmask}, deterministic=True) test_output_1= test_output[idxs] model_params = lasagne.layers.get_all_params(l_out, trainable=True) self.model_p = lasagne.layers.get_all_params(l_out, trainable=True) reg = sum(lasagne.regularization.l2(x) for x in model_params) cost = lasagne.objectives.categorical_crossentropy(output_1, y) cost = T.mean(cost) + params.L2 * reg #pred = T.argmax(output_1, axis=1) final_pred = T.argmax(test_output_1, axis=1) y1 = T.ones_like(y) SUM = T.sum(y1) acc = T.sum(T.eq(final_pred, y)) ###acc = 1.0 * T.sum(T.eq(final_pred, y))/SUM self.acc_function = theano.function([g, gmask, y, idxs], [acc, SUM], on_unused_input='warn') ##from adam import adam ##updates = adam(cost, model_params, self.eta) #updates = lasagne.updates.adam(cost, model_params, self.eta) updates = lasagne.updates.sgd(cost, model_params, self.eta) updates = lasagne.updates.apply_momentum(updates, model_params, momentum=0.9) self.train_function = theano.function([g, gmask, y, idxs], [cost, acc], updates=updates, on_unused_input='warn')
def __init__(self, We_initial, params): #self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden input_init = np.random.uniform(-0.1, 0.1, (10, MAX_lENGTH, params.num_labels)).astype('float32') self.input_init = theano.shared(input_init) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() Wyy0 = np.random.uniform(-0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb ==1: l_emb_word = lasagne.layers.EmbeddingLayer(l_in_word, input_size= We_initial.shape[0] , output_size = embsize, W =We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards = True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat,(-1,2*hidden)) l_local = lasagne.layers.DenseLayer(l_reshape_concat, num_units= params.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) #print len(network_params) f = open('ccctag_CRF_Bilstm_Viterbi_.Batchsize_10_dropout_0_LearningRate_0.01_0.0512_tagversoin_2.pickle','r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) def inner_function( targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1,:-1]) new_ta_energy_t = tg_energy + T.sum(new_ta_energy*targets_one_step, axis =1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, {l_in_word: input_var, l_mask_word: mask_var}) local_energy = local_energy.reshape((-1, length, params.num_labels)) local_energy = local_energy*mask_var[:,:,None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1,-1] local_energy = local_energy + end_term.dimshuffle('x', 'x', 0)*mask_var1[:,:, None] predy_init = self.input_init[:,:length,:] a_params = [self.input_init] predy = T.nnet.softmax(predy_init.reshape((-1, params.num_labels))) predy = predy.reshape((-1, length, params.num_labels)) prediction = T.argmax(predy_init, axis=2) predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1,:-1]) initials = [target_time0, initial_energy0] [ _, target_energies], _ = theano.scan(fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum(T.sum(local_energy*predy, axis=2)*mask_var, axis=1) predy_f = predy.reshape((-1, params.num_labels)) y_f = target_var.flatten() if (params.annealing ==0): lamb = params.L3 elif (params.annealing ==1): lamb = params.L3* (1 - 0.01*t_t) cost = T.mean(-cost11) #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) self.inf_fn = theano.function([input_var, mask_var, mask_var1, length], cost, updates = updates_a) self.eval_fn = theano.function([input_var, mask_var, mask_var1, length], [prediction, -cost11], on_unused_input='ignore')
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden start0 = np.random.uniform(-0.02, 0.02, (1, 26)).astype('float32') end0 = np.zeros((1, 26)).astype('float32') end0[0, -1] = 1.0 start = theano.shared(start0) end = theano.shared(end0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_local = lasagne.layers.DenseLayer( concat2, num_units=25, nonlinearity=lasagne.nonlinearities.linear) f_params = lasagne.layers.get_all_params(l_local, trainable=True) Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) d_params = lasagne.layers.get_all_params(l_local, trainable=True) d_params.append(Wyy) self.d_params = d_params l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, l_emb_word.W) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params y_in = T.ftensor3() y = T.imatrix() g = T.imatrix() gmask = T.fmatrix() y_mask = T.fmatrix() length = T.iscalar() # shape: n, L, 1 #y1 = T.ftensor3() # shape: n, 1, 46 predy0 = lasagne.layers.get_output(l_local_a, { l_in_word_a: g, l_mask_word_a: gmask }) predy = predy0.reshape((-1, length, 25)) predy = predy * gmask[:, :, None] #newpredy = T.concatenate([predy, y0] , axis=2) # n , L, 46, 46 # predy0: n, L, 25 # energy loss def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy) return [targets_one_step, new_ta_energy] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) local_energy = lasagne.layers.get_output(l_local, { l_in_word: g, l_mask_word: gmask }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * gmask[:, :, None] targets_shuffled = y_in.dimshuffle(1, 0, 2) masks_shuffled = gmask.dimshuffle(1, 0) target_time0 = targets_shuffled[0] initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) length_index = T.sum(gmask, axis=1) - 1 length_index = T.cast(length_index, 'int32') l_LM_in = lasagne.layers.InputLayer((None, None, 26)) l_LM_mask = lasagne.layers.InputLayer(shape=(None, None)) l_LM_lstm = lasagne.layers.LSTMLayer(l_LM_in, 2 * hidden, mask_input=l_LM_mask) l_reshape_LM = lasagne.layers.ReshapeLayer(l_LM_lstm, (-1, 2 * hidden)) l_LM = lasagne.layers.DenseLayer( l_reshape_LM, num_units=26, nonlinearity=lasagne.nonlinearities.softmax) LM_params = lasagne.layers.get_all_params(l_LM, trainable=True) LM_params.append(start) f = open('Label_LM.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(LM_params): p.set_value(data[idx]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) pos_end_target = y_in[T.arange(length_index.shape[0]), length_index] """add ground truth labels LM cost""" pos_predy_tmp0 = y_in[:, :, 0].reshape((-1, length, 1)) pos_tmp0 = T.zeros_like(pos_predy_tmp0) pos_predy_lm = T.concatenate([y_in, pos_tmp0], axis=2) pos_predy_tmp = pos_predy_lm[:, 0, :].reshape((-1, 1, 26)) pos_tmp = T.ones_like(pos_predy_tmp) sos = pos_tmp * (start.dimshuffle('x', 0, 1)) eos = pos_tmp * (end.dimshuffle('x', 0, 1)) pos_y_lm_in = T.concatenate([sos, pos_predy_lm], axis=1) pos_y_lm_out = T.concatenate([pos_predy_lm, eos], axis=1) pos_lm_mask_var = T.concatenate( [pos_tmp[:, 0, 0].reshape((-1, 1)), gmask], axis=1) pos_LM_out = lasagne.layers.get_output(l_LM, { l_LM_in: pos_y_lm_in, l_LM_mask: pos_lm_mask_var }) pos_LM_out = pos_LM_out.reshape((-1, length + 1, 26)) pos_LM_cost = T.sum(T.log( T.sum(pos_LM_out[:, :-1, :] * pos_y_lm_out[:, :-1, :], axis=2) + eps) * gmask, axis=1) pos_cost = target_energies[-1] + T.sum( T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot( pos_end_target, Wyy[:-1, -1]) + params.lm * pos_LM_cost check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1) negtargets_shuffled = predy.dimshuffle(1, 0, 2) negtarget_time0 = negtargets_shuffled[0] neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1]) """predict label language cost""" neg_predy_tmp0 = predy[:, :, 0].reshape((-1, length, 1)) neg_tmp0 = T.zeros_like(neg_predy_tmp0) neg_predy_lm = T.concatenate([predy, neg_tmp0], axis=2) neg_predy_tmp = neg_predy_lm[:, 0, :].reshape((-1, 1, 26)) neg_tmp = T.ones_like(neg_predy_tmp) sos = neg_tmp * (start.dimshuffle('x', 0, 1)) eos = neg_tmp * (end.dimshuffle('x', 0, 1)) neg_y_lm_in = T.concatenate([sos, neg_predy_lm], axis=1) neg_y_lm_out = T.concatenate([neg_predy_lm, eos], axis=1) neg_lm_mask_var = T.concatenate( [neg_tmp[:, 0, 0].reshape((-1, 1)), gmask], axis=1) neg_LM_out = lasagne.layers.get_output(l_LM, { l_LM_in: neg_y_lm_in, l_LM_mask: neg_lm_mask_var }) neg_LM_out = neg_LM_out.reshape((-1, length + 1, 26)) neg_LM_cost = T.sum(T.log( T.sum(neg_LM_out[:, :-1, :] * neg_y_lm_out[:, :-1, :], axis=2) + eps) * gmask, axis=1) neginitials = [negtarget_time0, neginitial_energy0] [_, negtarget_energies], _ = theano.scan( fn=inner_function, outputs_info=neginitials, sequences=[negtargets_shuffled[1:], masks_shuffled[1:]]) neg_end_target = predy[T.arange(length_index.shape[0]), length_index] neg_cost = negtarget_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot( neg_end_target, Wyy[:-1, -1]) + params.lm * neg_LM_cost y_f = y.flatten() predy_f = predy.reshape((-1, 25)) ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * gmask, axis=1) entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * gmask, axis=1) delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask delta0 = T.sum(delta0, axis=1) hinge_cost = delta0 + neg_cost - pos_cost hinge_cost = hinge_cost * T.gt(hinge_cost, 0) d_cost = T.mean(hinge_cost) d_cost0 = d_cost """select different regulizer""" g_cost = -d_cost0 + params.l2 * sum( lasagne.regularization.l2(x) for x in a_params) + params.l3 * T.mean(ce_hinge) ###g_cost = -d_cost0 + params.L2* sum(lasagne.regularization.l2(x) for x in a_params) - params.L31*T.mean(entropy_term) d_cost = d_cost0 + params.l2 * sum( lasagne.regularization.l2(x) for x in d_params) self.a_params = a_params updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta) updates_g = lasagne.updates.apply_momentum(updates_g, a_params, momentum=0.9) self.train_g = theano.function( [g, gmask, y, y_in, length], [g_cost, d_cost0, pos_cost, neg_cost, delta0, check], updates=updates_g, on_unused_input='ignore') updates_d = lasagne.updates.adam(d_cost, d_params, 0.001) self.train_d = theano.function( [g, gmask, y, y_in, length], [d_cost, d_cost0, pos_cost, neg_cost, delta0, check], updates=updates_d, on_unused_input='ignore') # test the model and retuning the infernce network predy_test = lasagne.layers.get_output(l_local_a, { l_in_word_a: g, l_mask_word_a: gmask }, deterministic=True) predy_test = predy_test.reshape((-1, length, 25)) pred = T.argmax(predy_test, axis=2) pg = T.eq(pred, y) pg = pg * gmask acc = 1.0 * T.sum(pg) / T.sum(gmask) negtargets_shuffled_test = predy_test.dimshuffle(1, 0, 2) negtarget_time0_test = negtargets_shuffled_test[0] neginitial_energy0_test = T.dot(negtarget_time0_test, Wyy[-1, :-1]) neginitials_test = [negtarget_time0_test, neginitial_energy0_test] [_, negtarget_energies_test], _ = theano.scan( fn=inner_function, outputs_info=neginitials_test, sequences=[negtargets_shuffled_test[1:], masks_shuffled[1:]]) end_test_target = predy_test[T.arange(length_index.shape[0]), length_index] neg_cost_test = negtarget_energies_test[-1] + T.sum( T.sum(local_energy * predy_test, axis=2) * gmask, axis=1) + T.dot( end_test_target, Wyy[:-1, -1]) test_cost = -T.mean(neg_cost_test) + params.l3 * T.mean( ce_hinge) - params.lm * T.mean(neg_LM_cost) test_updates = lasagne.updates.sgd(test_cost, a_params, params.eta) test_updates = lasagne.updates.apply_momentum(test_updates, a_params, momentum=0.9) self.test_time_turning = theano.function([g, gmask, y, length], test_cost, updates=test_updates, on_unused_input='ignore') self.test_time1 = theano.function([g, gmask, y, y_in, length], [ acc, T.mean(neg_cost), T.mean(pos_cost), params.l3 * T.mean(ce_hinge) ], on_unused_input='ignore') self.test_time = theano.function([g, gmask, y, length], acc) self.test_time2 = theano.function([g, gmask, length], pred)
def __init__(self, params, data): self.get_pos_map(data) self.cap = params.cap self.lowercase = params.lowercase self.featuretype = params.featuretype chardim = params.chardim #dimension of character network layer worddim = params.worddim #dimension of character embedding and word LSTM layer if not params.nntype == "charagram": self.chars = self.get_character_dict(data) Ce = lasagne.init.Uniform(range=0.5 / len(self.chars)) Ce_np = Ce.sample((len(self.chars), params.worddim)) Ce = theano.shared(np.asarray(Ce_np, dtype=config.floatX)) char = T.imatrix() charmask = T.matrix() word = T.imatrix() wordmask = T.matrix() idxs = T.ivector() Y = T.matrix() l_in_char = lasagne.layers.InputLayer((None, None)) if params.nntype == "charlstm": l_mask_char = lasagne.layers.InputLayer(shape=(None, None)) l_emb_char = lasagne.layers.EmbeddingLayer( l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) l_lstm_char = lasagne.layers.LSTMLayer(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) if not params.outgate: l_lstm_char = lasagne_lstm_nooutput(l_emb_char, chardim, peepholes=True, learn_init=False, mask_input=l_mask_char) l_We = lasagne.layers.SliceLayer(l_lstm_char, -1, 1) We = lasagne.layers.get_output(l_We, { l_in_char: char, l_mask_char: charmask }) elif params.nntype == "charagram": char = T.matrix() self.featuremap = self.get_feature_map(data, params.featuretype, params.cutoff, params.lowercase) print "Number of features: ", len(self.featuremap) l_in_char = lasagne.layers.InputLayer( (None, len(self.featuremap) + 1)) if self.cap: l_in_char = lasagne.layers.InputLayer( (None, len(self.featuremap) + 2)) l_1 = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) if params.numlayers == 1: l_We = lasagne.layers.DenseLayer(l_in_char, chardim, nonlinearity=params.act) elif params.numlayers == 2: l_We = lasagne.layers.DenseLayer(l_1, chardim, nonlinearity=params.act) else: raise ValueError('Only 1-2 layers are supported currently.') We = lasagne.layers.get_output(l_We, {l_in_char: char}) elif params.nntype == "charcnn": l_emb_char = lasagne.layers.EmbeddingLayer( l_in_char, input_size=Ce.get_value().shape[0], output_size=Ce.get_value().shape[1], W=Ce) emb = lasagne.layers.DimshuffleLayer(l_emb_char, (0, 2, 1)) conv_params = None if params.conv_type == 1: conv_params = [(175, 2), (175, 3), (175, 4)] else: conv_params = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] layers = [] for num_filters, filter_size in conv_params: conv = lasagne.layers.Conv1DLayer(emb, num_filters, filter_size, nonlinearity=params.act) pl = lasagne.layers.GlobalPoolLayer(conv, theano.tensor.max) pl = lasagne.layers.FlattenLayer(pl) layers.append(pl) concat = lasagne.layers.ConcatLayer(layers) l_We = lasagne.layers.DenseLayer(concat, num_units=chardim, nonlinearity=params.act) We = lasagne.layers.get_output(l_We, {l_in_char: char}) else: l_We = None We = None l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word = lasagne_embedding_layer_2(l_in_word, chardim, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, worddim, peepholes=True, learn_init=False, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, worddim)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, worddim)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_emb = lasagne.layers.DenseLayer( concat2, num_units=worddim, nonlinearity=lasagne.nonlinearities.tanh) l_out = lasagne.layers.DenseLayer( l_emb, num_units=len(self.tags), nonlinearity=lasagne.nonlinearities.softmax) embg = lasagne.layers.get_output(l_out, { l_in_word: word, l_mask_word: wordmask }) embg = embg[idxs] prediction = T.argmax(embg, axis=1) self.all_params = lasagne.layers.get_all_params( l_out, trainable=True) + lasagne.layers.get_all_params( l_We, trainable=True) reg = 0.5 * params.LC * sum( lasagne.regularization.l2(x) for x in self.all_params) cost = T.nnet.categorical_crossentropy(embg, Y) cost = T.mean(cost) + reg self.feedforward_function = None self.scoring_function = None self.cost_function = None self.train_function = None if params.nntype == "charlstm": self.feedforward_function = theano.function( [char, charmask, word, wordmask, idxs], embg) self.scoring_function = theano.function( [char, charmask, word, wordmask, idxs], prediction) self.cost_function = theano.function( [char, charmask, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum( grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function( [char, charmask, word, wordmask, idxs, Y], cost, updates=updates) elif params.nntype == "charcnn" or params.nntype == "charagram": self.feedforward_function = theano.function( [char, word, wordmask, idxs], embg) self.scoring_function = theano.function( [char, word, wordmask, idxs], prediction) self.cost_function = theano.function( [char, word, wordmask, idxs, Y], cost) grads = theano.gradient.grad(cost, self.all_params) updates = lasagne.updates.momentum( grads, self.all_params, 0.2, momentum=0.95) #same as Ling et al. self.train_function = theano.function( [char, word, wordmask, idxs, Y], cost, updates=updates)