def build_model1(self): # LookupTable to Embedding src_embedding_layer = EmbeddingLayer(input_dim=self.n_src_vocab, output_dim=self.src_embed_dim, name='src_embedding') tgt_embedding_layer = EmbeddingLayer(input_dim=self.n_tgt_vocab, output_dim=self.tgt_embed_dim, name='src_embedding') # LSTMs src_lstm_forward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) src_lstm_backward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) tgt_lstm = LSTM(input_dim=self.tgt_embed_dim, output_dim=self.tgt_lstm_op_dim) sys.stderr.write(str(tgt_lstm.params) + "\n") # TODO # From target LSTM to target word indexes # Input: target LSTM output dim + Attention from BiLSTM proj_layer = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=self.n_tgt_vocab, activation='softmax') params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer.params # declare input variables src_ip = T.ivector() tgt_ip = T.ivector() tgt_op = T.ivector() # lookup table -> embedding src_embed_ip = src_embedding_layer.fprop(src_ip) tgt_embed_ip = tgt_embedding_layer.fprop(tgt_ip) # embedding -> source BiLSTM src_lstm_forward.fprop(src_embed_ip) src_lstm_backward.fprop(src_embed_ip[::-1, :]) # Concatenate foward/backward. (Flip backward again to get corresponding h for the same word) encoderh = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1) # End of source BiLSTM -> target LSTM tgt_lstm.h_0 = encoderh[-1] tgt_lstm.fprop(tgt_embed_ip) # Attention # Read http://arxiv.org/abs/1508.04025 attention = tgt_lstm.h.dot(encoderh.transpose()) attention = attention.dot(encoderh) # Order preference? decoderh = T.concatenate((attention, tgt_lstm.h), axis=1) # LSTM output -> target word proj_op = proj_layer.fprop(decoder) # Cost + regularization cost = T.nnet.categorical_crossentropy(proj_op, tgt_op).mean() cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) return dict({'cost': cost, 'src_ip': src_ip, 'tgt_ip': tgt_ip, 'tgt_op': tgt_op, 'params': params, 'proj_op': proj_op})
logging.info('Number of training sentence-pairs : %d ' % (len(train_src))) logging.info('Number of validation sentence-pairs : %d ' % (len(dev_src))) # Create symbolic variables src_inp = T.imatrix() src_lens = T.ivector() tgt_mask = T.fmatrix() index = T.scalar() # Create synthetic data to test computation graph src_inp_t = np.random.randint(low=1, high=100, size=(5, 10)).astype(np.int32) src_lens_t = np.random.randint(low=1, high=9, size=(5, )).astype(np.int32) tgt_mask_t = np.float32(np.random.rand(5, 9).astype(np.float32) > 0.5) src_embedding_layer = EmbeddingLayer(input_dim=src_embedding.shape[0], output_dim=src_embedding.shape[1], pretrained=src_embedding, name='src_embedding') tgt_embedding_layer = EmbeddingLayer(input_dim=src_embedding.shape[0], output_dim=src_embedding.shape[1], name='tgt_embedding') tgt_lstm_0 = FastLSTM(input_dim=1024, output_dim=1024, name='tgt_lstm_0') tgt_lstm_1 = FastLSTM(input_dim=1024, output_dim=1024, name='tgt_lstm_1') tgt_lstm_2 = FastLSTM(input_dim=1024, output_dim=1024, name='tgt_lstm_2') tgt_lstm_h_to_vocab = FullyConnectedLayer( input_dim=1024, output_dim=tgt_embedding_layer.input_dim,
tgt_inp_t = np.random.randint(low=1, high=100, size=(5, 15)).astype(np.int32) tgt_op_t = np.random.randint(low=1, high=200, size=(5, 15)).astype(np.int32) src_lens_t = np.random.randint(low=1, high=9, size=(5, )).astype(np.int32) tgt_mask_t = np.float32(np.random.rand(5, 15).astype(np.float32) > 0.5) # Embedding Lookup Tables if args.pretrained_src != 'none': logging.info('Reading src pretrained embeddings ...') src_embedding_layer, src_word2ind, src_ind2word \ = get_pretrained_embedding_layer(args.pretrained_src, src_vocab, 'src') n_src = len(src_word2ind) # number of words in the source language else: n_src = len(src_word2ind) # number of words in the source language src_embedding_layer = EmbeddingLayer(input_dim=n_src, output_dim=src_emb_dim, name='src_embedding') if args.pretrained_tgt != 'none': logging.info('Reading tgt pretrained embeddings ...') tgt_embedding_layer, tgt_word2ind, tgt_ind2word \ = get_pretrained_embedding_layer(args.pretrained_tgt, tgt_vocab, 'tgt') n_tgt = len(tgt_word2ind) else: n_tgt = len(tgt_word2ind) # number of words in the source language tgt_embedding_layer = EmbeddingLayer(input_dim=n_tgt, output_dim=tgt_emb_dim, name='tgt_embedding') logging.info('Source vocabulary size : %d ' % (src_embedding_layer.input_dim)) logging.info('Target vocabulary size : %d ' % (tgt_embedding_layer.input_dim))
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % ( best_valid_score) log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()
def __init__(self): logging.info('Start Building Model...') logging.info("config.floatX" + " " + config.floatX) TparamD = OrderedDict() trg_vocab_size = 20 src_vocab_size = 20 hidden_size = 20 embedding_size = 5 self.x = T.matrix("x", dtype="int64") self.x0 = self.x x = self.x inp = self.x.dimshuffle((1, 0)) self.layer1 = EmbeddingLayer(inp, vocab_size=src_vocab_size, embedding_size=embedding_size, prefix="Embed01") self.forward1 = theano.function([x], self.layer1.output) self.layer2 = SimpleRNNLayer(inpSeq=self.layer1.output, mask=None, in_dim=embedding_size, hidden_dim=hidden_size, bias=1, prefix="RNN01") self.forward2 = theano.function([x], self.layer1.output) self.current_output = self.layer2.output[0] self.layer3 = TimeDistributedDenseLayer(inpSeq=self.current_output, mask=None, in_dim=hidden_size, out_dim=trg_vocab_size, activation="softmax", prefix="TimeDense01") self.forward = theano.function([x], self.layer3.output) TparamD = OrderedDict() TparamD.update(self.layer1.Tparam) TparamD.update(self.layer2.Tparam) TparamD.update(self.layer3.Tparam) print TparamD self.TparamD = TparamD self.load("model.json") #Training step """ Batch of probs of each time step [ [(....),(....)], [(....),(....)], [(....),(....)] ] ==> for example, (3 step, 2 batch, 4 classes) """ self.y0 = T.matrix("y", dtype="int64") self.y = self.y0.dimshuffle((1, 0)) probs = self.layer3.output #layer2.output is already softmax y_flat = self.y.flatten() #flatten ids y_flat_idx = T.arange( y_flat.shape[0] ) * trg_vocab_size + y_flat #shift the ids to match the prob_flat cost = -T.log( probs.flatten()[y_flat_idx]) #calcuate log for only picked ids cost = cost.reshape([self.y.shape[0], self.y.shape[1]]) cost = cost.sum(0) cost = cost.mean() """ Add Regularize HERE !! """ logging.info("Building Gradient...") self.train = theano.function([self.x, self.y], [probs, cost]) UpdateParams = TparamD grads = T.grad(cost, wrt=list(UpdateParams.values())) f_grad = theano.function([self.x0, self.y0], grads, name='f_grad') lr = T.scalar(name='lr') optimizer = adadelta self.f_grad_shared, self.f_update = optimizer(lr, UpdateParams, grads, self.x0, self.y0, cost) logging.info('Building Model Completed.')
def __init__(self, load=None): logging.info('Start Building Model...') logging.info("config.floatX" + " " + config.floatX) TparamD = OrderedDict() trg_vocab_size = 8 src_vocab_size = 8 hidden_size = 60 embedding_size = 5 self.hidden_size = hidden_size self.x = T.matrix("x", dtype="int64") self.x0 = self.x x = self.x inp = self.x.dimshuffle((1, 0)) self.layer1 = EmbeddingLayer(inp, vocab_size=src_vocab_size, embedding_size=embedding_size, prefix="Embed01") self.forward1 = theano.function([x], self.layer1.output) self.layer2 = GRULayer(inpSeq=self.layer1.output, mask=None, in_dim=embedding_size, hidden_dim=hidden_size, bias=1, prefix="RNN01") self.forward2 = theano.function([x], self.layer1.output) self.current_output = self.layer2.output[0] """Dropout at hidden states""" self.current_output = _dropout_from_layer(self.current_output, 0.2) self.layer3 = TimeDistributedDenseLayer(inpSeq=self.current_output, mask=None, in_dim=hidden_size, out_dim=trg_vocab_size, activation="softmax", prefix="TimeDense01") self.forward = theano.function([x], self.layer3.output) self.one_step_state = np.zeros((hidden_size, ), dtype="float32") TparamD = OrderedDict() TparamD.update(self.layer1.Tparam) TparamD.update(self.layer2.Tparam) TparamD.update(self.layer3.Tparam) print TparamD self.TparamD = TparamD if load is not None: self.load(load) isTrain = True if isTrain: #Training step """ Batch of probs of each time step [ [(....),(....)], [(....),(....)], [(....),(....)] ] ==> for example, (3 step, 2 batch, 4 classes) """ self.y0 = T.ones_like(self.x0) self.y = T.set_subtensor(self.y0[:, 0:-1], self.x0[:, 1:]) self.fy = theano.function([self.x0], self.y) probs = self.layer3.output #layer2.output is already softmax y_flat = self.y.flatten() #flatten ids y_flat_idx = T.arange( y_flat.shape[0] ) * trg_vocab_size + y_flat #shift the ids to match the prob_flat cost = -T.log( probs.flatten()[y_flat_idx]) #calcuate log for only picked ids cost = cost.reshape([self.y.shape[0], self.y.shape[1]]) cost = cost.sum(0) cost = cost.mean() """ Add Regularize HERE !! """ logging.info("Building Gradient...") self.train = theano.function([self.x], [probs, cost]) UpdateParams = TparamD grads = T.grad(cost, wrt=list(UpdateParams.values())) f_grad = theano.function([self.x0], grads, name='f_grad') lr = T.scalar(name='lr') optimizer = adadelta_lm self.f_grad_shared, self.f_update = optimizer( lr, UpdateParams, grads, self.x0, cost) logging.info('Building Model Completed.')