output_matrix = tf.get_variable("output", [conf.num_hidden_state, conf.vocab_size], tf.float32, initializer=xavier_initializer()) output_bias = tf.get_variable("bias", [conf.vocab_size], tf.float32, initializer=xavier_initializer()) # embedding lookup word_embeddings = tf.nn.embedding_lookup(embedding_matrix, data) # shape: (64, 29, 1, 100) word_embeddings = tf.reshape(word_embeddings, [conf.batch_size, conf.seq_length -1, conf.embed_size]) #shape: (64, 29, 100) assert word_embeddings.shape == (conf.batch_size, conf.seq_length - 1, conf.embed_size) # RNN unrolling print("creating RNN") lstm_outputs = [] with tf.variable_scope("rnn") as scope: cell = LSTMCell(conf.num_hidden_state) state = cell.zero_state(conf.batch_size, tf.float32) for i in range(conf.seq_length - 1): if i > 0: scope.reuse_variables() lstm_output, state = cell(word_embeddings[:, i, :], state) lstm_outputs.append(lstm_output) # stack the outputs together, reshape, multiply lstm_outputs = tf.stack(lstm_outputs, axis = 1) lstm_outputs = tf.reshape(lstm_outputs, [conf.batch_size * (conf.seq_length - 1), conf.num_hidden_state]) assert lstm_outputs.shape == (conf.batch_size * (conf.seq_length - 1), conf.num_hidden_state) predictions = tf.matmul(lstm_outputs, output_matrix) + output_bias # reshape the labels labels = tf.reshape(next_word, [conf.batch_size * (conf.seq_length - 1)])
# tf.matmul(convW,encoder_final_state) # Decoder if mode == 1: attention_states = tf.transpose(encoder_outputs, [1, 0, 2]) attention_mechanism = tf.contrib.seq2seq.LuongAttention( decoder_hidden_units, attention_states, memory_sequence_length=enc_seqLen) decoder_cell = LSTMCell(decoder_hidden_units) decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell, attention_mechanism, attention_layer_size=2 * decoder_hidden_units) encoder_final_state = decoder_cell.zero_state(dtype=tf.float32, batch_size=batchSize) else: if encoder_choice == 0: decoder_cell = LSTMCell(2 * decoder_hidden_units) else: decoder_cell = LSTMCell(decoder_hidden_units) projection_layer = layers_core.Dense(dec_vocab_size, use_bias=False, name="output_projection") # Attention Is All You Need #Training Helper helper_1 = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embedded_input, sequence_length=dec_seqLen,
running-time. The output sahpe for different early_stop is same, hence, it is not clear what is the value of output for the timesteps which was not computed. """ import tensorflow as tf from tensorflow.contrib.rnn import static_rnn from tensorflow.contrib.rnn import LSTMCell from time import time import numpy as np initializer = tf.random_uniform_initializer(-1, 1) seq_input = tf.placeholder(shape=[200, 10, 2048], dtype=tf.float32) inputs = [ts[0] for ts in tf.split(seq_input, 200, axis=0)] early_stop = tf.placeholder(dtype=tf.int32) cell = LSTMCell(2048, initializer=initializer) initial_state = cell.zero_state(10, dtype=tf.float32) outputs, states = static_rnn( cell, inputs, initial_state=initial_state, sequence_length=early_stop ) sess = tf.Session() init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() input_values = np.random.uniform(size=(200, 10, 2048)).astype('float32')
def build_inference_graph(params_dict): # Todo: Check if load the glove is faster than import it from embedding # glove = np.load('GloVe/glove.npy') # Building the Embedding layer + placeholders keep_prob = tf.placeholder(tf.float32) embedding = tf.get_variable("embedding", initializer=glove, trainable=True) document_tokens = tf.placeholder(tf.int32, shape=[None, None], name="document_tokens") document_emb = tf.nn.embedding_lookup(embedding, document_tokens) answer_masks = tf.placeholder(tf.float32, shape=[None, None, None], name="answer_masks") encoder_lengths = tf.placeholder(tf.int32, shape=[None], name="encoder_lengths") projection = Dense(embedding.shape[0], use_bias=False) helper = seq2seq.GreedyEmbeddingHelper(embedding, tf.fill([batch_size], START_TOKEN), END_TOKEN) # Building the Encoder encoder_inputs = tf.matmul(answer_masks, document_emb, name="encoder_inputs") output = encoder_inputs for n in range(params_dict["num_encoder_layers"]): cell_fw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True) cell_bw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True) cell_fw = DropoutWrapper(cell_fw, output_keep_prob=keep_prob, ) cell_bw = DropoutWrapper(cell_bw, output_keep_prob=keep_prob, ) state_fw = cell_fw.zero_state(params_dict["batch_size"], tf.float32) state_bw = cell_bw.zero_state(params_dict["batch_size"], tf.float32) (output_fw, output_bw), encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, output, initial_state_fw=state_fw, initial_state_bw=state_bw, sequence_length=encoder_lengths, dtype=tf.float32, scope='encoder_rnn_' + str(n)) output = tf.concat([output_fw, output_bw], axis=2) encoder_final_output = output encoder_state_c = tf.concat((encoder_state[0][0], encoder_state[1][0]), -1) encoder_state_h = tf.concat((encoder_state[0][1], encoder_state[1][1]), -1) encoder_final_state = LSTMStateTuple(encoder_state_c, encoder_state_h) # Attention mechanism attention_mechanism = seq2seq.LuongAttention( num_units=params_dict["lstm_units"] * 2, memory=encoder_final_output, memory_sequence_length=encoder_lengths) # Building the Decoder temp_cell = LSTMCell(params_dict["lstm_units"] * 2, forget_bias=1.0) temp_cell = DropoutWrapper(temp_cell, output_keep_prob=keep_prob, ) decoder_cell = seq2seq.AttentionWrapper( cell=temp_cell, attention_mechanism=attention_mechanism, attention_layer_size=params_dict["lstm_units"] * 2) decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=helper, initial_state=decoder_cell.zero_state(params_dict["batch_size"], tf.float32).clone(cell_state=encoder_final_state), output_layer=projection) decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder, maximum_iterations=16) decoder_outputs = decoder_outputs.rnn_output # Normalize the logits between [0,1] prob_logits = tf.nn.softmax(decoder_outputs, axis=-1) return { "keep_prob": keep_prob, "document_tokens": document_tokens, "answer_masks": answer_masks, "encoder_lengths": encoder_lengths, "decoder_outputs": decoder_outputs, "prob_logits": prob_logits }
def build_train_graph(params_dict): # Building the Embedding layer + placeholders keep_prob = tf.placeholder(tf.float32) embedding = tf.get_variable("embedding", initializer=glove, trainable=True) document_tokens = tf.placeholder(tf.int32, shape=[None, None], name="document_tokens") document_emb = tf.nn.embedding_lookup(embedding, document_tokens) answer_masks = tf.placeholder(tf.float32, shape=[None, None, None], name="answer_masks") decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder_inputs") decoder_labels = tf.placeholder(tf.int32, shape=[None, None], name="decoder_labels") decoder_lengths = tf.placeholder(tf.int32, shape=[None], name="decoder_lengths") encoder_lengths = tf.placeholder(tf.int32, shape=[None], name="encoder_lengths") decoder_emb = tf.nn.embedding_lookup(embedding, decoder_inputs) question_mask = tf.sequence_mask(decoder_lengths, dtype=tf.float32) projection = Dense(embedding.shape[0], use_bias=False) training_helper = seq2seq.TrainingHelper(inputs=decoder_emb, sequence_length=decoder_lengths, time_major=False) # Building the Encoder encoder_inputs = tf.matmul(answer_masks, document_emb, name="encoder_inputs") output = encoder_inputs for n in range(params_dict["num_encoder_layers"]): cell_fw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True) cell_bw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True) cell_fw = DropoutWrapper( cell_fw, output_keep_prob=keep_prob, ) cell_bw = DropoutWrapper( cell_bw, output_keep_prob=keep_prob, ) state_fw = cell_fw.zero_state(params_dict["batch_size"], tf.float32) state_bw = cell_bw.zero_state(params_dict["batch_size"], tf.float32) (output_fw, output_bw), encoder_state = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, output, initial_state_fw=state_fw, initial_state_bw=state_bw, sequence_length=encoder_lengths, dtype=tf.float32, scope='encoder_rnn_' + str(n)) output = tf.concat([output_fw, output_bw], axis=2) encoder_final_output = output encoder_state_c = tf.concat((encoder_state[0][0], encoder_state[1][0]), -1) encoder_state_h = tf.concat((encoder_state[0][1], encoder_state[1][1]), -1) encoder_final_state = LSTMStateTuple(encoder_state_c, encoder_state_h) # Attention mechanism attention_mechanism = seq2seq.LuongAttention( num_units=params_dict["lstm_units"] * 2, memory=encoder_final_output, memory_sequence_length=encoder_lengths) # Building the Decoder temp_cell = LSTMCell(params_dict["lstm_units"] * 2, forget_bias=1.0) temp_cell = DropoutWrapper( temp_cell, output_keep_prob=keep_prob, ) decoder_cell = seq2seq.AttentionWrapper( cell=temp_cell, attention_mechanism=attention_mechanism, attention_layer_size=params_dict["lstm_units"] * 2) training_decoder = seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_cell.zero_state( params_dict["batch_size"], tf.float32).clone(cell_state=encoder_final_state), output_layer=projection) training_decoder_output, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=tf.reduce_max(decoder_lengths)) training_logits = training_decoder_output.rnn_output # Normalize the logits between [0,1] prob_logits = tf.nn.softmax(training_logits, axis=-1) loss = seq2seq.sequence_loss(logits=training_logits, targets=decoder_labels, weights=question_mask, name="loss") return { "keep_prob": keep_prob, "document_tokens": document_tokens, "answer_masks": answer_masks, "encoder_lengths": encoder_lengths, "decoder_inputs": decoder_inputs, "decoder_labels": decoder_labels, "decoder_lengths": decoder_lengths, "training_logits": training_logits, "prob_logits": prob_logits, "loss": loss }
def runMoreLstm(path=None, epochs=10, saveResult=True): trainData, validData, testData, wordId = loadWordIdsFromFiles() trainData = np.array(trainData, np.float32) # validData = np.array(validData, np.float32) testData = np.array(testData, np.float32) vocabSz = len(wordId) info = loadInfo('lstm_ped', path) learnRate = info['learning rate'] batchSz = info['batch size'] embedSz = info['embed size'] rnnSz = info['rnn size'] winSz = info['win size'] numWin = (trainData.shape[0] - 1) // (batchSz * winSz) # each batch has winSz * numWin words batchLen = winSz * numWin testNumWin = (testData.shape[0] - 1) // (batchSz * winSz) testBatchLen = winSz * testNumWin inp = tf.placeholder(tf.int32, shape=[batchSz, winSz]) ans = tf.placeholder(tf.int32, shape=[batchSz * winSz]) E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1)) embed = tf.nn.embedding_lookup(E, inp) rnn = LSTMCell(rnnSz) initialState = rnn.zero_state(batchSz, tf.float32) output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState) output = tf.reshape(output, [batchSz * winSz, rnnSz]) W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1)) B = tf.Variable(tf.random_normal([vocabSz], stddev=.1)) logits = tf.matmul(output, W) + B ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans) loss = tf.reduce_sum(ents) train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss) trainPerp = np.zeros(epochs + 1, dtype=np.float32) trainPerp[0] = info['train perplexity'] testPerp = np.zeros(epochs + 1, dtype=np.float32) testPerp[0] = info['test perplexity'] with tf.Session() as sess: loadSession(sess, 'lstm_ped', path) startTime = time.time() epoch = 0 print('epoch:', end=' ') while epoch < epochs: epoch += 1 win = 0 state = sess.run(initialState) testState = sess.run(initialState) # print(state, testState) winStart, winEnd = 0, winSz while win < numWin: inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)]) inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) _, state, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: state}) trainPerp[epoch] += outLoss if win < testNumWin: inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)]) inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState}) testPerp[epoch] += testOutLoss winStart, winEnd = winEnd, winEnd + winSz win += 1 print(epoch + info['epochs'], end=' ') trainPerp[1:] = np.exp(trainPerp[1:] / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen))) testPerp[1:] = np.exp(testPerp[1:] / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen))) print(f'\nelapsed: {time.time() - startTime}') print('train perplexity:', trainPerp[-1]) print('test perplexity:', testPerp[-1]) info['epochs'] += epochs info['train perplexity'] = trainPerp[-1] info['test perplexity'] = testPerp[-1] if saveResult: save(sess, info) drawPerplexity(trainPerp, testPerp, info['epochs'] - epochs)
def create_critic_network(self, Scope): inputs = tf.placeholder(shape=[1, self.max_lenth], dtype=tf.int32, name="inputs") action = tf.placeholder(shape=[1, self.max_lenth], dtype=tf.int32, name="action") action_pos = tf.placeholder(shape=[1, None], dtype=tf.int32, name="action_pos") lenth = tf.placeholder(shape=[1], dtype=tf.int32, name="lenth") lenth_up = tf.placeholder(shape=[1], dtype=tf.int32, name="lenth_up") #Lower network if Scope[-1] == 'e': vec = tf.nn.embedding_lookup(self.wordvector, inputs) print("active") else: vec = tf.nn.embedding_lookup(self.target_wordvector, inputs) print("target") cell = LSTMCell(self.dim, initializer=self.init, state_is_tuple=False) self.state_size = cell.state_size actions = tf.to_float(action) h = cell.zero_state(1, tf.float32) print('h:', h) embedding = [] for step in range(self.max_lenth): with tf.variable_scope("Lower/" + Scope, reuse=True): o, h = cell(vec[:, step, :], h) embedding.append(o[0]) h = h * (1.0 - actions[0, step]) #Upper network embedding = tf.stack(embedding) embedding = tf.gather(embedding, action_pos, name="Upper_input") with tf.variable_scope("Upper", reuse=True): out, _ = tf.nn.bidirectional_dynamic_rnn(cell, cell, embedding, lenth_up, dtype=tf.float32, scope=Scope) if self.isAttention: out = tf.concat(out, 2) out = out[0, :, :] tmp = tflearn.fully_connected(out, self.dim, scope=Scope, name="att") tmp = tflearn.tanh(tmp) with tf.variable_scope(Scope): v_T = tf.get_variable("v_T", dtype=tf.float32, shape=[self.dim, 1], trainable=True) a = tflearn.softmax(tf.matmul(tmp, v_T)) out = tf.reduce_sum(out * a, 0) out = tf.expand_dims(out, 0) else: #out = embedding[:, -1, :] out = tf.concat((out[0][:, -1, :], out[1][:, 0, :]), 1) out = tflearn.dropout(out, self.keep_prob) out = tflearn.fully_connected(out, self.grained, scope=Scope + "/pred", name="get_pred") return inputs, action, action_pos, lenth, lenth_up, out
def core_rnn_net(config, images_ph): """ Inputs: :param config :param images_ph: (batch_size, image_size, image_size, num_channels) Returns: :param outputs: a list/tuple of length num_glimpse with outputs/hidden_states (batch_size, cell_dim) from rnn network :param loc_means: a list of length num_glimpse with all output coordinate mean of loc_net :param loc_samples: a list of length num_glimpse with all sampled next_loc from loc_net """ cell_dim = config.get("cell_dim", None) num_glimpses = config.get("num_glimpses", None) loc_dim = config.get("loc_dim", None) batch_size = tf.shape(images_ph)[0] with tf.variable_scope("glimpse_net", reuse=None): glimpse_net = GlimpseNet(config, images_ph) with tf.variable_scope("loc_net", reuse=None): loc_net = LocNet(config) ############################################### # TODO: the core rnn network and use LSTM cell# ############################################### # First, set up initial variables. For example, # init_loc: randomly and uniformly sampled # from -1 to 1. You can use "tf.random_uniform" here. # init_g: the first input into rnn core network. # # Then define a LSTM cell by tf.contrib.rnn.LSTMCell as you did in task 1. # Also, you need to initialize the state values, and you can use "zero_state" # function. Go to https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMCell # and get familiar with this class. # # Finally, create a RNN with time_steps = num_glimpses # Be careful that in this RNN, you need to feed the output next_loc back to the input # of the glimpse net. # Hint: There are many ways to feed the output back # to the input of the rnn network, # 1. for loop # 2. use tf.contrib.legacy_seq2seq modules and define the # "loop function" to generate next input from the output # of the current step. This method is recommended, since # it will help you get familiar with some advanced RNN modules # provided by tensorflow. # In detail, a "loop function" should look like, # def loop_fn(h_t, t): # loc_mean, next_loc = loc_net(h_t) # next_glimpse = glimpse_net(next_loc) # return next_glimpse # # Also, you need to cache the loc_mean and next_loc in a list, # because you are going to use them in loss computation. _loc = tf.random_uniform(shape=[batch_size, loc_dim], minval=-1, maxval=1) cell = LSTMCell(num_units=cell_dim) _state = cell.zero_state(batch_size, dtype=tf.float32) outputs = [] loc_means = [] loc_samples = [] for i in range(num_glimpses): _g = glimpse_net(_loc) _h, _state = cell(_g, _state) _loc_mean, _next_loc = loc_net(_h) _loc = _next_loc outputs.append(_h) loc_means.append(_loc_mean) loc_samples.append(_next_loc) return outputs, loc_means, loc_samples
def stacked_bidirectional_rnn(num_units, num_layers, inputs, seq_lengths, batch_size, is_train, output_keep_prob, reuse=False): """ multi layer bidirectional rnn :param num_units: int, hidden unit of RNN cell :param num_layers: int, the number of layers :param inputs: Tensor, the input sequence, shape: [batch_size, max_time_step, num_feature] :param seq_lengths: list or 1-D Tensor, sequence length, a list of sequence lengths, the length of the list is batch_size :param batch_size: int :return: the output of last layer bidirectional rnn with concatenating 这里用到几个tf的特性 1. tf.variable_scope(None, default_name="bidirectional-rnn")使用default_name 的话,tf会自动处理命名冲突 """ # TODO: add time_major parameter, and using batch_size = tf.shape(inputs)[0], and more assert _inputs = inputs if len(_inputs.get_shape().as_list()) != 3: raise ValueError("the inputs must be 3-dimentional Tensor") for i in range(num_layers): with tf.variable_scope("Layer%d" % i, reuse=reuse): rnn_cell_fw = LSTMCell(num_units) rnn_cell_bw = LSTMCell(num_units) rnn_cell_fw = SwitchableDropoutWrapper( rnn_cell_fw, is_train, output_keep_prob=output_keep_prob) rnn_cell_bw = SwitchableDropoutWrapper( rnn_cell_bw, is_train, output_keep_prob=output_keep_prob) initial_state_fw = rnn_cell_fw.zero_state(batch_size, dtype=tf.float32) initial_state_bw = rnn_cell_bw.zero_state(batch_size, dtype=tf.float32) (output, state) = tf.nn.bidirectional_dynamic_rnn(rnn_cell_fw, rnn_cell_bw, _inputs, seq_lengths, initial_state_fw, initial_state_bw, dtype=tf.float32) _inputs = tf.concat(output, 2) return _inputs
def __init__(self, is_testing): super().__init__() self.is_testing = is_testing print("Preparing data...") # Load and encode data (Disk -> Memory), see more details in encode_data() # Also see data_loader(), the next processing stage. self.train, self.valid, self.test, self.vocab = self.encode_data(bAbI('en-valid-10k')) print("Creating graph...") with tf.Graph().as_default(), tf.device('/cpu:0'): regularizer = layers.l2_regularizer(1e-4) # regularizer applied to fully-connected network # allow_soft_placement=True: if cannot find specific device, allow tf to choose the device self.session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) self.global_step = tf.Variable(initial_value=0, trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) self.facts_ph = tf.placeholder(tf.int32, shape=(None, None)) # (bs*#facts, seq) self.facts_pos_ph = tf.placeholder(tf.int32, shape=(None,)) # (bs*#facts, ) self.question_ph = tf.placeholder(tf.int32, shape=(None, None)) # (bs, seq) self.answers_ph = tf.placeholder(tf.int32, shape=(None,)) # (bs, ) self.edge_indices_ph = tf.placeholder(tf.int32, shape=(None, 2)) self.fact_segments_ph = tf.placeholder(tf.int32, shape=(None,)) self.edge_segments_ph = tf.placeholder(tf.int32, shape=(None,)) self.q_seq_length_ph = tf.placeholder(tf.int32, shape=(None,)) self.f_seq_length_ph = tf.placeholder(tf.int32, shape=(None,)) self.task_indices_ph = tf.placeholder(tf.int32, shape=(None,)) self.edge_keep_prob_ph = tf.placeholder(tf.float32, shape=()) self.is_training_ph = tf.placeholder(tf.bool) # device: CPU:0 placeholders = [self.facts_ph, self.facts_pos_ph, self.question_ph, self.answers_ph, self.edge_indices_ph, self.fact_segments_ph, self.edge_segments_ph, self.q_seq_length_ph, self.f_seq_length_ph, self.task_indices_ph, self.edge_keep_prob_ph] # each element of train_queue is a training batch self.train_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='train-queue') # each element of train_queue is a validation batch self.val_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='val-queue') self.train_enqueue_op = self.train_queue.enqueue(placeholders) self.train_qsize_op = self.train_queue.size() # record the size of the train_queue every batch tf.summary.scalar('queues/train', self.train_qsize_op) self.val_enqueue_op = self.val_queue.enqueue(placeholders) self.val_qsize_op = self.val_queue.size() # record the size of the val_queue every batch tf.summary.scalar('queues/val', self.val_qsize_op) def avg_n(x): return tf.reduce_mean(tf.stack(x, axis=0), axis=0) towers = [] with tf.variable_scope(tf.get_variable_scope()): for device_nr, device in enumerate(self.devices): with tf.device('/cpu:0'): if self.is_testing: facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = placeholders else: facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = tf.cond( self.is_training_ph, true_fn=lambda: self.train_queue.dequeue(), false_fn=lambda: self.val_queue.dequeue(), ) # device: CPU:0, CPU:0, CPU:0 (In a 3 GPU machine, these placeholders are in triplicate.) vars = (facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob) for v, ph in zip(vars, placeholders): v.set_shape(ph.get_shape()) # device: CPU:0, CPU:0, CPU:0 facts_emb = layers.embed_sequence(facts_ph, self.vocab.size(), self.emb_size, scope='word-embeddings') # device: CPU:0, CPU:0, CPU:0 questions_emb = layers.embed_sequence(question_ph, self.vocab.size(), self.emb_size, scope='word-embeddings', reuse=True) with tf.device(device), tf.name_scope("device-%s" % device_nr): # 4 layers FC def mlp(x, scope, n_hidden): with tf.variable_scope(scope): for i in range(3): x = layers.fully_connected(x, n_hidden, weights_regularizer=regularizer) return layers.fully_connected(x, n_hidden, weights_regularizer=regularizer, activation_fn=None) # get the final hidden state for the sentences(facts), f_encoding shape: (bs*#facts, state_size) _, (_, f_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), facts_emb, dtype=tf.float32, sequence_length=f_seq_length_ph, scope='fact-encoder') # shape:(bs, ) (the same as answers_ph), elements inside the vector range from 0 to 20 randomly # and subjects to the normal distribution random_pos_offsets = tf.random_uniform(tf.shape(answers_ph), minval=0, maxval=self.num_facts, dtype=tf.int32) # Generate random offset. Note that for a specific task, the offset is the same. fact_pos = facts_pos_ph + tf.gather(random_pos_offsets, fact_segments_ph) # Considering the offset, the depth for the positional one-hot encoding should be 2*num_facts facts_pos_encoding = tf.one_hot(fact_pos, 2 * self.num_facts) # concatenate the encoding of content and position; device: GPU:0, GPU:1, GPU:2 f_encoding = tf.concat([f_encoding, facts_pos_encoding], axis=1) # Need not to encode position for questions, just get the features of their content # q_encoding shape: (bs, state_size); device: GPU:0, GPU:1, GPU: 2 _, (_, q_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), questions_emb, dtype=tf.float32, sequence_length=q_seq_length_ph, scope='question-encoder') # MLP of 3 layers FC, used to process the output of a graph # num output of last layer is vocab.size(), so as to get the logits def graph_fn(x): with tf.variable_scope('graph-fn'): x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer) x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer) return layers.fully_connected(x, self.vocab.size(), activation_fn=None, weights_regularizer=regularizer) # concatenate the fact_encoding and the question_encoding x = tf.concat([f_encoding, tf.gather(q_encoding, fact_segments_ph)], 1) # x0 represents "fact embedding given the question" # (by concatenate the question embedding with them) # device: GPU:0, GPU:1, GPU:2 x0 = mlp(x, 'pre', self.n_hidden) # generate the question encoding for every edge # edge_features shape: (bs*(#facts**2), LSTM state_size) edge_features = tf.gather(q_encoding, edge_segments_ph) x = x0 outputs = [] log_losses = [] with tf.variable_scope('steps'): lstm_cell = LSTMCell(self.n_hidden) state = lstm_cell.zero_state(tf.shape(x)[0], tf.float32) for step in range(self.n_steps): x = message_passing(x, edge_indices_ph, edge_features, lambda x: mlp(x, 'message-fn', self.n_hidden), edge_keep_prob) x = mlp(tf.concat([x, x0], axis=1), 'post-fn', self.n_hidden) # x=hidden state, state=<cell state, hidden state> # device: (GPU:0)*5, (GPU:1)*5, (GPU:2)*5 (5 is the time step) x, state = lstm_cell(x, state) with tf.variable_scope('graph-sum'): # In every step, get the sum of output vectors of Nodes for every task(Graph) # i.e. graph_sum shape: (bs, n_hidden) graph_sum = tf.segment_sum(x, fact_segments_ph) out = graph_fn(graph_sum) # shape: (bs, vocab_size) outputs.append(out) # softmax loss, scalar Tensor log_loss=tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=answers_ph, logits=out)) # log_losses is a list of scalar Tensor, each one means the loss in a time step log_losses.append(log_loss) # reuse the Variables in LSTM across different time step tf.get_variable_scope().reuse_variables() # scalr Tensor, the sum of all regularization term loss reg_loss = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # avg_n(log_losses) gets the mean loss for every step, i.e. "loss" is a scalar Tensor # device: GPU:0, GPU:1, GPU:2 loss = avg_n(log_losses) + reg_loss # device: GPU:0, GPU:1, GPU:2 stat={ 'loss': loss, # scalar Tensor 'grads': self.optimizer.compute_gradients(loss), 'log_losses': tf.stack(log_losses), # (n_steps, ) 'answers': answers_ph, # (batch_size, ) 'outputs': tf.stack(outputs), # (n_steps, batch_size, vocab_size) 'task_indices': task_indices_ph # (batch_size, ) } towers.append(stat) print('line 159: ') print('"' + tf.get_variable_scope().name + '"') # reuse the Variables in embedding, encoder, and some MLPs across different device tf.get_variable_scope().reuse_variables() # device of the following 4 vars is CPU:0 self.loss = avg_n([t['loss'] for t in towers]) self.out = tf.concat([t['outputs'] for t in towers], axis=1) self.answers = tf.concat([t['answers'] for t in towers], axis=0) self.task_indices = tf.concat([t['task_indices'] for t in towers], axis=0) tf.summary.scalar('losses/total', self.loss) tf.summary.scalar('losses/reg', reg_loss) log_losses = avg_n([t['log_losses'] for t in towers]) for i in range(self.n_steps): tf.summary.scalar('steps/%d/losses/log' % i, log_losses[i]) avg_gradients = util.average_gradients([t['grads'] for t in towers]) # global_step increases by 1 after the gradient is updated self.train_step = self.optimizer.apply_gradients(avg_gradients, global_step=self.global_step) self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() util.print_vars(tf.trainable_variables()) self.train_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/train/%s' % (self.revision, self.name), self.session.graph) self.test_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/test/%s' % (self.revision, self.name), self.session.graph) self.summaries = tf.summary.merge_all() print("Starting data loaders...") train_mp_queue = mp.Manager().Queue(maxsize=self.qsize) val_mp_queue = mp.Manager().Queue(maxsize=self.qsize) # After loaded data from disk(done in the code `self.encode_data(bAbI('en-valid-10k'))`), # use 4+1=5 Processes to construct batches and encode them, then enqueue them onto corresponding queue. # see more details in random_batch() and encode_batch() data_loader_processes = [mp.Process(target=self.data_loader, args=(train_mp_queue, True)) for i in range(4)] val_data_loader_processes = [mp.Process(target=self.data_loader, args=(val_mp_queue, False)) for i in range(1)] # start the processes for p in data_loader_processes + val_data_loader_processes: p.daemon = True p.start() # Use 2 threads to transfer data from train_mp_queue(val_mp_queue) to train_queue(val_queue). # Note that batch in train_mp_queue is ndarray of numpy, # and these two thread change every batch into Tensors and enqueue it onto train_queue. # see the placeholders defined before for the format of each batch. queue_putter_threads = [ threading.Thread(target=self.queue_putter, args=(train_mp_queue, self.train_enqueue_op, 'train', 1000)), threading.Thread(target=self.queue_putter, args=(val_mp_queue, self.val_enqueue_op, 'val', 1)), ] # start data transferring for t in queue_putter_threads: t.daemon = True t.start() train_qsize, val_qsize = 0, 0 print("Waiting for queue to fill...") while train_qsize < self.qsize or val_qsize < self.qsize: # update the size of the queues of training and validation train_qsize = self.session.run(self.train_qsize_op) val_qsize = self.session.run(self.val_qsize_op) print('train_qsize: %d, val_qsize: %d' % (train_qsize, val_qsize), flush=True) time.sleep(1)
def __init__(self, is_testing): super().__init__() self.is_testing = is_testing with tf.Graph().as_default(), tf.device('/cpu:0'): regularizer = layers.l2_regularizer(1e-4) self.name = "%s %s" % (self.revision, self.message) self.train, self.valid, self.test = self.encode_data(sudoku()) print("Building graph...") self.session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True)) self.global_step = tf.Variable(initial_value=0, trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4) self.mode = tf.placeholder(tf.string) if self.edges == 'sudoku': edges = self.sudoku_edges() elif self.edges == 'full': edges = [(i, j) for i in range(81) for j in range(81) if not i == j] else: raise ValueError('edges must be sudoku or full') edge_indices = tf.constant([(i + (b * 81), j + (b * 81)) for b in range(self.batch_size) for i, j in edges], tf.int32) n_edges = tf.shape(edge_indices)[0] edge_features = tf.zeros((n_edges, 1), tf.float32) positions = tf.constant([[(i, j) for i in range(9) for j in range(9)] for b in range(self.batch_size)], tf.int32) # (bs, 81, 2) rows = layers.embed_sequence(positions[:, :, 0], 9, self.emb_size, scope='row-embeddings', unique=True) # bs, 81, emb_size cols = layers.embed_sequence(positions[:, :, 1], 9, self.emb_size, scope='cols-embeddings', unique=True) # bs, 81, emb_size def avg_n(x): return tf.reduce_mean(tf.stack(x, axis=0), axis=0) towers = [] with tf.variable_scope(tf.get_variable_scope()): for device_nr, device in enumerate(self.devices): with tf.device('/cpu:0'): if self.is_testing: (quizzes, answers ), edge_keep_prob = self.test.get_next(), 1.0 else: (quizzes, answers), edge_keep_prob = tf.cond( tf.equal(self.mode, "train"), true_fn=lambda: (self.train.get_next(), self.edge_keep_prob), false_fn=lambda: (self.valid.get_next(), 1.0)) x = layers.embed_sequence( quizzes, 10, self.emb_size, scope='nr-embeddings', unique=True) # bs, 81, emb_size x = tf.concat([x, rows, cols], axis=2) x = tf.reshape(x, (-1, 3 * self.emb_size)) with tf.device(device), tf.name_scope("device-%s" % device_nr): def mlp(x, scope): with tf.variable_scope(scope): for i in range(3): x = layers.fully_connected( x, self.n_hidden, weights_regularizer=regularizer) return layers.fully_connected( x, self.n_hidden, weights_regularizer=regularizer, activation_fn=None) x = mlp(x, 'pre-fn') x0 = x n_nodes = tf.shape(x)[0] outputs = [] log_losses = [] with tf.variable_scope('steps'): lstm_cell = LSTMCell(self.n_hidden) state = lstm_cell.zero_state(n_nodes, tf.float32) for step in range(self.n_steps): x = message_passing( x, edge_indices, edge_features, lambda x: mlp(x, 'message-fn'), edge_keep_prob) x = mlp(tf.concat([x, x0], axis=1), 'post-fn') x, state = lstm_cell(x, state) with tf.variable_scope('graph-sum'): out = tf.reshape( layers.fully_connected( x, num_outputs=10, activation_fn=None), (-1, 81, 10)) outputs.append(out) log_losses.append( tf.reduce_mean( tf.nn. sparse_softmax_cross_entropy_with_logits( labels=answers, logits=out))) tf.get_variable_scope().reuse_variables() reg_loss = sum( tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES)) loss = avg_n(log_losses) + reg_loss towers.append({ 'loss': loss, 'grads': self.optimizer.compute_gradients(loss), 'log_losses': tf.stack(log_losses), # (n_steps, 1) 'quizzes': quizzes, # (bs, 81, 10) 'answers': answers, # (bs, 81, 10) 'outputs': tf.stack(outputs) # n_steps, bs, 81, 10 }) tf.get_variable_scope().reuse_variables() self.loss = avg_n([t['loss'] for t in towers]) self.out = tf.concat([t['outputs'] for t in towers], axis=1) # n_steps, bs, 81, 10 self.predicted = tf.cast(tf.argmax(self.out, axis=3), tf.int32) self.answers = tf.concat([t['answers'] for t in towers], axis=0) self.quizzes = tf.concat([t['quizzes'] for t in towers], axis=0) tf.summary.scalar('losses/total', self.loss) tf.summary.scalar('losses/reg', reg_loss) log_losses = avg_n([t['log_losses'] for t in towers]) for step in range(self.n_steps): equal = tf.equal(self.answers, self.predicted[step]) digit_acc = tf.reduce_mean(tf.to_float(equal)) tf.summary.scalar('steps/%d/digit-acc' % step, digit_acc) puzzle_acc = tf.reduce_mean( tf.to_float(tf.reduce_all(equal, axis=1))) tf.summary.scalar('steps/%d/puzzle-acc' % step, puzzle_acc) tf.summary.scalar('steps/%d/losses/log' % step, log_losses[step]) avg_gradients = util.average_gradients( [t['grads'] for t in towers]) self.train_step = self.optimizer.apply_gradients( avg_gradients, global_step=self.global_step) self.session.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() util.print_vars(tf.trainable_variables()) self.train_writer = tf.summary.FileWriter( '/tmp/tensorboard/sudoku/%s/train/%s' % (self.revision, self.name), self.session.graph) self.test_writer = tf.summary.FileWriter( '/tmp/tensorboard/sudoku/%s/test/%s' % (self.revision, self.name), self.session.graph) self.summaries = tf.summary.merge_all()
def startLstm(epochs=10, saveResult=True): trainData, validData, testData, wordId = loadWordIdsFromFiles() trainData = np.array(trainData, np.float32) # validData = np.array(validData, np.float32) testData = np.array(testData, np.float32) vocabSz = len(wordId) learnRate = 0.001 embedSz = 128 rnnSz, batchSz, winSz = 512, 10, 5 numWin = (trainData.shape[0] - 1) // (batchSz * winSz) # each batch has winSz * numWin words batchLen = winSz * numWin testNumWin = (testData.shape[0] - 1) // (batchSz * winSz) testBatchLen = winSz * testNumWin inp = tf.placeholder(tf.int32, shape=[batchSz, winSz]) # ans = tf.placeholder(tf.int32, shape=[batchSz * winSz]) ans = tf.placeholder(tf.int32, shape=[batchSz, winSz]) E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1)) embed = tf.nn.embedding_lookup(E, inp) rnn = LSTMCell(rnnSz) initialState = rnn.zero_state(batchSz, tf.float32) output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState) # output = tf.reshape(output, [batchSz * winSz, rnnSz]) W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1)) B = tf.Variable(tf.random_normal([vocabSz], stddev=.1)) # logits = tf.matmul(output, W) + B logits = tf.tensordot(output, W, [[2], [0]]) + B ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans) loss = tf.reduce_sum(ents) train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss) trainPerp = np.zeros(epochs, dtype=np.float32) testPerp = np.zeros(epochs, dtype=np.float32) with tf.Session() as sess: startTime = time.time() sess.run(tf.global_variables_initializer()) epoch = 0 print('epoch:', end=' ') while epoch < epochs: win = 0 inState = sess.run(initialState) testState = sess.run(initialState) # print(inState, testState) winStart, winEnd = 0, winSz while win < numWin: inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]) _, inState, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: inState}) trainPerp[epoch] += outLoss if win < testNumWin: inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)]) # inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz) inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]) testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState}) testPerp[epoch] += testOutLoss winStart, winEnd = winEnd, winEnd + winSz win += 1 epoch += 1 print(epoch, end=' ') trainPerp = np.exp(trainPerp / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen))) testPerp = np.exp(testPerp / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen))) print(f'\nelapsed: {time.time() - startTime}') print('train perplexity:', trainPerp[-1]) print('test perplexity:', testPerp[-1]) info = {'style': 'lstm', 'batch size': batchSz, 'embed size': embedSz, 'rnn size': rnnSz, 'win size': winSz, 'learning rate': learnRate, 'epochs': epochs, 'train perplexity': trainPerp[-1], 'test perplexity': testPerp[-1]} if saveResult: save(sess, info) drawPerplexity(trainPerp, testPerp)