def __init__(self, rnn_size, num_layers, batch_size, seq_length, vocab_size, grad_clip,\ infer=False): """ Constructor for an RNN using LSTMs. @param rnn_size: The size of the RNN @param num_layers: The number of layers for the RNN to have @param batch_size: The batch size to train with @param seq_length: The length of the sequences to use in training @param vocab_size: The size of the vocab @param grad_clip: The point at which to clip the gradient in the gradient descent @param infer: """ #TODO: During training, (and when sampling), the input to the RNN should be # the list of ingredients that goes with that recipe text. if infer: batch_size = 1 seq_length = 1 cell_fn = rnn_cell.GRUCell #BasicLSTMCell cell = cell_fn(rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * num_layers) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.variable_scope("rnnlm"): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with (tf.device("/cpu:0")): embedding = tf.get_variable("embedding", [vocab_size, rnn_size]) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(\ embedding, self.input_data)) inputs = [tf.squeeze(inp, [1]) for inp in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) loop_func = loop if infer else None outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state,\ cell, loop_function=loop_func, scope="rnnlm") output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits],\ [tf.reshape(self.targets, [-1])],\ [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_graph(self, test): """ Builds an LSTM graph in TensorFlow. """ if test: self.batch_size = 1 self.seq_len = 1 lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size) self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) with tf.variable_scope('lstm_vars'): self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size]) self.bs = tf.get_variable('bs', [self.vocab_size]) with tf.device('/cpu:0'): self.embeddings = tf.get_variable( 'embeddings', [self.vocab_size, self.cell_size]) input_embeddings = tf.nn.embedding_lookup( self.embeddings, self.inputs) inputs_split = tf.split(1, self.seq_len, input_embeddings) inputs_split = [ tf.squeeze(input_, [1]) for input_ in inputs_split ] def loop(prev, _): prev = tf.matmul(prev, self.ws) + self.bs prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embeddings, prev_symbol) lstm_outputs_split, self.final_state = seq2seq.rnn_decoder( inputs_split, self.initial_state, self.cell, loop_function=loop if test else None, scope='lstm_vars') lstm_outputs = tf.reshape(tf.concat(1, lstm_outputs_split), [-1, self.cell_size]) logits = tf.matmul(lstm_outputs, self.ws) + self.bs self.probs = tf.nn.softmax(logits) total_loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_len])], self.vocab_size) self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len self.global_step = tf.Variable(0, trainable=False, name='global_step') self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
def build_decoder_rnn(self, first_step): with tf.variable_scope("rnnlm"): if first_step: rnn_input = tf.matmul(self.fc7, self.encode_img_W) + self.encode_img_b else: self.decoder_prev_word = tf.placeholder(tf.int32, [None]) rnn_input = tf.nn.embedding_lookup(self.Wemb, self.decoder_prev_word) self.batch_size = tf.shape(rnn_input)[0] tf.get_variable_scope().reuse_variables() self.decoder_cell = rnn_cell.MultiRNNCell([self.basic_cell] * self.opt.num_layers, state_is_tuple = False) state_size = self.decoder_cell.state_size if not first_step: self.decoder_initial_state = initial_state = tf.placeholder(tf.float32, [None, state_size]) else: initial_state = self.decoder_cell.zero_state( self.batch_size, tf.float32) outputs, state = seq2seq.rnn_decoder([rnn_input], initial_state, self.decoder_cell) #outputs, state = tf.nn.rnn(self.decoder_cell, [rnn_input], initial_state) logits = tf.matmul(outputs[0], self.embed_word_W) + self.embed_word_b decoder_probs = tf.reshape(tf.nn.softmax(logits), [self.batch_size, self.vocab_size + 1]) decoder_state = state return [decoder_probs, decoder_state]
def build_generator(self): with tf.variable_scope("rnnlm"): image_emb = tf.matmul(self.fc7, self.encode_img_W) + self.encode_img_b rnn_inputs = tf.split(1, self.seq_length + 1, tf.zeros([self.batch_size, self.seq_length + 1, self.input_encoding_size])) rnn_inputs = [tf.squeeze(input_, [1]) for input_ in rnn_inputs] rnn_inputs = [image_emb] + rnn_inputs initial_state = self.cell.zero_state(self.batch_size, tf.float32) # Always pick the word with largest probability as the input of next time step def loop(prev, i): if i == 1: return rnn_inputs[1] prev = tf.matmul(prev, self.embed_word_W) + self.embed_word_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.Wemb, prev_symbol) tf.get_variable_scope().reuse_variables() outputs, last_state = seq2seq.rnn_decoder(rnn_inputs, initial_state, self.cell, loop_function=loop) #outputs, last_state = tf.nn.rnn(self.cell, rnn_inputs, initial_state) self.g_output = output = tf.reshape(tf.concat(1, outputs[1:]), [-1, self.rnn_size]) # outputs[1:], because we don't calculate loss on time 0. self.g_logits = logits = tf.matmul(output, self.embed_word_W) + self.embed_word_b self.g_probs = probs = tf.reshape(tf.nn.softmax(logits), [self.batch_size, self.seq_length + 1, self.vocab_size + 1]) self.generator = tf.argmax(probs, 2)
def _create_encoder(self, args): # Create LSTM portion of network lstm = rnn_cell.LSTMCell(args.encoder_size, state_is_tuple=True, initializer=initializers.xavier_initializer()) self.full_lstm = rnn_cell.MultiRNNCell([lstm] * args.num_encoder_layers, state_is_tuple=True) self.lstm_state = self.full_lstm.zero_state(args.batch_size, tf.float32) # Forward pass encoder_input = tf.concat(1, [self.states_encode, self.actions_encode]) output, self.final_state = seq2seq.rnn_decoder([encoder_input], self.lstm_state, self.full_lstm) output = tf.reshape(tf.concat(1, output), [-1, args.encoder_size]) # Fully connected layer to latent variable distribution parameters W = tf.get_variable("latent_w", [args.encoder_size, 2 * args.z_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("latent_b", [2 * args.z_dim]) logits = tf.nn.xw_plus_b(output, W, b) # Separate into mean and logstd self.z_mean, self.z_logstd = tf.split(1, 2, logits)
def _create_lstm_policy(self, args): # Create LSTM portion of network lstm = rnn_cell.LSTMCell(args.policy_size, state_is_tuple=True, initializer=initializers.xavier_initializer()) self.full_lstm = rnn_cell.MultiRNNCell([lstm] * args.num_policy_layers, state_is_tuple=True) self.lstm_state = self.full_lstm.zero_state(args.batch_size, tf.float32) # Forward pass policy_input = self.states output, self.final_state = seq2seq.rnn_decoder([policy_input], self.lstm_state, self.full_lstm) output = tf.reshape(tf.concat(1, output), [-1, args.policy_size]) # Fully connected layer to latent variable distribution parameters W = tf.get_variable("lstm_w", [args.policy_size, args.action_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("lstm_b", [args.action_dim]) self.a_mean = tf.nn.xw_plus_b(output, W, b) # Initialize logstd self.a_logstd = tf.Variable(np.zeros(args.action_dim), name="a_logstd", dtype=tf.float32)
def __init__(self, args, infer=False): self.args = args training = not infer if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) if training and args.dropout > 0: cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=1.0-args.dropout) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): self.embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(self.embedding, self.input_data) if training and args.dropout > 0: inputs = tf.nn.dropout(inputs, args.dropout) inputs = tf.split(1, args.seq_length, inputs) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state if not infer: self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format(args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) self.attn_length = 5 self.attn_size = 32 self.attention_states = tf.placeholder(tf.float32,[args.batch_size, self.attn_length, self.attn_size]) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size],name='soft_w') softmax_b = build_weight([args.vocab_size],name='soft_b') word_embedding = build_weight([args.vocab_size, args.embedding_size],name='word_embedding') inputs_list = tf.split(1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) if not args.attention: outputs, last_state = seq2seq.rnn_decoder(inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') else: outputs, last_state = attention_decoder(inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.logfile = args.log_dir+str(datetime.datetime.strftime(datetime.datetime.now(),'%Y-%m-%d %H:%M:%S')+'.txt').replace(' ','').replace('/','') self.var_op = tf.global_variables() self.saver = tf.train.Saver(self.var_op,max_to_keep=4,keep_checkpoint_every_n_hours=1)
def basic_rnn_seq2seq_with_loop_function( encoder_inputs, decoder_inputs, cell, dtype=dtypes.float32, loop_function=None, scope=None): """Basic RNN sequence-to-sequence model. Edited for a loopback function. Don't know why this isn't in the current library """ with variable_scope.variable_scope(scope or "basic_rnn_seq2seq_with_loop_function"): _, enc_state = rnn.rnn(cell, encoder_inputs, dtype=dtype) return rnn_decoder(decoder_inputs, enc_state, cell, loop_function=loop_function)
def build_model(self): with tf.name_scope("batch_size"): self.batch_size = tf.shape(self.images)[0] with tf.variable_scope("rnnlm"): image_emb = tf.matmul(self.fc7, self.encode_img_W) + self.encode_img_b # Replicate self.seq_per_img times for each image embedding image_emb = tf.reshape(tf.tile(tf.expand_dims(image_emb, 1), [1, self.seq_per_img, 1]), [self.batch_size * self.seq_per_img, self.input_encoding_size]) rnn_inputs = tf.split(1, self.seq_length + 1, tf.nn.embedding_lookup(self.Wemb, self.labels[:,:self.seq_length + 1])) rnn_inputs = [tf.squeeze(input_, [1]) for input_ in rnn_inputs] rnn_inputs = [image_emb] + rnn_inputs initial_state = self.cell.zero_state(self.batch_size * self.seq_per_img, tf.float32) outputs, last_state = seq2seq.rnn_decoder(rnn_inputs, initial_state, self.cell, loop_function=None) #outputs, last_state = tf.nn.rnn(self.cell, rnn_inputs, initial_state) self.logits = [tf.matmul(output, self.embed_word_W) + self.embed_word_b for output in outputs[1:]] with tf.variable_scope("loss"): loss = seq2seq.sequence_loss_by_example(self.logits, [tf.squeeze(label, [1]) for label in tf.split(1, self.seq_length + 1, self.labels[:, 1:])], # self.labels[:,1:] is the target [tf.squeeze(mask, [1]) for mask in tf.split(1, self.seq_length + 1, self.masks[:, 1:])]) self.cost = tf.reduce_mean(loss) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) self.cnn_lr = tf.Variable(0.0, trainable=False) # Collect the rnn variables, and create the optimizer of rnn tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='rnnlm') optimizer = tf.train.AdamOptimizer(self.lr, beta1=0.8) grads = optimizer.compute_gradients(self.cost, tvars) grads_cliped = [(tf.clip_by_value(i, -self.opt.grad_clip, self.opt.grad_clip),j) for i,j in grads if not i is None] #grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), # self.opt.grad_clip) self.train_op = optimizer.apply_gradients(grads_cliped) # Collect the cnn variables, and create the optimizer of cnn cnn_tvars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='vgg16') cnn_optimizer = tf.train.AdamOptimizer(self.cnn_lr, beta1=0.8) cnn_grads = cnn_optimizer.compute_gradients(self.cost, cnn_tvars) cnn_grads_cliped = [(tf.clip_by_value(i, -self.opt.grad_clip, self.opt.grad_clip),j) for i,j in cnn_grads if not i is None] #cnn_grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, cnn_tvars), # self.opt.grad_clip) self.cnn_train_op = cnn_optimizer.apply_gradients(cnn_grads_cliped) tf.scalar_summary('training loss', self.cost) tf.scalar_summary('learning rate', self.lr) tf.scalar_summary('cnn learning rate', self.cnn_lr) #for i,j in cnn_grads: #if not i is None and j.name.startswith('vgg16_1'): #tf.histogram_summary(j.name+'_v', j) #tf.histogram_summary(j.name+'_d', i) #for i,j in grads: #tf.histogram_summary(j.name+'_v', j) #tf.histogram_summary(j.name+'_d', i) self.summaries = tf.merge_all_summaries()
def decoder(cell, dec_outputs, states, scope): outputs = [] with variable_scope.variable_scope(scope): for i in range(len(states)): if i > 0: variable_scope.get_variable_scope().reuse_variables() outs, _ = seq2seq.rnn_decoder(dec_outputs, states[i], cell) outputs.extend(outs) return outputs
def discriminate_wv(self, input_data_wv): with tf.variable_scope('DISC', reuse=self.has_init_seq2seq) as scope: self.has_init_seq2seq = True output_wv, states_wv = seq2seq.rnn_decoder(input_data_wv, self.initial_state, self.cell, scope=scope) predicted_classes_wv = tf.matmul(output_wv[-1], self.fc_layer) return predicted_classes_wv
def generate(self): inputs = tf.split(1, self.args.seq_length, tf.nn.embedding_lookup(self.embedding, self.input_data)) inputs = map(lambda i: tf.nn.l2_normalize(i, 1), [tf.squeeze(input_, [1]) for input_ in inputs]) def loop(prev, i): return prev with tf.variable_scope('GEN', reuse=self.has_init_seq2seq) as scope: self.has_init_seq2seq = True if self.args.num_layers == 1: outputs, last_state = seq2seq.rnn_decoder(inputs, [self.initial_state1], self.cell, loop_function=loop, scope=scope) elif self.args.num_layers == 2: outputs, last_state = seq2seq.rnn_decoder(inputs, [self.initial_state1, self.initial_state2], self.cell, loop_function=loop, scope=scope) else: raise Exception('Unsupported number of layers. Use 1 or 2 layers for now..') outputs = map(lambda o: tf.nn.l2_normalize(o, 1), outputs) self.outputs = outputs return outputs
def build_network(self): with tf.variable_scope('encoder'): z_mean_w = tf.Variable(self.initializer([self._enc_cell.state_size, self.n_latent])) z_mean_b = tf.Variable(tf.zeros([self.n_latent], dtype=tf.float32)) z_logvar_w = tf.Variable(self.initializer([self._enc_cell.state_size, self.n_latent])) z_logvar_b = tf.Variable(tf.zeros([self.n_latent], dtype=tf.float32)) _, enc_state = rnn.rnn(self._enc_cell, self.inputs, dtype=tf.float32) self.z_mean = tf.add(tf.matmul(enc_state, z_mean_w), z_mean_b) self.z_log_var = tf.add(tf.matmul(enc_state, z_logvar_w), z_logvar_b) eps = tf.random_normal((self.batch_size, self.n_latent), 0, 1, dtype=tf.float32) self.z = tf.add(self.z_mean, tf.mul(tf.sqrt(tf.exp(self.z_log_var)), eps)) with tf.variable_scope('decoder') as scope: dec_in_w = tf.Variable(self.initializer([self.n_latent, self._dec_cell.state_size], dtype=tf.float32)) dec_in_b = tf.Variable(tf.zeros([self._dec_cell.state_size], dtype=tf.float32)) dec_out_w = tf.Variable(self.initializer([self.n_hidden, self.elem_num], dtype=tf.float32)) dec_out_b = tf.Variable(tf.zeros([self.elem_num], dtype=tf.float32)) initial_dec_state = self.transfer_func(tf.add(tf.matmul(self.z, dec_in_w), dec_in_b)) dec_out, _ = seq2seq.rnn_decoder(self.inputs, initial_dec_state, self._dec_cell) if self.reverse: dec_out = dec_out[::-1] dec_output = tf.transpose(tf.pack(dec_out), [1, 0, 2]) batch_dec_out_w = tf.tile(tf.expand_dims(dec_out_w, 0), [self.batch_size, 1, 1]) self.output = tf.nn.sigmoid(tf.batch_matmul(dec_output, batch_dec_out_w) + dec_out_b) scope.reuse_variables() dec_gen_input = [0.5 * tf.ones([self.batch_size, self.elem_num], dtype=tf.float32) for _ in range(self.step_num)] self.z_gen = tf.placeholder(tf.float32, [self.batch_size, self.n_latent]) dec_gen_state = self.transfer_func( tf.add(tf.matmul(self.z_gen, dec_in_w), dec_in_b)) dec_gen_out, _ = seq2seq.rnn_decoder( dec_gen_input, dec_gen_state, self._dec_cell) if self.reverse: dec_gen_out = dec_gen_out[::-1] dec_gen_output = tf.transpose(tf.pack(dec_gen_out), [1, 0, 2]) self.gen_output = tf.nn.sigmoid(tf.batch_matmul(dec_gen_output, batch_dec_out_w) + dec_out_b) self.inp = tf.transpose(tf.pack(self.inputs), [1, 0, 2]) self.train_loss = self.get_loss() self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.train_loss)
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.empirical_entropy = self.cost/np.log(2) tf.summary.scalar('Empircal_Entropy', self.empirical_entropy) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.merged_summaries = tf.summary.merge_all()
def __init__(self, args, infer=False): self.args = args if infer: #When we sample, the batch and sequence lenght are = 1 args.batch_size = 1 args.seq_length = 1 cell_fn = rnn_cell.BasicLSTMCell #Define the internal cell structure cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) #Build the inputs and outputs placeholders, and start with a zero internal values self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable( "softmax_w", [args.rnn_size, args.vocab_size]) #Final w softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #Final bias with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, rnn_size, num_layers, batch_size, seq_length, vocabulary_size, gradient_clip, sample=False): lstm_cell = rnn_cell.BasicLSTMCell(num_units=rnn_size) # create the RNN cell, that is constructed from multiple lstm cells, by duplicating the lstm cell self.cell = rnn_cell.MultiRNNCell([lstm_cell] * num_layers) # Initial state is a matrix of zeros self.initial_state = self.cell.zero_state(batch_size, tf.float32) # Define the vectors that will hold Tensorflow state self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) # variable_scope is tensorflow best practice that allows us to recycle variables names with different scopes with tf.variable_scope(VARIABLE_SCOPE): softmax_w = tf.get_variable("softmax_w", [rnn_size, vocabulary_size]) softmax_b = tf.get_variable("softmax_b", [vocabulary_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocabulary_size, rnn_size]) inputs = tf.split(1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop_function(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b stop_gradient = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, stop_gradient) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=loop_function if sample else None, scope=VARIABLE_SCOPE) output = tf.result_sentencehape(tf.concat(1, outputs), [-1, rnn_size]) # Calculate the logits and probabilities for the tensor self.logits = tf.matmul(output, softmax_w) + softmax_b self.probabilities = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.result_sentencehape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocabulary_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), gradient_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def _create_lstm_policy(self, args): raise NotImplementedError # Create LSTM portion of network lstm = rnn_cell.LSTMCell(args.policy_size, state_is_tuple=True, initializer=initializers.xavier_initializer()) self.policy_lstm = rnn_cell.MultiRNNCell([lstm] * args.num_policy_layers, state_is_tuple=True) self.policy_state = self.policy_lstm.zero_state( args.batch_size * args.sample_size, tf.float32) # Get samples from standard normal distribution, transform to match z-distribution samples = tf.random_normal( [args.sample_size, args.batch_size, args.z_dim], name="z_samples") self.z_samples = samples * tf.exp(self.z_logstd) + self.z_mean self.z_samples = tf.transpose(self.z_samples, perm=[1, 0, 2]) # Construct policy input policy_input = tf.concat(2, [self.states, self.z_samples]) policy_input = tf.reshape( policy_input, [args.batch_size * args.sample_size, args.state_dim + args.z_dim], name="policy_input") # Forward pass with tf.variable_scope("policy"): output, self.final_policy_state = seq2seq.rnn_decoder( [policy_input], self.policy_state, self.policy_lstm) output = tf.reshape(tf.concat(1, output), [-1, args.policy_size]) # Fully connected layer to latent variable distribution parameters W = tf.get_variable("lstm_w", [args.policy_size, args.action_dim], initializer=initializers.xavier_initializer()) b = tf.get_variable("lstm_b", [args.action_dim]) a_mean = tf.nn.xw_plus_b(output, W, b) self.a_mean = tf.reshape( a_mean, [args.batch_size, args.sample_size, args.action_dim], name="a_mean") # Initialize logstd self.a_logstd = tf.Variable(np.zeros(args.action_dim), name="a_logstd", dtype=tf.float32)
def basic_decoder( batch_input_shape, cells, code, keep_prob, **kwargs ): # Recieve arguments batch_size, timestep, feature = batch_input_shape peek = kwargs['peek'] assert len(cells) == 1, "One cell needed!" de_cell = cells[0] # Start building graph hidden_dim = de_cell.output_size # Define code code_dropout = tf.nn.dropout(code, keep_prob) code_dim = int(code_dropout.get_shape()[1]) # Decoder inputs rest_of_decoder_inputs = [ tf.placeholder(tf.float32, shape=[ batch_size, code_dim ]) for _ in range(timestep-1) ] decoder_inputs_dropout = [ code_dropout ] + \ [ tf.nn.dropout(inp, keep_prob) for inp in rest_of_decoder_inputs ] def loop(prev, i): if peek: return prev + code_dropout # Output as input else: return prev decoder_outputs, decoder_state = seq2seq.rnn_decoder( decoder_inputs_dropout, de_cell.zero_state(batch_size,tf.float32), de_cell, loop_function = loop ) W_out = tf.get_variable("W_out", shape=[hidden_dim, feature], initializer=tf.contrib.layers.xavier_initializer()) b_out = tf.Variable( tf.zeros([ feature ] ) ) unpacked_reconstruction = [ tf.matmul( tf.nn.dropout( out, keep_prob ), W_out ) for out in decoder_outputs ] #recX = tf.nn.relu( tf.transpose(tf.pack(unpacked_reconstruction), perm=[1, 0, 2]) ) recX = tf.transpose(tf.pack(unpacked_reconstruction), perm=[1, 0, 2]) return recX
def __init__(self, args): self.args = args self.dropout = tf.Variable(trainable=False, dtype=tf.float32, initial_value=0) cell = rnn_cell.LSTMCell(args.hidden, state_is_tuple=True) cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.cell = tf.nn.rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout) self.input_data = tf.placeholder( tf.float32, [args.batch_size, args.seq_length, args.seq_dim]) self.output_data = tf.placeholder(tf.int32, [args.batch_size]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnn_audio'): rnn_weights = tf.get_variable("rnn_weights", [args.hidden, args.num_classes]) rnn_bias = tf.get_variable("rnn_bias", [args.num_classes]) with tf.device("/cpu:0"): inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnn_audio') output = outputs[-1] self.logits = tf.matmul(output, rnn_weights) + rnn_bias self.probabilities = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [self.output_data], [tf.ones([args.batch_size])], args.num_classes) self.cost = tf.reduce_mean(loss) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) train_vars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, train_vars), 5) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, train_vars))
def val_loss(self): # reuse vars tf.get_variable_scope().reuse_variables() # unpack values for easier reference seq_length = self._opts.fake_sequence_length def loop_function(prev, i): return tf.matmul(prev, self.W_out) + self.b_out # build the decoder rnn outputs, states = seq2seq.rnn_decoder(self.decoder_inputs, self.enc_state, self.cell, loop_function) # so the outputs are the scores # we could convert them to probability distributions # with a softmax, but for now just treat them as the # direct predictions predictions = [] for idx in range(seq_length): pred = loop_function(outputs[idx], idx) predictions.append(pred) # the targets are the same as the decoder_inputs # except shifted ahead in time 1 unit targets = [dec_input for dec_input in self.decoder_inputs[1:]] # compute the loss, which for now is squared error losses = [] for idx in range(seq_length): diff = targets[idx] - predictions[idx] loss = tf.reduce_mean(tf.square(diff)) losses.append(loss) # get and return cumulative loss loss = tf.add_n(losses) return loss
def prepare_graph(self): # prepare the input batchholder with tf.name_scope("encoder/decoder/convNet"): # declare the place holder for a batch of seq inputs = [] targets = [] # input and targes are both images for i in xrange(self.seq_length): inputs += [tf.placeholder(dtype=tf.float32, shape=(None, self.image_shape[0], self.image_shape[1]))] targets += [tf.placeholder(dtype=tf.float32, shape=(None, self.image_shape[0], self.image_shape[1]))] # initial the weights and bias for endcoder and decoder # fot each convolution kernel shared the same weights self.W_conv = init_W(shape=[20, 20, 1, 32]) self.b_conv = init_bias(shape=[32]) encoder_conv = [] encoder_max = [] for input, target in zip(inputs, targets): encoder_conv += [tf.nn.relu(conv2d(input, self.W_conv) + self.b_conv)] encoder_max += [maxpooling_2x2(encoder_conv[-1])] with variable_scope.variable_scope("LSTM-CovolutionSeq2Seq"): cell = tf.nn.rnn_cell.BasicLSTMCell(self.lstm_hidden) _, enc_state = rnn.rnn(cell, encoder_conv, dtype=tf.float32) # put enc_states into a convolutional net decoders, state = rnn_decoder(encoder_max, enc_state, cell, feed_previous=True) for decoder in decoders: # upsampling conv2d(decoder, tf.transpose(self.W_conv, premu=[2, 3, 0, 1]))
def build(self): print(' Building model') self.embeddings = tf.Variable( tf.random_normal([self.alphabet_size, self.embedd_dims], stddev=0.1), name='embeddings') X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t') with tf.variable_scope('split_X_inputs'): X_list = tf.split(split_dim=1, num_split=self.max_x_seq_len, value=X_embedded) X_list = [tf.squeeze(X) for X in X_list] [X.set_shape([None, self.embedd_dims]) for X in X_list] with tf.variable_scope('split_t_inputs'): t_list = tf.split(split_dim=1, num_split=self.max_t_seq_len, value=t_embedded) t_list = [tf.squeeze(t) for t in t_list] [t.set_shape([None, self.embedd_dims]) for t in t_list] with tf.variable_scope('dense_out'): W_out = tf.get_variable('W_out', [self.rnn_units, self.alphabet_size]) b_out = tf.get_variable('b_out', [self.alphabet_size]) cell = rnn_cell.GRUCell(self.rnn_units) # encoder enc_outputs, enc_state = rnn.rnn(cell=cell, inputs=X_list, dtype=tf.float32, sequence_length=self.X_len, scope='rnn_encoder') tf.histogram_summary('final_encoder_state', enc_state) # The loop function provides inputs to the decoder: def decoder_loop_function(prev, i): def feedback_on(): prev_1 = tf.matmul(prev, W_out) + b_out # feedback is on, so feed the decoder with the previous output return tf.gather(self.embeddings, tf.argmax(prev_1, 1)) def feedback_off(): # feedback is off, so just feed the decoder with t's return t_list[i] return tf.cond(self.feedback, feedback_on, feedback_off) # decoder dec_out, dec_state = ( seq2seq.rnn_decoder(decoder_inputs=t_list, initial_state=enc_state, cell=cell, loop_function=decoder_loop_function) ) self.out = [tf.matmul(d, W_out) + b_out for d in dec_out] # for debugging network (NOTE should write this outside of build) out_packed = tf.pack(self.out) out_packed = tf.transpose(out_packed, perm=[1, 0, 2]) self.out_tensor = out_packed # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.glove_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder(tf.float32, shape=[N, L, V], name='x') # [num_batch, sentence_len, glove_dim] question = tf.placeholder(tf.float32, shape=[N, Q, V], name='q') # [num_batch, sentence_len, glove_dim] answer = tf.placeholder(tf.int64, shape=[N], name='y') # [num_batch] - one word answer input_mask = tf.placeholder(tf.bool, shape=[N, L], name='x_mask') # [num_batch, sentence_len] is_training = tf.placeholder(tf.bool) # Prepare parameters gru = rnn_cell.GRUCell(d) # Input module with tf.variable_scope('input') as scope: input_list = self.make_decoder_batch_input(input) input_states, _ = seq2seq.rnn_decoder(input_list, gru.zero_state(N, tf.float32), gru) # Question module scope.reuse_variables() ques_list = self.make_decoder_batch_input(question) questions, _ = seq2seq.rnn_decoder(ques_list, gru.zero_state(N, tf.float32), gru) question_vec = questions[-1] # use final state # Masking: to extract fact vectors at end of sentence. (details in paper) input_states = tf.transpose(tf.pack(input_states), [1, 0, 2]) # [N, L, D] facts = [] for n in range(N): filtered = tf.boolean_mask(input_states[n, :, :], input_mask[n, :]) # [?, D] padding = tf.zeros(tf.pack([F - tf.shape(filtered)[0], d])) facts.append(tf.concat(0, [filtered, padding])) # [F, D] facked = tf.pack(facts) # packing for transpose... I hate TF so much facts = tf.unpack(tf.transpose(facked, [1, 0, 2]), num=F) # F x [N, D] # Episodic Memory with tf.variable_scope('episodic') as scope: episode = EpisodeModule(d, question_vec, facts) memory = tf.identity(question_vec) for t in range(params.memory_step): memory = gru(episode.new(memory), memory)[0] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A]) logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdadeltaOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.q = question self.y = answer self.mask = input_mask self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def _init_tensorflow(self, infer: bool = False): """ Deferred importing of tensorflow and initializing model for training or sampling. This is necessary for two reasons: first, the tensorflow graph is different for training and inference, so must be reset when switching between modes. Second, importing tensorflow takes a long time, so we only want to do it if we actually need to. Arguments: infer (bool): If True, initialize model for inference. If False, initialize model for training. Returns: module: imported TensorFlow module """ import tensorflow as tf from tensorflow.python.ops import rnn_cell from tensorflow.python.ops import seq2seq # Use self.tensorflow_state to mark whether or not model is configured # for training or inference. try: if self.tensorflow_state == infer: return tf except AttributeError: pass self.cell_fn = { "lstm": rnn_cell.BasicLSTMCell, "gru": rnn_cell.GRUCell, "rnn": rnn_cell.BasicRNNCell }.get(self.model_type, None) if self.cell_fn is None: raise clgen.UserError("Unrecognized model type") # reset the graph when switching between training and inference tf.reset_default_graph() # corpus info: batch_size = 1 if infer else self.corpus.batch_size seq_length = 1 if infer else self.corpus.seq_length vocab_size = self.corpus.vocab_size fs.mkdir(self.cache.path) cell = self.cell_fn(self.rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [batch_size, seq_length]) self.targets = tf.placeholder(tf.int32, [batch_size, seq_length]) self.initial_state = self.cell.zero_state(batch_size, tf.float32) scope_name = 'rnnlm' with tf.variable_scope(scope_name): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [vocab_size, self.rnn_size]) inputs = tf.split( 1, seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope=scope_name) output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([batch_size * seq_length])], vocab_size) self.cost = tf.reduce_sum(loss) / batch_size / seq_length self.final_state = last_state self.learning_rate = tf.Variable(0.0, trainable=False) self.epoch = tf.Variable(0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients(zip(grads, tvars)) # set model status self.tensorflow_state = infer return tf
# outputs, finstate = rnn.rnn(neurons, inputs, init_state) #################### # hand made seq2seq # outputs_le, finstate = rnn.rnn(neurons, inputs, init_state) # inp_state = array_ops.slice(finstate, [0, 0], [batch_size, input_size]) # le_state = array_ops.slice(finstate, [0, input_size], [batch_size, le_size]) # finstate = array_ops.concat(1, [le_state, inp_state]) # outputs, finstate = rnn.rnn(neurons_out, outputs_le, finstate, scope="out") #################### outputs, finstate = ss.rnn_decoder(inputs, state, neurons) loss = tf.add_n([ tf.nn.l2_loss(target - output) for output, target in zip(outputs, targets) ]) / bptt_steps / batch_size / net_size ### test_inputs = [ tf.placeholder(tf.float32, shape=(1, input_size), name="TestInput{}".format(idx)) for idx in xrange(bptt_steps) ] test_state = tf.placeholder(tf.float32, shape=(1, state_size), name="TestState") variable_scope.get_variable_scope().reuse_variables() test_outputs, test_finstate = ss.rnn_decoder(test_inputs, test_state, neurons) ### lrate_var = tf.Variable(0.0, trainable=False)
def __init__(self, args, infer=False): # infer is set to true during sampling. self.args = args if infer: # Worry about one character at a time during sampling; no batching or BPTT. args.batch_size = 1 args.seq_length = 1 # Set cell_fn to the type of network cell we're creating -- RNN, GRU or LSTM. if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) # Call tensorflow library tensorflow-master/tensorflow/python/ops/rnn_cell # to create a layer of rnn_size cells of the specified basic type (RNN/GRU/LSTM). cell = cell_fn(args.rnn_size, state_is_tuple=True) # Use the same rnn_cell library to create a stack of these cells # of num_layers layers. Pass in a python list of these cells. # (The [cell] * arg.num_layers syntax literally duplicates cell multiple times in # a list. The syntax is such that [5, 6] * 3 would return [5, 6, 5, 6, 5, 6].) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) # Create two TF placeholder nodes of 32-bit ints (NOT floats!), # each of shape batch_size x seq_length. This shape matches the batches # (listed in x_batches and y_batches) constructed in create_batches in utils.py. # input_data will receive input batches, and targets will be what it compares against # to calculate loss. self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Using the zero_state function in the RNNCell master class in rnn_cell library, # create a tensor of zeros such that we can swap it in for the network state at any time # to zero out the network's state. # State dimensions are: cell_fn state size (2 for LSTM) x rnn_size x num_layers. # So an LSTM network with 100 cells per layer and 3 layers would have a state size of 600, # and initial_state would have a dimension of none x 600. self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) # Scope our new variables to the scope identifier string "rnnlm". with tf.variable_scope('rnnlm'): # Create new variable softmax_w and softmax_b for output. # softmax_w is a weights matrix from the top layer of the model (of size rnn_size) # to the vocabulary output (of size vocab_size). softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) # softmax_b is a bias vector of the ouput characters (of size vocab_size). softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # [TODO: Why specify CPU? Same as the TF translation tutorial, but don't know why.] with tf.device("/cpu:0"): # Create new variable named 'embedding' to connect the character input to the base layer # of the RNN. Its role is the conceptual inverse of softmax_w. # It contains the trainable weights from the one-hot input vector to the lowest layer of RNN. embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) # Create an embedding tensor with tf.nn.embedding_lookup(embedding, self.input_data). # This tensor has dimensions batch_size x seq_length x rnn_size. # tf.split splits that embedding lookup tensor into seq_length tensors (along dimension 1). # Thus inputs is a list of seq_length different tensors, # each of dimension batch_size x 1 x rnn_size. inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) # Iterate through these resulting tensors and eliminate that degenerate second dimension of 1, # i.e. squeeze each from batch_size x 1 x rnn_size down to batch_size x rnn_size. # Thus we now have a list of seq_length tensors, each with dimension batch_size x rnn_size. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # THIS LOOP FUNCTION IS NEVER ACTUALLY USED. # IT IS EXPLICITLY NOT USED DURING TRAINING. # DURING INFERENCE, SEQ_LENGTH == 1, SO SEQ2SEQ.RNN_DECODER() ONLY USES THE LOOP ARGUMENT # ON SEQUENCE LENGTH ITEMS SUBSEQUENT TO THE FIRST. # This looping function is used as part of seq2seq.rnn_decoder only during sampling -- not training. # prev is a 2D Tensor of shape [batch_size x cell.output_size]. # returns a 2D Tensor of shape [batch_size x cell.input_size]. def loop(prev, _): # prev is initially the top cell state. # Convert the top cell state into character logits. prev = tf.matmul(prev, softmax_w) + softmax_b # Pull the character with the greatest logit (no sampling, just argmaxing). # WHY IS THIS ARGMAXING WHEN ACTUAL SAMPLING IS DONE PROBABILISTICALLY? # DOESN'T THIS CAUSE OUTPUTS NOT TO MATCH INPUTS DURING SEQUENCE GENERATION? prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # Re-embed that symbol as the next step's input, and return that. return tf.nn.embedding_lookup(embedding, prev_symbol) # Set up a seq2seq decoder from the seq2seq.py library. # This constructs the outputs and states nodes of the network. # Outputs is a list (of len seq_length, same as inputs) of tensors of shape [batch_size x rnn_size]. # These are the raw output values of the top layer of the network at each time step. # They have NOT been fed through the decoder projection; they are still in network space, # not character space. # State is a tensor of shape [batch_size x cell.state_size]. # This is also the step where all of the trainable parameters for the LSTM (weights and biases) are defined. outputs, self.final_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # tf.concat concatenates the output tensors along the rnn_size dimension, # to make a single tensor of shape [batch_size x (seq_length * rnn_size)]. # This gives the following 2D outputs matrix: # [(rnn output: batch 0, seq 0) (rnn output: batch 0, seq 1) ... (rnn output: batch 0, seq seq_len-1)] # [(rnn output: batch 1, seq 0) (rnn output: batch 1, seq 1) ... (rnn output: batch 1, seq seq_len-1)] # ... # [(rnn output: batch batch_size-1, seq 0) (rnn output: batch batch_size-1, seq 1) ... (rnn output: batch batch_size-1, seq seq_len-1)] # tf.reshape then reshapes it to a tensor of shape [(batch_size * seq_length) x rnn_size]. # Output will now be the following matrix: # [rnn output: batch 0, seq 0] # [rnn output: batch 0, seq 1] # ... # [rnn output: batch 0, seq seq_len-1] # [rnn output: batch 1, seq 0] # [rnn output: batch 1, seq 1] # ... # [rnn output: batch 1, seq seq_len-1] # ... # ... # [rnn output: batch batch_size-1, seq seq_len-1] # Note the following comment in rnn_cell.py: # Note: in many cases it may be more efficient to not use this wrapper, # but instead concatenate the whole sequence of your outputs in time, # do the projection on this batch-concatenated sequence, then split it # if needed or directly feed into a softmax. output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) # Obtain logits node by applying output weights and biases to the output tensor. # Logits is a tensor of shape [(batch_size * seq_length) x vocab_size]. # Recall that outputs is a 2D tensor of shape [(batch_size * seq_length) x rnn_size], # and softmax_w is a 2D tensor of shape [rnn_size x vocab_size]. # The matrix product is therefore a new 2D tensor of [(batch_size * seq_length) x vocab_size]. # In other words, that multiplication converts a loooong list of rnn_size vectors # to a loooong list of vocab_size vectors. # Then add softmax_b (a single vocab-sized vector) to every row of that list. # That gives you the logits! self.logits = tf.matmul(output, softmax_w) + softmax_b # Convert logits to probabilities. Probs isn't used during training! That node is never calculated. # Like logits, probs is a tensor of shape [(batch_size * seq_length) x vocab_size]. # During sampling, this means it is of shape [1 x vocab_size]. self.probs = tf.nn.softmax(self.logits) # seq2seq.sequence_loss_by_example returns 1D float Tensor containing the log-perplexity # for each sequence. (Size is batch_size * seq_length.) # Targets are reshaped from a [batch_size x seq_length] tensor to a 1D tensor, of the following layout: # target character (batch 0, seq 0) # target character (batch 0, seq 1) # ... # target character (batch 0, seq seq_len-1) # target character (batch 1, seq 0) # ... # These targets are compared to the logits to generate loss. # Logits: instead of a list of character indices, it's a list of character index probability vectors. # seq2seq.sequence_loss_by_example will do the work of generating losses by comparing the one-hot vectors # implicitly represented by the target characters against the probability distrutions in logits. # It returns a 1D float tensor (a vector) where item i is the log-perplexity of # the comparison of the ith logit distribution to the ith one-hot target vector. loss = seq2seq.sequence_loss_by_example([self.logits], # logits: 1-item list of 2D Tensors of shape [batch_size x vocab_size] [tf.reshape(self.targets, [-1])], # targets: 1-item list of 1D batch-sized int32 Tensors of the same length as logits [tf.ones([args.batch_size * args.seq_length])], # weights: 1-item list of 1D batch-sized float-Tensors of the same length as logits args.vocab_size) # num_decoder_symbols: integer, number of decoder symbols (output classes) # Cost is the arithmetic mean of the values of the loss tensor # (the sum divided by the total number of elements). # It is a single-element floating point tensor. This is what the optimizer seeks to minimize. self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length # Create a summary for our cost. tf.scalar_summary("cost", self.cost) # Create a node to track the learning rate as it decays through the epochs. self.lr = tf.Variable(args.learning_rate, trainable=False) self.global_epoch_fraction = tf.Variable(0.0, trainable=False) self.global_seconds_elapsed = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() # tvars is a python list of all trainable TF Variable objects. # tf.gradients returns a list of tensors of length len(tvars) where each tensor is sum(dy/dx). grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) # Use ADAM optimizer with the current learning rate. # Zip creates a list of tuples, where each tuple is (variable tensor, gradient tensor). # Training op nudges the variables along the gradient, with the given learning rate, using the ADAM optimizer. # This is the op that a training session should be instructed to perform. self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.summary_op = tf.merge_all_summaries()
def __init__(self, args): self.args = args if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) ## ## input data will be of dimension ## shape = (batch_size, seq_length, invocab_size) ## self.input_data = tf.placeholder(tf.float32, [args.batch_size, args.seq_length, args.char_size]) ## ## target data will be of dimension ## shape = (batch_size, seq_length) ## NOTE : out dim not specified here ## self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) ## ## initial state is of size batch_size * state_size ## this is equivalent to tf.zeros([batch_size, state_size]) ## self.initial_state = cell.zero_state(args.batch_size, tf.float32) ## ## input and final softmax layer outputs ## here we specify the out dimention ## with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.phvocab_size]) softmax_b = tf.get_variable("softmax_b", [args.phvocab_size]) ## ## unrolling of the input to sequence length ## and removing the 1 dim ## inputs = tf.split(1, args.seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] ## ## simple rnn decoder. Simple meaning without attention ## last_state is the final state from rnn after specified ## sequence length. ## last_state is the thought vector ## outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, scope='rnnlm') ## ## outputs is a list of size sequence length. ## Each list element is of dimention batch_size * rnn_size ## i.e for each unrolled input, there will be one output state ## (last state) each will be of dimension rnn_size. ## outconcat = tf.concat(1, outputs) output = tf.reshape(outconcat, [-1, args.rnn_size]) ## ## final logit layer ## NOTE : x * W (where x is batch * rnn_size) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) ## ## cost function ## reshaped_target = tf.reshape(self.targets, [-1]), seq_weight = tf.ones([args.batch_size * args.seq_length]) loss = seq2seq.sequence_loss_by_example([self.logits], [reshaped_target], [seq_weight], args.phvocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state ## ## Optimizer ## Adam optimizer and gradient clipping ## self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_graph(self, test): """ Builds an LSTM graph in TensorFlow. """ if test: self.batch_size = 1 self.seq_len = 1 ## # LSTM Cells ## lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size) self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) ## # Data ## # inputs and targets are 2D tensors of shape self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) ## # Variables ## with tf.variable_scope('lstm_vars'): self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size]) self.bs = tf.get_variable('bs', [self.vocab_size]) # TODO: initializer? with tf.device('/cpu:0' ): # put on CPU to parallelize for faster training/ self.embeddings = tf.get_variable( 'embeddings', [self.vocab_size, self.cell_size]) # get embeddings for all input words input_embeddings = tf.nn.embedding_lookup( self.embeddings, self.inputs) # The split splits this tensor into a seq_len long list of 3D tensors of shape # [batch_size, 1, rnn_size]. The squeeze removes the 1 dimension from the 1st axis # of each tensor inputs_split = tf.split(1, self.seq_len, input_embeddings) inputs_split = [ tf.squeeze(input_, [1]) for input_ in inputs_split ] # inputs_split looks like this: # [ # tensor_<0>([ # [batchElt<0>_wordEmbedding<0>], # ..., # [batchElt<batch_size - 1>_wordEmbedding<0>] # ]), # ..., # tensor_<seq_len - 1>([ # [batchElt<0>_wordEmbedding<seq_len - 1>], # ..., # [batchElt<batch_size - 1>_wordEmbedding<seq_len - 1>] # ]) # ] def loop(prev, _): prev = tf.matmul(prev, self.ws) + self.bs prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embeddings, prev_symbol) lstm_outputs_split, self.final_state = seq2seq.rnn_decoder( inputs_split, self.initial_state, self.cell, loop_function=loop if test else None, scope='lstm_vars') lstm_outputs = tf.reshape(tf.concat(1, lstm_outputs_split), [-1, self.cell_size]) # outputs looks like this: # [ # tensor_<0>([ # [batchElt<0>_outputEmbedding<0>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<0>] # ]), # ..., # tensor_<seq_len - 1>([ # [batchElt<0>_outputEmbedding<seq_len - 1>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<seq_len - 1>] # ]) # ] # output looks like this: # tensor([ # [batchElt<0>_outputEmbedding<0>], # ..., # [batchElt<0>_outputEmbedding<seq_len - 1>], # [batchElt<1>_outputEmbedding<0>], # ..., # [batchElt<1>_outputEmbedding<seq_len - 1>], # ... # [batchElt<batch_size - 1>_outputEmbedding<0>], # ..., # [batchElt<batch_size - 1>_outputEmbedding<seq_len - 1>] # ]) logits = tf.matmul(lstm_outputs, self.ws) + self.bs self.probs = tf.nn.softmax(logits) ## # Train ## total_loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_len])], self.vocab_size) self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len self.global_step = tf.Variable(0, trainable=False, name='global_step') self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
def forward(self): # unpack values for easier reference seq_length = self._opts.fake_sequence_length batch_size = self._opts.batch_size input_dim = self._opts.fake_input_dim num_hidden = self._opts.num_hidden # define the placeholders / symbolic inputs to the graph encoder_inputs = [] decoder_inputs = [] for idx in range(seq_length): encoder_inputs.append(tf.placeholder(tf.float32, shape=(batch_size, input_dim), name= 'encoder_inputs_{}'.format(idx))) decoder_inputs.append(tf.placeholder(tf.float32, shape=(batch_size, input_dim), name= 'decoder_inputs_{}'.format(idx))) # we do this an extra time for the decoder because # has a <START> token appended to the front decoder_inputs.append(tf.placeholder(tf.float32, shape=(batch_size, input_dim), name= 'decoder_inputs_{}'.format(seq_length))) # create the encoder rnn self.cell = rnn.rnn_cell.BasicLSTMCell(num_hidden) _, self.enc_state = rnn.rnn(self.cell, encoder_inputs, dtype=tf.float32) # define a custom function to convert each decoder output # at each timestep of dimension num_hidden into # the same dimension as the output (which in this case # is the same as the input) so it can be used as a prediction # or as the input to the next time step of the decoding self.W_out = tf.Variable(tf.random_uniform([num_hidden, input_dim], -1, 1), name="sm_w") self.b_out = tf.Variable(tf.zeros([input_dim]), name="sm_b") def loop_function(prev, i): return tf.matmul(prev, self.W_out) + self.b_out # build the decoder rnn outputs, states = seq2seq.rnn_decoder(decoder_inputs, self.enc_state, self.cell) # so the outputs are the scores # we could convert them to probability distributions # with a softmax, but for now just treat them as the # direct predictions predictions = [] for idx in range(seq_length): pred = loop_function(outputs[idx], idx) predictions.append(pred) # set the encoder_inputs and decoder_inputs to be members # of the object because they are required for each train step # in contrast, the predictions are only used for defining # the graph, so we just return them once self.encoder_inputs = encoder_inputs self.decoder_inputs = decoder_inputs return predictions
with tf.variable_scope("COMPUTATION", reuse=None): # create a BasicLSTMCell cell = GRUCell(state_dim) # True ) # use it to create a MultiRNNCell cell = rnn_cell.MultiRNNCell([cell] * num_layers) # use it to create an initial_state # note that initial_state will be a *list* of tensors! initial_state = cell.zero_state(batch_size, tf.float32) softmax_w = tf.get_variable("softmax_w", [state_dim, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) # call seq2seq.rnn_decoder outputs, last_state = seq2seq.rnn_decoder(inputs, initial_state, cell) output = tf.reshape(tf.concat(1, outputs), [-1, state_dim]) # transform the list of state outputs to a list of logits. logits = tf.matmul(output, softmax_w) + softmax_b # use a linear transformation. probs = tf.nn.softmax(logits) # call seq2seq.sequence_loss loss = seq2seq.sequence_loss([logits], [tf.reshape(targets, [-1])], [tf.ones([batch_size * sequence_length])], vocab_size) cost = tf.reduce_sum(loss) / batch_size / sequence_length final_state = last_state lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables()
def __init__(self, args, infer=False): self.args = args if infer: self.batch_size = 1 self.seq_length = 1 else: self.batch_size = args.batch_size self.seq_length = args.seq_length if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell elif args.model == 'dropgru' or args.model == 'droprnn': pass else: raise Exception("model type not supported: {}".format(args.model)) if args.model.startswith('drop'): cells = [] dt1 = DropoutBasicRNNCell dt2 = DropoutGRUCell if args.model != 'dropgru': print("additional layers will be basic RNN") dt2 = DropoutBasicRNNCell for ii in range(args.num_layers): if False and args.learn_input_embedding: # context-dependent embedding learned as a small RNN before the large GRUs args.learn_input_embedding = False if ii == 0: nc = dt1(args.vocab_size, input_size=args.vocab_size, probofdrop_st=args.dropout, probofdrop_in=0.0) elif ii == 1: nc = dt2(args.rnn_size, input_size=args.vocab_size, probofdrop_st=args.dropout, probofdrop_in=args.dropout) else: nc = dt2(args.rnn_size, input_size=args.rnn_size, probofdrop_st=args.dropout, probofdrop_in=args.dropout) else: # embedding is fixed, context-independent; like word vectors firstdroprate = 0.0 if args.learn_input_embedding: firstdroprate = args.dropout if ii == 0: nc = dt2(args.rnn_size, input_size=args.vocab_size, probofdrop_st=args.dropout, probofdrop_in=firstdroprate) else: nc = dt2(args.rnn_size, input_size=args.rnn_size, probofdrop_st=args.dropout, probofdrop_in=args.dropout) cells.append(nc) self.cell = rnn_cell.MultiRNNCell(cells) self.cellusesdropout = True else: print("building basic non-dropout model") c1 = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([c1] * args.num_layers) self.cellusesdropout = False self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="x_input_data") self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_length], name="y_targets") self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) if args.learn_input_embedding: self.embedding = tf.get_variable("embedding", [args.vocab_size, args.vocab_size]) else: self.embedding = tf.placeholder(tf.float32, [args.vocab_size, args.vocab_size], name="embedding") if self.cellusesdropout: self._dropMaskOutput = tf.placeholder(dtype=tf.float32, shape=[self.batch_size*self.seq_length, args.rnn_size], name="dropout_output_mask") self._latest_mask_output = None with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("top_softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("top_softmax_b", [args.vocab_size]) inputs = tf.split(1, self.seq_length, tf.nn.embedding_lookup(self.embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): if self.cellusesdropout: assert(prev.get_shape() == self._dropMaskOutput.get_shape()) prev = tf.matmul(tf.mul(prev, self._dropMaskOutput), softmax_w) + softmax_b else: prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embedding, prev_symbol) self.temperature = tf.placeholder(tf.float32, 1, name="temperature") # if loop_function is not None, it is used to generate the next input # otherwise, if it is None, the next input will be from the "inputs" sequence outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [self.batch_size*self.seq_length, args.rnn_size]) if self.cellusesdropout: assert(output.get_shape() == self._dropMaskOutput.get_shape()) self.logits = tf.matmul(tf.mul(output, self._dropMaskOutput), softmax_w) + softmax_b else: self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) self.probswithtemp = tf.nn.softmax(self.logits / self.temperature) # 1.44... term converts cost from units of "nats" to units of "bits" self.cost = seq2seq.sequence_loss([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_length])]) * 1.44269504088896340736 self.pred_entropy = tf.reduce_sum(tf.mul(self.probs, tf.log(self.probs + 1e-12)), 1) * (-1.44269504088896340736) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False, name="learningrate") tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) zipgradvars = zip(grads, tvars) self.train_op = optimizer.apply_gradients(zipgradvars) # for tensorboard tb_cost = tf.scalar_summary('cost_train', self.cost) tb_predent = tf.scalar_summary('prediction_entropy_train', tf.reduce_mean(self.pred_entropy)) mergethese = [tb_cost, tb_predent] for grad,var in zipgradvars: mergethese.append(tf.histogram_summary(var.name+'_value', var)) mergethese.append(tf.histogram_summary(var.name+'_grad', grad)) self.tbsummary = tf.merge_summary(mergethese)
def build_network(self): with tf.variable_scope('encoder'): z_mean_w = tf.Variable( self.initializer([self._enc_cell.state_size, self.n_latent])) z_mean_b = tf.Variable(tf.zeros([self.n_latent], dtype=tf.float32)) z_logvar_w = tf.Variable( self.initializer([self._enc_cell.state_size, self.n_latent])) z_logvar_b = tf.Variable( tf.zeros([self.n_latent], dtype=tf.float32)) _, enc_state = rnn.rnn(self._enc_cell, self.inputs, dtype=tf.float32) self.z_mean = tf.add(tf.matmul(enc_state, z_mean_w), z_mean_b) self.z_log_var = tf.add(tf.matmul(enc_state, z_logvar_w), z_logvar_b) eps = tf.random_normal((self.batch_size, self.n_latent), 0, 1, dtype=tf.float32) self.z = tf.add(self.z_mean, tf.mul(tf.sqrt(tf.exp(self.z_log_var)), eps)) with tf.variable_scope('decoder') as scope: dec_in_w = tf.Variable( self.initializer([self.n_latent, self._dec_cell.state_size], dtype=tf.float32)) dec_in_b = tf.Variable( tf.zeros([self._dec_cell.state_size], dtype=tf.float32)) dec_out_w = tf.Variable( self.initializer([self.n_hidden, self.elem_num], dtype=tf.float32)) dec_out_b = tf.Variable(tf.zeros([self.elem_num], dtype=tf.float32)) initial_dec_state = self.transfer_func( tf.add(tf.matmul(self.z, dec_in_w), dec_in_b)) dec_out, _ = seq2seq.rnn_decoder(self.inputs, initial_dec_state, self._dec_cell) if self.reverse: dec_out = dec_out[::-1] dec_output = tf.transpose(tf.pack(dec_out), [1, 0, 2]) batch_dec_out_w = tf.tile(tf.expand_dims(dec_out_w, 0), [self.batch_size, 1, 1]) self.output = tf.nn.sigmoid( tf.batch_matmul(dec_output, batch_dec_out_w) + dec_out_b) scope.reuse_variables() dec_gen_input = [ 0.5 * tf.ones([self.batch_size, self.elem_num], dtype=tf.float32) for _ in range(self.step_num) ] self.z_gen = tf.placeholder(tf.float32, [self.batch_size, self.n_latent]) dec_gen_state = self.transfer_func( tf.add(tf.matmul(self.z_gen, dec_in_w), dec_in_b)) dec_gen_out, _ = seq2seq.rnn_decoder(dec_gen_input, dec_gen_state, self._dec_cell) if self.reverse: dec_gen_out = dec_gen_out[::-1] dec_gen_output = tf.transpose(tf.pack(dec_gen_out), [1, 0, 2]) self.gen_output = tf.nn.sigmoid( tf.batch_matmul(dec_gen_output, batch_dec_out_w) + dec_out_b) self.inp = tf.transpose(tf.pack(self.inputs), [1, 0, 2]) self.train_loss = self.get_loss() self.train = tf.train.AdamOptimizer(self.learning_rate).minimize( self.train_loss)
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.float32, [args.batch_size, args.seq_length], name="input") self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="targets") self.initial_state = cell.zero_state(args.batch_size, tf.float32) inputs_data = tf.split(1, args.seq_length, self.input_data) args.vocab_size = 1 with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # with tf.device("/cpu:0"): # embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) # inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) #inputs = tf.split(1, args.seq_length, self.input_data) # inputs = [tf.squeeze(input_, [1]) for input_ in inputs] #def loop(prev, _): # prev = tf.matmul(prev, softmax_w) + softmax_b # prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # return tf.nn.embedding_lookup(embedding, prev_symbol) #outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') outputs, last_state = seq2seq.rnn_decoder(inputs_data, self.initial_state, cell) output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) #loss = seq2seq.sequence_loss_by_example([self.logits], # [tf.reshape(self.targets, [-1])], # [tf.ones([args.batch_size * args.seq_length])], # args.vocab_size) self.reg_cost = tf.reduce_sum(1e-1 * (tf.nn.l2_loss(softmax_w))) target = tf.cast(self.targets, tf.float32) self.target_vector = tf.reshape(target, [-1]) loss = tf.pow(self.logits / self.target_vector, 2) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length + self.reg_cost self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, n_input, model, rnn_size, rnn_num_layers, n_outputs, batch_size, input_seq_length, grad_clip, infer=True): if infer: batch_size = 1 input_seq_length = 1 if model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif model == 'gru': cell_fn = rnn_cell.GRUCell elif model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(model)) cell = cell_fn(rnn_size, state_is_tuple=True) self.cell = cell = rnn_cell.MultiRNNCell([cell] * rnn_num_layers, state_is_tuple=True) self.n_input = n_input self.input_data = tf.placeholder( tf.int32, [batch_size, input_seq_length, n_input]) self.targets = tf.placeholder( tf.int32, [batch_size, input_seq_length, n_outputs]) self.initial_state = cell.zero_state(batch_size, tf.float32) with tf.variable_scope('rnn_model'): output_w = tf.get_variable("output_w", [rnn_size, n_outputs]) output_b = tf.get_variable("output_b", [n_outputs]) inputs = tf.split(1, input_seq_length, self.input_data) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, output_w) + output_b return prev outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnn_model') #The following gives (batch_size * input_seq_length, rnn_size) shape tensor output = tf.reshape(tf.concat(1, outputs), [-1, rnn_size]) self.logits = tf.matmul( output, output_w) + output_b # (batch_size * input_seq_length, n_outputs) self.output = tf.sigmoid(self.logits) self.loss = tf.reduce_sum((tf.reshape(self.targets, [-1, n_outputs]) - self.output)**2) / \ (batch_size * input_seq_length * n_outputs) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def decoder_rnn(conv_encoder, rnn_encoder, decoder_inputs, decoder_hidden, weigth_generation, n_steps, bias_generation, batch_size, keep_prob, encoder_states, defendant, embedding, sample_rate, lstm_layer=1, is_train=True): with tf.name_scope('decoder_rnn') as scope: lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden, forget_bias=1.0, state_is_tuple=True) if lstm_layer > 1: lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer) batch_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs) batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2]) batch_decoder_inputs = tf.unpack(batch_decoder_inputs) batch_decoder_inputs = [ tf.concat(1, [batch_decoder_inputs[i], conv_encoder]) for i in range(len(batch_decoder_inputs)) ] if is_train: def func(prev, i): #words prob words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) # select from prev_outputs and ground truth prob = tf.random_uniform(minval=0, maxval=1, shape=(batch_size, )) mask = tf.cast(tf.greater(sample_rate, prob), tf.float32) mask = tf.expand_dims(mask, 1) mask = tf.tile(mask, [1, prev_outputs.get_shape().as_list()[-1]]) next_input = mask * prev_outputs + ( 1 - mask) * batch_decoder_inputs[i] return next_input outputs, state = seq2seq.rnn_decoder( decoder_inputs=batch_decoder_inputs, initial_state=encoder_states, cell=lstm_cell, loop_function=func, scope='rnn_decoder') else: def func(prev, i): #words prob words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) return prev_outputs outputs, state = seq2seq.rnn_decoder( decoder_inputs=batch_decoder_inputs, initial_state=encoder_states, cell=lstm_cell, loop_function=func, scope='rnn_decoder') outputs = tf.nn.dropout(outputs, keep_prob) outputs = tf.unpack(outputs) res = [0 for i in range(n_steps)] for i in range(len(outputs)): #words prob res[i] = tf.nn.bias_add(tf.matmul(outputs[i], weigth_generation), bias_generation) return res, state
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) #(3, 2) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) #(3, 2) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) #with tf.name_scope('stddev'): # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) #tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) #tf.summary.histogram('histogram', var) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) #(4, 7) variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) #7 variable_summaries(softmax_b) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) #(7,4) inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): """ Args: infer: whether the model is used for training or inference. If doing inference, we need to do two things: 1. Feed in one word at a time. 2. Give a loop function to the rnn decoder, in order to feed the previous step output into the next step. Inside the loop function, we prevent gradient updates. """ self.args = args if infer: args.batch_size = 1 args.seq_length = 1 # Can also experiment with using rnn_cell.BasicGRUCell here. cell_constructor = rnn_cell.BasicLSTMCell cell = cell_constructor(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) # for training, targets is input_data shifted by one word. # see example in text_loader_tests self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Init hidden state to all zeroes. self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): # Dimensions should be: # Output * w + b # [batch_size, rnn_size] * [rnn_size, vocab_size] + [vocab_size] softmax_w = tf.get_variable('softmax_w', [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable('softmax_b', args.vocab_size) # Word embedding. # Always place word embedding lookup on the CPU, and save GPU for # running the forward and backward pass of the LSTM. # Experience from running char-rnn on my GTX 1070 + 6820HK: # CPU utilization was about 16% during training, and GPU utilization was about 90%. with tf.device("/cpu:0"): # We learn this during training, hence this matrix is also a variable. # Each row is the word vector for one word. # TODO: consider visualizing this embedding using TSNE after training. embedding = tf.get_variable('embedding', [args.vocab_size, args.rnn_size]) # Dimensions: [batch_size, seq_length, word_vector_length==rnn_size] embedding_lookup = tf.nn.embedding_lookup(embedding, self.input_data) # Split into a list of records, each with dimension: [batch_size, 1, word_vector_length] # This is to match tensorflow's LSTM impl: it expects a list of inputs, each is a time step. inputs = tf.split(1, args.seq_length, embedding_lookup) # Note that tensorflow wants a 2D matrix for each time step, not 3D. So remove dimension 1. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # While doing inference, we predict a word at each time step, then we feed the prediction # back into the LSTM decoder for the next timestep. This is done by giving this loop # function to the rnn decoder. # Second arg is the step number. We don't use it here. def loop(prev, _): # Dimensions: # prev * w + b # [batch_size==1, rnn_size] * [rnn_size, vocab_size] + [vocab_size] prev = tf.matmul(prev, softmax_w) + softmax_b symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, symbol) # last_state has dimension [batch_size, cell.state_size==rnn_size] # outputs is a list of records, one for each timestep of dimension [batch_size, rnn_size] outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # note that outputs is a list and cannot be multiplied with w. # we first reshape outputs to make it [batch_size * seq_length, rnn_size], # so we can multiple it with w. output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state # This allows variable learning rate during the training. # I.e. we can decrease this over time. # Notice the 'trainable=False' flag: we don't want to backprop into lr! self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): """ 数据预处理完成以后,接下来就是建立seq2seq模型了。建立模型主要分为三步: 确定好编码器和解码器中cell的结构,即采用什么循环单元,多少个神经元以及多少个循环层; 将输入数据转化成tensorflow的seq2seq.rnn_decoder需要的格式,并得到最终的输出以及最后一个隐含状态; 将输出数据经过softmax层得到概率分布,并且得到误差函数,确定梯度下降优化器; 由于tensorflow提供的rnncell共有三种,分别是RNN、GRU、LSTM,因此这里我们也提供三种选择,并且每一种都可以使用多层结构, 即MultiRNNCell :param args: :param infer: """ self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format( args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size], name='soft_w') softmax_b = build_weight([args.vocab_size], name='soft_b') word_embedding = build_weight( [args.vocab_size, args.embedding_size], name='word_embedding') inputs_list = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(word_embedding, prev_symbol) # 用于建立seq2seq的函数,rnn_decoder以及attention_decoder if not args.attention: outputs, last_state = seq2seq.rnn_decoder( inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') # rnn_decoder函数主要有四个参数 # decoder_inputs其实就是输入的数据,要求的格式为一个list,并且list中的tensor大小应该为[batch_size,input_size], # 换句话说这个list的长度就是seq_length;但我们原始的输入数据的维度为[args.batch_size, args.seq_length], # 是不是感觉缺少了一个input_size维度,其实这个维度就是word_embedding的维度,或者说word2vec的大小, # 这里需要我们手动进行word_embedding,并且这个embedding矩阵是一个可以学习的参数 # initial_state是cell的初始状态,其维度是[batch_size,cell.state_size], # 由于rnn_cell模块提供了对状态的初始化函数,因此我们可以直接调用 # cell就是我们要构建的解码器和编码器的cell,上面已经提过了。 # 最后一个参数是loop_function,其作用是在生成的时候,我们需要把解码器上一时刻的输出作为下一时刻的输入, # 并且这个loop_function需要我们自己写 # 其中outputs是与decoder_inputs同样维度的量,即每一时刻的输出; # last_state的维度是[batch_size,cell.state_size],即最后时刻的所有cell的状态。 # 接下来需要outputs来确定目标函数,而last-state的作用是作为抽样生成函数下一时刻的状态 else: self.attn_length = 5 self.attn_size = 32 self.attention_states = build_weight( [args.batch_size, self.attn_length, self.attn_size]) outputs, last_state = seq2seq.attention_decoder( inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # tensorflow中提供了sequence_loss_by_example函数用于按照权重来计算整个序列中每个单词的交叉熵, # 返回的是每个序列的log-perplexity。为了使用sequence_loss_by_example函数, # 我们首先需要将outputs通过一个前向层,同时我们需要得到一个softmax概率分布 # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) # train_op即为训练时需要运行的 self.train_op = optimizer.apply_gradients( zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace( ' ', '').replace('/', '') self.var_op = tf.global_variables()
def build(self): print(' Building model') self.embeddings = tf.Variable(tf.random_normal( [self.alphabet_size, self.embedd_dims], stddev=0.1), name='embeddings') X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t') with tf.variable_scope('split_X_inputs'): X_list = tf.split(split_dim=1, num_split=self.max_x_seq_len, value=X_embedded) X_list = [tf.squeeze(X) for X in X_list] [X.set_shape([None, self.embedd_dims]) for X in X_list] with tf.variable_scope('split_t_inputs'): t_list = tf.split(split_dim=1, num_split=self.max_t_seq_len, value=t_embedded) t_list = [tf.squeeze(t) for t in t_list] [t.set_shape([None, self.embedd_dims]) for t in t_list] with tf.variable_scope('dense_out'): W_out = tf.get_variable('W_out', [self.rnn_units, self.alphabet_size]) b_out = tf.get_variable('b_out', [self.alphabet_size]) cell = rnn_cell.GRUCell(self.rnn_units) # encoder enc_outputs, enc_state = rnn.rnn(cell=cell, inputs=X_list, dtype=tf.float32, sequence_length=self.X_len, scope='rnn_encoder') tf.histogram_summary('final_encoder_state', enc_state) # The loop function provides inputs to the decoder: def decoder_loop_function(prev, i): def feedback_on(): prev_1 = tf.matmul(prev, W_out) + b_out # feedback is on, so feed the decoder with the previous output return tf.gather(self.embeddings, tf.argmax(prev_1, 1)) def feedback_off(): # feedback is off, so just feed the decoder with t's return t_list[i] return tf.cond(self.feedback, feedback_on, feedback_off) # decoder dec_out, dec_state = (seq2seq.rnn_decoder( decoder_inputs=t_list, initial_state=enc_state, cell=cell, loop_function=decoder_loop_function)) self.out = [tf.matmul(d, W_out) + b_out for d in dec_out] # for debugging network (NOTE should write this outside of build) out_packed = tf.pack(self.out) out_packed = tf.transpose(out_packed, perm=[1, 0, 2]) self.out_tensor = out_packed # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()
def __init__(self, config): """Init model from provided configuration Args: config (dict): Model's configuration Should have: rnn_size: size of RNN hidden state num_layers: number of RNN layers rnn_type: lstm, rnn, or gru batch_size: batch size seq_length: sequence length grad_clip: Clip gradient value by this value vocab_size: size of vocabulary infer: True/False, if True, use the predicted output to feed back to RNN insted of gold target output. is_train: True if is training """ logger.info("Create model with options: \n{}".format(pprint.pformat(config))) self.rnn_size = config["rnn_size"] self.num_layers = config["num_layers"] self.rnn_type = config["rnn_type"] self.batch_size = config["batch_size"] self.seq_length = config["seq_length"] self.grad_clip = config["grad_clip"] self.vocab_size = config["vocab_size"] self.infer = config["infer"] self.is_train = config["is_train"] self.reuse = config["reuse"] if self.infer: self.batch_size = 1 self.seq_length = 1 if self.rnn_type == "rnn": cell_fn = rnn_cell.BasicRNNCell elif self.rnn_type == "gru": cell_fn = rnn_cell.GRUCell elif self.rnn_type == "lstm": cell_fn = rnn_cell.LSTMCell else: msg = "Rnn type should be either rnn, gru or lstm" logger.error(msg) sys.exit(msg) # Define the cell cell = cell_fn(self.rnn_size) # Create multiple layers RNN self.cell = cell = rnn_cell.MultiRNNCell([cell] * self.num_layers) self.input_data = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_length]) self.initial_state = cell.zero_state(self.batch_size, tf.float32) with tf.variable_scope(MODEL_SCOPE, reuse=self.reuse): softmax_w = tf.get_variable("softmax_w", [self.rnn_size, self.vocab_size]) softmax_b = tf.get_variable("softmax_b", [self.vocab_size]) # Model params stored in DEVICE_SCOPE (here using GPU) with tf.device(DEVICE_SCOPE): embeddings = tf.get_variable("embeddings", [self.vocab_size, self.rnn_size]) # Split it into list of step input, i.e. along dimension 1 inputs = tf.split(1, self.seq_length, tf.nn.embedding_lookup(embeddings, self.input_data)) """ tf.split works like numply.split, inputs is now a list of step inputs (to rnn). Each step input has shape (batch_size, 1, rnn_size). We don't need that dimension 1, remove it by squeezing. """ inputs = [tf.squeeze(_input, [1]) for _input in inputs] """ Instead of writing the neuralnet manually, use seq2seq.rnn_decoder. In test time, the predicted output is fed back to RNN instead of gold target output like in training time. """ def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b # Wow, this stop_gradient is cool prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embeddings, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if self.infer else None, scope=MODEL_SCOPE ) # Concat each sequence of the batch output = tf.reshape(tf.concat(1, outputs), [-1, self.rnn_size]) # now (batch_size x seq_length) x rnn_size self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_length])] ) self.cost = tf.reduce_sum(loss) / (self.batch_size * self.seq_length) self.final_state = last_state if not self.is_train: return self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), self.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
#################### # hand made seq2seq # outputs_le, finstate = rnn.rnn(neurons, inputs, init_state) # inp_state = array_ops.slice(finstate, [0, 0], [batch_size, input_size]) # le_state = array_ops.slice(finstate, [0, input_size], [batch_size, le_size]) # finstate = array_ops.concat(1, [le_state, inp_state]) # outputs, finstate = rnn.rnn(neurons_out, outputs_le, finstate, scope="out") #################### # official seq2seq (perfect regression) _, enc_state = rnn.rnn(neurons, inputs, initial_state = state) outputs, finstate = ss.rnn_decoder(targets, enc_state, neurons) loss = tf.add_n([ tf.nn.l2_loss(target - output) for output, target in zip(outputs, targets) ]) / bptt_steps / batch_size / net_size lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads_raw = tf.gradients(loss, tvars) grads, _ = tf.clip_by_global_norm(grads_raw, 5.0) # optimizer = tf.train.GradientDescentOptimizer(lr) # optimizer = tf.train.AdagradOptimizer(lr) optimizer = tf.train.AdamOptimizer(lr) # optimizer = tf.train.RMSPropOptimizer(lr) # optimizer = tf.train.AdadeltaOptimizer(lr)