def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = core_rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = core_rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = core_rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, state_is_tuple=True) self.cell = cell = core_rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="input_data") self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name="targets") self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) print "seq_length = ", args.seq_length, "embedding_lookup = ", tf.nn.embedding_lookup(embedding, self.input_data) #inputs = tf.split(1, args.seq_length, tf.nn.embedding_lookup(embedding, self.input_data)) inputs = tf.split( tf.nn.embedding_lookup(embedding, self.input_data) , args.seq_length,1) print "inputs 1:",inputs inputs = [tf.squeeze(input_, [1]) for input_ in inputs] print "inputs 2:",inputs def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) # yonghua # inputs, initial_state, cell, scope outputs, last_state = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') #sys.stdout.write("outputs : %s\tlast_state : %s" % (outputs, last_state)) #output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.reshape(tf.concat(outputs,1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits, name="prob_results") loss = seq2seq.sequence_loss_by_example([self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False,name="LR_") tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, vocabularySize, config_param): self.vocabularySize = vocabularySize self.config = config_param self._inputX = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputsX") self._inputTargetsY = tf.placeholder( tf.int32, [self.config.batch_size, self.config.sequence_size], "InputTargetsY") #Converting Input in an Embedded form with tf.device( "/cpu:0"): #Tells Tensorflow what GPU to use specifically embedding = tf.get_variable( "embedding", [self.vocabularySize, self.config.embeddingSize]) embeddingLookedUp = tf.nn.embedding_lookup(embedding, self._inputX) inputs = tf.split(axis=1, num_or_size_splits=self.config.sequence_size, value=embeddingLookedUp) inputTensorsAsList = [tf.squeeze(input_, [1]) for input_ in inputs] #Define Tensor RNN singleRNNCell = rnn_cell.BasicRNNCell(self.config.hidden_size) self.multilayerRNN = rnn_cell.MultiRNNCell([singleRNNCell] * self.config.num_layers) self._initial_state = self.multilayerRNN.zero_state( self.config.batch_size, tf.float32) #Defining Logits hidden_layer_output, last_state = rnn.static_rnn( self.multilayerRNN, inputTensorsAsList, initial_state=self._initial_state) hidden_layer_output = tf.reshape( tf.concat(axis=1, values=hidden_layer_output), [-1, self.config.hidden_size]) self._logits = tf.nn.xw_plus_b( hidden_layer_output, tf.get_variable("softmax_w", [self.config.hidden_size, self.vocabularySize]), tf.get_variable("softmax_b", [self.vocabularySize])) self._predictionSoftmax = tf.nn.softmax(self._logits) #Define the loss loss = seq2seq.sequence_loss_by_example( [self._logits], [tf.reshape(self._inputTargetsY, [-1])], [tf.ones([self.config.batch_size * self.config.sequence_size])], self.vocabularySize) self._cost = tf.div(tf.reduce_sum(loss), self.config.batch_size) self._final_state = last_state
def testSequenceLossByExample(self): with self.test_session() as sess: output_classes = 5 logits = [ constant_op.constant( i + 0.5, shape=[2, output_classes]) for i in range(3) ] targets = [ constant_op.constant( i, dtypes.int32, shape=[2]) for i in range(3) ] weights = [constant_op.constant(1.0, shape=[2]) for i in range(3)] average_loss_per_example = (seq2seq_lib.sequence_loss_by_example( logits, targets, weights, average_across_timesteps=True)) res = sess.run(average_loss_per_example) self.assertAllClose(np.asarray([1.609438, 1.609438]), res) loss_per_sequence = seq2seq_lib.sequence_loss_by_example( logits, targets, weights, average_across_timesteps=False) res = sess.run(loss_per_sequence) self.assertAllClose(np.asarray([4.828314, 4.828314]), res)
def testSequenceLossByExample(self): with self.test_session() as sess: output_classes = 5 logits = [ constant_op.constant( i + 0.5, shape=[2, output_classes]) for i in range(3) ] targets = [ constant_op.constant( i, dtypes.int32, shape=[2]) for i in range(3) ] weights = [constant_op.constant(1.0, shape=[2]) for i in range(3)] average_loss_per_example = (seq2seq_lib.sequence_loss_by_example( logits, targets, weights, average_across_timesteps=True)) res = sess.run(average_loss_per_example) self.assertAllClose(np.asarray([1.609438, 1.609438]), res) loss_per_sequence = seq2seq_lib.sequence_loss_by_example( logits, targets, weights, average_across_timesteps=False) res = sess.run(loss_per_sequence) self.assertAllClose(np.asarray([4.828314, 4.828314]), res)
def compute_cost(self): """from tensorflow.contrib.legacy_seq2seq.python.ops.seq2seq import sequence_loss_by_example""" losses = sequence_loss_by_example( [tf.reshape(self.prediction, [-1], name='reshape_pred')], [tf.reshape(self.ys, [-1], name='reshape_target')], [tf.ones([self.batch_size * self.n_steps], dtype=tf.float32)], average_across_timesteps=True, softmax_loss_function=self.ms_error, name='losses' ) self.cost = tf.div( tf.reduce_sum(losses, name='losses_sum'), self.batch_size, name='average_cost') tf.summary.scalar('cost', self.cost)
def train_neural_network(): logits, last_state, _, _, _ = recurrent_neural_network() # targets = tf.reshape(output_targets, [-1, digits_range]) targets = tf.reshape(output_targets, [-1]) loss = seq2seq.sequence_loss_by_example(logits=[logits], targets=[targets], weights=[tf.ones_like(targets, dtype=tf.float32)]) # softmax_loss_function=softmax_cross_entropy) print(logits.get_shape()) print(targets.get_shape()) cost = tf.reduce_mean(loss) learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 6) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.minimize(cost) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # train_op = optimizer.apply_gradients(zip(grads, tvars)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) lr = 0.1 lr_decay = 0.002 for epoch in range(1000): mini_lr = lr_decay * (0.97 ** (epoch * 1.0 / 1001)) lr = lr * 1.0 / 10 if lr < mini_lr: lr = mini_lr if epoch > 0 and epoch % 55 == 0: lr_decay /= 10.0 sess.run(tf.assign(learning_rate, lr)) n = 0 batches = n_chunk - label_size - batch_size for batche in range(batches): train_loss, _, _ = sess.run([cost, last_state, train_op], feed_dict={input_data: x_inputs[n], output_targets: y_labels[n]}) n += 1 if n == batches / 4 or n == batches * 2 / 4 or n == batches * 3 / 4 \ or n == 1 or n == batches - 1: print(epoch, batche, train_loss) print lr if epoch > 34 and epoch % 7 == 0 and (n == batches / 4 or n == batches * 2 / 4 or n == batches * 3 / 4 \ or n == 1 or n == batches - 1): saver.save(sess, 'ticket.module', global_step=epoch)
def train_neural_network(total_train, total_regions, total_asfmap, wordtoix): keep = 0.5 image_feat_size = 2048 len_words = 3000 input_image_feature = tf.placeholder(tf.float32, [1, image_feat_size]) input_data = tf.placeholder(tf.int64, [1, None]) keep_prob = tf.placeholder(tf.float32) output_targets = tf.placeholder(tf.int64, [1, None]) # feat = tf.placeholder(tf.float32, []) logits, last_state, _, _, _, _, _, _, _ = neural_network(input_image_feature, input_data, keep_prob) targets = tf.reshape(output_targets, [-1]) loss = seq2seq.sequence_loss_by_example([logits], [targets], [tf.ones_like(targets, dtype=tf.float32)], len_words) cost = tf.reduce_mean(loss) learning_rate = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), 5) optimizer = tf.train.AdamOptimizer(learning_rate) train_op = optimizer.apply_gradients(zip(grads, tvars)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) for epoch in range(500): sess.run(tf.assign(learning_rate, 0.001 * (0.9 ** (epoch / 100)))) for k in range(len(total_train)): train_data = total_train[k] regions = total_regions[k] asfmap = total_asfmap[k] for i in range(train_data.shape[0]): train, test = sen2ix(regions[train_data[i, 4].astype('int32')]['phrase'], wordtoix) train_loss, _, _ = sess.run([cost, last_state, train_op], feed_dict={input_data: train, input_image_feature: asfmap[i].reshape(1, 2048), output_targets: test, keep_prob: keep}) if (epoch + 1) % 50 == 0: print(epoch, train_loss) saver.save(sess, 'RNN_model/test.module') print "train end!"
def __init__(self, is_training, config, debug=False): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size self.num_layers = config.num_layers self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) embedding = tf.get_variable("embedding", [vocab_size, size], dtype=data_type(is_lstm_layer=False)) inputs = tf.nn.embedding_lookup(embedding, self._input_data, name="inputs_to_rnn") if debug: variable_summaries(inputs, "inputs_to_rnn") if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) rnn = CudnnLSTM(config.num_layers, size, size, input_mode='linear_input', direction='unidirectional', dropout=config.keep_prob, seed=0, seed2=0) params_size_t = rnn.params_size() self._initial_input_h = tf.placeholder(data_type(is_lstm_layer=True), shape=[config.num_layers, batch_size, size]) #self._initial_input_h = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) self._initial_input_c = tf.placeholder(data_type(is_lstm_layer=True), shape=[config.num_layers, batch_size, size]) #self._initial_input_c = tf.Variable(tf.zeros([config.num_layers, batch_size, size])) #self.params = tf.get_variable("params", [params_size_t], validate_shape=False, dtype=data_type(is_lstm_layer=False)) self.params = tf.Variable(tf.random_uniform([params_size_t], minval=-config.init_scale, maxval=config.init_scale, dtype=data_type(is_lstm_layer=True)), validate_shape=False) self.params_size_t = rnn.params_size() outputs, output_h, output_c = rnn(is_training=is_training, input_data=tf.transpose(tf.cast(inputs, dtype=data_type(is_lstm_layer=True)), [1, 0, 2]), input_h=self.input_h, input_c=self.input_c, params=self.params) self._output_h = output_h self._output_c = output_c output = tf.reshape(tf.concat(values=tf.transpose(outputs, [1, 0, 2]), axis=1), [-1, size]) if debug: variable_summaries(output, 'multiRNN_output') softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=data_type(is_lstm_layer=False)) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type(is_lstm_layer=False)) logits = tf.matmul(output if output.dtype == data_type(is_lstm_layer=False) else tf.cast(output, data_type(is_lstm_layer=False)), softmax_w) + softmax_b if debug: variable_summaries(logits, 'logits') #loss = tf.contrib.nn.seq2seq.sequence_loss_by_example( loss = sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.ones([batch_size * num_steps], dtype=data_type(is_lstm_layer=False))]) self._cost = cost = tf.reduce_sum(loss) / batch_size if FLAGS.cost_function == 'avg': self._cost_to_optimize = cost_to_optimize = tf.reduce_mean(loss) else: self._cost_to_optimize = cost_to_optimize = cost tvars = tf.trainable_variables() for v in tvars: cost_to_optimize += FLAGS.reg_term * tf.cast(tf.nn.l2_loss(v), dtype=data_type(False)) / (batch_size*config.num_steps) self._cost_to_optimize = cost_to_optimize if debug: tf.summary.scalar('cost no regularization', cost) tf.summary.scalar('cost_to_optimize', cost_to_optimize) #self._final_state = state if not is_training: self.merged = tf.summary.merge_all() return self._lr = tf.Variable(0.0, trainable=False, dtype=data_type(is_lstm_layer=False)) #if debug: # tf.scalar_summary('learning rate', self._lr) #tvars = tf.trainable_variables() type2vars = dict() print("**************************") print("Trainable Variables") print("**************************") for var in tvars: print('Variable name: %s. With dtype: %s and shape: %s' % (var.name, var.dtype, var.get_shape())) if var.dtype not in type2vars: type2vars[var.dtype] = [var] else: type2vars[var.dtype].append(var) print("**************************") print("Gradients Variables") print("**************************") _grads = tf.gradients(cost_to_optimize, tvars) type2grads = dict() for g in _grads: print('Gradient name: %s. With dtype: %s' % (g.name, g.dtype)) if g.dtype not in type2grads: type2grads[g.dtype] = [g] else: type2grads[g.dtype].append(g) type2clippedGrads = dict() for dtype in type2grads: cgrads, _ = tf.clip_by_global_norm(type2grads[dtype], config.max_grad_norm) type2clippedGrads[dtype] = cgrads if debug: for (gkey, vkey) in zip(type2clippedGrads.keys(),type2vars.keys()): for (clipped_gradient, variable) in zip(type2clippedGrads[gkey], type2vars[vkey]): variable_summaries(clipped_gradient, "clipped_dcost/d"+variable.name) variable_summaries(variable, variable.name) if FLAGS.optimizer == 'MomentumOptimizer': optimizer = tf.train.MomentumOptimizer(learning_rate=self._lr, momentum=0.9) elif FLAGS.optimizer == 'AdamOptimizer': optimizer = tf.train.AdamOptimizer() elif FLAGS.optimizer == 'RMSPropOptimizer': optimizer = tf.train.RMSPropOptimizer(learning_rate=self._lr) elif FLAGS.optimizer == 'AdagradOptimizer': optimizer = tf.train.AdagradOptimizer(learning_rate=self._lr) else: optimizer = tf.train.GradientDescentOptimizer(self._lr) allgrads = [] allvars = [] for dtype in type2clippedGrads: allgrads += type2clippedGrads[dtype] #WARNING: key order assumption for dtype in type2vars: allvars += type2vars[dtype] self._train_op = optimizer.apply_gradients(zip(allgrads, allvars)) self._new_lr = tf.placeholder(dtype=data_type(False), shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) self.merged = tf.summary.merge_all()
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 additional_cell_args = {} if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell elif args.model == 'gridlstm': cell_fn = grid_rnn.Grid2LSTMCell additional_cell_args.update({ 'use_peepholes': True, 'forget_bias': 1.0 }) elif args.model == 'gridgru': cell_fn = grid_rnn.Grid2GRUCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size, **additional_cell_args) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(axis=1, num_or_size_splits=args.seq_length, value=tf.nn.embedding_lookup( embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.nn.xw_plus_b(prev, softmax_w, softmax_b) prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, args.rnn_size]) self.logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b) self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, batch_size, num_steps): self.batch_size = batch_size self.num_steps = num_steps self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, num_steps]) # 一行代码实现LSTM模型,并且考虑了dropout和deepRNN以及是否是训练过程 lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) if is_training: # TODO:注:这里为什么是output_keep_prob的 概率呢? lstm_cell = tf.nn.rnn_cell.DropoutWrapper( lstm_cell, output_keep_prob=KEEP_PRO) cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * NUM_LAYERS) # TODO:初始状态为什么是batch_size个? self.initial_state = cell.zero_state(batch_size, tf.int32) # TODO:word2vec技术 为什么维度是这个? embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if is_training: inputs = tf.nn.dropout(inputs, KEEP_PRO) # 用于存储num_step个值产生的所有的输出,并计算损失函数 outputs = [] # 用于记录每个循环体的状态变化,用类自身的进行初始化 state = self.initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() cell_output, state = cell(inputs[:, time_step, :], state) outputs.append(cell_output) # TODO:将输出reshape为一个这样的形式,从而进行后续操作?为什么要这样的形式 output = tf.reshape(tf.concat(1, outputs), [-1, HIDDEN_SIZE]) # 全连接神经网络,而且这没有初始化 weights = tf.get_variable("weights", [HIDDEN_SIZE, VOCAB_SIZE], tf.float32) biases = tf.get_variable("biases", [VOCAB_SIZE]) logits = tf.matmul(output, weights) + biases # 定义交叉熵函数,而且结合了权重 loss = sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], # 注意权重 [tf.ones([batch_size * num_steps], dtype=tf.float32)]) # 计算平均损失 self.cost = tf.reduce_sum(loss) / batch_size self.final_state = state # 保留最终的更新状态 # 如果不是考虑在训练集上的数据,直接返回 if not is_training: return training_variables = tf.trainable_variables() # 注意到这里是全局的clip, grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, training_variables), MAX_GRAD_NORM) # 定义优化方法 optimizer = tf.train.GradientDescentOptimizer(LEARNING_RATE) self.train_op = optimizer.apply_gradients( zip(grads, training_variables))
def __init__(self, is_training, config, filename): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size filename_queue = tf.train.string_input_producer([filename], num_epochs=None) # Unlike the TFRecordWriter, the TFRecordReader is symbolic reader = tf.TFRecordReader() # One can read a single serialized example from a filename # serialized_example is a Tensor of type string. _, serialized_example = reader.read(filename_queue) # The serialized example is converted back to actual values. # One needs to describe the format of the objects to be returned features = tf.parse_single_example( serialized_example, features={ # We know the length of both fields. If not the # tf.VarLenFeature could be used 'input_data': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'target': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'mask': tf.FixedLenFeature([batch_size * num_steps], tf.float32), 'key_words': tf.FixedLenFeature([batch_size * config.num_keywords], tf.int64) }) self._input_data = tf.cast(features['input_data'], tf.int32) self._targets = tf.cast(features['target'], tf.int32) self._input_word = tf.cast(features['key_words'], tf.int32) self._init_output = tf.placeholder(tf.float32, [batch_size, size]) self._mask = tf.cast(features['mask'], tf.float32) self._input_data = tf.reshape(self._input_data, [batch_size, -1]) self._targets = tf.reshape(self._targets, [batch_size, -1]) self._input_word = tf.reshape(self._input_word, [batch_size, -1]) self._mask = tf.reshape(self._mask, [batch_size, -1]) LSTM_cell = tf.nn.rnn_cell.LSTMCell(size, forget_bias=0.0, state_is_tuple=False) if is_training and config.keep_prob < 1: LSTM_cell = tf.nn.rnn_cell.DropoutWrapper( LSTM_cell, output_keep_prob=config.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell([LSTM_cell] * config.num_layers, state_is_tuple=False) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable( 'word_embedding', [vocab_size, config.word_embedding_size], trainable=True, initializer=tf.constant_initializer(word_vec)) inputs = tf.nn.embedding_lookup( embedding, self._input_data ) #返回一个tensor,shape是(batch_size, num_steps, size) keyword_inputs = tf.nn.embedding_lookup(embedding, self._input_word) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) gate = tf.ones([batch_size, config.num_keywords]) atten_sum = tf.zeros([batch_size, config.num_keywords]) with tf.variable_scope("coverage"): u_f = tf.get_variable("u_f", [ config.num_keywords * config.word_embedding_size, config.num_keywords ]) res1 = tf.sigmoid( tf.matmul(tf.reshape(keyword_inputs, [batch_size, -1]), u_f)) phi_res = tf.reduce_sum(self._mask, 1, keep_dims=True) * res1 self.output1 = phi_res outputs = [] output_state = self._init_output state = self._initial_state with tf.variable_scope("RNN"): entropy_cost = [] for time_step in range(num_steps): vs = [] for s2 in range(config.num_keywords): with tf.variable_scope("RNN_attention"): if time_step > 0 or s2 > 0: tf.get_variable_scope().reuse_variables() u = tf.get_variable("u", [size, 1]) w1 = tf.get_variable("w1", [size, size]) w2 = tf.get_variable( "w2", [config.word_embedding_size, size]) b = tf.get_variable("b1", [size]) vi = tf.matmul( tf.tanh( tf.add( tf.add( tf.matmul(output_state, w1), tf.matmul(keyword_inputs[:, s2, :], w2)), b)), u) vs.append(vi * gate[:, s2:s2 + 1]) self.attention_vs = tf.concat(vs, axis=1) prob_p = tf.nn.softmax(self.attention_vs) gate = gate - (prob_p / phi_res) atten_sum += prob_p * self._mask[:, time_step:time_step + 1] mt = tf.add_n([ prob_p[:, i:i + 1] * keyword_inputs[:, i, :] for i in range(config.num_keywords) ]) with tf.variable_scope("RNN_sentence"): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell( tf.concat([inputs[:, time_step, :], mt], axis=1), state) outputs.append(cell_output) output_state = cell_output self._end_output = cell_output self.output2 = atten_sum output = tf.reshape(tf.concat(outputs, axis=1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b try: loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])], average_across_timesteps=False) except: loss = sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])], average_across_timesteps=False) self.cost1 = tf.reduce_sum(loss) self.cost2 = tf.reduce_sum((phi_res - atten_sum)**2) self._cost = cost = (self.cost1 + 0.1 * self.cost2) / batch_size self._final_state = state self._prob = tf.nn.softmax(logits) if not is_training: prob = tf.nn.softmax(logits) self._sample = tf.argmax(prob, 1) return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, embedding, initial_state, attention_states, size, num_layers, max_length, num_samples=512, feed_previous=False, update_embedding_for_previous=True, dtype=dtypes.float32, scope=None, initial_state_attention=False, **kwargs): # account for _GO and _EOS self.max_length = max_length + 2 self.lengths = kwargs.get('lengths', tf.placeholder(tf.int32, shape=[None], name="decoder_lengths")) self.inputs = kwargs.get('inputs', [tf.placeholder(tf.int32, shape=[None], name="decoder_input{0}".format(i)) for i in xrange(self.max_length)]) self.weights = kwargs.get('weights', [tf.placeholder(tf.float32, shape=[None], name="decoder_weight{0}".format(i)) for i in xrange(self.max_length)]) self.targets = [self.inputs[i + 1] for i in xrange(len(self.inputs) - 1)] self.targets.append(tf.zeros_like(self.targets[0])) num_symbols = embedding.get_shape()[0].value output_projection = None loss_function = None self.num_layers = num_layers self.cell = GRUCell(size) #tf.contrib.rnn.LayerNormBasicLSTMCell(size) if self.num_layers > 1: self.cell = tf.contrib.rnn.MultiRNNCell([self.cell] * self.num_layers) self.feed_previous = feed_previous if num_samples > 0 and num_samples < num_symbols: #with tf.device('/cpu:0'): w = tf.get_variable('proj_w', [self.cell.output_size, num_symbols]) w_t = tf.transpose(w) b = tf.get_variable('proj_b', [num_symbols]) output_projection = (w, b) def sampled_loss(labels, inputs): #with tf.device('/cpu:0'): labels = tf.reshape(labels, [-1, 1]) local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(inputs, tf.float32) return tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=num_symbols) loss_function = sampled_loss output_size = None if output_projection is None: self.cell = OutputProjectionWrapper(self.cell, num_symbols) output_size = num_symbols if output_size is None: output_size = self.cell.output_size if output_projection is not None: proj_weights = ops.convert_to_tensor(output_projection[0], dtype=dtype) proj_weights.get_shape().assert_is_compatible_with([self.cell.output_size, num_symbols]) proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): loop_fn_factory = self._extract_argmax_and_embed #self._extract_grumble_softmax_embed loop_function = loop_fn_factory(embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [embedding_ops.embedding_lookup(embedding, i) for i in self.inputs] self.outputs, self.state = attention_decoder( emb_inp, self.lengths, initial_state, attention_states, self.cell, output_size=output_size, loop_function=loop_function, initial_state_attention=initial_state_attention) targets = [self.inputs[i + 1] for i in xrange(len(self.inputs) - 1)] targets.append(tf.zeros_like(self.inputs[-1])) # loss for each instance in batch self.instance_loss = sequence_loss_by_example(self.outputs, targets, self.weights, softmax_loss_function=loss_function) # aggregated average loss per instance for batch self.loss = tf.reduce_sum(self.instance_loss) / math_ops.cast(array_ops.shape(targets[0])[0], self.instance_loss.dtype) if output_projection is not None: self.projected_output = [tf.matmul(o, output_projection[0]) + output_projection[1] for o in self.outputs] self.decoded_outputs = tf.unstack(tf.argmax(tf.stack(self.projected_output), 2)) self.decoded_output_prob = tf.reduce_max(tf.nn.softmax(tf.stack(self.projected_output)), 2) else: self.decoded_outputs = tf.unstack(tf.argmax(tf.stack(self.outputs), 2)) self.decoded_output_prob = tf.reduce_max(tf.nn.softmax(tf.stack(self.outputs)), 2) self.decoded_lenghts = tf.reduce_sum(tf.sign(tf.transpose(tf.stack(self.decoded_outputs))), 1) self.decoded_batch = tf.transpose(tf.stack(self.decoded_outputs)) self.decoded_batch_probs = tf.transpose(tf.stack(self.decoded_output_prob))
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = GRUCell elif args.rnncell == 'lstm': cell_fn = core_rnn_cell_impl.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format( args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) self.attn_length = 5 self.attn_size = 32 self.attention_states = tf.placeholder( tf.float32, [args.batch_size, self.attn_length, self.attn_size]) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size], name='soft_w') softmax_b = build_weight([args.vocab_size], name='soft_b') self.word_embedding = build_weight( [args.vocab_size, args.embedding_size], name='word_embedding') inputs_list = tf.split( tf.nn.embedding_lookup(self.word_embedding, self.input_data), args.seq_length, 1) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.word_embedding, prev_symbol) if not args.attention: outputs, last_state = seq2seq.rnn_decoder( inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') else: outputs, last_state = attention_decoder( inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients( zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace( ' ', '').replace('/', '') self.var_op = tf.global_variables() self.saver = tf.train.Saver(self.var_op, max_to_keep=4, keep_checkpoint_every_n_hours=1)
def __init__(self, is_training, config, input_): self._input = input_ batch_size = input_.batch_size num_steps = input_.batch_size size = config.hidden_size vocab_size = config.vocab_size def lstm_cell(): return tf.nn.rnn_cell.BasicLSTMCell(size, forget_bias=0.0, state_is_tuple=True) attn_cell = lstm_cell if is_training and config.keep_prob < 1: def attn_cell(): return tf.nn.rnn_cell.MultiRNNCell( lstm_cell(), out_keep_prob=config.keep_prob) cell = tf.nn.rnn_cell.MultiRNNCell( [attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) self._initial_state = cell.zero_state(batch_size, tf.float32) embedding = tf.get_variable("embedding", [vocab_size, size], dtype=tf.float32) inputs = tf.nn.embedding_lookup(embedding, input_.input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) outputs = [] state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(inputs[:, time_step, :], state) outputs.append(cell_output) output = tf.reshape(tf.concat(outputs, 1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size], dtype=tf.float32) softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32) logits = tf.matmul(output, softmax_w) + softmax_b loss = sequence_loss_by_example( [logits], [tf.reshape(input_.targets, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float32)]) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=tf.contrib.framework.get_or_create_global_step()) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr)
def __init__(self, mode, is_training, filename): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size filename_queue = tf.train.string_input_producer([filename], num_epochs=None) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ 'input_data': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'target': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'mask': tf.FixedLenFeature([batch_size * num_steps], tf.float32), 'key_words': tf.FixedLenFeature([batch_size * config.num_keywords], tf.int64) }) self._input_data = tf.cast(features['input_data'], tf.int32) self._targets = tf.cast(features['target'], tf.int32) self._mask = tf.cast(features['mask'], tf.float32) self._key_words = tf.cast(features['key_words'], tf.int32) self._init_output = tf.placeholder(tf.float32, [batch_size, size]) self._input_data = tf.reshape(self._input_data, [batch_size, -1]) self._targets = tf.reshape(self._targets, [batch_size, -1]) self._mask = tf.reshape(self._mask, [batch_size, -1]) self._key_words = tf.reshape(self._key_words, [batch_size, -1]) # single_cell = rnn_cell.LSTMCell(num_units=size, state_is_tuple=False) # if is_training and config.keep_prob < 1: # single_cell = rnn_cell.DropoutWrapper(cell=single_cell, input_keep_prob=config.keep_prob) def single_cell_fn(unit_type, num_units, dropout, mode, forget_bias=1.0): """Create an instance of a single RNN cell.""" dropout = dropout if mode is True else 0.0 if unit_type == "lstm": c = rnn_cell.LSTMCell(num_units, forget_bias=forget_bias, state_is_tuple=False) elif unit_type == "gru": c = rnn_cell.GRUCell(num_units) else: raise ValueError("Unknown unit type %s!" % unit_type) if dropout > 0.0: c = rnn_cell.DropoutWrapper(cell=c, input_keep_prob=(1.0 - dropout)) return c cell_list = [] for i in range(config.num_layers): single_cell = single_cell_fn(unit_type="lstm", num_units=size, dropout=1 - config.keep_prob, mode=is_training) cell_list.append(single_cell) cell = rnn_cell.MultiRNNCell(cell_list, state_is_tuple=False) self._initial_state = cell.zero_state(batch_size, tf.float32) # with tf.device("/cpu:0"): embedding_keyword = tf.get_variable( 'keyword_embedding', [config.movie + config.score, config.word_embedding_size], trainable=True, initializer=tf.random_uniform_initializer(-config.init_scale, config.init_scale)) embedding = tf.get_variable('word_embedding', [vocab_size, config.word_embedding_size], trainable=True, initializer=tf.random_uniform_initializer( -config.init_scale, config.init_scale)) # initializer=tf.constant_initializer(word_vec) inputs = tf.nn.embedding_lookup(embedding, self._input_data) keyword_inputs = tf.nn.embedding_lookup(embedding_keyword, self._key_words) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) # keyword_inputs = tf.nn.dropout(keyword_inputs, config.keep_prob) outputs = [] if mode == "v1": output_state = self._init_output elif mode == "v3": gate = tf.ones([batch_size, config.num_keywords]) atten_sum = tf.zeros([batch_size, config.num_keywords]) with tf.variable_scope("coverage"): """ u_f 是一个变量参数,他负责与topic相乘,得到的结果再通过sigmoid归一化到0~1之间,目的是为每一个控制信息分配一个初始比例 sen_len 是想计算每个样本的有效字数 假设每个样本,如果有两个控制条件的话,每一个控制条件的重要程度用一个0~1之间的数表示,(其实这里应该是 softmax更加合理) 有多少有效字,那么这句话中该控制条件就有多少的初始总分值 """ u_f = tf.get_variable("u_f", [ config.num_keywords * config.word_embedding_size, config.num_keywords ]) res1 = tf.sigmoid( tf.matmul(tf.reshape(keyword_inputs, [batch_size, -1]), u_f)) # todo sen_len = tf.reduce_sum(self._mask, -1, keepdims=True) phi_res = sen_len * res1 self.output1 = phi_res output_state = self._init_output state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): # vs 里面放的是当前这个time step,上一个时刻的隐含层状态跟每一个主题的关系一个被gate消弱后的得分 vs = [] for kw_i in range(config.num_keywords): with tf.variable_scope("RNN_attention"): if time_step > 0 or kw_i > 0: tf.get_variable_scope().reuse_variables() u = tf.get_variable("u", [size, 1]) w1 = tf.get_variable("w1", [size, size]) w2 = tf.get_variable( "w2", [config.word_embedding_size, size]) b = tf.get_variable("b1", [size]) # 加工上一次隐含层状态 线性变换一下 temp2 = tf.matmul(output_state, w1) # 取到某一个主题的向量 temp3 = keyword_inputs[:, kw_i, :] # 对主题的向量,线性变换一下 temp4 = tf.matmul(temp3, w2) # 线性变换后的隐状态和主题add起来 temp5 = tf.add(temp2, temp4) # 加上一个偏置项 temp6 = tf.add(temp5, b) # 加上一个非线性 temp7 = tf.tanh(temp6) # 在线性变换一下 vi = tf.matmul(temp7, u) temp8 = gate[:, kw_i:kw_i + 1] # 把kw_i主题对应的gate控制变量取出来,这个gate初始值都是1 temp9 = vi * temp8 # 一开始 门的初始值是1 不会对权重进行减弱,随后门的数越来越低,会进行削弱 vs.append(temp9) self.attention_vs = tf.concat(vs, axis=1) prob_p = tf.nn.softmax(self.attention_vs) # 此处prob_p表示的是上一步的隐含层状态对每一个主题的注意力得分 gate = gate - (prob_p / phi_res) temp10 = self._mask[:, time_step:time_step + 1] atten_sum += prob_p * temp10 # (batchsize,2) * (batchsize,1) # 如果某一个样本的这个time step的mask是0,那么对应这个样本的所有的主题的权重都为0 # 全部被mask掉了 # 全部主题的词向量的加权和 mt = tf.add_n([ prob_p[:, i:i + 1] * keyword_inputs[:, i, :] for i in range(config.num_keywords) ]) with tf.variable_scope("RNN_sentence"): if time_step > 0: tf.get_variable_scope().reuse_variables() temp11 = inputs[:, time_step, :] # mt 是根据 time_step上一个时刻的 隐含层状态 和 主题 信息一起得到的 temp12 = tf.concat([temp11, mt], axis=1) # 必须要保证 cell input 的 dims = hidden units temp13 = tf.layers.dense(inputs=temp12, units=size) (cell_output, state) = cell(temp13, state) # state 是 lstm 里面的 c outputs.append(cell_output) output_state = cell_output # 隐含层状态更新 为下一个时间步使用 self._end_output = cell_output output = tf.reshape(tf.concat(outputs, axis=1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])]) # 得到的是一个batch里面 所有字的 loss shape : batch_size*seq_len self.cost1 = tf.reduce_sum(loss) self.cost2 = tf.reduce_sum((phi_res - atten_sum)**2) mask_sum = tf.reduce_sum(self._mask) self._cost = cost = (self.cost1 + 0.1 * self.cost2) / mask_sum # self._cost = cost = (self.cost1 + 0.1 * self.cost2) self._final_state = state self._prob = tf.nn.softmax(logits) if not is_training: prob = tf.nn.softmax(logits) self._sample = tf.argmax(prob, 1) return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, word_embedding, config, filename): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size key_words_voc_size = config.key_words_voc_size alpha = tf.constant(0.5) filename_queue = tf.train.string_input_producer([filename], num_epochs=None) # Unlike the TFRecordWriter, the TFRecordReader is symbolic reader = tf.TFRecordReader() # One can read a single serialized example from a filename # serialized_example is a Tensor of type string. _, serialized_example = reader.read(filename_queue) # The serialized example is converted back to actual values. features = tf.parse_single_example( serialized_example, features={ # We know the length of both fields. If not the # tf.VarLenFeature could be used 'input_data': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'target': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'mask': tf.FixedLenFeature([batch_size * num_steps], tf.float32), 'key_words': tf.FixedLenFeature([batch_size * key_words_voc_size], tf.float32), }) self._input_data = tf.cast(features['input_data'], tf.int32) self._targets = tf.cast(features['target'], tf.int32) #声明输入变量x, y self._mask = tf.cast(features['mask'], tf.float32) self._key_words = tf.cast(features['key_words'], tf.float32) self._input_word = tf.reshape(self._key_words, [batch_size, -1]) self._input_data = tf.reshape(self._input_data, [batch_size, -1]) self._targets = tf.reshape(self._targets, [batch_size, -1]) self._mask = tf.reshape(self._mask, [batch_size, -1]) LSTM_cell = SC_LSTM(key_words_voc_size, size, forget_bias=0.0, state_is_tuple=False) if is_training and config.keep_prob < 1: LSTM_cell = SC_DropoutWrapper(LSTM_cell, output_keep_prob=config.keep_prob) cell = SC_MultiRNNCell([LSTM_cell] * config.num_layers, state_is_tuple=False) self._initial_state = cell.zero_state(batch_size, tf.float32) self._init_output = tf.zeros([batch_size, size * config.num_layers], tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable( 'word_embedding', [vocab_size, config.word_embedding_size], trainable=True, initializer=tf.constant_initializer(word_embedding)) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) sc_vec = self._input_word outputs = [] output_state = self._init_output state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): with tf.variable_scope("RNN_sentence"): if time_step > 0: tf.get_variable_scope().reuse_variables() sc_wr = tf.get_variable( 'sc_wr', [config.word_embedding_size, key_words_voc_size]) res_wr = tf.matmul(inputs[:, time_step, :], sc_wr) res_hr = tf.zeros_like(res_wr, dtype=tf.float32) for layer_id in range(config.num_layers): sc_hr = tf.get_variable('sc_hr_%d' % layer_id, [size, key_words_voc_size]) res_hr += alpha * tf.matmul( tf.slice(output_state, [0, size * layer_id], [-1, size]), sc_hr) r_t = tf.sigmoid(res_wr + res_hr) sc_vec = r_t * sc_vec (cell_output, state, cell_outputs) = cell(inputs[:, time_step, :], state, sc_vec) outputs.append(cell_outputs) output_state = cell_outputs self._end_output = output_state # output = tf.reshape(tf.concat(1, outputs), [-1, size*config.num_layers]) output = tf.reshape(tf.concat(outputs, 1), [-1, size * config.num_layers]) softmax_w = tf.get_variable("softmax_w", [size * config.num_layers, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b loss = sequence_loss_by_example( # loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])], average_across_timesteps=False) self._cost = cost = tf.reduce_sum(loss) / batch_size self._final_state = state if not is_training: prob = tf.nn.softmax(logits) return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, args, infer=False): # infer is set to true during sampling. self.args = args if infer: # Worry about one character at a time during sampling; no batching or BPTT. args.batch_size = 1 args.seq_length = 1 # Set cell_fn to the type of network cell we're creating -- RNN, GRU or LSTM. if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) # Call tensorflow library tensorflow-master/tensorflow/python/ops/rnn_cell # to create a layer of rnn_size cells of the specified basic type (RNN/GRU/LSTM). if args.model == "gru": cell = cell_fn(args.rnn_size) else: cell = cell_fn(args.rnn_size, state_is_tuple=True) # Use the same rnn_cell library to create a stack of these cells # of num_layers layers. Pass in a python list of these cells. # (The [cell] * arg.num_layers syntax literally duplicates cell multiple times in # a list. The syntax is such that [5, 6] * 3 would return [5, 6, 5, 6, 5, 6].) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers, state_is_tuple=True) # Create two TF placeholder nodes of 32-bit ints (NOT floats!), # each of shape batch_size x seq_length. This shape matches the batches # (listed in x_batches and y_batches) constructed in create_batches in utils.py. # input_data will receive input batches, and targets will be what it compares against # to calculate loss. self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) # Using the zero_state function in the RNNCell master class in rnn_cell library, # create a tensor of zeros such that we can swap it in for the network state at any time # to zero out the network's state. # State dimensions are: cell_fn state size (2 for LSTM) x rnn_size x num_layers. # So an LSTM network with 100 cells per layer and 3 layers would have a state size of 600, # and initial_state would have a dimension of none x 600. self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) # Scope our new variables to the scope identifier string "rnnlm". with tf.variable_scope('rnnlm'): # Create new variable softmax_w and softmax_b for output. # softmax_w is a weights matrix from the top layer of the model (of size rnn_size) # to the vocabulary output (of size vocab_size). softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) # softmax_b is a bias vector of the ouput characters (of size vocab_size). softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) # [TODO: Why specify CPU? Same as the TF translation tutorial, but don't know why.] with tf.device("/cpu:0"): # Create new variable named 'embedding' to connect the character input to the base layer # of the RNN. Its role is the conceptual inverse of softmax_w. # It contains the trainable weights from the one-hot input vector to the lowest layer of RNN. embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) # Create an embedding tensor with tf.nn.embedding_lookup(embedding, self.input_data). # This tensor has dimensions batch_size x seq_length x rnn_size. # tf.split splits that embedding lookup tensor into seq_length tensors (along dimension 1). # Thus inputs is a list of seq_length different tensors, # each of dimension batch_size x 1 x rnn_size. inputs = tf.split(tf.nn.embedding_lookup( embedding, self.input_data), args.seq_length, axis=1) # Iterate through these resulting tensors and eliminate that degenerate second dimension of 1, # i.e. squeeze each from batch_size x 1 x rnn_size down to batch_size x rnn_size. # Thus we now have a list of seq_length tensors, each with dimension batch_size x rnn_size. inputs = [tf.squeeze(input_, [1]) for input_ in inputs] # THIS LOOP FUNCTION IS NEVER ACTUALLY USED. # IT IS EXPLICITLY NOT USED DURING TRAINING. # DURING INFERENCE, SEQ_LENGTH == 1, SO SEQ2SEQ.RNN_DECODER() ONLY USES THE LOOP ARGUMENT # ON SEQUENCE LENGTH ITEMS SUBSEQUENT TO THE FIRST. # This looping function is used as part of seq2seq.rnn_decoder only during sampling -- not training. # prev is a 2D Tensor of shape [batch_size x cell.output_size]. # returns a 2D Tensor of shape [batch_size x cell.input_size]. def loop(prev, _): # prev is initially the top cell state. # Convert the top cell state into character logits. prev = tf.matmul(prev, softmax_w) + softmax_b # Pull the character with the greatest logit (no sampling, just argmaxing). # WHY IS THIS ARGMAXING WHEN ACTUAL SAMPLING IS DONE PROBABILISTICALLY? # DOESN'T THIS CAUSE OUTPUTS NOT TO MATCH INPUTS DURING SEQUENCE GENERATION? prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) # Re-embed that symbol as the next step's input, and return that. return tf.nn.embedding_lookup(embedding, prev_symbol) # Set up a seq2seq decoder from the seq2seq.py library. # This constructs the outputs and states nodes of the network. # Outputs is a list (of len seq_length, same as inputs) of tensors of shape [batch_size x rnn_size]. # These are the raw output values of the top layer of the network at each time step. # They have NOT been fed through the decoder projection; they are still in network space, # not character space. # State is a tensor of shape [batch_size x cell.state_size]. # This is also the step where all of the trainable parameters for the LSTM (weights and biases) are defined. outputs, self.final_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') # tf.concat concatenates the output tensors along the rnn_size dimension, # to make a single tensor of shape [batch_size x (seq_length * rnn_size)]. # This gives the following 2D outputs matrix: # [(rnn output: batch 0, seq 0) (rnn output: batch 0, seq 1) ... (rnn output: batch 0, seq seq_len-1)] # [(rnn output: batch 1, seq 0) (rnn output: batch 1, seq 1) ... (rnn output: batch 1, seq seq_len-1)] # ... # [(rnn output: batch batch_size-1, seq 0) (rnn output: batch batch_size-1, seq 1) ... (rnn output: batch batch_size-1, seq seq_len-1)] # tf.reshape then reshapes it to a tensor of shape [(batch_size * seq_length) x rnn_size]. # Output will now be the following matrix: # [rnn output: batch 0, seq 0] # [rnn output: batch 0, seq 1] # ... # [rnn output: batch 0, seq seq_len-1] # [rnn output: batch 1, seq 0] # [rnn output: batch 1, seq 1] # ... # [rnn output: batch 1, seq seq_len-1] # ... # ... # [rnn output: batch batch_size-1, seq seq_len-1] # Note the following comment in rnn_cell.py: # Note: in many cases it may be more efficient to not use this wrapper, # but instead concatenate the whole sequence of your outputs in time, # do the projection on this batch-concatenated sequence, then split it # if needed or directly feed into a softmax. output = tf.reshape(tf.concat(outputs, axis=1), [-1, args.rnn_size]) # Obtain logits node by applying output weights and biases to the output tensor. # Logits is a tensor of shape [(batch_size * seq_length) x vocab_size]. # Recall that outputs is a 2D tensor of shape [(batch_size * seq_length) x rnn_size], # and softmax_w is a 2D tensor of shape [rnn_size x vocab_size]. # The matrix product is therefore a new 2D tensor of [(batch_size * seq_length) x vocab_size]. # In other words, that multiplication converts a loooong list of rnn_size vectors # to a loooong list of vocab_size vectors. # Then add softmax_b (a single vocab-sized vector) to every row of that list. # That gives you the logits! self.logits = tf.matmul(output, softmax_w) + softmax_b # Convert logits to probabilities. Probs isn't used during training! That node is never calculated. # Like logits, probs is a tensor of shape [(batch_size * seq_length) x vocab_size]. # During sampling, this means it is of shape [1 x vocab_size]. self.probs = tf.nn.softmax(self.logits) # seq2seq.sequence_loss_by_example returns 1D float Tensor containing the log-perplexity # for each sequence. (Size is batch_size * seq_length.) # Targets are reshaped from a [batch_size x seq_length] tensor to a 1D tensor, of the following layout: # target character (batch 0, seq 0) # target character (batch 0, seq 1) # ... # target character (batch 0, seq seq_len-1) # target character (batch 1, seq 0) # ... # These targets are compared to the logits to generate loss. # Logits: instead of a list of character indices, it's a list of character index probability vectors. # seq2seq.sequence_loss_by_example will do the work of generating losses by comparing the one-hot vectors # implicitly represented by the target characters against the probability distrutions in logits. # It returns a 1D float tensor (a vector) where item i is the log-perplexity of # the comparison of the ith logit distribution to the ith one-hot target vector. loss = seq2seq.sequence_loss_by_example( [self.logits], # logits: 1-item list of 2D Tensors of shape [batch_size x vocab_size] [tf.reshape(self.targets, [-1])], # targets: 1-item list of 1D batch-sized int32 Tensors of the same length as logits [tf.ones([args.batch_size * args.seq_length])], # weights: 1-item list of 1D batch-sized float-Tensors of the same length as logits args.vocab_size ) # num_decoder_symbols: integer, number of decoder symbols (output classes) # Cost is the arithmetic mean of the values of the loss tensor # (the sum divided by the total number of elements). # It is a single-element floating point tensor. This is what the optimizer seeks to minimize. self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length # Create a summary for our cost. tf.summary.scalar("cost", self.cost) # Create a node to track the learning rate as it decays through the epochs. self.lr = tf.Variable(args.learning_rate, trainable=False) self.global_epoch_fraction = tf.Variable(0.0, trainable=False) self.global_seconds_elapsed = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables( ) # tvars is a python list of all trainable TF Variable objects. # tf.gradients returns a list of tensors of length len(tvars) where each tensor is sum(dy/dx). grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer( self.lr) # Use ADAM optimizer with the current learning rate. # Zip creates a list of tuples, where each tuple is (variable tensor, gradient tensor). # Training op nudges the variables along the gradient, with the given learning rate, using the ADAM optimizer. # This is the op that a training session should be instructed to perform. self.train_op = optimizer.apply_gradients(zip(grads, tvars)) self.summary_op = tf.summary.merge_all()
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.LayerNormBasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) #self.cell = cell = tf.nn.rnn_cell.MultiRNNCell([cell] * args.num_layers) #changed self.cell = cell #tf.nn.rnn_cell.BasicRNNCell([cell] * args.num_layers) #self.cell = rnn_cell.BasicRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.batch_pointer = tf.Variable(0, name="batch_pointer", trainable=False, dtype=tf.int32) self.inc_batch_pointer_op = tf.assign(self.batch_pointer, self.batch_pointer + 1) self.epoch_pointer = tf.Variable(0, name="epoch_pointer", trainable=False) self.batch_time = tf.Variable(0.0, name="batch_time", trainable=False) tf.summary.scalar("time_batch", self.batch_time) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) #with tf.name_scope('stddev'): # stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) #tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) #tf.summary.histogram('histogram', var) with tf.variable_scope('rnnlm'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) variable_summaries(softmax_w) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) variable_summaries(softmax_b) with tf.device("/cpu:0"): embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.split(axis=1, num_or_size_splits=args.seq_length, value=tf.nn.embedding_lookup( embedding, self.input_data)) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) outputs, last_state = seq2seq.rnn_decoder( inputs, self.initial_state, cell, loop_function=loop if infer else None, scope='rnnlm') output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length tf.summary.scalar("cost", self.cost) self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def __init__(self, is_training, batch_size, num_steps, batch_counts): self.batch_size = batch_size self.num_steps = num_steps self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self.targets = tf.placeholder(tf.int32, [batch_size, TERM_SIZE]) lstm_cell_fw = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) if is_training: lstm_cell_fw = tf.nn.rnn_cell.DropoutWrapper( lstm_cell_fw, output_keep_prob=KEEP_PROB) rnn_cell_fw = tf.nn.rnn_cell.MultiRNNCell([lstm_cell_fw] * NUM_LAYERS) self.initial_state_fw = rnn_cell_fw.zero_state(batch_size, tf.float32) embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) if is_training: inputs = tf.nn.dropout(inputs, KEEP_PROB) with tf.variable_scope('bidirectional_rnn'): outputs_fw = [] state_fw = self.initial_state_fw with tf.variable_scope("rnn_fw"): for time_step in range(num_steps): if time_step > 0: tf.get_variable_scope().reuse_variables() cell_output_fw, state_fw = rnn_cell_fw( inputs[:, time_step, :], state_fw) if time_step >= num_steps - TERM_SIZE: outputs_fw.append(cell_output_fw) output = tf.reshape(tf.concat(outputs_fw, 1), [-1, HIDDEN_SIZE]) weight = tf.get_variable("weight", [HIDDEN_SIZE, VOCAB_SIZE]) bias = tf.get_variable("bias", [VOCAB_SIZE]) logits = tf.matmul(output, weight) + bias loss = seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self.targets, [-1])], weights=[tf.ones([batch_size * TERM_SIZE], dtype=tf.float32)]) self.cost = tf.reduce_mean(loss) self.final_state_fw = state_fw self.predictions = tf.cast(tf.argmax(logits, 1), tf.int32) self.correct_prediction = tf.equal(self.predictions, tf.reshape(self.targets, [-1])) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32)) self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False) self.learning_rate = tf.train.exponential_decay(LEARNING_RATE, self.global_step, batch_counts / self.batch_size, LEARNING_RATE_DECAY, staircase=True) trainable_variables = tf.trainable_variables() # regularization_cost = tf.reduce_sum([tf.nn.l2_loss(v) for v in trainable_variables]) regularization_cost = tf.nn.l2_loss(weight) + tf.nn.l2_loss(bias) self.cost = self.cost + REGULARIZATION_RATE * regularization_cost if not is_training: return grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM) self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize( self.cost, global_step=self.global_step)
for logit in logits] #[steps, batch, vocab] # print(predictions) ''' why use logists as cost, is because logists can provide semi-one-hot vector with each entry some value. Then with one-hot y, all values are error except y's 1 entry, before softmax, it is kind of continues, softmax makes it not continue ''' y_as_list = [ tf.squeeze(i, squeeze_dims=[1]) for i in tf.split(y_, num_of_steps, axis=1) ] # y_as_list = [steps, batch] # print(y_as_list) # print(y_one_hot_) loss_weights = [tf.ones([batch_size]) for i in range(num_of_steps)] # print(loss_weights) losses = sequence_loss_by_example( logits, y_as_list, loss_weights ) # this is calculated step by step so, step should go as first index total_loss = tf.reduce_mean(losses) train_step = tf.train.AdagradOptimizer(learn_rate).minimize(total_loss) # run prediction init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # predicted result for i in range(epoch_size): _, cost_val, w_val, b_val, final_state_val = sess.run( [train_step, total_loss, W, b, final_state], { x_: input_x, y_: input_target
def __init__(self, is_training, filename): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps self.size = size = config.hidden_size vocab_size = config.vocab_size filename_queue = tf.train.string_input_producer([filename], num_epochs=None) reader = tf.TFRecordReader() _, serialized_example = reader.read(filename_queue) features = tf.parse_single_example( serialized_example, features={ # We know the length of both fields. # If not the tf.VarLenFeature could be used 'input_data': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'target': tf.FixedLenFeature([batch_size * num_steps], tf.int64), 'mask': tf.FixedLenFeature([batch_size * num_steps], tf.float32), 'key_words': tf.FixedLenFeature([batch_size * config.num_keywords], tf.int64) }) self._input_data = tf.cast(features['input_data'], tf.int32) self._targets = tf.cast(features['target'], tf.int32) self._input_word = tf.cast(features['key_words'], tf.int32) self._mask = tf.cast(features['mask'], tf.float32) self._init_output = tf.placeholder(tf.float32, [batch_size, size]) self._input_data = tf.reshape(self._input_data, [batch_size, -1]) self._targets = tf.reshape(self._targets, [batch_size, -1]) self._input_word = tf.reshape(self._input_word, [batch_size, -1]) self._mask = tf.reshape(self._mask, [batch_size, -1]) def single_cell_fn(unit_type, num_units, dropout, mode, forget_bias=1.0): """Create an instance of a single RNN cell.""" dropout = dropout if mode == True else 0.0 if unit_type == "lstm": single_cell = rnn_cell.LSTMCell(num_units, forget_bias=forget_bias, state_is_tuple=False) else: raise ValueError("Unknown unit type %s!" % unit_type) if dropout > 0.0: single_cell = rnn_cell.DropoutWrapper( cell=single_cell, input_keep_prob=(1.0 - dropout)) return single_cell cell_list = [] for i in range(config.num_layers): single_cell = single_cell_fn(unit_type="lstm", num_units=size, dropout=1 - config.keep_prob, mode=is_training) cell_list.append(single_cell) cell = rnn_cell.MultiRNNCell(cell_list, state_is_tuple=False) self._initial_state = cell.zero_state(batch_size, tf.float32) with tf.device("/cpu:0"): embedding = tf.get_variable( 'word_embedding', [vocab_size, config.word_embedding_size], trainable=True, initializer=tf.constant_initializer(word_vec)) inputs = tf.nn.embedding_lookup( embedding, self._input_data ) # 返回一个tensor,shape是(batch_size, num_steps, size) keyword_inputs = tf.nn.embedding_lookup(embedding, self._input_word) if is_training and config.keep_prob < 1: inputs = tf.nn.dropout(inputs, config.keep_prob) gate = tf.ones([batch_size, config.num_keywords]) atten_sum = tf.zeros([batch_size, config.num_keywords]) with tf.variable_scope("coverage"): u_f = tf.get_variable("u_f", [ config.num_keywords * config.word_embedding_size, config.num_keywords ]) res1 = tf.sigmoid( tf.matmul(tf.reshape(keyword_inputs, [batch_size, -1]), u_f)) temp1 = tf.reduce_sum(self._mask, 1, keepdims=True) phi_res = temp1 * res1 self.output1 = phi_res outputs = [] output_state = self._init_output state = self._initial_state with tf.variable_scope("RNN"): for time_step in range(num_steps): # vs 里面放的是 当前这个 time step, 上一个时刻的隐含层状态跟每一个主题的关系 一个 被 gate 消弱后的得分 vs = [] for s2 in range(config.num_keywords): with tf.variable_scope("RNN_attention"): if time_step > 0 or s2 > 0: tf.get_variable_scope().reuse_variables() u = tf.get_variable("u", [size, 1]) w1 = tf.get_variable("w1", [size, size]) w2 = tf.get_variable( "w2", [config.word_embedding_size, size]) b = tf.get_variable("b1", [size]) # 加工上一次隐含层状态 线性变换一下 temp2 = tf.matmul(output_state, w1) # 取到某一个主题的向量 temp3 = keyword_inputs[:, s2, :] # 对主题的向量 线性变换一下 temp4 = tf.matmul(temp3, w2) # 线性变换后的 隐状态 和 主题 add起来 temp5 = tf.add(temp2, temp4) # 加上一个偏置项 temp6 = tf.add(temp5, b) # 加上一个非线性 temp7 = tf.tanh(temp6) # 在线性变换一下 vi = tf.matmul(temp7, u) temp8 = gate[:, s2:s2 + 1] # 把 s2 主题对应的 gate 控制变量取出来,这个gate初始值都是1 temp9 = vi * temp8 vs.append(temp9) self.attention_vs = tf.concat(vs, axis=1) prob_p = tf.nn.softmax(self.attention_vs) # 此处 prob_p 表示的是 上一步的隐含层状态 对每一个主题的 注意力得分 gate = gate - (prob_p / phi_res) temp10 = self._mask[:, time_step:time_step + 1] atten_sum += prob_p * temp10 # (32,5) * (32,1) # 如果某一个样本的这个time step的mask是0,那么对应这个样本的所有的主题的权重都为0 # 全部被mask掉了 # 全部主题的词向量的加权和 mt = tf.add_n([ prob_p[:, i:i + 1] * keyword_inputs[:, i, :] for i in range(config.num_keywords) ]) with tf.variable_scope("RNN_sentence"): if time_step > 0: tf.get_variable_scope().reuse_variables() temp11 = inputs[:, time_step, :] # mt 是根据 time_step上一个时刻的 隐含层状态 和 主题 信息一起得到的 temp12 = tf.concat([temp11, mt], axis=1) # 必须要保证 cell input 的 dims = hidden units temp13 = tf.layers.dense(inputs=temp12, units=size) (cell_output, state) = cell(temp13, state) outputs.append(cell_output) output_state = cell_output self._end_output = cell_output self.output2 = atten_sum output = tf.reshape(tf.concat(outputs, axis=1), [-1, size]) softmax_w = tf.get_variable("softmax_w", [size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) logits = tf.matmul(output, softmax_w) + softmax_b try: loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])], average_across_timesteps=False) except: loss = sequence_loss_by_example([logits], [tf.reshape(self._targets, [-1])], [tf.reshape(self._mask, [-1])], average_across_timesteps=False) self.cost1 = tf.reduce_sum(loss) self.cost2 = tf.reduce_sum((phi_res - atten_sum)**2) self._cost = cost = (self.cost1 + 0.1 * self.cost2) / batch_size self._final_state = state self._prob = tf.nn.softmax(logits) if not is_training: prob = tf.nn.softmax(logits) self._sample = tf.argmax(prob, 1) return self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) optimizer = tf.train.AdamOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def build_graph(self, test): """ Builds an graph in TensorFlow. """ if test: self.batch_size = 1 self.seq_len = 1 ## # Cells ## lstm_cell = rnn_cell.BasicLSTMCell(self.cell_size) self.cell = rnn_cell.MultiRNNCell([lstm_cell] * self.num_layers) ## # Data ## # inputs and targets are 2D tensors of shape self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.targets = tf.placeholder(tf.int32, [self.batch_size, self.seq_len]) self.initial_state = self.cell.zero_state(self.batch_size, tf.float32) ## # Variables ## with tf.variable_scope('lstm_vars'): self.ws = tf.get_variable('ws', [self.cell_size, self.vocab_size]) self.bs = tf.get_variable('bs', [self.vocab_size]) # TODO: initializer? with tf.device('/cpu:0'): # put on CPU to parallelize for faster training/ self.embeddings = tf.get_variable('embeddings', [self.vocab_size, self.cell_size]) # get embeddings for all input words input_embeddings = tf.nn.embedding_lookup(self.embeddings, self.inputs) # The split splits this tensor into a seq_len long list of 3D tensors of shape # [batch_size, 1, rnn_size]. The squeeze removes the 1 dimension from the 1st axis # of each tensor inputs_split = tf.split(input_embeddings, self.seq_len, 1) inputs_split = [tf.squeeze(input_, [1]) for input_ in inputs_split] def loop(prev, _): prev = tf.matmul(prev, self.ws) + self.bs prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(self.embeddings, prev_symbol) lstm_outputs_split, self.final_state = seq2seq.rnn_decoder(inputs_split, self.initial_state, self.cell, loop_function=loop if test else None, scope='lstm_vars') lstm_outputs = tf.reshape(tf.concat(lstm_outputs_split, 1), [-1, self.cell_size]) logits = tf.matmul(lstm_outputs, self.ws) + self.bs self.probs = tf.nn.softmax(logits) ## # Train ## total_loss = seq2seq.sequence_loss_by_example([logits], [tf.reshape(self.targets, [-1])], [tf.ones([self.batch_size * self.seq_len])], self.vocab_size) self.loss = tf.reduce_sum(total_loss) / self.batch_size / self.seq_len self.global_step = tf.Variable(0, trainable=False, name='global_step') self.optimizer = tf.train.AdamOptimizer(learning_rate=c.L_RATE, name='optimizer') self.train_op = self.optimizer.minimize(self.loss, global_step=self.global_step, name='train_op')
def __init__(self, args, embedding): self.args = args if args.model == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.model == 'gru': cell_fn = rnn_cell.GRUCell elif args.model == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("model type not supported: {}".format(args.model)) cell = cell_fn(args.rnn_size) self.cell = cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name='STAND_input') self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length], name='STAND_targets') self.initial_state = cell.zero_state(args.batch_size, tf.float32) self.embedding = embedding with tf.variable_scope('STAND'): softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) inputs = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(self.embedding, self.input_data)) inputs = map(lambda i: tf.nn.l2_normalize(i, 1), [tf.squeeze(input_, [1]) for input_ in inputs]) def loop(prev, i): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.l2_normalize( tf.nn.embedding_lookup(embedding, prev_symbol), 1) o, _ = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=None, scope='STAND') with tf.variable_scope('STAND', reuse=True) as scope: sf_o, _ = seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop, scope=scope) output = tf.reshape(tf.concat(1, o), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) sf_output = tf.reshape(tf.concat(1, sf_o), [-1, args.rnn_size]) self_feed_logits = tf.matmul(sf_output, softmax_w) + softmax_b self.self_feed_probs = tf.nn.softmax(self_feed_logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) self.loss = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, tvars), args.grad_clip) for g, v in zip(grads, tvars): print v.name optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients(zip(grads, tvars))