def attention_decoder_with_embedding(decoder_inputs, initial_state, attention_states, cell, embedding, num_heads=1, output_size=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """ We are not using output_projection because we are NOT using a sampled softmax Parameters ---------- decoder_inputs initial_state attention_states cell embedding: outside embedding passed in num_heads output_size dtype scope initial_state_attention Returns ------- """ if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder_with_embedding"): emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=None, initial_state_attention=initial_state_attention)
def attention_decoder_with_embedding(decoder_inputs, initial_state, attention_states, cell, embedding, num_heads=1, output_size=None, dtype=dtypes.float32, scope=None, initial_state_attention=False): """ We are not using output_projection because we are NOT using a sampled softmax Parameters ---------- decoder_inputs initial_state attention_states cell embedding: outside embedding passed in num_heads output_size dtype scope initial_state_attention Returns ------- """ if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder_with_embedding"): emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=None, initial_state_attention=initial_state_attention)
def simple_attentional_rnn(rnn_input, attention_state_list, initial_state=None): """Implements Simple RNN Args: rnn_input: List of tensors of sizes [-1, sentembed_size] attention_state_list: List of tensors of sizes [-1, sentembed_size] Returns: outputs, state """ # Reshape attention_state_list to tensor attention_states = reshape_list2tensor(attention_state_list, len(attention_state_list), FLAGS.sentembed_size) # Setup cell cell = get_lstm_cell() # Apply dropout in_prob = FLAGS.dropout if FLAGS.use_dropout else 1.0 out_prob = FLAGS.dropout if FLAGS.use_dropout_outatt else 1.0 cell = tf.nn.rnn_cell.DropoutWrapper(cell, input_keep_prob=in_prob, output_keep_prob=out_prob) # Setup attentional RNNs dtype = tf.float16 if FLAGS.use_fp16 else tf.float32 # if initial_state == None: # batch_size = tf.shape(rnn_input[0])[0] # initial_state = cell.zero_state(batch_size, dtype) rnn_outputs, rnn_state = seq2seq.attention_decoder( rnn_input, initial_state, attention_states, cell, output_size=None, num_heads=1, loop_function=None, dtype=dtype, scope=None, initial_state_attention=False) # print(rnn_outputs) # print(rnn_state) return rnn_outputs, rnn_state
def attention_decoder( batch_input_shape, cells, code, annotation, keep_prob, **kwargs ): # Recieve arguments batch_size, timestep, feature = batch_input_shape assert len(cells) == 1, "One cell needed!" de_cell = cells[0] hidden_dim = de_cell.output_size # Start building graph code_dropout = tf.nn.dropout(code, keep_prob) code_dim = int( code_dropout.get_shape()[1] ) rest_of_decoder_inputs = [ tf.placeholder(tf.float32, shape=[ batch_size, code_dim ]) for _ in range(timestep-1) ] decoder_inputs_dropout = [ code_dropout ] + \ [ tf.nn.dropout(inp, keep_prob) for inp in rest_of_decoder_inputs ] def loop(prev, i): return prev # Output as input packed_annotation = tf.transpose(tf.pack(annotation), perm=[1,0,2]) decoder_outputs, decoder_state = seq2seq.attention_decoder( decoder_inputs_dropout, de_cell.zero_state(batch_size,tf.float32), packed_annotation ,de_cell, loop_function = loop ) W_out = tf.get_variable("W_out", shape=[hidden_dim, feature], initializer=tf.contrib.layers.xavier_initializer()) b_out = tf.Variable( tf.zeros([ feature ] ) ) unpacked_reconstruction = [ tf.matmul( tf.nn.dropout( out, keep_prob ), W_out ) for out in decoder_outputs ] recX = tf.nn.relu( tf.transpose(tf.pack(unpacked_reconstruction), perm=[1, 0, 2]) ) return recX
def __init__(self, args, infer=False): """ 数据预处理完成以后,接下来就是建立seq2seq模型了。建立模型主要分为三步: 确定好编码器和解码器中cell的结构,即采用什么循环单元,多少个神经元以及多少个循环层; 将输入数据转化成tensorflow的seq2seq.rnn_decoder需要的格式,并得到最终的输出以及最后一个隐含状态; 将输出数据经过softmax层得到概率分布,并且得到误差函数,确定梯度下降优化器; 由于tensorflow提供的rnncell共有三种,分别是RNN、GRU、LSTM,因此这里我们也提供三种选择,并且每一种都可以使用多层结构, 即MultiRNNCell :param args: :param infer: """ self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format( args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size], name='soft_w') softmax_b = build_weight([args.vocab_size], name='soft_b') word_embedding = build_weight( [args.vocab_size, args.embedding_size], name='word_embedding') inputs_list = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(word_embedding, prev_symbol) # 用于建立seq2seq的函数,rnn_decoder以及attention_decoder if not args.attention: outputs, last_state = seq2seq.rnn_decoder( inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') # rnn_decoder函数主要有四个参数 # decoder_inputs其实就是输入的数据,要求的格式为一个list,并且list中的tensor大小应该为[batch_size,input_size], # 换句话说这个list的长度就是seq_length;但我们原始的输入数据的维度为[args.batch_size, args.seq_length], # 是不是感觉缺少了一个input_size维度,其实这个维度就是word_embedding的维度,或者说word2vec的大小, # 这里需要我们手动进行word_embedding,并且这个embedding矩阵是一个可以学习的参数 # initial_state是cell的初始状态,其维度是[batch_size,cell.state_size], # 由于rnn_cell模块提供了对状态的初始化函数,因此我们可以直接调用 # cell就是我们要构建的解码器和编码器的cell,上面已经提过了。 # 最后一个参数是loop_function,其作用是在生成的时候,我们需要把解码器上一时刻的输出作为下一时刻的输入, # 并且这个loop_function需要我们自己写 # 其中outputs是与decoder_inputs同样维度的量,即每一时刻的输出; # last_state的维度是[batch_size,cell.state_size],即最后时刻的所有cell的状态。 # 接下来需要outputs来确定目标函数,而last-state的作用是作为抽样生成函数下一时刻的状态 else: self.attn_length = 5 self.attn_size = 32 self.attention_states = build_weight( [args.batch_size, self.attn_length, self.attn_size]) outputs, last_state = seq2seq.attention_decoder( inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # tensorflow中提供了sequence_loss_by_example函数用于按照权重来计算整个序列中每个单词的交叉熵, # 返回的是每个序列的log-perplexity。为了使用sequence_loss_by_example函数, # 我们首先需要将outputs通过一个前向层,同时我们需要得到一个softmax概率分布 # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) # train_op即为训练时需要运行的 self.train_op = optimizer.apply_gradients( zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5, keep_checkpoint_every_n_hours=1) self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace( ' ', '').replace('/', '') self.var_op = tf.global_variables()
def __init__(self, config, mode='TRAIN', loaded_word_embed=None): """Builds the computing graph and initializes all variabels. Args: config: Configuration object contains all model configuration. mode: String from {'TRAIN', 'EVAL', 'INFER'}. loaded_word_embed: A numpy array of pretrained word embedding. """ # Initilizes model parameters. self.batch_size = batch_size = config.batch_size self.vocab_size = vocab_size = config.vocab_size self.embed_dim = embed_dim = config.embed_dim self.hidden_dim = hidden_dim = config.hidden_dim self.num_hiddens = num_hiddens = config.num_hiddens self.num_modes = num_modes = config.num_modes self.mode_dim = mode_dim = config.mode_dim self.cmt_seq_len = cmt_seq_len = config.cmt_seq_len self.reply_seq_len = reply_seq_len = config.reply_seq_len # Objective weight for reply language modeling. self.alpha = alpha = config.alpha # Initializes placeholders for inputs. self.comment_inputs = [] self.comment_weights = [] self.reply_inputs = [] self.reply_weights = [] self._lr = tf.Variable(0.0, trainable=False) for i in xrange(cmt_seq_len): self.comment_inputs.append( tf.placeholder(tf.int32, name='comment_input_{0}'.format(i), shape=[batch_size])) self.comment_weights.append( tf.placeholder(tf.float32, name='comment_weight_{0}'.format(i), shape=[batch_size])) for i in xrange(reply_seq_len): self.reply_inputs.append( tf.placeholder(tf.int32, name='reply_input_{0}'.format(i), shape=[batch_size])) self.reply_weights.append( tf.placeholder(tf.float32, name='reply_weight_{0}'.format(i), shape=[batch_size])) self.comment_embeds = [] self.mix_mode_embeds = [] self.mode_probs = [] self.init_reply_embed = [] # Initlize mode_rnn. if mode == 'TRAIN' and config.keep_prob < 1.0: mode_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True), output_keep_prob=config.keep_prob) for _ in xrange(num_hiddens)], state_is_tuple=True) else: mode_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True) for _ in xrange(num_hiddens)], state_is_tuple=True) # Defines the modes. batch_mode_inds = tf.constant([range(num_modes) for _ in range(batch_size)]) # Defines the embeddings on CPU. with tf.device('/cpu:0'): mode_embedding = tf.get_variable( 'mode_embedding', [num_modes, mode_dim], dtype=tf.float32) att_mode_vecs = tf.nn.embedding_lookup( mode_embedding, batch_mode_inds) att_states = tf.reshape( att_mode_vecs, [-1, num_modes, 1, mode_dim]) att_mode_weight = tf.get_variable('att_mode_weight', [1, 1, mode_dim, hidden_dim]) mode_feat = tf.nn.conv2d( att_states, att_mode_weight, [1, 1, 1, 1], 'SAME') att_v = tf.get_variable('att_v', [hidden_dim]) def single_attention(query): with tf.variable_scope('attention_mlp'): y = linear(query, hidden_dim, True) y = tf.reshape(y, [-1, 1, 1, hidden_dim]) s = tf.reduce_sum(att_v * tf.tanh(mode_feat + y), [2, 3]) a_score = tf.nn.softmax(s) weighted_sum = tf.reduce_sum( tf.reshape(a_score, [-1, num_modes, 1, 1]) * att_states, [1, 2]) a_score = tf.reshape(a_score, [-1, num_modes]) weighted_sum = tf.reshape(weighted_sum, [-1, mode_dim]) return a_score, weighted_sum with tf.device('/cpu:0'): if loaded_word_embed is None: embed_weight = tf.get_variable('word_embedding', [vocab_size, embed_dim]) else: pretrain_word_embed = tf.constant(loaded_word_embed) embed_weight = tf.get_variable('word_embedding', initializer=pretrain_word_embed) cmt_state = mode_rnn.zero_state(batch_size, tf.float32) c_prev, cell_output = cmt_state[0] # Computes the residual value of content and global modes. att_proj_weight = tf.get_variable('att_proj_weight', [mode_dim, hidden_dim]) att_probs, attns = single_attention(cell_output) cell_output += tf.matmul(attns, att_proj_weight) cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)] mode_rnn_cell_output = [] mode_probs = [] lm_logits = [] with tf.variable_scope('mode_rnn'): for i, cmt_in in enumerate(self.comment_inputs): if i > 0: tf.get_variable_scope().reuse_variables() cmt_embeds = tf.reshape( tf.nn.embedding_lookup(embed_weight, cmt_in), [batch_size, embed_dim]) cell_output, cmt_state = mode_rnn(cmt_embeds, cmt_state) mode_rnn_cell_output.append(cell_output) att_probs, attns = single_attention(cell_output) c_prev, _ = cmt_state[0] cell_output += tf.matmul(attns, att_proj_weight) cmt_state = [tf.nn.rnn_cell.LSTMStateTuple(c_prev, cell_output)] with tf.variable_scope('attention_projection'): attention_proj = linear(cell_output, vocab_size, True) lm_logits.append(attention_proj) mode_probs.append(att_probs) if mode == 'INFER': self.mix_mode_embeds.append(attns) if mode == 'INFER': self.comment_embeds = mode_rnn_cell_output self.mode_probs = mode_probs top_states = [tf.reshape(e, [-1, 1, mode_rnn.output_size]) for e in mode_rnn_cell_output] states_for_reply_rnn = tf.concat(1, top_states) reply_embeds = [ tf.reshape(tf.nn.embedding_lookup(embed_weight, reply_i), [batch_size, embed_dim]) for reply_i in self.reply_inputs[:-1]] # Initlize reply_rnn. if mode == 'TRAIN' and config.keep_prob < 1.0: reply_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.DropoutWrapper( tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True), output_keep_prob=config.keep_prob) for _ in xrange(num_hiddens)], state_is_tuple=True) else: reply_rnn = tf.nn.rnn_cell.MultiRNNCell([ tf.nn.rnn_cell.BasicLSTMCell( hidden_dim, forget_bias=config.forget_bias, state_is_tuple=True) for _ in xrange(num_hiddens)], state_is_tuple=True) reply_rnn_output, reply_rnn_final_state = attention_decoder( reply_embeds, cmt_state, states_for_reply_rnn, reply_rnn) if mode == 'INFER': self.init_reply_embed = reply_rnn_output[0] # Computes the language model loss for the comment. comment_targets = [cc for cc in self.comment_inputs[1:]] lm_loss = tf.reduce_sum(sequence_loss_by_example( lm_logits[:-1], comment_targets, self.comment_weights[1:])) gen_logits = [] with tf.variable_scope('gen_logit_projection'): for i, rnn_out in enumerate(reply_rnn_output): if i > 0: tf.get_variable_scope().reuse_variables() logits = linear(rnn_out, vocab_size, True) gen_logits.append(logits) # Computes the lanuage model loss for the reply. reply_targets = [tt for tt in self.reply_inputs[1:]] gen_loss = tf.reduce_sum(sequence_loss_by_example( gen_logits, reply_targets, self.reply_weights[1:])) loss = lm_loss + alpha * gen_loss self.total_loss = loss self.saver = tf.train.Saver(tf.all_variables()) if mode != 'TRAIN': return tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) if config.opt_method == 'SGD': optimizer = tf.train.GradientDescentOptimizer(self._lr) elif config.opt_method == 'AdaDelta': optimizer = tf.train.AdadeltaOptimizer(self._lr) elif config.opt_method == 'Adam': optimizer = tf.train.AdamOptimizer(self._lr) else: ValueError('Unknown optimizer {}'.format(config.opt_method)) self.train_op = optimizer.apply_gradients(zip(grads, tvars))
def build(self): print('Building model') self.embeddings = tf.Variable( tf.random_uniform([self.alphabet_size, self.embedd_dims]), name='embeddings') X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t') with tf.variable_scope('split_X_inputs'): X_list = tf.split( split_dim=1, num_split=self.max_x_seq_len, value=X_embedded) X_list = [tf.squeeze(X) for X in X_list] [X.set_shape([None, self.embedd_dims]) for X in X_list] with tf.variable_scope('split_t_inputs'): t_list = tf.split( split_dim=1, num_split=self.max_t_seq_len, value=t_embedded) t_list = [tf.squeeze(t) for t in t_list] [t.set_shape([None, self.embedd_dims]) for t in t_list] with tf.variable_scope('dense_out'): W_out = tf.get_variable('W_out', [self.dec_units, self.alphabet_size]) b_out = tf.get_variable('b_out', [self.alphabet_size]) # char encoder char_cell = rnn_cell.GRUCell(self.char_enc_units) char_enc_outputs, char_enc_state = rnn.rnn( cell=char_cell, inputs=X_list, dtype=tf.float32, sequence_length=self.X_len, scope='rnn_char_encoder') # char2word char2word = tf.transpose(tf.pack(char_enc_outputs), perm=[1, 0, 2]) char2word = _grid_gather(char2word, self.X_spaces) char2word = tf.unpack(tf.transpose(char2word, perm=[1, 0, 2])) [t.set_shape([None, self.char_enc_units]) for t in char2word] # word encoder word_cell = rnn_cell.GRUCell(self.word_enc_units) word_enc_outputs, word_enc_state = rnn.rnn( cell=word_cell, inputs=char2word, dtype=tf.float32, sequence_length=self.X_spaces_len, scope='rnn_word_encoder' ) # The loop function provides inputs to the decoder: def decoder_loop_function(prev, i): def feedback_on(): prev_1 = tf.matmul(prev, W_out) + b_out # feedback is on, so feed the decoder with the previous output return tf.gather(self.embeddings, tf.argmax(prev_1, 1)) def feedback_off(): # feedback is off, so just feed the decoder with t's return t_list[i] return tf.cond(self.feedback, feedback_on, feedback_off) # decoder att_states = tf.transpose(tf.pack(word_enc_outputs), perm=[1, 0, 2]) dec_cell = rnn_cell.GRUCell(self.dec_units) dec_out, dec_state = seq2seq.attention_decoder( decoder_inputs=t_list, initial_state=word_enc_state, attention_states=att_states, cell=dec_cell, loop_function=decoder_loop_function, scope='attention_decoder' ) self.out = [] for d in dec_out: self.out.append(tf.matmul(d, W_out) + b_out) # for debugging network (should write this outside of build) out_packed = tf.pack(self.out) out_packed = tf.transpose(out_packed, perm=[1, 0, 2]) self.out_tensor = out_packed # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()
def decoder_rnn(conv_encoder, rnn_encoder, decoder_inputs, decoder_hidden, weigth_generation, weigth_copy, n_steps, bias_generation, bias_copy, batch_size, keep_prob, defendant, embedding, sample_rate, lstm_layer=1, is_train=True): with tf.name_scope('decoder_rnn') as scope: lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden, forget_bias=1.0, state_is_tuple=True) if lstm_layer > 1: lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer) initial_state = lstm_cell.zero_state(batch_size, tf.float32) batch_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs) batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2]) batch_decoder_inputs = tf.unpack(batch_decoder_inputs) batch_decoder_inputs = [tf.concat(1, [batch_decoder_inputs[i], conv_encoder]) for i in range(len(batch_decoder_inputs))] one_hot = tf.one_hot(indices=tf.reverse(defendant, dims=[False, True]), depth=embedding.get_shape().as_list()[0], on_value=1., off_value=0., axis=-1) rnn_time_major = tf.transpose(rnn_encoder, [1, 0, 2]) rnn_time_major = tf.unpack(rnn_time_major) rnn_encoder_temp = [tf.tanh(tf.nn.bias_add(tf.matmul(rnn_time_major[i], weigth_copy), bias_copy)) for i in range(len(rnn_time_major))] rnn_encoder_temp = tf.transpose(tf.pack(rnn_encoder_temp), [1, 0, 2]) def copy_net(decoder_out): with tf.variable_scope('copy_net') as scope: decoder_out = tf.reshape(decoder_out, [-1, decoder_hidden, 1]) source_prob = tf.batch_matmul(rnn_encoder_temp, decoder_out) source_prob = tf.reshape(source_prob, [-1, 1, source_prob.get_shape().as_list()[1]]) voc_prob = tf.batch_matmul(source_prob, one_hot) voc_prob = tf.reshape(voc_prob, [-1, voc_prob.get_shape().as_list()[-1]]) return voc_prob if is_train: def func(prev, i): #generation prob generation_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) #copy prob copy_prob = copy_net(prev) #words prob words_prob = tf.add(generation_prob, copy_prob) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) # select from prev_outputs and ground truth prob = tf.random_uniform(minval=0, maxval=1, shape=(batch_size,)) mask = tf.cast(tf.greater(sample_rate, prob), tf.float32) mask = tf.expand_dims(mask, 1) mask = tf.tile(mask, [1, prev_outputs.get_shape().as_list()[-1]]) next_input = mask * prev_outputs + (1 - mask) * batch_decoder_inputs[i] return next_input outputs, state = seq2seq.attention_decoder(decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) else: def func(prev, i): #generation prob generation_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) #copy prob copy_prob = copy_net(prev) #words prob words_prob = tf.add(generation_prob, copy_prob) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) return prev_outputs outputs, state = seq2seq.attention_decoder(decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) outputs = tf.nn.dropout(outputs, keep_prob) outputs = tf.unpack(outputs) res = [0 for i in range(n_steps)] for i in range(len(outputs)): #generation prob generation_prob = tf.nn.bias_add(tf.matmul(outputs[i], weigth_generation), bias_generation) #copy prob copy_prob = copy_net(outputs[i]) #words prob res[i] = tf.add(generation_prob, copy_prob) return res, state
def decoder_rnn(conv_encoder, rnn_encoder, decoder_inputs, decoder_hidden, weigth_generation, weigth_copy, n_steps, bias_generation, bias_copy, batch_size, keep_prob, defendant, embedding, sample_rate, lstm_layer=1, is_train=True): with tf.name_scope('decoder_rnn') as scope: lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden, forget_bias=1.0, state_is_tuple=True) if lstm_layer > 1: lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer) initial_state = lstm_cell.zero_state(batch_size, tf.float32) batch_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs) batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2]) batch_decoder_inputs = tf.unpack(batch_decoder_inputs) batch_decoder_inputs = [ tf.concat(1, [batch_decoder_inputs[i], conv_encoder]) for i in range(len(batch_decoder_inputs)) ] one_hot = tf.one_hot(indices=tf.reverse(defendant, dims=[False, True]), depth=embedding.get_shape().as_list()[0], on_value=1., off_value=0., axis=-1) rnn_time_major = tf.transpose(rnn_encoder, [1, 0, 2]) rnn_time_major = tf.unpack(rnn_time_major) rnn_encoder_temp = [ tf.tanh( tf.nn.bias_add(tf.matmul(rnn_time_major[i], weigth_copy), bias_copy)) for i in range(len(rnn_time_major)) ] rnn_encoder_temp = tf.transpose(tf.pack(rnn_encoder_temp), [1, 0, 2]) def copy_net(decoder_out): with tf.variable_scope('copy_net') as scope: decoder_out = tf.reshape(decoder_out, [-1, decoder_hidden, 1]) source_prob = tf.batch_matmul(rnn_encoder_temp, decoder_out) source_prob = tf.reshape( source_prob, [-1, 1, source_prob.get_shape().as_list()[1]]) voc_prob = tf.batch_matmul(source_prob, one_hot) voc_prob = tf.reshape( voc_prob, [-1, voc_prob.get_shape().as_list()[-1]]) return voc_prob if is_train: def func(prev, i): #generation prob generation_prob = tf.nn.bias_add( tf.matmul(prev, weigth_generation), bias_generation) #copy prob copy_prob = copy_net(prev) #words prob words_prob = tf.add(generation_prob, copy_prob) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) # select from prev_outputs and ground truth prob = tf.random_uniform(minval=0, maxval=1, shape=(batch_size, )) mask = tf.cast(tf.greater(sample_rate, prob), tf.float32) mask = tf.expand_dims(mask, 1) mask = tf.tile(mask, [1, prev_outputs.get_shape().as_list()[-1]]) next_input = mask * prev_outputs + ( 1 - mask) * batch_decoder_inputs[i] return next_input outputs, state = seq2seq.attention_decoder( decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) else: def func(prev, i): #generation prob generation_prob = tf.nn.bias_add( tf.matmul(prev, weigth_generation), bias_generation) #copy prob copy_prob = copy_net(prev) #words prob words_prob = tf.add(generation_prob, copy_prob) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) return prev_outputs outputs, state = seq2seq.attention_decoder( decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) outputs = tf.nn.dropout(outputs, keep_prob) outputs = tf.unpack(outputs) res = [0 for i in range(n_steps)] for i in range(len(outputs)): #generation prob generation_prob = tf.nn.bias_add( tf.matmul(outputs[i], weigth_generation), bias_generation) #copy prob copy_prob = copy_net(outputs[i]) #words prob res[i] = tf.add(generation_prob, copy_prob) return res, state
def __init__(self, args, infer=False): self.args = args if infer: args.batch_size = 1 args.seq_length = 1 if args.rnncell == 'rnn': cell_fn = rnn_cell.BasicRNNCell elif args.rnncell == 'gru': cell_fn = rnn_cell.GRUCell elif args.rnncell == 'lstm': cell_fn = rnn_cell.BasicLSTMCell else: raise Exception("rnncell type not supported: {}".format( args.rnncell)) cell = cell_fn(args.rnn_size) self.cell = rnn_cell.MultiRNNCell([cell] * args.num_layers) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = self.cell.zero_state(args.batch_size, tf.float32) with tf.variable_scope('rnnlm'): softmax_w = build_weight([args.rnn_size, args.vocab_size], name='soft_w') softmax_b = build_weight([args.vocab_size], name='soft_b') word_embedding = build_weight( [args.vocab_size, args.embedding_size], name='word_embedding') inputs_list = tf.split( 1, args.seq_length, tf.nn.embedding_lookup(word_embedding, self.input_data)) inputs_list = [tf.squeeze(input_, [1]) for input_ in inputs_list] def loop(prev, _): prev = tf.matmul(prev, softmax_w) + softmax_b prev_symbol = tf.stop_gradient(tf.argmax(prev, 1)) return tf.nn.embedding_lookup(embedding, prev_symbol) if not args.attention: outputs, last_state = seq2seq.rnn_decoder( inputs_list, self.initial_state, self.cell, loop_function=loop if infer else None, scope='rnnlm') else: self.attn_length = 5 self.attn_size = 32 self.attention_states = build_weight( [args.batch_size, self.attn_length, self.attn_size]) outputs, last_state = seq2seq.attention_decoder( inputs_list, self.initial_state, self.attention_states, self.cell, loop_function=loop if infer else None, scope='rnnlm') self.final_state = last_state output = tf.reshape(tf.concat(1, outputs), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) loss = seq2seq.sequence_loss_by_example( [self.logits], [tf.reshape(self.targets, [-1])], [tf.ones([args.batch_size * args.seq_length])], args.vocab_size) # average loss for each word of each timestep self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.lr = tf.Variable(0.0, trainable=False) self.var_trainable_op = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, self.var_trainable_op), args.grad_clip) optimizer = tf.train.AdamOptimizer(self.lr) self.train_op = optimizer.apply_gradients( zip(grads, self.var_trainable_op)) self.initial_op = tf.global_variables_initializer() self.logfile = args.log_dir + str( datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') + '.txt').replace( ' ', '').replace('/', '') self.var_op = tf.global_variables() self.saver = tf.train.Saver(self.var_op, max_to_keep=5, keep_checkpoint_every_n_hours=1)
def decoder_rnn(conv_encoder, rnn_encoder, decoder_inputs, decoder_hidden, weigth_generation, n_steps, bias_generation, batch_size, keep_prob, defendant, embedding, sample_rate, lstm_layer=1, is_train=True): with tf.name_scope('decoder_rnn') as scope: lstm_cell = rnn_cell.BasicLSTMCell(decoder_hidden, forget_bias=1.0, state_is_tuple=True) if lstm_layer > 1: lstm_cell = rnn_cell.MultiRNNCell([lstm_cell] * lstm_layer) initial_state = lstm_cell.zero_state(batch_size, tf.float32) batch_decoder_inputs = tf.nn.embedding_lookup(embedding, decoder_inputs) batch_decoder_inputs = tf.transpose(batch_decoder_inputs, [1, 0, 2]) batch_decoder_inputs = tf.unpack(batch_decoder_inputs) batch_decoder_inputs = [ tf.concat(1, [batch_decoder_inputs[i], conv_encoder]) for i in range(len(batch_decoder_inputs)) ] if is_train: def func(prev, i): #words prob words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) # select from prev_outputs and ground truth prob = tf.random_uniform(minval=0, maxval=1, shape=(batch_size, )) mask = tf.cast(tf.greater(sample_rate, prob), tf.float32) mask = tf.expand_dims(mask, 1) mask = tf.tile(mask, [1, prev_outputs.get_shape().as_list()[-1]]) next_input = mask * prev_outputs + ( 1 - mask) * batch_decoder_inputs[i] return next_input outputs, state = seq2seq.attention_decoder( decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) else: def func(prev, i): #words prob words_prob = tf.nn.bias_add(tf.matmul(prev, weigth_generation), bias_generation) sample = tf.argmax(words_prob, 1) prev_word = tf.nn.embedding_lookup(embedding, sample) prev_outputs = tf.concat(1, [prev_word, conv_encoder]) return prev_outputs outputs, state = seq2seq.attention_decoder( decoder_inputs=batch_decoder_inputs, initial_state=initial_state, attention_states=rnn_encoder, cell=lstm_cell, num_heads=1, loop_function=func, scope='rnn_decoder', initial_state_attention=False) outputs = tf.nn.dropout(outputs, keep_prob) outputs = tf.unpack(outputs) res = [0 for i in range(n_steps)] for i in range(len(outputs)): #words prob res[i] = tf.nn.bias_add(tf.matmul(outputs[i], weigth_generation), bias_generation) return res, state
def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=dtypes.float32, scope=None, initial_state_attention=False, embedding=None): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_size: Size of the output vectors; if None, use output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): #with ops.device("/cpu:0"): # embedding = variable_scope.get_variable("embedding", # [num_symbols, embedding_size]) loop_function = _extract_sample_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs ] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=loop_function, initial_state_attention=initial_state_attention)
def embedding_attention_decoder(decoder_inputs, initial_state, attention_states, cell, num_symbols, embedding_size, num_heads=1, output_size=None, output_projection=None, feed_previous=False, update_embedding_for_previous=True, dtype=dtypes.float32, scope=None, initial_state_attention=False, embedding=None): """RNN decoder with embedding and attention and a pure-decoding option. Args: decoder_inputs: A list of 1D batch-sized int32 Tensors (decoder inputs). initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function. num_symbols: Integer, how many symbols come into the embedding. embedding_size: Integer, the length of the embedding vector for each symbol. num_heads: Number of attention heads that read from attention_states. output_size: Size of the output vectors; if None, use output_size. output_projection: None or a pair (W, B) of output projection weights and biases; W has shape [output_size x num_symbols] and B has shape [num_symbols]; if provided and feed_previous=True, each fed previous output will first be multiplied by W and added B. feed_previous: Boolean; if True, only the first of decoder_inputs will be used (the "GO" symbol), and all other decoder inputs will be generated by: next = embedding_lookup(embedding, argmax(previous_output)), In effect, this implements a greedy decoder. It can also be used during training to emulate http://arxiv.org/abs/1506.03099. If False, decoder_inputs are used as given (the standard decoder case). update_embedding_for_previous: Boolean; if False and feed_previous=True, only the embedding for the first symbol of decoder_inputs (the "GO" symbol) will be updated by back propagation. Embeddings for the symbols generated from the decoder itself remain unchanged. This parameter has no effect if feed_previous=False. dtype: The dtype to use for the RNN initial states (default: tf.float32). scope: VariableScope for the created subgraph; defaults to "embedding_attention_decoder". initial_state_attention: If False (default), initial attentions are zero. If True, initialize the attentions from the initial state and attention states -- useful when we wish to resume decoding from a previously stored decoder state and attention states. Returns: A tuple of the form (outputs, state), where: outputs: A list of the same length as decoder_inputs of 2D Tensors with shape [batch_size x output_size] containing the generated outputs. state: The state of each decoder cell at the final time-step. It is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: When output_projection has the wrong shape. """ if output_size is None: output_size = cell.output_size if output_projection is not None: proj_biases = ops.convert_to_tensor(output_projection[1], dtype=dtype) proj_biases.get_shape().assert_is_compatible_with([num_symbols]) with variable_scope.variable_scope(scope or "embedding_attention_decoder"): #with ops.device("/cpu:0"): # embedding = variable_scope.get_variable("embedding", # [num_symbols, embedding_size]) loop_function = _extract_sample_and_embed( embedding, output_projection, update_embedding_for_previous) if feed_previous else None emb_inp = [ embedding_ops.embedding_lookup(embedding, i) for i in decoder_inputs] return attention_decoder( emb_inp, initial_state, attention_states, cell, output_size=output_size, num_heads=num_heads, loop_function=loop_function, initial_state_attention=initial_state_attention)
def build(self): print('Building model') self.embeddings = tf.Variable(tf.random_uniform( [self.alphabet_size, self.embedd_dims]), name='embeddings') X_embedded = tf.gather(self.embeddings, self.Xs, name='embed_X') t_embedded = tf.gather(self.embeddings, self.ts_go, name='embed_t') with tf.variable_scope('split_X_inputs'): X_list = tf.split(split_dim=1, num_split=self.max_x_seq_len, value=X_embedded) X_list = [tf.squeeze(X) for X in X_list] [X.set_shape([None, self.embedd_dims]) for X in X_list] with tf.variable_scope('split_t_inputs'): t_list = tf.split(split_dim=1, num_split=self.max_t_seq_len, value=t_embedded) t_list = [tf.squeeze(t) for t in t_list] [t.set_shape([None, self.embedd_dims]) for t in t_list] with tf.variable_scope('dense_out'): W_out = tf.get_variable('W_out', [self.dec_units, self.alphabet_size]) b_out = tf.get_variable('b_out', [self.alphabet_size]) # char encoder char_cell = rnn_cell.GRUCell(self.char_enc_units) char_enc_outputs, char_enc_state = rnn.rnn(cell=char_cell, inputs=X_list, dtype=tf.float32, sequence_length=self.X_len, scope='rnn_char_encoder') # char2word char2word = tf.transpose(tf.pack(char_enc_outputs), perm=[1, 0, 2]) char2word = _grid_gather(char2word, self.X_spaces) char2word = tf.unpack(tf.transpose(char2word, perm=[1, 0, 2])) [t.set_shape([None, self.char_enc_units]) for t in char2word] # word encoder word_cell = rnn_cell.GRUCell(self.word_enc_units) word_enc_outputs, word_enc_state = rnn.rnn( cell=word_cell, inputs=char2word, dtype=tf.float32, sequence_length=self.X_spaces_len, scope='rnn_word_encoder') # The loop function provides inputs to the decoder: def decoder_loop_function(prev, i): def feedback_on(): prev_1 = tf.matmul(prev, W_out) + b_out # feedback is on, so feed the decoder with the previous output return tf.gather(self.embeddings, tf.argmax(prev_1, 1)) def feedback_off(): # feedback is off, so just feed the decoder with t's return t_list[i] return tf.cond(self.feedback, feedback_on, feedback_off) # decoder att_states = tf.transpose(tf.pack(word_enc_outputs), perm=[1, 0, 2]) dec_cell = rnn_cell.GRUCell(self.dec_units) dec_out, dec_state = seq2seq.attention_decoder( decoder_inputs=t_list, initial_state=word_enc_state, attention_states=att_states, cell=dec_cell, loop_function=decoder_loop_function, scope='attention_decoder') self.out = [] for d in dec_out: self.out.append(tf.matmul(d, W_out) + b_out) # for debugging network (should write this outside of build) out_packed = tf.pack(self.out) out_packed = tf.transpose(out_packed, perm=[1, 0, 2]) self.out_tensor = out_packed # add TensorBoard summaries for all variables tf.contrib.layers.summarize_variables()