def Decoder(args, mode, enc_rnn_out, enc_rnn_state, X, emb_Y, emb_out): with tf.variable_scope("Decoder") as decoder_scope: mem_units = 2 * args.dim out_layer = Dense(args.output_vocab_size) # projection W*X+b beam_width = args.beam_width batch_size = tf.shape(enc_rnn_out)[0] cell, initial_state = _decoder(args, enc_rnn_out, enc_rnn_state, mode, beam_width, batch_size) if mode == "train": seq_len = tf.tile(tf.constant([args.maxlen], dtype=tf.int32), [batch_size]) helper = tf.contrib.seq2seq.TrainingHelper(inputs=emb_Y, sequence_length=seq_len) decoder = BasicDecoder(cell=cell, helper=helper, initial_state=initial_state, X=X, output_layer=out_layer) outputs, final_state, _= tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=args.maxlen, scope=decoder_scope) logits = outputs.rnn_output sample_ids = outputs.sample_id else: start_tokens = tf.tile(tf.constant([_GO], dtype=tf.int32), [batch_size]) end_token = _END my_decoder = BeamSearchDecoder(cell=cell, embedding=emb_out, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state, beam_width=beam_width, X=X, output_layer=out_layer, length_penalty_weight=0.0) outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode(my_decoder, maximum_iterations=args.maxlen, scope=decoder_scope) logits = tf.no_op() sample_ids = outputs.predicted_ids return logits, sample_ids
def _build_decoder(self, dec_scope_name, encoder_output, encoder_state, target_data, target_seq_len): with tf.name_scope(dec_scope_name): decoder_embeddings = tf.Variable(tf.random_uniform([self.tgt_vocab_size, self.embedding_size])) # cell cell = tf.contrib.rnn.MultiRNNCell([self.get_gru_cell(self.rnn_size, self.dropout) for _ in range(self.num_layers)]) # attention-model cell, decoder_initial_state = self._build_attention(encoder_output, encoder_state, cell) # output_layer output_layer = Dense(self.tgt_vocab_size, use_bias=False, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) with tf.variable_scope(dec_scope_name + '_train'): # Data format of target_data: <GO>...<PAD> # Tensor: [batch_size, max_time, embed_size], type: float32. decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, target_data) train_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, sequence_length=target_seq_len, time_major=False) train_decoder = tf.contrib.seq2seq.BasicDecoder(cell, train_helper, decoder_initial_state, output_layer) train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decoder, impute_finished=True, maximum_iterations=self.max_target_len) with tf.variable_scope(dec_scope_name + '_predict', reuse=True): # start_tokens = tf.tile(tf.constant([self.start_vocab.index('<go>')], dtype=tf.int32), # [self.batch_size], name='start_tokens') predict_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_embeddings, tf.fill([self.batch_size], self.start_vocab.index('<go>')), self.start_vocab.index('<eos>')) predict_decoder = tf.contrib.seq2seq.BasicDecoder(cell, predict_helper, decoder_initial_state, output_layer) predict_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predict_decoder, impute_finished=True, maximum_iterations=self.max_target_len) return train_decoder_output, predict_decoder_output
def _project_lstm_state_tuple(state_tuple, num_units): r""" Concatenates all the `c` and `h` members from a list of `LSTMStateTuple` and projects them to a space of dimension `num_units` Args: state_tuple: a list of `LSTMStateTuple` objects num_units: output dimension Returns: projected_state: a single `LSTMStateTuple` with `c` and `h` of dimension `num_units` """ state_proj_layer = Dense(num_units, name='state_projection', use_bias=False) cat_c = tf.concat([state.c for state in state_tuple], axis=-1) cat_h = tf.concat([state.h for state in state_tuple], axis=-1) proj_c = state_proj_layer(cat_c) proj_h = state_proj_layer(cat_h) projected_state = tf.contrib.rnn.LSTMStateTuple(c=proj_c, h=proj_h) print('projected_state', projected_state) return projected_state
def add_decoder_for_training(self): self.add_attention_for_training() decoder_embedding = tf.get_variable( 'decoder_embedding', [len(self.Y_word2idx), self.decoder_embedding_dim], tf.float32, tf.random_uniform_initializer(-1.0, 1.0)) training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=tf.nn.embedding_lookup(decoder_embedding, self.processed_decoder_input()), sequence_length=self.Y_seq_len, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=self.decoder_cell, helper=training_helper, initial_state=self.decoder_cell.zero_state( self.batch_size, tf.float32).clone(cell_state=self.encoder_state), output_layer=Dense(len(self.Y_word2idx))) training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=tf.reduce_max(self.Y_seq_len)) self.training_logits = training_decoder_output.rnn_output
def build_encoder(self): print 'Building Encoder' with tf.variable_scope('encoder'): self.encoder_cell = self.build_encoder_cell() # Initialize encoder_embeddings to have variance=1 initializer = tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3), dtype=tf.float32) self.encoder_embeddings = tf.get_variable( "encoder_embeddings", [self.src_vocab_size, self.input_embedding_size], initializer=initializer, dtype=tf.float32) # [batch_size, time_step, embedding_size] self.encoder_inputs_embedded = tf.nn.embedding_lookup( params=self.encoder_embeddings, ids=self.encoder_inputs) # Input projection layer to feed embedded inputs to the cell input_layer = Dense(self.encoder_hidden_units, dtype=tf.float32) self.encoder_inputs_embedded = input_layer( self.encoder_inputs_embedded) # Encode input sequences into context vectors: # encoder_outputs: [batch_size, max_time_step, cell_output_size] # encoder_state: [batch_size, cell_output_size] self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn( cell=self.encoder_cell, inputs=self.encoder_inputs_embedded, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=False)
def decode(helper, memory, scope, enc_state, reuse=None): with tf.variable_scope(scope, reuse=reuse): attention_mechanism = tf.contrib.seq2seq.LuongAttention( num_units=cfg.RNN_UNITS, memory=memory) cell = tf.contrib.rnn.GRUCell(num_units=cfg.RNN_UNITS) attn_cell = tf.contrib.seq2seq.AttentionWrapper( cell, attention_mechanism, attention_layer_size=cfg.RNN_UNITS, output_attention=True) output_layer = Dense(units=cfg.VOCAB_SIZE) decoder = tf.contrib.seq2seq.BasicDecoder( cell=attn_cell, helper=helper, initial_state=attn_cell.zero_state( dtype=tf.float32, batch_size=cfg.BATCH_SIZE).clone(cell_state=enc_state[0]), output_layer=output_layer) outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=27) return outputs
def decoder(decoder_embed_input,decoder_y,target_length,max_target_length,encode_state,keep_prob,reuse=False): with tf.variable_scope("decoder",reuse=reuse): decode_lstm = tf.contrib.rnn.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) decode_cell = tf.contrib.rnn.DropoutWrapper(decode_lstm, output_keep_prob=keep_prob) decoder_initial_state = encode_state output_layer = Dense(n_input) #TOTAL_SIZE decoder_input_ = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), decoder_embed_input], 1) # add GO to the end decoder_input = tf.nn.embedding_lookup(dic_embeddings, decoder_input_) decoder_input=tf.concat([decoder_input,decoder_y],2) # # input_=tf.transpose(decoder_input,[1,0,2]) training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_input, sequence_length=target_length) training_decoder = tf.contrib.seq2seq.BasicDecoder(decode_cell, training_helper, decoder_initial_state, output_layer) output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, impute_finished=True, maximum_iterations=max_target_length) predicting_logits = tf.identity(output.sample_id, name='predictions') training_logits = tf.identity(output.rnn_output, 'logits') masks = tf.sequence_mask(target_length, max_target_length, dtype=tf.float32, name='masks') #target = tf.concat([target_input, tf.fill([batch_size, 1], vocab_to_int['<EOS>'])], 1) # target = decoder_embed_input return output,predicting_logits,training_logits,masks,target
def _build_word_projections(self): """Helper to update word embedding and output projection variables.""" c = self._config rnn_size = c.rnn_size word_size = c.rnn_word_size softmax_size = self._softmax_size token_type = c.token_type place_var_on_cpu = token_type == 'word' #with tf.variable_scope('decoder/rnn_decoder', reuse=tf.AUTO_REUSE): dec_out_layer = Dense(softmax_size, name='output_projection') dec_out_layer.build(rnn_size) self.decoder_output_layer = dec_out_layer print('INFO: Building separate embedding matrix.') kwargs = dict(name='embedding_map', shape=[softmax_size, word_size], dtype=tf.float32, trainable=True) if place_var_on_cpu: with tf.device('/cpu:0'): self._word_embed_map = tf.get_variable(**kwargs) else: self._word_embed_map = tf.get_variable(**kwargs) return self._word_embed_map
def _decoder_inference(self, init_state): tiled_z = tf.tile(tf.expand_dims(self.z, 1), [1, args.beam_width, 1]) decoder = BeamSearchDecoder( cell=tf.nn.rnn_cell.MultiRNNCell([ self._rnn_cell(args.rnn_size, reuse=True) for _ in range(args.decoder_layers) ]), embedding=self.tied_embedding, start_tokens=tf.tile( tf.constant([self._word2idx['<start>']], dtype=tf.int32), [self._batch_size]), end_token=self._word2idx['<end>'], initial_state=tf.contrib.seq2seq.tile_batch( init_state, args.beam_width), beam_width=args.beam_width, output_layer=Dense(args.vocab_size, _reuse=True), length_penalty_weight=0.0, z=tiled_z) decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, impute_finished=False, maximum_iterations=self.gen_seq_length) return decoder_output.predicted_ids[:, :, 0]
def attention_keras_test(): # tf.keras.layers.SimpleRNNCell를 이용하기 vocab_size = 6 SOS_token = 0 EOS_token = 5 x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32) y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32) print("data shape: ", x_data.shape) sess = tf.InteractiveSession() output_dim = vocab_size batch_size = len(x_data) hidden_dim =7 seq_length = x_data.shape[1] embedding_dim = 8 state_tuple_mode = True init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1) train_mode = True alignment_history_flag = True # True이면 initial_state나 last state를 sess.run 하면 안됨. alignment_history가 function이기 때문에... with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope: # Make rnn cell cell = tf.keras.layers.SimpleRNNCell(units=hidden_dim) embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32) inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size x seq_length x embedding_dim Y = tf.convert_to_tensor(y_data) #encoder_outputs = tf.ones([batch_size,20,30]) encoder_outputs = tf.convert_to_tensor(np.random.normal(0,1,[batch_size,20,30]).astype(np.float32)) # 20: encoder sequence length, 30: encoder hidden dim #input_lengths = [20]*batch_size input_lengths = [5,10,20] # encoder에 padding 같은 것이 있을 경우, attention을 주지 않기 위해 # attention mechanism # num_units = Na = 11 attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths,normalize=False) #attention_mechanism = tf.contrib.seq2seq.BahdanauMonotonicAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths) # LuongAttention에서는 num_units이 임의로 들어가면 안되고, decoder의 hidden_dim과 일치해야 한다 #attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=hidden_dim, memory=encoder_outputs,memory_sequence_length=input_lengths) # output_attention = True(default) ==> 이면 output으로 attention이 나가고, False이면 cell의 output이 나간다 # attention_layer_size = N_l attention_initial_state = [cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)] cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=13,initial_cell_state=attention_initial_state, alignment_history=alignment_history_flag,output_attention=True) # 여기서 zero_state를 부르면, 위의 attentionwrapper에서 넝허준 attention_initial_state를 가져온다. 즉, AttentionWrapperState.cell_state에는 넣어준 값이 들어있다. initial_state = cell.zero_state(batch_size, tf.float32) # AttentionWrapperState if train_mode: helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32)) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token) output_layer = Dense(output_dim, name='output_projection') decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer) # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다 outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10) weights = tf.ones(shape=[batch_size,seq_length]) loss = tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights) opt = tf.train.AdamOptimizer(0.01).minimize(loss) sess.run(tf.global_variables_initializer()) for i in range(100): loss_,_ =sess.run([loss,opt]) print("{} loss: = {}".format(i,loss_)) if alignment_history_flag ==False: print("initial_state: ", sess.run(initial_state)) print("\n\noutputs: ",outputs) o = sess.run(outputs.rnn_output) #batch_size, seq_length, outputs o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1)) print("\n",o,o2) #batch_size, seq_length, outputs print("\n\nlast_state: ",last_state) if alignment_history_flag == False: print(sess.run(last_state)) # batch_size, hidden_dim else: print("alignment_history: ", last_state.alignment_history.stack()) alignment_history_ = sess.run(last_state.alignment_history.stack()) print(alignment_history_) print("alignment_history sum: ",np.sum(alignment_history_,axis=-1)) print("cell_state: ", sess.run(last_state.cell_state)) print("attention: ", sess.run(last_state.attention)) print("time: ", sess.run(last_state.time)) alignments_ = sess.run(last_state.alignments) print("alignments: ", alignments_) print('alignments sum: ', np.sum(alignments_,axis=1)) # alignments의 합이 1인지 확인 print("attention_state: ", sess.run(last_state.attention_state)) print("\n\nlast_sequence_lengths: ",last_sequence_lengths) print(sess.run(last_sequence_lengths)) # [seq_length]*batch_size print("kernel(weight)",sess.run(output_layer.trainable_weights[0])) # kernel(weight) print("bias",sess.run(output_layer.trainable_weights[1])) # bias if train_mode: p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim) print("loss: {:20.6f}".format(sess.run(loss))) print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) )
def attention_multicell_test(): # BasicRNNCell을 multi로 쌓아 attention 적용. multi에서는 제일 아래 layer에 attention을 적용한다 vocab_size = 6 SOS_token = 0 EOS_token = 5 x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32) y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32) print("data shape: ", x_data.shape) sess = tf.InteractiveSession() output_dim = vocab_size batch_size = len(x_data) hidden_dim =7 num_layers = 2 seq_length = x_data.shape[1] embedding_dim = 8 state_tuple_mode = True init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1) train_mode = True with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope: # Make multi-rnn cell cells = [] for _ in range(num_layers): cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim) cells.append(cell) cell = tf.contrib.rnn.MultiRNNCell(cells) embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32) inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size x seq_length x embedding_dim Y = tf.convert_to_tensor(y_data) encoder_outputs = tf.ones([batch_size,20,30]) input_lengths = [20]*batch_size # attention mechanism attention_initial_state = cell.zero_state(batch_size, tf.float32) # 다른 값을 줄수도 있다. attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths) cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,initial_cell_state=attention_initial_state, attention_layer_size=13) # AttentionWrapperState를 return한다. initial_state = cell.zero_state(batch_size, tf.float32) #(batch_size x hidden_dim) x layer 개수 ==> AttentionWrapperState class object를 return한다. if train_mode: helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32)) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token) output_layer = Dense(output_dim, name='output_projection') decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer) # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다 outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10) weights = tf.ones(shape=[batch_size,seq_length]) loss = tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights) sess.run(tf.global_variables_initializer()) print("initial_state: ", sess.run(initial_state)) print("\n\noutputs: ",outputs) o = sess.run(outputs.rnn_output) #batch_size, seq_length, outputs o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1)) print("\n",o,o2) #batch_size, seq_length, outputs print("\n\nlast_state: ",last_state) print(sess.run(last_state)) # batch_size, hidden_dim print("\n\nlast_sequence_lengths: ",last_sequence_lengths) print(sess.run(last_sequence_lengths)) # [seq_length]*batch_size print("kernel(weight)",sess.run(output_layer.trainable_weights[0])) # kernel(weight) print("bias",sess.run(output_layer.trainable_weights[1])) # bias if train_mode: p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim) print("loss: {:20.6f}".format(sess.run(loss))) print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) )
def __init__(self, data, args, embed): with tf.variable_scope("input"): with tf.variable_scope("embedding"): # build the embedding table and embedding input if embed is None: # initialize the embedding randomly self.embed = tf.get_variable( 'embed', [data.vocab_size, args.embedding_size], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable('embed', dtype=tf.float32, initializer=embed) self.sentence = tf.placeholder(tf.int32, (None, None), 'sen_inps') # batch*len self.sentence_length = tf.placeholder(tf.int32, (None, ), 'sen_lens') # batch self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior") batch_size, batch_len = tf.shape(self.sentence)[0], tf.shape( self.sentence)[1] self.decoder_max_len = batch_len - 1 self.encoder_input = tf.nn.embedding_lookup( self.embed, self.sentence) # batch*len*unit self.encoder_len = self.sentence_length decoder_input = tf.split(self.sentence, [self.decoder_max_len, 1], 1)[0] # no eos_id self.decoder_input = tf.nn.embedding_lookup( self.embed, decoder_input) # batch*(len-1)*unit self.decoder_target = tf.split(self.sentence, [1, self.decoder_max_len], 1)[1] # no go_id, batch*(len-1) self.decoder_len = self.sentence_length - 1 self.decoder_mask = tf.sequence_mask( self.decoder_len, self.decoder_max_len, dtype=tf.float32) # batch*(len-1) # initialize the training process self.learning_rate = tf.Variable(float(args.lr), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * args.lr_decay) self.global_step = tf.Variable(0, trainable=False) # build rnn_cell cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size) cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size) # build encoder with tf.variable_scope('encoder'): encoder_output, encoder_state = dynamic_rnn(cell_enc, self.encoder_input, self.encoder_len, dtype=tf.float32, scope="encoder_rnn") with tf.variable_scope('recognition_net'): recog_input = encoder_state self.recog_mu = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_mu') self.recog_logvar = tf.layers.dense(inputs=recog_input, units=args.z_dim, activation=None, name='recog_logvar') epsilon = tf.random_normal(tf.shape(self.recog_logvar), name="epsilon") std = tf.exp(0.5 * self.recog_logvar) self.recog_z = tf.add(self.recog_mu, tf.multiply(std, epsilon), name='recog_z') self.kld = tf.reduce_mean(0.5 * tf.reduce_sum( tf.exp(self.recog_logvar) + self.recog_mu * self.recog_mu - self.recog_logvar - 1, axis=-1)) self.prior_z = tf.random_normal(tf.shape(self.recog_logvar), name="prior_z") latent_sample = tf.cond(self.use_prior, lambda: self.prior_z, lambda: self.recog_z, name='latent_sample') dec_init_state = tf.layers.dense(inputs=latent_sample, units=args.dh_size, activation=None) with tf.variable_scope("output_layer", initializer=tf.orthogonal_initializer()): self.output_layer = Dense( data.vocab_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), use_bias=True) with tf.variable_scope("decode", initializer=tf.orthogonal_initializer()): train_helper = tf.contrib.seq2seq.TrainingHelper( inputs=self.decoder_input, sequence_length=self.decoder_len) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=train_helper, initial_state=dec_init_state, output_layer=self.output_layer) train_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=train_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True) logits = train_output.rnn_output crossent = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.decoder_target, logits=logits) crossent = tf.reduce_sum(crossent * self.decoder_mask) self.sen_loss = crossent / tf.to_float(batch_size) self.ppl_loss = crossent / tf.reduce_sum(self.decoder_mask) self.decoder_distribution_teacher = tf.nn.log_softmax(logits) with tf.variable_scope("decode", reuse=True): infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embed, tf.fill([batch_size], data.go_id), data.eos_id) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cell_dec, helper=infer_helper, initial_state=dec_init_state, output_layer=self.output_layer) infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=infer_decoder, maximum_iterations=self.decoder_max_len, impute_finished=True) self.decoder_distribution = infer_output.rnn_output self.generation_index = tf.argmax( tf.split(self.decoder_distribution, [2, data.vocab_size - 2], 2)[1], 2) + 2 # for removing UNK self.kl_weights = tf.minimum( tf.to_float(self.global_step) / args.full_kl_step, 1.0) self.kl_loss = self.kl_weights * tf.maximum(self.kld, args.min_kl) self.loss = self.sen_loss + self.kl_loss # calculate the gradient of parameters and update self.params = [ k for k in tf.trainable_variables() if args.name in k.name ] opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=args.momentum) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, args.grad_clip) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) # save checkpoint self.latest_saver = tf.train.Saver( write_version=tf.train.SaverDef.V2, max_to_keep=args.checkpoint_max_to_keep, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) # create summary for tensorboard self.create_summary(args)
def cnnlstm(features, labels, mode, params): """ Model to be used in the tf.estimator. Basically the machine learning model. Simple RNN model that: - Takes a sentence represented like ['This', 'is', 'a', 'sentence'] where each character in a word is represented by a integer and each word in a batch has the same length (zero padded) - One word at a time, each word is embedded using a CNN and a Highway network. (TODO: add the highway network) - This embedding is given to a RNN - The last state is given to another RNN (+ Attention over the previous hidden state) that predicts the next word. Args: - features: a dict: - sequence: a tensor of shape [batch_size, max_sentence_length, max_word_size] filled with the character ids, and padded with 0 - sequence_length: a tensor of shape [batch_size] with the original length of the sequences. - max_word_size: tha maximum length of each word in the batch - labels: a dict: - sequence: a tensor of shape [batch_size, max_sentence_length] filled with the words ids of each sentence and padded with 0. - sequence_length: a tensor of shape [batch_size] with the original length of the sequences. - mode: the mode of the model (given by the estimator) - params: a dict with the following keys: - vocab_size: the size of the character vocabulary used - embedding_size: the size of the embeddings - dropout: 1 - dropout probability (the keep probability) """ with tf.variable_scope('ModelParams'): batch_size = tf.shape(features['sequence'])[0] timesteps = tf.shape(features['sequence'])[1] maxwordlength = tf.shape(features['sequence'])[2] c_embed_s = params['char_embedding_size'] dropout = params['dropout'] hidden_size = params['hidden_size'] network_depth = params['network_depth'] kernels = params['kernels'] kernel_features = params['kernel_features'] with tf.variable_scope('Convolution'): ########### # ENCODER # ########### # Characters embeddings matrix. Basically each character id (int) # is associated a vector [char_embedding_size] embeddings_c = tf.Variable(tf.random_uniform([params['char_vocab_size'], c_embed_s], -1.0, 1.0)) # Embed every char id into their embedding. Will go from this dimension # [batch_size, max_sequence_length, max_word_size] to this dimension # [batch_size, max_sequence_length, max_word_size, char_embedding_size] embedded_chars = tf.nn.embedding_lookup(embeddings_c, features['sequence']) # Change the dimension and bring every word as an example. # Reshape the inputs to have words as second and third dimension # from [batch, timesteps, wordlength, embedsize] to # [batch*timesteps, wordlength, embedisze] cnn_inputs = tf.reshape(embedded_chars, [batch_size*timesteps, maxwordlength, c_embed_s]) # Expand the second dimension for convolution purposes cnn_inputs = tf.expand_dims(cnn_inputs, 1) # Layer to hold all of the convolution results layers = [] # For each kernel, tuple of [kernel size, num filters] for kernel_size, kernel_feature_size in zip(kernels, kernel_features): # Apply the convolution on all of the inputs for this kernal conv = conv2d(cnn_inputs, kernel_feature_size, 1, kernel_size, name="kernel_%d" % kernel_size) pool = tf.reduce_max(tf.tanh(conv), 2, keep_dims=True) layers.append(tf.squeeze(pool, [1, 2])) cnn_output = tf.concat(layers, 1) rnn_inputs = tf.reshape(cnn_output, [batch_size, timesteps, sum(kernel_features)]) with tf.variable_scope('RNN_Encoder'): # Create the actual encoder. Which applies a convolution on the char input # to have an embedding for each word. This embedding is then fed to the # classical LSMT RNN. # TODO: apply dropout cell_list = [create_cell(mode, dropout, hidden_size) for _ in range(network_depth)] cell = tf.contrib.rnn.MultiRNNCell(cell_list) # Loop over the inputs and apply the previously created cell at every # timestep. Returns the output at every step and last hidden state. encoder_outputs, encoder_state = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32, inputs=rnn_inputs, sequence_length=features['sequence_length']) with tf.variable_scope('Decoder'): ########### # DECODER # ########### # Words embeddings matrix. Basically every word id (int) in the vocab # is associated a vector [char_embedding_size] embeddings_w = tf.Variable(tf.random_uniform([params['word_vocab_size'], params['word_embedding_size']], -1.0, 1.0)) # Decoder cell. Basic LSTM cell that will do the decoding. cell_list_dec = [create_cell(mode, dropout, hidden_size) for _ in range(network_depth)] decoder_cell = cell = tf.contrib.rnn.MultiRNNCell(cell_list_dec) # Attention mechanism attention_mechanism = LuongAttention(num_units=hidden_size, memory=encoder_outputs, memory_sequence_length=features['sequence_length']) # Attention Wrapper attn_cell = AttentionWrapper(decoder_cell, attention_mechanism) initial_decoder_state = attn_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state) # Projection layer. Layer that takes the output of the decoder cell # and projects it on the word vocab dimension. projection_layer = Dense(params['word_vocab_size'], use_bias=False) # If not at infering mode, use the decoder_inputs # output at each time step. if mode != tf.estimator.ModeKeys.PREDICT: # Decoder outputs, i.e., what we are trying to predict. decoder_o = tf.cast(labels['sequence_output'], tf.int32) # Embed the decoder input decoder_i = tf.nn.embedding_lookup(embeddings_w, labels['sequence_input']) # Helper method. Basically a function that "helps" the decoder # at each time step by giving it the true input, whatever it computed # earlier. output_sequence_length = tf.cast(labels['sequence_length'], tf.int32) helper = TrainingHelper(decoder_i, output_sequence_length) else: # Helper method. At inference time it is different, we do not have the # true inputs, so this function will take the previously generated output # and embbed it with the decoder embeddings. start_token = tf.fill([batch_size], params['start_token']) end_token = tf.cast(params['end_token'], tf.int32) helper = GreedyEmbeddingHelper(embeddings_w, start_token, end_token) # The final decoder, with its cell, its intial state, its helper function, # and its projection layer. decoder = BasicDecoder(attn_cell, helper,initial_decoder_state, output_layer=projection_layer) # Use this decoder to perform a dynamic decode. # Dynamic Decoder: controls the flow of operations and mainly store the outputs # and keeps decoding until the decoder is done. # Decoder: kind of the cell of the dynacmic decoder. It passes the inputs # to the RNN, samples the output of the RNN and computes the next input. # To sample and compute the next inputs, the decoder uses a Helper function. # During training it is a TrainingHelper and during inference it is GreedyEmbeddingHelper # In our case the sampling is simply taking the argmax of the output logit. # The main difference between the two helpers is on the way they "compute" # the next input. TrainingHelper will use the decoder inputs provided while # the GreedyEmbeddingHelper will use the sampled RNN output and give it to # an embedding function to give it at as the next input. # Outputs of the BasicDecoder is a BasicDecoderOutput which holds the logits # and the sample_ids. if mode != tf.estimator.ModeKeys.PREDICT: outputs, state, sequence_lengths = dynamic_decode(decoder) else: max_iterations = tf.cast(tf.reduce_max(features['sequence_length'])*2, tf.int32) outputs, state, sequence_lengths = dynamic_decode(decoder, maximum_iterations=max_iterations) with tf.variable_scope('Prediction'): # Contains the logits = outputs.rnn_output # output of the projection layer sample_id = outputs.sample_id # argmax of the logits # If we are INFER time only if mode == tf.estimator.ModeKeys.PREDICT: # Return a dict with the sample word ids. predictions = {"sequence": sample_id} export_outputs = { 'prediction': tf.estimator.export.PredictOutput(predictions) } return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, export_outputs=export_outputs) with tf.variable_scope('Loss'): # We are not at INFER time. We compute the cross entropy. crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=decoder_o, logits=logits) # Here we create a mask to "erase" the loss where the sentences are finished target_w = tf.sequence_mask(labels['sequence_length'], dtype=logits.dtype) # We apply the mask and sum the loss accross all the dimensions and divide it # by the batch size to make it independent of the batch_size. batch_size_32 = tf.cast(batch_size, tf.float32) timesteps_32 = tf.cast(timesteps, tf.float32) loss = (tf.reduce_sum(crossent * target_w) / (batch_size_32+timesteps_32)) with tf.variable_scope('Train'): # At train time only. if mode == tf.estimator.ModeKeys.TRAIN: # Initialize an optimize that has for goal to minimize the loss learning_rate = tf.train.exponential_decay(params['learning_rate'], tf.train.get_global_step(), params['decay_steps'], 0.96, staircase=True) optimizer = tf.train.AdamOptimizer(learning_rate) # Apply gradient clipping gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) with tf.variable_scope('Evaluate'): # Compute the accuracy of the model (the number of sequences that the model # got right) eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=decoder_o, predictions=sample_id, weights=target_w)} return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def build_model(self): print('building model... ...') #=================================1, 定义模型的placeholder self.encoder_inputs = tf.placeholder(tf.int32, [None, None], name='encoder_inputs') self.encoder_inputs_length = tf.placeholder( tf.int32, [None], name='encoder_inputs_length') self.batch_size = tf.placeholder(tf.int32, [], name='batch_size') self.keep_prob_placeholder = tf.placeholder( tf.float32, name='keep_prob_placeholder') self.decoder_targets = tf.placeholder(tf.int32, [None, None], name='decoder_targets') self.decoder_targets_length = tf.placeholder( tf.int32, [None], name='decoder_targets_length') # 根据目标序列长度,选出其中最大值,然后使用该值构建序列长度的mask标志。用一个sequence_mask的例子来说明起作用 # tf.sequence_mask([1, 3, 2], 5) # [[True, False, False, False, False], # [True, True, True, False, False], # [True, True, False, False, False]] self.max_target_sequence_length = tf.reduce_max( self.decoder_targets_length, name='max_target_len') self.mask = tf.sequence_mask(self.decoder_targets_length, self.max_target_sequence_length, dtype=tf.float32, name='masks') #=================================2, 定义模型的encoder部分 with tf.variable_scope('encoder'): #创建LSTMCell,两层+dropout encoder_cell = self._create_rnn_cell() #构建embedding矩阵,encoder和decoder公用该词向量矩阵 embedding = tf.get_variable('embedding', [self.vocab_size, self.embedding_size]) encoder_inputs_embedded = tf.nn.embedding_lookup( embedding, self.encoder_inputs) # 使用dynamic_rnn构建LSTM模型,将输入编码成隐层向量。 # encoder_outputs用于attention,batch_size*encoder_inputs_length*rnn_size, # encoder_state用于decoder的初始化状态,batch_size*rnn_szie encoder_outputs, encoder_state = tf.nn.dynamic_rnn( encoder_cell, encoder_inputs_embedded, sequence_length=self.encoder_inputs_length, dtype=tf.float32) # =================================3, 定义模型的decoder部分 with tf.variable_scope('decoder'): encoder_inputs_length = self.encoder_inputs_length if self.beam_search: # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。 print("use beamsearch decoding..") encoder_outputs = tf.contrib.seq2seq.tile_batch( encoder_outputs, multiplier=self.beam_size) encoder_state = nest.map_structure( lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size), encoder_state) encoder_inputs_length = tf.contrib.seq2seq.tile_batch( self.encoder_inputs_length, multiplier=self.beam_size) #定义要使用的attention机制。 attention_mechanism = tf.contrib.seq2seq.BahdanauAttention( num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) #attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length) # 定义decoder阶段要是用的LSTMCell,然后为其封装attention wrapper decoder_cell = self._create_rnn_cell() decoder_cell = tf.contrib.seq2seq.AttentionWrapper( cell=decoder_cell, attention_mechanism=attention_mechanism, attention_layer_size=self.rnn_size, name='Attention_Wrapper') #如果使用beam_seach则batch_size = self.batch_size * self.beam_size。因为之前已经复制过一次 batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size #定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值 decoder_initial_state = decoder_cell.zero_state( batch_size=batch_size, dtype=tf.float32).clone(cell_state=encoder_state) #TODO here i DONT CHANGE anything i think # output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) output_layer = Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) if self.mode == 'train': # 定义decoder阶段的输入,其实就是在decoder的target开始处添加一个<go>,并删除结尾处的<end>,并进行embedding。 # decoder_inputs_embedded的shape为[batch_size, decoder_targets_length, embedding_size] ending = tf.strided_slice(self.decoder_targets, [0, 0], [self.batch_size, -1], [1, 1]) decoder_input = tf.concat([ tf.fill([self.batch_size, 1], self.word_to_idx['<go>']), ending ], 1) decoder_inputs_embedded = tf.nn.embedding_lookup( embedding, decoder_input) #训练阶段,使用TrainingHelper+BasicDecoder的组合,这一般是固定的,当然也可以自己定义Helper类,实现自己的功能 training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_inputs_embedded, sequence_length=self.decoder_targets_length, time_major=False, name='training_helper') training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=training_helper, initial_state=decoder_initial_state, output_layer=output_layer) #调用dynamic_decode进行解码,decoder_outputs是一个namedtuple,里面包含两项(rnn_outputs, sample_id) # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案 decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, impute_finished=True, maximum_iterations=self.max_target_sequence_length) # 根据输出计算loss和梯度,并定义进行更新的AdamOptimizer和train_op self.decoder_logits_train = tf.identity( decoder_outputs.rnn_output) self.decoder_predict_train = tf.argmax( self.decoder_logits_train, axis=-1, name='decoder_pred_train') # 使用sequence_loss计算loss,这里需要传入之前定义的mask标志 self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.decoder_logits_train, targets=self.decoder_targets, weights=self.mask) # Training summary for the current batch_loss tf.summary.scalar('loss', self.loss) self.summary_op = tf.summary.merge_all() optimizer = tf.train.AdamOptimizer(self.learning_rate) trainable_params = tf.trainable_variables() gradients = tf.gradients(self.loss, trainable_params) clip_gradients, _ = tf.clip_by_global_norm( gradients, self.max_gradient_norm) self.train_op = optimizer.apply_gradients( zip(clip_gradients, trainable_params)) elif self.mode == 'decode': start_tokens = tf.ones([ self.batch_size, ], tf.int32) * self.word_to_idx['<go>'] end_token = self.word_to_idx['<eos>'] # decoder阶段根据是否使用beam_search决定不同的组合, # 如果使用则直接调用BeamSearchDecoder(里面已经实现了helper类) # 如果不使用则调用GreedyEmbeddingHelper+BasicDecoder的组合进行贪婪式解码 if self.beam_search: inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_size, output_layer=output_layer) else: decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=embedding, start_tokens=start_tokens, end_token=end_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=decoder_initial_state, output_layer=output_layer) decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=inference_decoder, maximum_iterations=10) # 调用dynamic_decode进行解码,decoder_outputs是一个namedtuple, # 对于不使用beam_search的时候,它里面包含两项(rnn_outputs, sample_id) # rnn_output: [batch_size, decoder_targets_length, vocab_size] # sample_id: [batch_size, decoder_targets_length], tf.int32 # 对于使用beam_search的时候,它里面包含两项(predicted_ids, beam_search_decoder_output) # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果 # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids) # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果 if self.beam_search: self.decoder_predict_decode = decoder_outputs.predicted_ids else: self.decoder_predict_decode = tf.expand_dims( decoder_outputs.sample_id, -1) # =================================4, 保存模型 self.saver = tf.train.Saver(tf.global_variables())
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers, rnn_size, target_sequence_length, max_target_sequence_length, encoder_state, decoder_input): ''' :param target_letter_to_int: target数据的映射表 :param decoding_embedding_size: embed向量大小 :param num_layers: 堆叠的RNN单元数量 :param rnn_size: RNN单元的隐层结点数量 :param target_sequence_length: target数据序列长度 :param max_target_sequence_length: target数据序列最大长度 :param encoder_state: encoder端编码的状态向量 :param decoder_input: decoder端输入 ''' # 1. Embedding target_vocab_size = len(target_letter_to_int) decoder_embeddings = tf.Variable( tf.random_uniform([target_vocab_size, decoding_embedding_size])) decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input) # 2. 构造Decoder中的RNN单元 def get_decoder_cell(rnn_size): decoder_cell = tf.contrib.rnn.LSTMCell( rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2)) return decoder_cell cell = tf.contrib.rnn.MultiRNNCell( [get_decoder_cell(rnn_size) for _ in range(num_layers)]) # 3. Output全连接层 output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer( mean=0.0, stddev=0.1)) # 4. Training decoder with tf.variable_scope("decode"): # 得到help对象 training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoder_embed_input, sequence_length=target_sequence_length, time_major=False) # 构造decoder training_decoder = tf.contrib.seq2seq.BasicDecoder( cell, training_helper, encoder_state, output_layer) training_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length) # 5. Predicting decoder # 与training共享参数 with tf.variable_scope("decode", reuse=True): # 创建一个常量tensor并复制为batch_size的大小 start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], name='start_tokens') predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( decoder_embeddings, start_tokens, target_letter_to_int['<EOS>']) predicting_decoder = tf.contrib.seq2seq.BasicDecoder( cell, predicting_helper, encoder_state, output_layer) predicting_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode( predicting_decoder, impute_finished=True, maximum_iterations=max_target_sequence_length) return training_decoder_output, predicting_decoder_output
def BuildNetwork(self, learningRate): ############################################################################# # Input Data ############################################################################# self.dataInput = tensorflow.placeholder( dtype=tensorflow.float32, shape=[None, None, self.featureShape], name='dataInput') self.dataLenInput = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='dataLenInput') self.labelInputSR = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None, None], name='labelInput') self.labelLenInputSR = tensorflow.placeholder(dtype=tensorflow.int32, shape=[None], name='labelLenInput') self.labelInputDR = tensorflow.placeholder(dtype=tensorflow.float32, shape=None, name='labelInputDR') ############################################################################# # Batch Parameters ############################################################################# self.parameters['BatchSize'], self.parameters[ 'TimeStep'], _ = tensorflow.unstack( tensorflow.shape(input=self.dataInput, name='DataShape')) self.parameters['LabelStep'] = tensorflow.shape( input=self.labelInputSR, name='LabelShape')[1] ################################################################################################### # Encoder ################################################################################################### with tensorflow.variable_scope('Encoder'): self.parameters[ 'Encoder_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters[ 'Encoder_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Encoder_Output'], self.parameters['Encoder_FinalState'] = \ tensorflow.nn.bidirectional_dynamic_rnn( cell_fw=self.parameters['Encoder_Cell_Forward'], cell_bw=self.parameters['Encoder_Cell_Backward'], inputs=self.dataInput, sequence_length=self.dataLenInput, dtype=tensorflow.float32) self.attentionList = self.firstAttention( dataInput=self.parameters['Encoder_Output'], scopeName=self.firstAttentionName, hiddenNoduleNumber=2 * self.hiddenNodules, attentionScope=self.firstAttentionScope, blstmFlag=True) self.parameters['Decoder_InitalState'] = [] for index in range(self.rnnLayers): self.parameters[ 'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple( c=self.attentionList['FinalResult'], h=tensorflow.concat([ self.parameters['Encoder_FinalState'][index][0].h, self.parameters['Encoder_FinalState'][index][1].h ], axis=1)) self.parameters['Decoder_InitalState'].append( self.parameters['Encoder_Cell_Layer%d' % index]) self.parameters['Decoder_InitalState'] = tuple( self.parameters['Decoder_InitalState']) ############################################################################# # Decoder Label Pretreatment ############################################################################# self.parameters['DecoderEmbedding'] = tensorflow.Variable( initial_value=tensorflow.truncated_normal( shape=[VOCABULAR, self.hiddenNodules * 2], stddev=0.1, name='DecoderEmbedding')) self.parameters[ 'DecoderEmbeddingResult'] = tensorflow.nn.embedding_lookup( params=self.parameters['DecoderEmbedding'], ids=self.labelInputSR, name='DecoderEmbeddingResult') ############################################################################# # Decoder ############################################################################# self.parameters['Decoder_Helper'] = seq2seq.TrainingHelper( inputs=self.parameters['DecoderEmbeddingResult'], sequence_length=self.labelLenInputSR, name='Decoder_Helper') with tensorflow.variable_scope('Decoder'): self.parameters['Decoder_FC'] = Dense(VOCABULAR) self.parameters[ 'Decoder_Cell'] = tensorflow.nn.rnn_cell.MultiRNNCell( cells=[ rnn.LSTMCell(num_units=self.hiddenNodules * 2) for _ in range(self.rnnLayers) ], state_is_tuple=True) self.parameters['Decoder'] = seq2seq.BasicDecoder( cell=self.parameters['Decoder_Cell'], helper=self.parameters['Decoder_Helper'], initial_state=self.parameters['Decoder_InitalState'], output_layer=self.parameters['Decoder_FC']) self.parameters['Decoder_Logits'], self.parameters[ 'Decoder_FinalState'], self.parameters[ 'Decoder_FinalSeq'] = seq2seq.dynamic_decode( decoder=self.parameters['Decoder']) with tensorflow.name_scope('Loss'): self.parameters['TargetsReshape'] = tensorflow.reshape( tensor=self.labelInputSR, shape=[-1], name='TargetsReshape') self.parameters['Decoder_Reshape'] = tensorflow.reshape( self.parameters['Decoder_Logits'].rnn_output, [-1, VOCABULAR], name='Decoder_Reshape') self.parameters[ 'Cost'] = tensorflow.losses.sparse_softmax_cross_entropy( labels=self.parameters['TargetsReshape'], logits=self.parameters['Decoder_Reshape']) self.trainEncoderDecoder = tensorflow.train.AdamOptimizer( learning_rate=learningRate).minimize(self.parameters['Cost']) ############################################################################# self.DBLSTM_Structure(learningRate=learningRate)
def model_fn(features, labels, mode, params): # particular to this project word2index = params['word2index'] # index2word = params['index2word'] GPUs = get_available_gpus() GPU = { 'titan': GPUs[1], 'sidekick': GPUs[0]} lookup_table, emb_vectors = load_embeddings(params['embedding_vectors'], params['vocab']) embedded_enc_input = tf.nn.embedding_lookup(emb_vectors, features['encoder_inputs']) forget_bias = get_forget_bias(params, mode) num_units = [2048, 2048] init = tf.initializers.truncated_normal(0.0, 0.01) with tf.device(GPU['titan']): encoder_cells = [tf.nn.rnn_cell.LSTMCell(num_units=num, forget_bias=forget_bias, initializer=init) for num in num_units] encoder_stacked_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(encoder_cells) enc_outputs, enc_final_state = tf.nn.dynamic_rnn(encoder_stacked_rnn_cell, embedded_enc_input, sequence_length=features['encoder_input_lengths'], dtype=tf.float32) # Decoder model with tf.device(GPU['sidekick']): partial_embedding_helper = partial(embedding_helper, emb_vectors=emb_vectors) if mode == tf.estimator.ModeKeys.TRAIN: embed_dec_inputs = tf.nn.embedding_lookup(emb_vectors, features['decoder_inputs']) helper = tf.contrib.seq2seq.TrainingHelper( inputs=embed_dec_inputs, sequence_length=features['decoder_input_lengths'], ) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=partial_embedding_helper, start_tokens=tf.tile([word2index['<GO>']], [tf.shape(features['encoder_inputs'])[0]]), end_token=word2index['<EOS>']) dec_cell = tf.nn.rnn_cell.LSTMCell(num_units=num_units[-1], # needs to match size of last layer of encoder forget_bias=forget_bias, initializer=init) decoder = tf.contrib.seq2seq.BasicDecoder( cell=dec_cell, helper=helper, initial_state=enc_final_state[-1], output_layer=Dense(params['vocab_size'], use_bias=False)) dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=decoder, output_time_major=False, impute_finished=True, maximum_iterations=params['output_max_length']) logits = tf.identity(dec_outputs.rnn_output, 'logits') if mode == tf.estimator.ModeKeys.PREDICT: indices = predict_words(logits) predictions = {'sentence_tokens': indices} return tf.estimator.EstimatorSpec(mode, predictions=predictions) training_labels = labels['target_sequences'] weights = tf.cast(tf.cast(tf.not_equal(training_labels, tf.constant(word2index['<PAD>'])), tf.bool), tf.float32) sequence_loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=training_labels, weights=weights) tf.summary.scalar('sequence_loss', sequence_loss) if mode == tf.estimator.ModeKeys.EVAL: metrics = {'loss': sequence_loss} return tf.estimator.EstimatorSpec(mode, loss=sequence_loss, eval_metric_ops=metrics) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) train_op = optimizer.minimize(sequence_loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode, loss=sequence_loss, train_op=train_op)
def __init__(self, tfFLAGS, embed=None): self.vocab_size = tfFLAGS.vocab_size self.embed_size = tfFLAGS.embed_size self.num_units = tfFLAGS.num_units self.num_layers = tfFLAGS.num_layers self.beam_width = tfFLAGS.beam_width self.use_lstm = tfFLAGS.use_lstm self.attn_mode = tfFLAGS.attn_mode self.train_keep_prob = tfFLAGS.keep_prob self.max_decode_len = tfFLAGS.max_decode_len self.bi_encode = tfFLAGS.bi_encode self.recog_hidden_units = tfFLAGS.recog_hidden_units self.prior_hidden_units = tfFLAGS.prior_hidden_units self.z_dim = tfFLAGS.z_dim self.full_kl_step = tfFLAGS.full_kl_step self.global_step = tf.Variable(0, name="global_step", trainable=False) self.max_gradient_norm = 5.0 if tfFLAGS.opt == 'SGD': self.learning_rate = tf.Variable(float(tfFLAGS.learning_rate), trainable=False, dtype=tf.float32) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * tfFLAGS.learning_rate_decay_factor) self.opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif tfFLAGS.opt == 'Momentum': self.opt = tf.train.MomentumOptimizer( learning_rate=tfFLAGS.learning_rate, momentum=tfFLAGS.momentum) else: self.learning_rate = tfFLAGS.learning_rate self.opt = tf.train.AdamOptimizer() self._make_input(embed) with tf.variable_scope("output_layer"): self.output_layer = Dense( self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1)) with tf.variable_scope("encoders", initializer=tf.orthogonal_initializer()): self.enc_post_outputs, self.enc_post_state = self._build_encoder( scope='post_encoder', inputs=self.enc_post, sequence_length=self.post_len) self.enc_ref_outputs, self.enc_ref_state = self._build_encoder( scope='ref_encoder', inputs=self.enc_ref, sequence_length=self.ref_len) self.enc_response_outputs, self.enc_response_state = self._build_encoder( scope='resp_encoder', inputs=self.enc_response, sequence_length=self.response_len) self.post_state = self._get_representation_from_enc_state( self.enc_post_state) self.ref_state = self._get_representation_from_enc_state( self.enc_ref_state) self.response_state = self._get_representation_from_enc_state( self.enc_response_state) self.cond_embed = tf.concat([self.post_state, self.ref_state], axis=-1) with tf.variable_scope("RecognitionNetwork"): recog_input = tf.concat([self.cond_embed, self.response_state], axis=-1) recog_hidden = tf.layers.dense(inputs=recog_input, units=self.recog_hidden_units, activation=tf.nn.tanh) recog_mulogvar = tf.layers.dense(inputs=recog_hidden, units=self.z_dim * 2, activation=None) # recog_mulogvar = tf.layers.dense(inputs=recog_input, units=self.z_dim * 2, activation=None) recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=-1) with tf.variable_scope("PriorNetwork"): prior_input = self.cond_embed prior_hidden = tf.layers.dense(inputs=prior_input, units=self.prior_hidden_units, activation=tf.nn.tanh) prior_mulogvar = tf.layers.dense(inputs=prior_hidden, units=self.z_dim * 2, activation=None) prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=-1) with tf.variable_scope("GenerationNetwork"): latent_sample = tf.cond( self.use_prior, lambda: sample_gaussian(prior_mu, prior_logvar), lambda: sample_gaussian(recog_mu, recog_logvar), name='latent_sample') gen_input = tf.concat([self.cond_embed, latent_sample], axis=-1) if self.use_lstm: self.dec_init_state = tuple([ tf.contrib.rnn.LSTMStateTuple( c=tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None), h=tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None)) for _ in range(self.num_layers) ]) print self.dec_init_state else: self.dec_init_state = tuple([ tf.layers.dense(inputs=gen_input, units=self.num_units, activation=None) for _ in range(self.num_layers) ]) kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar) self.avg_kld = tf.reduce_mean(kld) self.kl_weights = tf.minimum( tf.to_float(self.global_step) / self.full_kl_step, 1.0) self.kl_loss = self.kl_weights * self.avg_kld self._build_decoder() self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=1, pad_step_number=True, keep_checkpoint_every_n_hours=1.0) for var in tf.trainable_variables(): print var
def build_decoder(encoder_outputs, encoder_state, input_sequence_length, char_ids, batch_size, num_classes, num_decoder_layers, maximum_iterations): vocab_size = num_classes out_layer = Dense(vocab_size, name='output_projection') # Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = build_decoder_cell( encoder_outputs, encoder_state, input_sequence_length, num_decoder_layers, batch_size) # Train # if mode != 'INFER': # char_ids = tf.placeholder(tf.int32, # shape=[None, None], # name='ids_target') embedding = tf.get_variable( 'embedding', shape=[vocab_size, 300], # embeddings dimension I have given 2 dtype=tf.float32) char_embedding_lookup = tf.nn.embedding_lookup(embedding, char_ids, name='char_embedding') char_embedding = tf.nn.dropout(char_embedding_lookup, 0.986, name='char_embedding_dropout') helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( inputs=char_embedding, sequence_length=input_sequence_length, embedding=embedding, sampling_probability=0.5, time_major=False) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder(cell, helper, decoder_initial_state, output_layer=out_layer) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, output_time_major=False, maximum_iterations=maximum_iterations, swap_memory=False, impute_finished=True, scope=decoder_scope) sample_id = outputs.sample_id logits = outputs.rnn_output # Inference # else: # start_tokens = tf.fill([batch_size], sos_id_2) # end_token = eos_id_2 # # Beam search # if beam_width > 0: # my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( # cell=cell, # embedding=embedding, # start_tokens=start_tokens, # end_token=end_token, # initial_state=decoder_initial_state, # beam_width=beam_width, # output_layer=output_layer, # ) # # Greedy # else: # helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, # start_tokens, # end_token) # my_decoder = tf.contrib.seq2seq.BasicDecoder(cell, # helper, # decoder_initial_state, # output_layer=output_layer) # if inference_targets: # maximum_iterations = maximum_iterations # else: # maximum_iterations = None # # Dynamic decoding # outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( # my_decoder, # maximum_iterations=maximum_iterations, # output_time_major=False, # impute_finished=False, # swap_memory=False, # scope=decoder_scope) # if beam_width > 0: # logits = tf.no_op() # sample_id = outputs.predicted_ids # else: # logits = tf.no_op() # sample_id = outputs.sample_id return logits, sample_id, final_context_state
def _create_decoder(cells, batch_size, encoder_outputs, encoder_state, encoder_lengths, decoding_inputs, decoding_lengths, embed_matrix, target_vocab_size, scope, max_sequence_size, use_attention=True): """Summary Parameters ---------- cells : TYPE Description batch_size : TYPE Description encoder_outputs : TYPE Description encoder_state : TYPE Description encoder_lengths : TYPE Description decoding_inputs : TYPE Description decoding_lengths : TYPE Description embed_matrix : TYPE Description target_vocab_size : TYPE Description scope : TYPE Description max_sequence_size : TYPE Description use_attention : bool, optional Description Returns ------- TYPE Description """ from tensorflow.python.layers.core import Dense # Output projection output_layer = Dense(target_vocab_size, name='output_projection') # Setup Attention if use_attention: attn_mech = tf.contrib.seq2seq.LuongAttention(cells.output_size, encoder_outputs, encoder_lengths, scale=True) cells = tf.contrib.seq2seq.AttentionWrapper( cell=cells, attention_mechanism=attn_mech, attention_layer_size=cells.output_size, alignment_history=False) initial_state = cells.zero_state(dtype=tf.float32, batch_size=batch_size) initial_state = initial_state.clone(cell_state=encoder_state) # Setup training a build decoder helper = tf.contrib.seq2seq.TrainingHelper( inputs=decoding_inputs, sequence_length=decoding_lengths, time_major=False) train_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cells, helper=helper, initial_state=initial_state, output_layer=output_layer) train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( train_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_sequence_size) train_logits = tf.identity(train_outputs.rnn_output, name='train_logits') # Setup inference and build decoder scope.reuse_variables() start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [batch_size]) helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=embed_matrix, start_tokens=start_tokens, end_token=EOS_ID) infer_decoder = tf.contrib.seq2seq.BasicDecoder( cell=cells, helper=helper, initial_state=initial_state, output_layer=output_layer) infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( infer_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_sequence_size) infer_logits = tf.identity(infer_outputs.sample_id, name='infer_logits') return train_logits, infer_logits
def Decoder(self, encoder_output, encoder_final_state): with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE): emb_w = tf.get_variable("embedding", shape=[self.voc_size, self.dim_hidden]) if self.mode == 'test' and self.beam == True: print("use beamsearch decoding..") encoder_output = tf.contrib.seq2seq.tile_batch( encoder_output, multiplier=self.beam_size) encoder_final_state = tf.contrib.seq2seq.tile_batch( encoder_final_state, multiplier=self.beam_size) attention_output = tf.contrib.seq2seq.LuongAttention( self.dim_hidden, encoder_output) decoder_cell = tf.nn.rnn_cell.MultiRNNCell([ self.get_a_cell(self.dim_hidden) for _ in range(self.lstm_num_layer) ]) decoder_cell = tf.contrib.seq2seq.AttentionWrapper( decoder_cell, attention_output, attention_layer_size=self.dim_hidden) projection_layer = Dense(self.voc_size, use_bias=False) if self.mode == 'train': decoder_input = tf.nn.embedding_lookup(emb_w, self.ys[:, :-1]) decoder_seq_length = [self.input_timestep] * self.batch_size decoder_init_state = decoder_cell.zero_state( self.batch_size, self.dtype).clone(cell_state=encoder_final_state) helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( decoder_input, decoder_seq_length, emb_w, 0.2, time_major=False) training_decoder = tf.contrib.seq2seq.BasicDecoder( decoder_cell, helper, decoder_init_state, output_layer=projection_layer) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=training_decoder, maximum_iterations=self.max_len) elif self.mode == 'test': start_tokens = tf.ones([self.batch_size], tf.int32) end_token = 2 if self.beam == True: decoder_init_state = decoder_cell.zero_state( self.batch_size * self.beam_size, self.dtype).clone(cell_state=encoder_final_state) inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=decoder_cell, embedding=emb_w, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_init_state, beam_width=self.beam_size, output_layer=projection_layer) else: decoder_init_state = decoder_cell.zero_state( self.batch_size, self.dtype).clone(cell_state=encoder_final_state) decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=emb_w, start_tokens=start_tokens, end_token=end_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=decoder_cell, helper=decoding_helper, initial_state=decoder_init_state, output_layer=projection_layer) outputs, _, _ = tf.contrib.seq2seq.dynamic_decode( decoder=inference_decoder, maximum_iterations=self.max_len) return outputs
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, target_dict_dim, is_generating, beam_size, max_generation_length): src_word_idx = tf.placeholder(tf.int32, shape=[None, None]) src_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) src_embedding_weights = tf.get_variable("source_word_embeddings", [source_dict_dim, embedding_dim]) src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx) src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size) # no peephole encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=src_forward_cell, cell_bw=src_reversed_cell, inputs=src_embedding, sequence_length=src_sequence_length, dtype=tf.float32) # concat the forward outputs and backward outputs encoded_vec = tf.concat(encoder_outputs, axis=2) # project the encoder outputs to size of decoder lstm encoded_proj = tf.contrib.layers.fully_connected(inputs=tf.reshape( encoded_vec, shape=[-1, embedding_dim * 2]), num_outputs=decoder_size, activation_fn=None, biases_initializer=None) encoded_proj_reshape = tf.reshape( encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size]) # get init state for decoder lstm's H backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1]) decoder_boot = tf.contrib.layers.fully_connected(inputs=tf.reshape( backword_first, shape=[-1, embedding_dim]), num_outputs=decoder_size, activation_fn=tf.nn.tanh, biases_initializer=None) # prepare the initial state for decoder lstm cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32) initial_state = LSTMStateTuple(cell_init, decoder_boot) # create decoder lstm cell decoder_cell = LSTMCellWithSimpleAttention( decoder_size, encoded_vec if not is_generating else seq2seq.tile_batch( encoded_vec, beam_size), encoded_proj_reshape if not is_generating else seq2seq.tile_batch( encoded_proj_reshape, beam_size), src_sequence_length if not is_generating else seq2seq.tile_batch( src_sequence_length, beam_size), forget_bias=0.0) output_layer = Dense(target_dict_dim, name='output_projection') if not is_generating: trg_word_idx = tf.placeholder(tf.int32, shape=[None, None]) trg_sequence_length = tf.placeholder(tf.int32, shape=[ None, ]) trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights, trg_word_idx) training_helper = seq2seq.TrainingHelper( inputs=trg_embedding, sequence_length=trg_sequence_length, time_major=False, name='training_helper') training_decoder = seq2seq.BasicDecoder(cell=decoder_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) # get the max length of target sequence max_decoder_length = tf.reduce_max(trg_sequence_length) decoder_outputs_train, _, _ = seq2seq.dynamic_decode( decoder=training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_decoder_length) decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output) decoder_pred_train = tf.argmax(decoder_logits_train, axis=-1, name='decoder_pred_train') masks = tf.sequence_mask(lengths=trg_sequence_length, maxlen=max_decoder_length, dtype=tf.float32, name='masks') # place holder of label sequence lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None]) # compute the loss loss = seq2seq.sequence_loss(logits=decoder_logits_train, targets=lbl_word_idx, weights=masks, average_across_timesteps=True, average_across_batch=True) # return feeding list and loss operator return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length, 'trg_word_idx': trg_word_idx, 'trg_sequence_length': trg_sequence_length, 'lbl_word_idx': lbl_word_idx }, loss else: start_tokens = tf.ones([ tf.shape(src_word_idx)[0], ], tf.int32) * START_TOKEN_IDX # share the same embedding weights with target word trg_embedding_weights = tf.get_variable( "target_word_embeddings", [target_dict_dim, embedding_dim]) inference_decoder = beam_search_decoder.BeamSearchDecoder( cell=decoder_cell, embedding=lambda tokens: tf.nn.embedding_lookup( trg_embedding_weights, tokens), start_tokens=start_tokens, end_token=END_TOKEN_IDX, initial_state=tf.nn.rnn_cell.LSTMStateTuple( tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size), tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)), beam_width=beam_size, output_layer=output_layer) decoder_outputs_decode, _, _ = seq2seq.dynamic_decode( decoder=inference_decoder, output_time_major=False, #impute_finished=True,# error occurs maximum_iterations=max_generation_length) predicted_ids = decoder_outputs_decode.predicted_ids return { 'src_word_idx': src_word_idx, 'src_sequence_length': src_sequence_length }, predicted_ids
def build_decoder(self, encoder_outputs, encoder_state): sos_id_2 = tf.cast(self.char2ind[self.sos], tf.int32) eos_id_2 = tf.cast(self.char2ind[self.eos], tf.int32) self.output_layer = Dense(self.vocab_size, name='output_projection') # Decoder. with tf.variable_scope("decoder") as decoder_scope: cell, decoder_initial_state = self.build_decoder_cell( encoder_outputs, encoder_state, self.audio_sequence_lengths) # Train if self.mode != 'INFER': helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper( inputs=self.char_embedding, sequence_length=self.char_sequence_lengths, embedding=self.embedding, sampling_probability=0.5, time_major=False) # Decoder my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, output_layer=self.output_layer) # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, output_time_major=False, maximum_iterations=self.maximum_iterations, swap_memory=False, impute_finished=True, scope=decoder_scope) sample_id = outputs.sample_id logits = outputs.rnn_output # Inference else: start_tokens = tf.fill([self.batch_size], sos_id_2) end_token = eos_id_2 # Beam search if self.beam_width > 0: my_decoder = tf.contrib.seq2seq.BeamSearchDecoder( cell=cell, embedding=self.embedding, start_tokens=start_tokens, end_token=end_token, initial_state=decoder_initial_state, beam_width=self.beam_width, output_layer=self.output_layer, ) # Greedy else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( self.embedding, start_tokens, end_token) my_decoder = tf.contrib.seq2seq.BasicDecoder( cell, helper, decoder_initial_state, output_layer=self.output_layer) if self.inference_targets: maximum_iterations = self.maximum_iterations else: maximum_iterations = None # Dynamic decoding outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode( my_decoder, maximum_iterations=maximum_iterations, output_time_major=False, impute_finished=False, swap_memory=False, scope=decoder_scope) if self.beam_width > 0: logits = tf.no_op() sample_id = outputs.predicted_ids else: logits = tf.no_op() sample_id = outputs.sample_id return logits, sample_id, final_context_state
def __init__(self, num_emb, batch_size, emb_dim, encoder_num_units, emb_data, ques_length, ans_length, start_token, gen_filter_sizes, gen_num_filters, learning_rate=0.01, reward_gamma=0.95): self.num_emb = num_emb self.batch_size = batch_size self.emb_dim = emb_dim self.emb_data = emb_data self.encoder_num_units = encoder_num_units self.max_ques_length = ques_length self.max_ans_length = ans_length self.start_token = tf.constant([start_token] * self.batch_size, dtype=tf.int32) self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.reward_gamma = reward_gamma self.gen_filter_sizes = gen_filter_sizes self.gen_num_filters = gen_num_filters self.grad_clip = 5.0 self.seq_start_token = None self.seq_end_token = None self.rnn_size = 50 self.layer_size = 2 self.beam_width = 10 self.atten_depth = 50 #The depth of the query mechanism self.g_embeddings = tf.Variable( self.init_matrix([self.num_emb, self.emb_dim])) self.x = tf.placeholder(tf.int32, shape=[ self.batch_size, self.max_ques_length ]) # sequence of tokens generated by generator self.response = tf.placeholder( tf.int32, shape=[self.batch_size, self.max_ans_length ]) # get from rollout policy and discriminator self.target_sequence_length = tf.placeholder( tf.int32, [self.batch_size], name='target_sequence_length') self.target_response_length = tf.placeholder( tf.int32, [self.batch_size], name='target_response_length') self.max_response_length_per_batch = tf.placeholder(tf.int32, shape=()) with tf.device("/cpu:0"): #self.processed_x = tf.transpose(tf.nn.embedding_lookup(self.g_embeddings, self.x), perm=[1, 0, 2]) # seq_length x batch_size x emb_dim self.processed_x = tf.nn.embedding_lookup(self.g_embeddings, self.x) self.processed_response = tf.nn.embedding_lookup( self.g_embeddings, self.response) print("processed_x shape: ", self.processed_x.shape) print("processed_response shape: ", self.processed_response.shape) self.add_encoder_layer() self.getCnnEncoder(self.gen_filter_sizes, self.gen_num_filters) self.output_layer = Dense( self.num_emb, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1)) with tf.variable_scope('decode'): training_decoder_output = self.add_decoder_for_training() with tf.variable_scope('decode', reuse=True): predicting_decoder_output, final_context_state = self.add_decoder_for_inference( ) #attention visualizion attention_images = (final_context_state.alignment_history.stack()) print("attention_images shape: ", attention_images.shape) # Reshape to (batch, src_seq_len, tgt_seq_len,1) attention_images = tf.expand_dims( tf.transpose(attention_images, [1, 2, 0]), -1) # Scale to range [0, 255] attention_images *= 255 self.infer_summary = tf.summary.image("attention_images", attention_images) # encoder_output, encoder_state = self.get_encoder_layer(self.processed_x, self.encode_rnn_size, self.encode_layer_size, self.target_sequence_length) #sourse seqlenth # training_decoder_output, predicting_decoder_output = self.decoding_layer( # self.decode_layer_size, # self.decode_rnn_size, # self.target_response_length, # self.max_ans_length, # encoder_state, # encoder_output, # self.x) ####################################################################################################### # Training ####################################################################################################### self.g_pretrain_predictions = training_decoder_output.rnn_output self.g_pretrain_sample = training_decoder_output.sample_id print("self.g_pretrain_predictions: ", self.g_pretrain_predictions) masks = tf.sequence_mask(self.target_sequence_length, self.max_response_length_per_batch, dtype=tf.float32, name='masks') self.pretrain_loss = tf.contrib.seq2seq.sequence_loss( self.g_pretrain_predictions, self.response[:, 0:self.max_response_length_per_batch], masks) # training updates pretrain_opt = self.g_optimizer(self.learning_rate) pre_gradients = pretrain_opt.compute_gradients(self.pretrain_loss) self.pretrain_grad_zip = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in pre_gradients if grad is not None] self.pretrain_updates = pretrain_opt.apply_gradients( self.pretrain_grad_zip) self.g_samples = predicting_decoder_output.predicted_ids
def _init_decoder(self, forward_only): with tf.variable_scope("decoder") as scope: def output_fn(outputs): return tf.contrib.layers.linear(outputs, self.target_vocab_size, scope=scope) # attention_states: size [batch_size, max_time, num_units] #attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2]) self.batch_size = tf.shape(self.encoder_inputs)[0] self.attn_mech = tf.contrib.seq2seq.LuongAttention( num_units=self.dec_hidden_size, memory=self.encoder_outputs, memory_sequence_length=self.encoder_inputs_length, normalize=False, name='LuongAttention') self.dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper( cell=self.decoder_cell, attention_mechanism=self.attn_mech, attention_size=self.dec_hidden_size, # attention_history=False (in ver 1.2) name='Attention_Wrapper') self.initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState( cell_state=self.encoder_state, attention=_zero_state_tensors(self.dec_hidden_size, self.batch_size, tf.float32)) self.output_layer = Dense(self.target_vocab_size + 2, name='output_projection') if forward_only: start_tokens = tf.tile(tf.constant([model_config.PAD_ID], dtype=tf.int32), [self.batch_size], name='start_tokens') inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=self.dec_embedding_matrix, start_tokens=start_tokens, end_token=model_config.EOS_ID) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=self.dec_cell, helper=inference_helper, initial_state=self.initial_state, output_layer=self.output_layer) infer_dec_outputs, infer_dec_last_state = tf.contrib.seq2seq.dynamic_decode( inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.target_vocab_size) # [batch_size x dec_sentence_length], tf.int32 self.predictions = tf.identity(infer_dec_outputs.sample_id, name='predictions') else: # maxium unrollings in current batch = max(dec_sent_len) + 1(GO symbol) self.max_dec_len = tf.reduce_max(self.decoder_inputs_length + 1, name='max_dec_len') self.training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=self.decoder_inputs_embedded, sequence_length=self.decoder_inputs_length + 1, time_major=False, name='training_helper') self.training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=self.dec_cell, helper=self.training_helper, initial_state=self.initial_state, output_layer=self.output_layer) self.decoder_outputs, self.decoder_state = tf.contrib.seq2seq.dynamic_decode( self.training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.max_dec_len) # logits: [batch_size x max_dec_len x dec_vocab_size+2] self.logits = tf.identity(self.decoder_outputs.rnn_output, name='logits') # targets: [batch_size x max_dec_len x dec_vocab_size+2] self.targets = tf.slice(self.decoder_inputs, [0, 0], [-1, self.max_dec_len], 'targets') # masks: [batch_size x max_dec_len] # => ignore outputs after `dec_senquence_length+1` when calculating loss self.masks = tf.sequence_mask(self.decoder_inputs_length + 1, self.max_dec_len, dtype=tf.float32, name='masks') # internal: `tf.nn.sparse_softmax_cross_entropy_with_logits` self.loss = tf.contrib.seq2seq.sequence_loss( logits=self.logits, targets=self.targets, weights=self.masks, name='batch_loss')
def __init__(self, vocab_size, hidden_size, dropout, num_layers, max_gradient_norm, batch_size, learning_rate, lr_decay_factor, max_target_length, max_source_length, decoder_mode=False): ''' vocab_size: number of vocab tokens buckets: buckets of max sequence lengths hidden_size: dimension of hidden layers num_layers: number of hidden layers max_gradient_norm: maximum gradient magnitude batch_size: number of training examples fed to network at once learning_rate: starting learning rate of network lr_decay_factor: amount by which to decay learning rate num_samples: number of samples for sampled softmax decoder_mode: Whether to build backpass nodes or not ''' GO_ID = config.GO_ID EOS_ID = config.EOS_ID self.max_source_length = max_source_length self.max_target_length = max_target_length self.vocab_size = vocab_size self.batch_size = batch_size self.global_step = tf.Variable(0, trainable=False) self.learning_rate = learning_rate self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs') self.source_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='source_lengths') self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets') self.target_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name="target_lengths") with tf.variable_scope('embeddings') as scope: embeddings = tf.Variable(tf.random_uniform( [vocab_size, hidden_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_embedded = tf.nn.embedding_lookup( embeddings, self.encoder_inputs) targets_embedding = tf.nn.embedding_lookup(embeddings, self.decoder_targets) with tf.variable_scope('encoder') as scope: encoder_cell = rnn.LSTMCell(hidden_size) encoder_cell = rnn.DropoutWrapper(encoder_cell, input_keep_prob=dropout) encoder_cell = tf.nn.rnn_cell.MultiRNNCell( [encoder_cell for _ in range(num_layers)], state_is_tuple=True) encoder_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell, sequence_length=self.source_lengths, inputs=encoder_inputs_embedded, dtype=tf.float32, time_major=False) #BiLSTM encoder encoder_output = encoder_outputs[0] encoder_outputs = tf.concat(encoder_outputs, 2) with tf.variable_scope('decoder') as scope: decoder_cell = rnn.LSTMCell(hidden_size) decoder_cell = rnn.DropoutWrapper(decoder_cell, input_keep_prob=dropout) decoder_cell = tf.nn.rnn_cell.MultiRNNCell( [decoder_cell for _ in range(num_layers)], state_is_tuple=True) #TODO add attention #attention_mechanism= seq2seq.BahdanauAttention(num_units=hidden_size,memory=encoder_outputs) #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell, # attention_mechanism=) attn_mech = seq2seq.BahdanauAttention( num_units=hidden_size, #depth of query mechanism memory=encoder_output, #out of RNN hidden states memory_sequence_length=self.source_lengths, name='BahdanauAttentiion') attn_cell = seq2seq.AttentionWrapper( cell=decoder_cell, #same as encoder attention_mechanism=attn_mech, attention_layer_size=hidden_size, #depth of attention tensor name='attention_wrapper') #attention layer if decoder_mode: beam_width = 1 attn_zero = attn_cell.zero_state(batch_size=(batch_size * beam_width), dtype=tf.float32) init_state = attn_zero.clone(cell_state=encoder_state) decoder = seq2seq.BeamSearchDecoder( cell=attn_cell, embedding=embeddings, start_tokens=tf.tile([GO_ID], [1]), end_token=EOS_ID, initial_state=init_state, beam_width=beam_width, output_layer=Dense(vocab_size)) #BeamSearch in Decoder final_outputs, final_state, final_sequence_lengths =\ seq2seq.dynamic_decode(decoder=decoder) self.logits = final_outputs.predicted_ids else: helper = seq2seq.TrainingHelper( inputs=targets_embedding, sequence_length=self.target_lengths) decoder = seq2seq.BasicDecoder( cell=attn_cell, helper=helper, #initial_state=attn_cell.zero_state(batch_size, tf.float32), initial_state=attn_cell.zero_state( batch_size, tf.float32).clone(cell_state=encoder_state[0]), output_layer=Dense(vocab_size)) final_outputs, final_state, final_sequence_lengths =\ seq2seq.dynamic_decode(decoder=decoder) self.logits = final_outputs.rnn_output if not decoder_mode: with tf.variable_scope("loss") as scope: #have to pad logits, dynamic decode produces results not consistent #in shape with targets pad_size = self.max_target_length - tf.reduce_max( final_sequence_lengths) self.logits = tf.pad(self.logits, [[0, 0], [0, pad_size], [0, 0]]) weights = tf.sequence_mask(lengths=final_sequence_lengths, maxlen=self.max_target_length, dtype=tf.float32, name='weights') x_entropy_loss = seq2seq.sequence_loss( logits=self.logits, targets=self.decoder_targets, weights=weights) #cross-entropy loss function self.loss = tf.reduce_mean(x_entropy_loss) optimizer = tf.train.AdamOptimizer() #Adam optimization algorithm gradients = optimizer.compute_gradients(x_entropy_loss) capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm, max_gradient_norm), var) for grad, var in gradients] self.train_op = optimizer.apply_gradients( capped_grads, global_step=self.global_step) self.saver = tf.train.Saver(tf.global_variables())
def dynamic_decode_test(): vocab_size = 6 SOS_token = 0 EOS_token = 5 x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32) y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32) print("data shape: ", x_data.shape) sess = tf.InteractiveSession() output_dim = vocab_size batch_size = len(x_data) hidden_dim =7 num_layers = 2 seq_length = x_data.shape[1] embedding_dim = 8 state_tuple_mode = True init_state_flag = 0 init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1) train_mode = False with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope: # Make rnn method = 1 if method == 0: cells = [] for _ in range(num_layers): cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim) #cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_dim,state_is_tuple=state_tuple_mode) #cell = tf.contrib.rnn.GRUCell(num_units=hidden_dim) # init_state_flag==0 으로 해야 됨. cells.append(cell) cell = tf.contrib.rnn.MultiRNNCell(cells) else: #cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim) cell = tf.contrib.rnn.LSTMCell(num_units=hidden_dim,num_proj=7) embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32) inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size x seq_length x embedding_dim Y = tf.convert_to_tensor(y_data) # tf.contrib.rnn.OutputProjectionWrapper 마지막에 FC layer를 하나 더 추가하는 효과. 아래에서 적용하는 Dense보다 앞에 적용된다. Dense가 있기 때문에 OutputProjectionWrapper 또는 Dense로 처리 가능함 # FC layer를 multiple로 적용하려면 OutputProjectionWrapper을 사용해야 함. if False: cell = tf.contrib.rnn.OutputProjectionWrapper(cell,13,activation=tf.nn.relu) cell = tf.contrib.rnn.OutputProjectionWrapper(cell,17) if init_state_flag==0: initial_state = cell.zero_state(batch_size, tf.float32) #(batch_size x hidden_dim) x layer 개수 else: if state_tuple_mode: h0 = tf.random_normal([batch_size,hidden_dim]) #h0 = tf.cast(np.random.randn(batch_size,hidden_dim),tf.float32) # 첫번째 layer의 c=0, h=h0, 두번째 layer의 c=0, h=0, .... initial_state=(tf.contrib.rnn.LSTMStateTuple(tf.zeros_like(h0), h0),) + (tf.contrib.rnn.LSTMStateTuple(tf.zeros_like(h0), tf.zeros_like(h0)),)*(num_layers-1) else: h0 = tf.random_normal([batch_size,hidden_dim]) #h0 = tf.cast(np.random.randn(batch_size,hidden_dim),tf.float32) initial_state = (tf.concat((tf.zeros_like(h0),h0), axis=1),) + (tf.concat((tf.zeros_like(h0),tf.zeros_like(h0)), axis=1),) * (num_layers-1) if train_mode: helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32)) #helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([[2],[4],[6]]).reshape(-1)) else: helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token) output_layer = Dense(output_dim, name='output_projection') #output_layer = None decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer) # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다 # last_state는 num_layers 만큼 나온다. outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10) weights = tf.ones(shape=[batch_size,seq_length]) loss = tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights) sess.run(tf.global_variables_initializer()) print("initial_state: ", sess.run(initial_state)) print("\n\noutputs: ",outputs) o = sess.run(outputs.rnn_output) #batch_size, seq_length, outputs o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1)) print("\n",o,o2) #batch_size, seq_length, outputs print("\n\nlast_state: ",last_state) print(sess.run(last_state)) # batch_size, hidden_dim print("\n\nlast_sequence_lengths: ",last_sequence_lengths) print(sess.run(last_sequence_lengths)) # [seq_length]*batch_size if output_layer is not None: print("kernel(weight)",sess.run(output_layer.trainable_weights[0])) # kernel(weight) print("bias",sess.run(output_layer.trainable_weights[1])) # bias if train_mode: p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim) #(18,5) = (batch_size x seq_length, vocab_size) print("loss: {:20.6f}".format(sess.run(loss))) print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) )
def add_decoder(self): with tf.variable_scope('Decoder') as scope: with tf.device('/cpu:0'): self.dec_Wemb = tf.get_variable('embedding', initializer=tf.random_uniform([ self.dec_vocab_size + 2, self.dec_emb_size ]), dtype=tf.float32) batch_size = tf.shape(self.enc_inputs)[0] dec_cell = self.cell(self.hidden_size) attn_mech = tf.contrib.seq2seq.LuongAttention( num_units=self.attn_size, memory=self.enc_outputs, memory_sequence_length=self.enc_sequence_length, name='LuongAttention') dec_cell = tf.contrib.seq2seq.AttentionWrapper( cell=dec_cell, attention_mechanism=attn_mech, attention_layer_size=self.attn_size, name='Attention_Wrapper') initial_state = dec_cell.zero_state( dtype=tf.float32, batch_size=batch_size).clone(cell_state=self.enc_last_state) output_layer = Dense(self.dec_vocab_size + 2, name='output_projection') if self.mode == 'training': max_dec_len = tf.reduce_max(self.dec_sequence_length + 1, name='max_dec_len') dec_emb_inputs = tf.nn.embedding_lookup(self.dec_Wemb, self.dec_inputs, name='emb_inputs') training_helper = tf.contrib.seq2seq.TrainingHelper( inputs=dec_emb_inputs, sequence_length=self.dec_sequence_length + 1, time_major=False, name='training_helper') training_decoder = tf.contrib.seq2seq.BasicDecoder( cell=dec_cell, helper=training_helper, initial_state=initial_state, output_layer=output_layer) train_dec_outputs, train_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode( training_decoder, output_time_major=False, impute_finished=True, maximum_iterations=max_dec_len) logits = tf.identity(train_dec_outputs.rnn_output, name='logits') targets = tf.slice(self.dec_inputs, [0, 0], [-1, max_dec_len], 'targets') masks = tf.sequence_mask(self.dec_sequence_length + 1, max_dec_len, dtype=tf.float32, name='masks') self.batch_loss = tf.contrib.seq2seq.sequence_loss( logits=logits, targets=targets, weights=masks, name='batch_loss') self.valid_predictions = tf.identity( train_dec_outputs.sample_id, name='valid_preds') elif self.mode == 'inference': start_tokens = tf.tile(tf.constant([self.start_token], dtype=tf.int32), [batch_size], name='start_tokens') inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper( embedding=self.dec_Wemb, start_tokens=start_tokens, end_token=self.end_token) inference_decoder = tf.contrib.seq2seq.BasicDecoder( cell=dec_cell, helper=inference_helper, initial_state=initial_state, output_layer=output_layer) infer_dec_outputs, infer_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode( inference_decoder, output_time_major=False, impute_finished=True, maximum_iterations=self.dec_sentence_length) self.predictions = tf.identity(infer_dec_outputs.sample_id, name='predictions')
def _match_model_fn_v6(features, labels, mode, params): ''' this version uses origianl seq2seq, but uses a lstm merges the cause and word embedding_tabel and this version use the input embedding as the attention query ''' # print('aaa') '''set parameters''' with tf.device('/gpu:0'), tf.variable_scope('model', reuse=tf.AUTO_REUSE) as scope: # set hyper parameters embedding_size = params['embedding_size'] num_units = params['num_units'] if mode == tf.estimator.ModeKeys.TRAIN: dropout_keep_prob = params['dropout_keep_prob'] else: dropout_keep_prob = 1 beam_width = params['beam_width'] EOS = params['EOS'] SOS = params['SOS'] # set training parameters max_sequence_length = params['max_sequence_length'] max_cause_length = params['max_cause_length'] vocab_size = params['vocab_size'] num_causes = EOS + 1 '''process input and target''' # input layer input = tf.reshape(features['content'], [-1, max_sequence_length]) batch_size = tf.shape(input)[0] input_length = tf.reshape(features['content_length'], [batch_size]) cause_label = tf.reshape(labels['cause_label'], [batch_size, max_cause_length]) cause_length = tf.reshape(labels['cause_length'], [batch_size]) # necessary cast input = tf.cast(input, dtype=tf.int32) input_length = tf.cast(input_length, dtype=tf.int32) cause_label = tf.cast(cause_label, dtype=tf.int32) cause_length = tf.cast(cause_length, dtype=tf.int32) # word embedding layer embeddings_word = load_embedding(params['word2vec_model'], vocab_size, embedding_size) embedded_input = gen_array_ops.gather_v2(embeddings_word, input, axis=0) # cause-label embedding layer cause_encoder = CauseEncoder(word_embeddings=embeddings_word, params=params) embedded_cause = cause_encoder.apply(cause_label) # cause lookpu_table cause_table = tf.constant(params['cause_table'], dtype=tf.int32) encoder_output = encoders(embedded_input, input_length, params, mode) '''hierarchical multilabel decoder''' # build lstm cell with attention lstm = rnn.LayerNormBasicLSTMCell(num_units=num_units, reuse=tf.AUTO_REUSE, dropout_keep_prob=dropout_keep_prob) # lstm = rnn.DropoutWrapper(lstm, output_keep_prob=dropout_keep_prob) # the subtraction at the end of the line is a ele-wise subtraction supported by tensorflow attention_mechanism = MyBahdanauAttention( num_units=embedding_size, memory=encoder_output.attention_values, memory_sequence_length=encoder_output.attention_values_length) initial_state = rnn.LSTMStateTuple(encoder_output.initial_state, encoder_output.initial_state) cell = MyAttentionWrapper_v2(lstm, attention_mechanism, sot=SOS, output_attention=False, name="MyAttentionWrapper") cell_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size) cell_state = cell_state.clone(cell_state=initial_state, attention=encoder_output.final_state) # extra dense layer to project a rnn output into a classification project_dense = Dense(num_causes, _reuse=tf.AUTO_REUSE, _scope='project_dense_scope', name='project_dense') # train_decoder train_helper = MyTrainingHelper(embedded_cause, cause_label, cause_length) train_decoder = MyBasicDecoder(cell, train_helper, cell_state, lookup_table=cause_table, output_layer=project_dense, hie=params['hie']) decoder_output_train, decoder_state_train, decoder_len_train = dynamic_decode( train_decoder, maximum_iterations=max_cause_length - 1, parallel_iterations=64, scope='decoder') # beam_width = 1 tiled_memory_sequence_length = tile_batch( encoder_output.attention_values_length, multiplier=beam_width) tiled_memory = tile_batch(encoder_output.attention_values, multiplier=beam_width) tiled_encoder_output_initital_state = tile_batch( encoder_output.initial_state, multiplier=beam_width) tiled_initial_state = rnn.LSTMStateTuple( tiled_encoder_output_initital_state, tiled_encoder_output_initital_state) tiled_first_attention = tile_batch(encoder_output.final_state, multiplier=beam_width) attention_mechanism = MyBahdanauAttention( num_units=embedding_size, memory=tiled_memory, memory_sequence_length=tiled_memory_sequence_length) cell = MyAttentionWrapper_v2(lstm, attention_mechanism, sot=SOS, output_attention=False, name="MyAttentionWrapper") cell_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size * beam_width) cell_state = cell_state.clone(cell_state=tiled_initial_state, attention=tiled_first_attention) infer_decoder = MyBeamSearchDecoder(cell, embedding=cause_encoder, sots=tf.fill([batch_size], SOS), start_tokens=tf.fill([batch_size], SOS), end_token=EOS, initial_state=cell_state, beam_width=beam_width, output_layer=project_dense, lookup_table=cause_table, length_penalty_weight=0.7, hie=params['hie']) cause_output_infer, cause_state_infer, cause_length_infer = dynamic_decode( infer_decoder, parallel_iterations=64, maximum_iterations=max_cause_length - 1, scope='decoder') # loss mask_for_cause = tf.sequence_mask(cause_length - 1, max_cause_length - 1, dtype=tf.float32) # loss = sequence_loss(logits=padded_train_output, targets=cause_label, weights=mask_for_cause, name='loss') tmp_padding = tf.pad(decoder_output_train.rnn_output, [[0, 0], [ 0, max_cause_length - 1 - tf.shape(decoder_output_train.rnn_output)[1] ], [0, 0]], constant_values=0) loss = _compute_loss(tmp_padding, cause_label, mask_for_cause, batch_size) # predicted_ids: [batch_size, max_cause_length, beam_width] predicted_and_cause_ids = tf.transpose( cause_output_infer.predicted_ids, perm=[0, 2, 1], name='predicted_cause_ids') # for monitoring cause_label_expanded = tf.reshape(cause_label[:, 1:], [-1, 1, max_cause_length - 1]) predicted_and_cause_ids = tf.pad( predicted_and_cause_ids, [[0, 0], [0, 0], [0, max_cause_length - 1 - tf.shape(predicted_and_cause_ids)[2]]], constant_values=EOS) predicted_and_cause_ids = tf.concat( [predicted_and_cause_ids, cause_label_expanded], axis=1, name='predicted_and_cause_ids') predicted_and_cause_ids = tf.reshape( predicted_and_cause_ids, [-1, beam_width + 1, max_cause_length - 1]) predicted_and_cause_ids_train = tf.concat( [decoder_output_train.sample_id, cause_label[:, 1:]], axis=1, name='predicted_and_cause_ids_train') predictions = { 'predicted_and_cause_ids': predicted_and_cause_ids, } if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) if mode == tf.estimator.ModeKeys.TRAIN: # warm_up_constant = params['warm_up_steps'] ** (-1.5) # embedding_constant = embedding_size ** (-0.5) # global_step = tf.to_float(tf.train.get_global_step()) # learning_rate = tf.minimum(1 / tf.sqrt(global_step), # warm_up_constant * global_step) * embedding_constant # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) optimizer = tf.train.AdamOptimizer() # # train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) # '''using gradient clipping''' # loss = tf.Print(loss, [loss, 'to be clear, this is the loss']) grads_and_vars = optimizer.compute_gradients(loss) clipped_gvs = [ ele if ele[0] is None else (tf.clip_by_value(ele[0], -0.1, 0.1), ele[1]) for ele in grads_and_vars ] train_op = optimizer.apply_gradients( clipped_gvs, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # predicted_cause_ids shape = [batch_size, cause_length] # cause_label = [batch_size, cause_length] # select the predicted cause with the highest possibility # todo: evalutaion # bi_predicted_cause_ids = binarizer(predicted_cause_ids[:, 0, :], num_causes) # bi_cause_label = binarizer(cause_label, num_causes) # todo: now I have to leave the evaluation work be done outside the estimator eval_metric_ops = { 'predicted_and_cause_ids': tf.contrib.metrics.streaming_concat(predicted_and_cause_ids), # 'precision': tf.metrics.precision(bi_cause_label, bi_predicted_cause_ids), # 'recall': tf.metrics.recall(bi_cause_label, bi_predicted_cause_ids), # 'f1-score': f_score(bi_cause_label, bi_predicted_cause_ids), } return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def build_model(self, grad_clip, is_train=1): data = tf.placeholder(tf.int32, shape=[1, None], name="input_id") train_data = tf.placeholder(tf.int32, shape=[1, None], name="train_id") train_label = tf.placeholder(tf.int32, shape=[1, None], name="trian_label") z_0 = tf.placeholder(tf.float32, shape=[1], name="prior_selection") # 1 or 0 wrods = tf.nn.embedding_lookup(self.embed, data) decoder_input = tf.nn.embedding_lookup(self.embed, train_data) with tf.variable_scope("encoder"): encoder = self._get_simple_lstm(lstm_size, lstm_layer) words = tf.nn.embedding_lookup(self.embed, data) encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder, words, dtype=tf.float32) # define the variational approximation epsilon = tf.placeholder(tf.float32, shape=[1], name="epsilon") with tf.variable_scope("encoder_approx"): mean_encode_layer_1 = Dense(1) # 1 #mean_encode_layer_2 = Dense(1) # -1 var_encode_layer = Dense(1) mean_approx_1 = mean_encode_layer_1(encoder_state[lstm_layer - 1][1]) #mean_approx_2 = mean_encode_layer_2(encoder_state[0][1]) var_approx = var_encode_layer(encoder_state[lstm_layer - 1][1]) # p(Z) = z_0 * N(1, 1) + (1-z_0) * N(-1, 1) self.Z = (2 * z_0 - 1) * mean_approx_1 + epsilon * var_approx if is_train == 0: # do inference self.Z = tf.placeholder(tf.float32, shape=[1, 1], name="Z_input") self.start_tokens = tf.placeholder(tf.int32, shape=[1], name='start_tokens') self.end_tokens = tf.placeholder(tf.int32, shape=(), name="end_tokens") #print self.end_tokens.shape helper = GreedyEmbeddingHelper(self.embed, self.start_tokens, self.end_tokens) elif is_train == 1: self.decoder_seq_length = tf.placeholder(tf.int32, shape=[None], name='decoder_seq_length') ''' NOTICE: since it is an auto-encoder, the input of the traininghelper is the first n-1 words and the output is the last n-1 words Otherwise, it will be just an identity transformation ''' # words' shape: [1, sen_length, vocab_dim] helper = TrainingHelper(decoder_input, self.decoder_seq_length) with tf.variable_scope("decoder"): # decoder, use the latent variable to compute the new initial hidden state # and the cell state for the decoding lstm model. fc_rec = Dense(lstm_size) fc_rec2 = Dense(lstm_size) decoder_h = fc_rec(self.Z) decoder_c = fc_rec2(self.Z) fc_layer = Dense(self.shape[0]) decoder_cell = self._get_simple_lstm(lstm_size, lstm_layer) d_i_s = tf.contrib.rnn.LSTMStateTuple(decoder_c, decoder_h) decoder = BasicDecoder(decoder_cell, helper, (d_i_s, ), fc_layer) logits, final_state, final_sequence_lengths = dynamic_decode( decoder, maximum_iterations=LENGTH) if is_train == 0: loss = tf.reshape(tf.nn.softmax(logits.rnn_output), [-1, self.shape[0]]) # output shouldn't have SOS predict = tf.argmax(loss, axis=1) return predict, loss elif is_train == 1: # train targets = tf.reshape(train_label, [-1]) logits_flatten = tf.reshape(logits.rnn_output, [-1, self.shape[0]]) cross_ent = tf.losses.sparse_softmax_cross_entropy( targets, logits_flatten) #DL_loss = -0.5 * (2 * tf.log(var_approx) - z_0 * tf.square(mean_approx_1) # - (1-z_0) * tf.square(mean_approx_2) + tf.square(var_approx) # + z_0 * mean_approx_1 - (1-z_0) * mean_approx_2) DL_loss = -(0.5 * (tf.log(tf.square(var_approx)) - tf.square(mean_approx_1) - tf.square(var_approx)) + mean_approx_1) loss = DL_loss + cross_ent # negative ELOB tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip) optimizer = tf.train.AdamOptimizer(pretrain_lr) train_op = optimizer.apply_gradients(zip( grads, tvars)) # minimize the loss return train_op, loss, data, train_data, train_label, z_0, epsilon