Пример #1
0
def Decoder(args, mode, enc_rnn_out, enc_rnn_state, X, emb_Y, emb_out):
    
    with tf.variable_scope("Decoder") as decoder_scope:

        mem_units = 2 * args.dim
        out_layer = Dense(args.output_vocab_size) # projection W*X+b
        beam_width = args.beam_width
        batch_size = tf.shape(enc_rnn_out)[0]

        cell, initial_state = _decoder(args, enc_rnn_out, enc_rnn_state, mode, beam_width, batch_size)

        if mode == "train":
            seq_len = tf.tile(tf.constant([args.maxlen], dtype=tf.int32), [batch_size])
            helper = tf.contrib.seq2seq.TrainingHelper(inputs=emb_Y, sequence_length=seq_len)
            decoder = BasicDecoder(cell=cell, helper=helper, initial_state=initial_state, X=X, output_layer=out_layer) 
            outputs, final_state, _= tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=args.maxlen, scope=decoder_scope)
            logits = outputs.rnn_output
            sample_ids = outputs.sample_id
        else:
            start_tokens = tf.tile(tf.constant([_GO], dtype=tf.int32), [batch_size])
            end_token = _END
            my_decoder = BeamSearchDecoder(cell=cell,
                                           embedding=emb_out,
                                           start_tokens=start_tokens,
                                           end_token=end_token,
                                           initial_state=initial_state,
                                           beam_width=beam_width,
                                           X=X,
                                           output_layer=out_layer,
                                           length_penalty_weight=0.0)
                      
            outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode(my_decoder, maximum_iterations=args.maxlen, scope=decoder_scope)
            logits = tf.no_op()
            sample_ids = outputs.predicted_ids
        
    return logits, sample_ids
Пример #2
0
    def _build_decoder(self, dec_scope_name, encoder_output, encoder_state, target_data, target_seq_len):
        with tf.name_scope(dec_scope_name):
            decoder_embeddings = tf.Variable(tf.random_uniform([self.tgt_vocab_size, self.embedding_size]))
            # cell
            cell = tf.contrib.rnn.MultiRNNCell([self.get_gru_cell(self.rnn_size, self.dropout)
                                                for _ in range(self.num_layers)])
            # attention-model
            cell, decoder_initial_state = self._build_attention(encoder_output, encoder_state, cell)
            # output_layer
            output_layer = Dense(self.tgt_vocab_size, use_bias=False,
                                 kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))

            with tf.variable_scope(dec_scope_name + '_train'):
                # Data format of target_data: <GO>...<PAD>
                # Tensor: [batch_size, max_time, embed_size], type: float32.
                decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, target_data)
                train_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input,
                                                                 sequence_length=target_seq_len, time_major=False)
                train_decoder = tf.contrib.seq2seq.BasicDecoder(cell, train_helper,
                                                                decoder_initial_state, output_layer)
                train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(train_decoder, impute_finished=True,
                                                                               maximum_iterations=self.max_target_len)

            with tf.variable_scope(dec_scope_name + '_predict', reuse=True):
                # start_tokens = tf.tile(tf.constant([self.start_vocab.index('<go>')], dtype=tf.int32),
                #                        [self.batch_size], name='start_tokens')
                predict_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    decoder_embeddings,
                    tf.fill([self.batch_size], self.start_vocab.index('<go>')),
                    self.start_vocab.index('<eos>'))
                predict_decoder = tf.contrib.seq2seq.BasicDecoder(cell, predict_helper,
                                                                  decoder_initial_state, output_layer)
                predict_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predict_decoder, impute_finished=True,
                                                                                 maximum_iterations=self.max_target_len)

        return train_decoder_output, predict_decoder_output
Пример #3
0
def _project_lstm_state_tuple(state_tuple, num_units):
    r"""
    Concatenates all the `c` and `h` members from a list of `LSTMStateTuple`
      and projects them to a space of dimension `num_units`
    Args:
        state_tuple: a list of `LSTMStateTuple` objects
        num_units: output dimension

    Returns:
        projected_state: a single `LSTMStateTuple` with `c` and `h` of dimension `num_units`
    """
    state_proj_layer = Dense(num_units,
                             name='state_projection',
                             use_bias=False)

    cat_c = tf.concat([state.c for state in state_tuple], axis=-1)
    cat_h = tf.concat([state.h for state in state_tuple], axis=-1)

    proj_c = state_proj_layer(cat_c)
    proj_h = state_proj_layer(cat_h)

    projected_state = tf.contrib.rnn.LSTMStateTuple(c=proj_c, h=proj_h)
    print('projected_state', projected_state)
    return projected_state
Пример #4
0
    def add_decoder_for_training(self):
        self.add_attention_for_training()
        decoder_embedding = tf.get_variable(
            'decoder_embedding',
            [len(self.Y_word2idx), self.decoder_embedding_dim], tf.float32,
            tf.random_uniform_initializer(-1.0, 1.0))

        training_helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=tf.nn.embedding_lookup(decoder_embedding,
                                          self.processed_decoder_input()),
            sequence_length=self.Y_seq_len,
            time_major=False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
            cell=self.decoder_cell,
            helper=training_helper,
            initial_state=self.decoder_cell.zero_state(
                self.batch_size,
                tf.float32).clone(cell_state=self.encoder_state),
            output_layer=Dense(len(self.Y_word2idx)))
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder=training_decoder,
            impute_finished=True,
            maximum_iterations=tf.reduce_max(self.Y_seq_len))
        self.training_logits = training_decoder_output.rnn_output
Пример #5
0
    def build_encoder(self):

        print 'Building Encoder'

        with tf.variable_scope('encoder'):
            self.encoder_cell = self.build_encoder_cell()

            # Initialize encoder_embeddings to have variance=1
            initializer = tf.random_uniform_initializer(-math.sqrt(3),
                                                        math.sqrt(3),
                                                        dtype=tf.float32)
            self.encoder_embeddings = tf.get_variable(
                "encoder_embeddings",
                [self.src_vocab_size, self.input_embedding_size],
                initializer=initializer,
                dtype=tf.float32)

            # [batch_size, time_step, embedding_size]
            self.encoder_inputs_embedded = tf.nn.embedding_lookup(
                params=self.encoder_embeddings, ids=self.encoder_inputs)

            # Input projection layer to feed embedded inputs to the cell
            input_layer = Dense(self.encoder_hidden_units, dtype=tf.float32)

            self.encoder_inputs_embedded = input_layer(
                self.encoder_inputs_embedded)

            # Encode input sequences into context vectors:
            # encoder_outputs: [batch_size, max_time_step, cell_output_size]
            # encoder_state: [batch_size, cell_output_size]
            self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn(
                cell=self.encoder_cell,
                inputs=self.encoder_inputs_embedded,
                sequence_length=self.encoder_inputs_length,
                dtype=tf.float32,
                time_major=False)
def decode(helper, memory, scope, enc_state, reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units=cfg.RNN_UNITS, memory=memory)
        cell = tf.contrib.rnn.GRUCell(num_units=cfg.RNN_UNITS)
        attn_cell = tf.contrib.seq2seq.AttentionWrapper(
            cell,
            attention_mechanism,
            attention_layer_size=cfg.RNN_UNITS,
            output_attention=True)
        output_layer = Dense(units=cfg.VOCAB_SIZE)

        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell=attn_cell,
            helper=helper,
            initial_state=attn_cell.zero_state(
                dtype=tf.float32,
                batch_size=cfg.BATCH_SIZE).clone(cell_state=enc_state[0]),
            output_layer=output_layer)
        outputs = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,
                                                    output_time_major=False,
                                                    impute_finished=True,
                                                    maximum_iterations=27)
        return outputs
Пример #7
0
def decoder(decoder_embed_input,decoder_y,target_length,max_target_length,encode_state,keep_prob,reuse=False):
    with tf.variable_scope("decoder",reuse=reuse):
        decode_lstm = tf.contrib.rnn.LSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
        decode_cell = tf.contrib.rnn.DropoutWrapper(decode_lstm, output_keep_prob=keep_prob)
        decoder_initial_state = encode_state
        output_layer = Dense(n_input) #TOTAL_SIZE
        decoder_input_ = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), decoder_embed_input],
                                   1)  # add GO to the end
        decoder_input = tf.nn.embedding_lookup(dic_embeddings, decoder_input_)
        decoder_input=tf.concat([decoder_input,decoder_y],2)
        # # input_=tf.transpose(decoder_input,[1,0,2])
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_input,
                                                            sequence_length=target_length)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(decode_cell, training_helper, decoder_initial_state,
                                                           output_layer)
        output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                         impute_finished=True,
                                                         maximum_iterations=max_target_length)
        predicting_logits = tf.identity(output.sample_id, name='predictions')
        training_logits = tf.identity(output.rnn_output, 'logits')
        masks = tf.sequence_mask(target_length, max_target_length, dtype=tf.float32, name='masks')
        #target = tf.concat([target_input, tf.fill([batch_size, 1], vocab_to_int['<EOS>'])], 1)  #
        target = decoder_embed_input
        return output,predicting_logits,training_logits,masks,target
 def _build_word_projections(self):
     """Helper to update word embedding and output projection variables."""
     c = self._config
     rnn_size = c.rnn_size
     word_size = c.rnn_word_size
     softmax_size = self._softmax_size
     token_type = c.token_type
     place_var_on_cpu = token_type == 'word'
     
     #with tf.variable_scope('decoder/rnn_decoder', reuse=tf.AUTO_REUSE):
     dec_out_layer = Dense(softmax_size, name='output_projection')
     dec_out_layer.build(rnn_size)
     self.decoder_output_layer = dec_out_layer
     print('INFO: Building separate embedding matrix.')
     kwargs = dict(name='embedding_map',
                   shape=[softmax_size, word_size],
                   dtype=tf.float32,
                   trainable=True)
     if place_var_on_cpu:
         with tf.device('/cpu:0'):
             self._word_embed_map = tf.get_variable(**kwargs)
     else:
         self._word_embed_map = tf.get_variable(**kwargs)
     return self._word_embed_map
Пример #9
0
    def _decoder_inference(self, init_state):
        tiled_z = tf.tile(tf.expand_dims(self.z, 1), [1, args.beam_width, 1])

        decoder = BeamSearchDecoder(
            cell=tf.nn.rnn_cell.MultiRNNCell([
                self._rnn_cell(args.rnn_size, reuse=True)
                for _ in range(args.decoder_layers)
            ]),
            embedding=self.tied_embedding,
            start_tokens=tf.tile(
                tf.constant([self._word2idx['<start>']], dtype=tf.int32),
                [self._batch_size]),
            end_token=self._word2idx['<end>'],
            initial_state=tf.contrib.seq2seq.tile_batch(
                init_state, args.beam_width),
            beam_width=args.beam_width,
            output_layer=Dense(args.vocab_size, _reuse=True),
            length_penalty_weight=0.0,
            z=tiled_z)
        decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder=decoder,
            impute_finished=False,
            maximum_iterations=self.gen_seq_length)
        return decoder_output.predicted_ids[:, :, 0]
def attention_keras_test():
    # tf.keras.layers.SimpleRNNCell를 이용하기
    vocab_size = 6
    SOS_token = 0
    EOS_token = 5
    
    x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32)
    y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32)
    print("data shape: ", x_data.shape)
    sess = tf.InteractiveSession()
    
    output_dim = vocab_size
    batch_size = len(x_data)
    hidden_dim =7
    seq_length = x_data.shape[1]
    embedding_dim = 8
    state_tuple_mode = True

    init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1)
    
    train_mode = True
    alignment_history_flag = True   # True이면 initial_state나 last state를 sess.run 하면 안됨. alignment_history가 function이기 때문에...
    with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope:
        # Make rnn cell
        cell = tf.keras.layers.SimpleRNNCell(units=hidden_dim)
    
        embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32)
        inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size  x seq_length x embedding_dim
    
        Y = tf.convert_to_tensor(y_data)
    
        #encoder_outputs = tf.ones([batch_size,20,30])
        encoder_outputs = tf.convert_to_tensor(np.random.normal(0,1,[batch_size,20,30]).astype(np.float32)) # 20: encoder sequence length, 30: encoder hidden dim
        
        #input_lengths = [20]*batch_size
        input_lengths = [5,10,20]  # encoder에 padding 같은 것이 있을 경우, attention을 주지 않기 위해
        
        # attention mechanism  # num_units = Na = 11
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths,normalize=False)
        #attention_mechanism = tf.contrib.seq2seq.BahdanauMonotonicAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths)
        
        # LuongAttention에서는 num_units이 임의로 들어가면 안되고, decoder의 hidden_dim과 일치해야 한다
        #attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=hidden_dim, memory=encoder_outputs,memory_sequence_length=input_lengths)
        
        
        # output_attention = True(default) ==> 이면 output으로 attention이 나가고, False이면 cell의 output이 나간다
        # attention_layer_size = N_l
        
        attention_initial_state = [cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)]
        
        cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism, attention_layer_size=13,initial_cell_state=attention_initial_state,
                                                   alignment_history=alignment_history_flag,output_attention=True)

        # 여기서 zero_state를 부르면, 위의 attentionwrapper에서 넝허준 attention_initial_state를 가져온다. 즉, AttentionWrapperState.cell_state에는 넣어준 값이 들어있다.
        initial_state = cell.zero_state(batch_size, tf.float32) # AttentionWrapperState
 
        if train_mode:
            helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32))
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token)
     
        output_layer = Dense(output_dim, name='output_projection')
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer)    
        # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다
        outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10)
     
        weights = tf.ones(shape=[batch_size,seq_length])
        loss =   tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights)
     
     
        
        
        opt = tf.train.AdamOptimizer(0.01).minimize(loss)
        
        sess.run(tf.global_variables_initializer())
        for i in range(100):
            loss_,_ =sess.run([loss,opt])
            print("{} loss: = {}".format(i,loss_))
        
        if alignment_history_flag ==False:
            print("initial_state: ", sess.run(initial_state))
        print("\n\noutputs: ",outputs)
        o = sess.run(outputs.rnn_output)  #batch_size, seq_length, outputs
        o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1))
        print("\n",o,o2) #batch_size, seq_length, outputs
     
        print("\n\nlast_state: ",last_state)
        if alignment_history_flag == False:
            print(sess.run(last_state)) # batch_size, hidden_dim
        else:
            print("alignment_history: ", last_state.alignment_history.stack())
            alignment_history_ = sess.run(last_state.alignment_history.stack())
            print(alignment_history_)
            print("alignment_history sum: ",np.sum(alignment_history_,axis=-1))
            
            print("cell_state: ", sess.run(last_state.cell_state))
            print("attention: ", sess.run(last_state.attention))
            print("time: ", sess.run(last_state.time))
            
            alignments_ = sess.run(last_state.alignments)
            print("alignments: ", alignments_)
            print('alignments sum: ', np.sum(alignments_,axis=1))   # alignments의 합이 1인지 확인
            print("attention_state: ", sess.run(last_state.attention_state))

     
        print("\n\nlast_sequence_lengths: ",last_sequence_lengths)
        print(sess.run(last_sequence_lengths)) #  [seq_length]*batch_size    
         
        print("kernel(weight)",sess.run(output_layer.trainable_weights[0]))  # kernel(weight)
        print("bias",sess.run(output_layer.trainable_weights[1]))  # bias
     
        if train_mode:
            p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim)
            print("loss: {:20.6f}".format(sess.run(loss)))
            print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) )    
def attention_multicell_test():
    # BasicRNNCell을 multi로 쌓아 attention 적용. multi에서는 제일 아래 layer에 attention을 적용한다
    vocab_size = 6
    SOS_token = 0
    EOS_token = 5
    
    x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32)
    y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32)
    print("data shape: ", x_data.shape)
    sess = tf.InteractiveSession()
    
    output_dim = vocab_size
    batch_size = len(x_data)
    hidden_dim =7
    num_layers = 2
    seq_length = x_data.shape[1]
    embedding_dim = 8
    state_tuple_mode = True
    init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1)
    
    train_mode = True
    with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope:
        # Make multi-rnn cell
        cells = []
        for _ in range(num_layers):
            cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim)
            cells.append(cell)
        cell = tf.contrib.rnn.MultiRNNCell(cells)
    
        embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32)
        inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size  x seq_length x embedding_dim
    
        Y = tf.convert_to_tensor(y_data)
    
        encoder_outputs = tf.ones([batch_size,20,30])
        input_lengths = [20]*batch_size
        # attention mechanism
        attention_initial_state = cell.zero_state(batch_size, tf.float32)  # 다른 값을 줄수도 있다.
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units=11, memory=encoder_outputs,memory_sequence_length=input_lengths)
        cell = tf.contrib.seq2seq.AttentionWrapper(cell, attention_mechanism,initial_cell_state=attention_initial_state, attention_layer_size=13)  # AttentionWrapperState를 return한다.


        initial_state = cell.zero_state(batch_size, tf.float32) #(batch_size x hidden_dim) x layer 개수   ==> AttentionWrapperState class object를 return한다.
  
        if train_mode:
            helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32))
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token)
      
        output_layer = Dense(output_dim, name='output_projection')
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer)    
        # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다
        outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10)
      
        weights = tf.ones(shape=[batch_size,seq_length])
        loss =   tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights)
      
      
        sess.run(tf.global_variables_initializer())
        print("initial_state: ", sess.run(initial_state))
        print("\n\noutputs: ",outputs)
        o = sess.run(outputs.rnn_output)  #batch_size, seq_length, outputs
        o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1))
        print("\n",o,o2) #batch_size, seq_length, outputs
      
        print("\n\nlast_state: ",last_state)
        print(sess.run(last_state)) # batch_size, hidden_dim
      
        print("\n\nlast_sequence_lengths: ",last_sequence_lengths)
        print(sess.run(last_sequence_lengths)) #  [seq_length]*batch_size    
          
        print("kernel(weight)",sess.run(output_layer.trainable_weights[0]))  # kernel(weight)
        print("bias",sess.run(output_layer.trainable_weights[1]))  # bias
      
        if train_mode:
            p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim)
            print("loss: {:20.6f}".format(sess.run(loss)))
            print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) ) 
Пример #12
0
    def __init__(self, data, args, embed):

        with tf.variable_scope("input"):
            with tf.variable_scope("embedding"):
                # build the embedding table and embedding input
                if embed is None:
                    # initialize the embedding randomly
                    self.embed = tf.get_variable(
                        'embed', [data.vocab_size, args.embedding_size],
                        tf.float32)
                else:
                    # initialize the embedding by pre-trained word vectors
                    self.embed = tf.get_variable('embed',
                                                 dtype=tf.float32,
                                                 initializer=embed)

            self.sentence = tf.placeholder(tf.int32, (None, None),
                                           'sen_inps')  # batch*len
            self.sentence_length = tf.placeholder(tf.int32, (None, ),
                                                  'sen_lens')  # batch
            self.use_prior = tf.placeholder(dtype=tf.bool, name="use_prior")

            batch_size, batch_len = tf.shape(self.sentence)[0], tf.shape(
                self.sentence)[1]
            self.decoder_max_len = batch_len - 1

            self.encoder_input = tf.nn.embedding_lookup(
                self.embed, self.sentence)  # batch*len*unit
            self.encoder_len = self.sentence_length

            decoder_input = tf.split(self.sentence, [self.decoder_max_len, 1],
                                     1)[0]  # no eos_id
            self.decoder_input = tf.nn.embedding_lookup(
                self.embed, decoder_input)  # batch*(len-1)*unit
            self.decoder_target = tf.split(self.sentence,
                                           [1, self.decoder_max_len],
                                           1)[1]  # no go_id, batch*(len-1)
            self.decoder_len = self.sentence_length - 1
            self.decoder_mask = tf.sequence_mask(
                self.decoder_len, self.decoder_max_len,
                dtype=tf.float32)  # batch*(len-1)

        # initialize the training process
        self.learning_rate = tf.Variable(float(args.lr),
                                         trainable=False,
                                         dtype=tf.float32)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * args.lr_decay)
        self.global_step = tf.Variable(0, trainable=False)

        # build rnn_cell
        cell_enc = tf.nn.rnn_cell.GRUCell(args.eh_size)
        cell_dec = tf.nn.rnn_cell.GRUCell(args.dh_size)

        # build encoder
        with tf.variable_scope('encoder'):
            encoder_output, encoder_state = dynamic_rnn(cell_enc,
                                                        self.encoder_input,
                                                        self.encoder_len,
                                                        dtype=tf.float32,
                                                        scope="encoder_rnn")

        with tf.variable_scope('recognition_net'):
            recog_input = encoder_state
            self.recog_mu = tf.layers.dense(inputs=recog_input,
                                            units=args.z_dim,
                                            activation=None,
                                            name='recog_mu')
            self.recog_logvar = tf.layers.dense(inputs=recog_input,
                                                units=args.z_dim,
                                                activation=None,
                                                name='recog_logvar')

            epsilon = tf.random_normal(tf.shape(self.recog_logvar),
                                       name="epsilon")
            std = tf.exp(0.5 * self.recog_logvar)
            self.recog_z = tf.add(self.recog_mu,
                                  tf.multiply(std, epsilon),
                                  name='recog_z')

            self.kld = tf.reduce_mean(0.5 * tf.reduce_sum(
                tf.exp(self.recog_logvar) + self.recog_mu * self.recog_mu -
                self.recog_logvar - 1,
                axis=-1))
            self.prior_z = tf.random_normal(tf.shape(self.recog_logvar),
                                            name="prior_z")
            latent_sample = tf.cond(self.use_prior,
                                    lambda: self.prior_z,
                                    lambda: self.recog_z,
                                    name='latent_sample')
            dec_init_state = tf.layers.dense(inputs=latent_sample,
                                             units=args.dh_size,
                                             activation=None)

        with tf.variable_scope("output_layer",
                               initializer=tf.orthogonal_initializer()):
            self.output_layer = Dense(
                data.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                use_bias=True)

        with tf.variable_scope("decode",
                               initializer=tf.orthogonal_initializer()):
            train_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=self.decoder_input, sequence_length=self.decoder_len)
            train_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=cell_dec,
                helper=train_helper,
                initial_state=dec_init_state,
                output_layer=self.output_layer)
            train_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=train_decoder,
                maximum_iterations=self.decoder_max_len,
                impute_finished=True)
            logits = train_output.rnn_output

            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.decoder_target, logits=logits)
            crossent = tf.reduce_sum(crossent * self.decoder_mask)
            self.sen_loss = crossent / tf.to_float(batch_size)
            self.ppl_loss = crossent / tf.reduce_sum(self.decoder_mask)

            self.decoder_distribution_teacher = tf.nn.log_softmax(logits)

        with tf.variable_scope("decode", reuse=True):
            infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                self.embed, tf.fill([batch_size], data.go_id), data.eos_id)
            infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=cell_dec,
                helper=infer_helper,
                initial_state=dec_init_state,
                output_layer=self.output_layer)
            infer_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=infer_decoder,
                maximum_iterations=self.decoder_max_len,
                impute_finished=True)
            self.decoder_distribution = infer_output.rnn_output
            self.generation_index = tf.argmax(
                tf.split(self.decoder_distribution, [2, data.vocab_size - 2],
                         2)[1], 2) + 2  # for removing UNK

        self.kl_weights = tf.minimum(
            tf.to_float(self.global_step) / args.full_kl_step, 1.0)
        self.kl_loss = self.kl_weights * tf.maximum(self.kld, args.min_kl)
        self.loss = self.sen_loss + self.kl_loss

        # calculate the gradient of parameters and update
        self.params = [
            k for k in tf.trainable_variables() if args.name in k.name
        ]
        opt = tf.train.MomentumOptimizer(learning_rate=self.learning_rate,
                                         momentum=args.momentum)
        gradients = tf.gradients(self.loss, self.params)
        clipped_gradients, self.gradient_norm = tf.clip_by_global_norm(
            gradients, args.grad_clip)
        self.update = opt.apply_gradients(zip(clipped_gradients, self.params),
                                          global_step=self.global_step)

        # save checkpoint
        self.latest_saver = tf.train.Saver(
            write_version=tf.train.SaverDef.V2,
            max_to_keep=args.checkpoint_max_to_keep,
            pad_step_number=True,
            keep_checkpoint_every_n_hours=1.0)
        self.best_saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                         max_to_keep=1,
                                         pad_step_number=True,
                                         keep_checkpoint_every_n_hours=1.0)

        # create summary for tensorboard
        self.create_summary(args)
Пример #13
0
def cnnlstm(features, labels, mode, params):
    """
    Model to be used in the tf.estimator. Basically the machine learning model.
    Simple RNN model that:
        - Takes a sentence represented like ['This', 'is', 'a', 'sentence']
          where each character in a word is represented by a integer and each
          word in a batch has the same length (zero padded)
        - One word at a time, each word is embedded using a CNN and a Highway
          network. (TODO: add the highway network)
        - This embedding is given to a RNN
        - The last state is given to another RNN (+ Attention over the previous
          hidden state) that predicts the next word.

    Args:
        - features: a dict:
            - sequence: a tensor of shape [batch_size, max_sentence_length,
                                           max_word_size]
            filled with the character ids, and padded with 0
            - sequence_length: a tensor of shape [batch_size] with the original
              length of the sequences.
            - max_word_size: tha maximum length of each word in the batch
        - labels: a dict:
            - sequence: a tensor of shape [batch_size, max_sentence_length]
            filled with the words ids of each sentence and padded with 0.
            - sequence_length: a tensor of shape [batch_size] with the original
             length of the sequences.
        - mode: the mode of the model (given by the estimator)
        - params: a dict with the following keys:
            - vocab_size: the size of the character vocabulary used
            - embedding_size: the size of the embeddings
            - dropout: 1 - dropout probability (the keep probability)
    """
    with tf.variable_scope('ModelParams'):
        batch_size = tf.shape(features['sequence'])[0]
        timesteps = tf.shape(features['sequence'])[1]
        maxwordlength = tf.shape(features['sequence'])[2]
        c_embed_s = params['char_embedding_size']
        dropout = params['dropout']
        hidden_size = params['hidden_size']
        network_depth = params['network_depth']
        kernels = params['kernels']
        kernel_features = params['kernel_features']

    with tf.variable_scope('Convolution'):
        ###########
        # ENCODER #
        ###########
        # Characters embeddings matrix. Basically each character id (int)
        # is associated a vector [char_embedding_size]
        embeddings_c = tf.Variable(tf.random_uniform([params['char_vocab_size'],
                                   c_embed_s], -1.0, 1.0))
        # Embed every char id into their embedding. Will go from this dimension
        # [batch_size, max_sequence_length, max_word_size] to this dimension
        # [batch_size, max_sequence_length, max_word_size, char_embedding_size]
        embedded_chars = tf.nn.embedding_lookup(embeddings_c, features['sequence'])
        # Change the dimension and bring every word as an example.

        # Reshape the inputs to have words as second and third dimension
        # from [batch, timesteps, wordlength, embedsize] to
        # [batch*timesteps, wordlength, embedisze]
        cnn_inputs = tf.reshape(embedded_chars, [batch_size*timesteps,
                                                 maxwordlength,
                                                 c_embed_s])
        # Expand the second dimension for convolution purposes
        cnn_inputs = tf.expand_dims(cnn_inputs, 1)
        # Layer to hold all of the convolution results
        layers = []
        # For each kernel, tuple of [kernel size, num filters]
        for kernel_size, kernel_feature_size in zip(kernels, kernel_features):
            # Apply the convolution on all of the inputs for this kernal
            conv = conv2d(cnn_inputs, kernel_feature_size, 1, kernel_size, name="kernel_%d" % kernel_size)
            pool = tf.reduce_max(tf.tanh(conv), 2, keep_dims=True)
            layers.append(tf.squeeze(pool, [1, 2]))
        cnn_output = tf.concat(layers, 1)
        rnn_inputs = tf.reshape(cnn_output, [batch_size, timesteps, sum(kernel_features)])

    with tf.variable_scope('RNN_Encoder'):
        # Create the actual encoder. Which applies a convolution on the char input
        # to have an embedding for each word. This embedding is then fed to the
        # classical LSMT RNN.
        # TODO: apply dropout
        cell_list = [create_cell(mode, dropout, hidden_size) for _ in range(network_depth)]
        cell = tf.contrib.rnn.MultiRNNCell(cell_list)
        # Loop over the inputs and apply the previously created cell at every
        # timestep. Returns the output at every step and last hidden state.
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(cell=cell, dtype=tf.float32,
                                                 inputs=rnn_inputs,
                                                 sequence_length=features['sequence_length'])

    with tf.variable_scope('Decoder'):
        ###########
        # DECODER #
        ###########
        # Words embeddings matrix. Basically every word id (int) in the vocab
        # is associated a vector [char_embedding_size]
        embeddings_w = tf.Variable(tf.random_uniform([params['word_vocab_size'],
                                   params['word_embedding_size']], -1.0, 1.0))
        # Decoder cell. Basic LSTM cell that will do the decoding.
        cell_list_dec = [create_cell(mode, dropout, hidden_size) for _ in range(network_depth)]
        decoder_cell = cell = tf.contrib.rnn.MultiRNNCell(cell_list_dec)
        # Attention mechanism
        attention_mechanism = LuongAttention(num_units=hidden_size,
                                    memory=encoder_outputs,
                                    memory_sequence_length=features['sequence_length'])
        # Attention Wrapper
        attn_cell = AttentionWrapper(decoder_cell, attention_mechanism)
        initial_decoder_state = attn_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)
        # Projection layer. Layer that takes the output of the decoder cell
        # and projects it on the word vocab dimension.
        projection_layer = Dense(params['word_vocab_size'], use_bias=False)
        # If not at infering mode, use the decoder_inputs
        # output at each time step.
        if mode != tf.estimator.ModeKeys.PREDICT:
            # Decoder outputs, i.e., what we are trying to predict.
            decoder_o = tf.cast(labels['sequence_output'], tf.int32)
            # Embed the decoder input
            decoder_i = tf.nn.embedding_lookup(embeddings_w, labels['sequence_input'])
            # Helper method. Basically a function that "helps" the decoder
            # at each time step by giving it the true input, whatever it computed
            # earlier.
            output_sequence_length = tf.cast(labels['sequence_length'], tf.int32)
            helper = TrainingHelper(decoder_i, output_sequence_length)
        else:
            # Helper method. At inference time it is different, we do not have the
            # true inputs, so this function will take the previously generated output
            # and embbed it with the decoder embeddings.
            start_token = tf.fill([batch_size], params['start_token'])
            end_token = tf.cast(params['end_token'], tf.int32)
            helper = GreedyEmbeddingHelper(embeddings_w, start_token, end_token)
        # The final decoder, with its cell, its intial state, its helper function,
        # and its projection layer.
        decoder = BasicDecoder(attn_cell, helper,initial_decoder_state, output_layer=projection_layer)
        # Use this decoder to perform a dynamic decode.
        # Dynamic Decoder: controls the flow of operations and mainly store the outputs
        # and keeps decoding until the decoder is done.
        # Decoder: kind of the cell of the dynacmic decoder. It passes the inputs
        # to the RNN, samples the output of the RNN and computes the next input.
        # To sample and compute the next inputs, the decoder uses a Helper function.
        # During training it is a TrainingHelper and during inference it is GreedyEmbeddingHelper
        # In our case the sampling is simply taking the argmax of the output logit.
        # The main difference between the two helpers is on the way they "compute"
        # the next input. TrainingHelper will use the decoder inputs provided while
        # the GreedyEmbeddingHelper will use the sampled RNN output and give it to
        # an embedding function to give it at as the next input.
        # Outputs of the BasicDecoder is a BasicDecoderOutput which holds the logits
        # and the sample_ids.
        if mode != tf.estimator.ModeKeys.PREDICT:
            outputs, state, sequence_lengths = dynamic_decode(decoder)
        else:
            max_iterations = tf.cast(tf.reduce_max(features['sequence_length'])*2, tf.int32)
            outputs, state, sequence_lengths = dynamic_decode(decoder,
                                                    maximum_iterations=max_iterations)
    with tf.variable_scope('Prediction'):
        # Contains the
        logits = outputs.rnn_output # output of the projection layer
        sample_id = outputs.sample_id # argmax of the logits
        # If we are INFER time only
        if mode == tf.estimator.ModeKeys.PREDICT:
            # Return a dict with the sample word ids.
            predictions = {"sequence": sample_id}
            export_outputs = {
                'prediction': tf.estimator.export.PredictOutput(predictions)
            }
            return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions,
                                              export_outputs=export_outputs)
    with tf.variable_scope('Loss'):
        # We are not at INFER time. We compute the cross entropy.
        crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=decoder_o,
                                                                  logits=logits)
        # Here we create a mask to "erase" the loss where the sentences are finished
        target_w = tf.sequence_mask(labels['sequence_length'], dtype=logits.dtype)
        # We apply the mask and sum the loss accross all the dimensions and divide it
        # by the batch size to make it independent of the batch_size.
        batch_size_32 = tf.cast(batch_size, tf.float32)
        timesteps_32 = tf.cast(timesteps, tf.float32)
        loss = (tf.reduce_sum(crossent * target_w) / (batch_size_32+timesteps_32))
    with tf.variable_scope('Train'):
        # At train time only.
        if mode == tf.estimator.ModeKeys.TRAIN:
            # Initialize an optimize that has for goal to minimize the loss
            learning_rate = tf.train.exponential_decay(params['learning_rate'],
                                                       tf.train.get_global_step(),
                                                       params['decay_steps'],
                                                       0.96, staircase=True)
            optimizer = tf.train.AdamOptimizer(learning_rate)
            # Apply gradient clipping
            gradients, variables = zip(*optimizer.compute_gradients(loss))
            gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
            train_op = optimizer.apply_gradients(zip(gradients, variables),
                                                 global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    with tf.variable_scope('Evaluate'):
        # Compute the accuracy of the model (the number of sequences that the model
        # got right)
        eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=decoder_o,
                                                           predictions=sample_id,
                                                           weights=target_w)}
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
Пример #14
0
    def build_model(self):
        print('building model... ...')
        #=================================1, 定义模型的placeholder
        self.encoder_inputs = tf.placeholder(tf.int32, [None, None],
                                             name='encoder_inputs')
        self.encoder_inputs_length = tf.placeholder(
            tf.int32, [None], name='encoder_inputs_length')

        self.batch_size = tf.placeholder(tf.int32, [], name='batch_size')
        self.keep_prob_placeholder = tf.placeholder(
            tf.float32, name='keep_prob_placeholder')

        self.decoder_targets = tf.placeholder(tf.int32, [None, None],
                                              name='decoder_targets')
        self.decoder_targets_length = tf.placeholder(
            tf.int32, [None], name='decoder_targets_length')
        # 根据目标序列长度,选出其中最大值,然后使用该值构建序列长度的mask标志。用一个sequence_mask的例子来说明起作用
        #  tf.sequence_mask([1, 3, 2], 5)
        #  [[True, False, False, False, False],
        #  [True, True, True, False, False],
        #  [True, True, False, False, False]]
        self.max_target_sequence_length = tf.reduce_max(
            self.decoder_targets_length, name='max_target_len')
        self.mask = tf.sequence_mask(self.decoder_targets_length,
                                     self.max_target_sequence_length,
                                     dtype=tf.float32,
                                     name='masks')

        #=================================2, 定义模型的encoder部分
        with tf.variable_scope('encoder'):
            #创建LSTMCell,两层+dropout
            encoder_cell = self._create_rnn_cell()
            #构建embedding矩阵,encoder和decoder公用该词向量矩阵
            embedding = tf.get_variable('embedding',
                                        [self.vocab_size, self.embedding_size])
            encoder_inputs_embedded = tf.nn.embedding_lookup(
                embedding, self.encoder_inputs)
            # 使用dynamic_rnn构建LSTM模型,将输入编码成隐层向量。
            # encoder_outputs用于attention,batch_size*encoder_inputs_length*rnn_size,
            # encoder_state用于decoder的初始化状态,batch_size*rnn_szie
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                encoder_inputs_embedded,
                sequence_length=self.encoder_inputs_length,
                dtype=tf.float32)

        # =================================3, 定义模型的decoder部分
        with tf.variable_scope('decoder'):
            encoder_inputs_length = self.encoder_inputs_length
            if self.beam_search:
                # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。
                print("use beamsearch decoding..")
                encoder_outputs = tf.contrib.seq2seq.tile_batch(
                    encoder_outputs, multiplier=self.beam_size)
                encoder_state = nest.map_structure(
                    lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_size),
                    encoder_state)
                encoder_inputs_length = tf.contrib.seq2seq.tile_batch(
                    self.encoder_inputs_length, multiplier=self.beam_size)

            #定义要使用的attention机制。
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
                num_units=self.rnn_size,
                memory=encoder_outputs,
                memory_sequence_length=encoder_inputs_length)
            #attention_mechanism = tf.contrib.seq2seq.LuongAttention(num_units=self.rnn_size, memory=encoder_outputs, memory_sequence_length=encoder_inputs_length)
            # 定义decoder阶段要是用的LSTMCell,然后为其封装attention wrapper
            decoder_cell = self._create_rnn_cell()
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=self.rnn_size,
                name='Attention_Wrapper')
            #如果使用beam_seach则batch_size = self.batch_size * self.beam_size。因为之前已经复制过一次
            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size
            #定义decoder阶段的初始化状态,直接使用encoder阶段的最后一个隐层状态进行赋值
            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_state)
            #TODO here i DONT CHANGE anything i think
            # output_layer = tf.layers.Dense(self.vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
            output_layer = Dense(
                self.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                   stddev=0.1))

            if self.mode == 'train':
                # 定义decoder阶段的输入,其实就是在decoder的target开始处添加一个<go>,并删除结尾处的<end>,并进行embedding。
                # decoder_inputs_embedded的shape为[batch_size, decoder_targets_length, embedding_size]
                ending = tf.strided_slice(self.decoder_targets, [0, 0],
                                          [self.batch_size, -1], [1, 1])
                decoder_input = tf.concat([
                    tf.fill([self.batch_size, 1], self.word_to_idx['<go>']),
                    ending
                ], 1)
                decoder_inputs_embedded = tf.nn.embedding_lookup(
                    embedding, decoder_input)
                #训练阶段,使用TrainingHelper+BasicDecoder的组合,这一般是固定的,当然也可以自己定义Helper类,实现自己的功能
                training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_inputs_embedded,
                    sequence_length=self.decoder_targets_length,
                    time_major=False,
                    name='training_helper')
                training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=training_helper,
                    initial_state=decoder_initial_state,
                    output_layer=output_layer)
                #调用dynamic_decode进行解码,decoder_outputs是一个namedtuple,里面包含两项(rnn_outputs, sample_id)
                # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss
                # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案
                decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_sequence_length)
                # 根据输出计算loss和梯度,并定义进行更新的AdamOptimizer和train_op
                self.decoder_logits_train = tf.identity(
                    decoder_outputs.rnn_output)
                self.decoder_predict_train = tf.argmax(
                    self.decoder_logits_train,
                    axis=-1,
                    name='decoder_pred_train')
                # 使用sequence_loss计算loss,这里需要传入之前定义的mask标志
                self.loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_targets,
                    weights=self.mask)

                # Training summary for the current batch_loss
                tf.summary.scalar('loss', self.loss)
                self.summary_op = tf.summary.merge_all()

                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                trainable_params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, trainable_params)
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))
            elif self.mode == 'decode':
                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * self.word_to_idx['<go>']
                end_token = self.word_to_idx['<eos>']
                # decoder阶段根据是否使用beam_search决定不同的组合,
                # 如果使用则直接调用BeamSearchDecoder(里面已经实现了helper类)
                # 如果不使用则调用GreedyEmbeddingHelper+BasicDecoder的组合进行贪婪式解码
                if self.beam_search:
                    inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_size,
                        output_layer=output_layer)
                else:
                    decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                        embedding=embedding,
                        start_tokens=start_tokens,
                        end_token=end_token)
                    inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                        cell=decoder_cell,
                        helper=decoding_helper,
                        initial_state=decoder_initial_state,
                        output_layer=output_layer)
                decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=inference_decoder, maximum_iterations=10)
                # 调用dynamic_decode进行解码,decoder_outputs是一个namedtuple,
                # 对于不使用beam_search的时候,它里面包含两项(rnn_outputs, sample_id)
                # rnn_output: [batch_size, decoder_targets_length, vocab_size]
                # sample_id: [batch_size, decoder_targets_length], tf.int32

                # 对于使用beam_search的时候,它里面包含两项(predicted_ids, beam_search_decoder_output)
                # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果
                # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids)
                # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果
                if self.beam_search:
                    self.decoder_predict_decode = decoder_outputs.predicted_ids
                else:
                    self.decoder_predict_decode = tf.expand_dims(
                        decoder_outputs.sample_id, -1)
        # =================================4, 保存模型
        self.saver = tf.train.Saver(tf.global_variables())
Пример #15
0
def decoding_layer(target_letter_to_int, decoding_embedding_size, num_layers,
                   rnn_size, target_sequence_length,
                   max_target_sequence_length, encoder_state, decoder_input):
    '''
    :param target_letter_to_int: target数据的映射表
    :param decoding_embedding_size: embed向量大小
    :param num_layers: 堆叠的RNN单元数量
    :param rnn_size: RNN单元的隐层结点数量
    :param target_sequence_length: target数据序列长度
    :param max_target_sequence_length: target数据序列最大长度
    :param encoder_state: encoder端编码的状态向量
    :param decoder_input: decoder端输入
    '''

    # 1. Embedding
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(
        tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings,
                                                 decoder_input)

    # 2. 构造Decoder中的RNN单元
    def get_decoder_cell(rnn_size):
        decoder_cell = tf.contrib.rnn.LSTMCell(
            rnn_size,
            initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
        return decoder_cell

    cell = tf.contrib.rnn.MultiRNNCell(
        [get_decoder_cell(rnn_size) for _ in range(num_layers)])

    # 3. Output全连接层
    output_layer = Dense(target_vocab_size,
                         kernel_initializer=tf.truncated_normal_initializer(
                             mean=0.0, stddev=0.1))

    # 4. Training decoder
    with tf.variable_scope("decode"):
        # 得到help对象
        training_helper = tf.contrib.seq2seq.TrainingHelper(
            inputs=decoder_embed_input,
            sequence_length=target_sequence_length,
            time_major=False)
        # 构造decoder
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
            cell, training_helper, encoder_state, output_layer)
        training_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(
            training_decoder,
            impute_finished=True,
            maximum_iterations=max_target_sequence_length)

    # 5. Predicting decoder
    # 与training共享参数
    with tf.variable_scope("decode", reuse=True):
        # 创建一个常量tensor并复制为batch_size的大小
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']],
                                           dtype=tf.int32), [batch_size],
                               name='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
            decoder_embeddings, start_tokens, target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(
            cell, predicting_helper, encoder_state, output_layer)
        predicting_decoder_output, _ = tf.contrib.seq2seq.dynamic_decode(
            predicting_decoder,
            impute_finished=True,
            maximum_iterations=max_target_sequence_length)

    return training_decoder_output, predicting_decoder_output
    def BuildNetwork(self, learningRate):
        #############################################################################
        # Input Data
        #############################################################################

        self.dataInput = tensorflow.placeholder(
            dtype=tensorflow.float32,
            shape=[None, None, self.featureShape],
            name='dataInput')
        self.dataLenInput = tensorflow.placeholder(dtype=tensorflow.int32,
                                                   shape=[None],
                                                   name='dataLenInput')

        self.labelInputSR = tensorflow.placeholder(dtype=tensorflow.int32,
                                                   shape=[None, None],
                                                   name='labelInput')
        self.labelLenInputSR = tensorflow.placeholder(dtype=tensorflow.int32,
                                                      shape=[None],
                                                      name='labelLenInput')

        self.labelInputDR = tensorflow.placeholder(dtype=tensorflow.float32,
                                                   shape=None,
                                                   name='labelInputDR')

        #############################################################################
        # Batch Parameters
        #############################################################################

        self.parameters['BatchSize'], self.parameters[
            'TimeStep'], _ = tensorflow.unstack(
                tensorflow.shape(input=self.dataInput, name='DataShape'))
        self.parameters['LabelStep'] = tensorflow.shape(
            input=self.labelInputSR, name='LabelShape')[1]

        ###################################################################################################
        # Encoder
        ###################################################################################################

        with tensorflow.variable_scope('Encoder'):
            self.parameters[
                'Encoder_Cell_Forward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)
            self.parameters[
                'Encoder_Cell_Backward'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Encoder_Output'], self.parameters['Encoder_FinalState'] = \
                tensorflow.nn.bidirectional_dynamic_rnn(
                    cell_fw=self.parameters['Encoder_Cell_Forward'], cell_bw=self.parameters['Encoder_Cell_Backward'],
                    inputs=self.dataInput, sequence_length=self.dataLenInput, dtype=tensorflow.float32)

        self.attentionList = self.firstAttention(
            dataInput=self.parameters['Encoder_Output'],
            scopeName=self.firstAttentionName,
            hiddenNoduleNumber=2 * self.hiddenNodules,
            attentionScope=self.firstAttentionScope,
            blstmFlag=True)
        self.parameters['Decoder_InitalState'] = []
        for index in range(self.rnnLayers):
            self.parameters[
                'Encoder_Cell_Layer%d' % index] = rnn.LSTMStateTuple(
                    c=self.attentionList['FinalResult'],
                    h=tensorflow.concat([
                        self.parameters['Encoder_FinalState'][index][0].h,
                        self.parameters['Encoder_FinalState'][index][1].h
                    ],
                                        axis=1))
            self.parameters['Decoder_InitalState'].append(
                self.parameters['Encoder_Cell_Layer%d' % index])
        self.parameters['Decoder_InitalState'] = tuple(
            self.parameters['Decoder_InitalState'])

        #############################################################################
        # Decoder Label Pretreatment
        #############################################################################

        self.parameters['DecoderEmbedding'] = tensorflow.Variable(
            initial_value=tensorflow.truncated_normal(
                shape=[VOCABULAR, self.hiddenNodules * 2],
                stddev=0.1,
                name='DecoderEmbedding'))

        self.parameters[
            'DecoderEmbeddingResult'] = tensorflow.nn.embedding_lookup(
                params=self.parameters['DecoderEmbedding'],
                ids=self.labelInputSR,
                name='DecoderEmbeddingResult')

        #############################################################################
        # Decoder
        #############################################################################

        self.parameters['Decoder_Helper'] = seq2seq.TrainingHelper(
            inputs=self.parameters['DecoderEmbeddingResult'],
            sequence_length=self.labelLenInputSR,
            name='Decoder_Helper')
        with tensorflow.variable_scope('Decoder'):
            self.parameters['Decoder_FC'] = Dense(VOCABULAR)

            self.parameters[
                'Decoder_Cell'] = tensorflow.nn.rnn_cell.MultiRNNCell(
                    cells=[
                        rnn.LSTMCell(num_units=self.hiddenNodules * 2)
                        for _ in range(self.rnnLayers)
                    ],
                    state_is_tuple=True)

            self.parameters['Decoder'] = seq2seq.BasicDecoder(
                cell=self.parameters['Decoder_Cell'],
                helper=self.parameters['Decoder_Helper'],
                initial_state=self.parameters['Decoder_InitalState'],
                output_layer=self.parameters['Decoder_FC'])

            self.parameters['Decoder_Logits'], self.parameters[
                'Decoder_FinalState'], self.parameters[
                    'Decoder_FinalSeq'] = seq2seq.dynamic_decode(
                        decoder=self.parameters['Decoder'])

        with tensorflow.name_scope('Loss'):
            self.parameters['TargetsReshape'] = tensorflow.reshape(
                tensor=self.labelInputSR, shape=[-1], name='TargetsReshape')
            self.parameters['Decoder_Reshape'] = tensorflow.reshape(
                self.parameters['Decoder_Logits'].rnn_output, [-1, VOCABULAR],
                name='Decoder_Reshape')
            self.parameters[
                'Cost'] = tensorflow.losses.sparse_softmax_cross_entropy(
                    labels=self.parameters['TargetsReshape'],
                    logits=self.parameters['Decoder_Reshape'])

            self.trainEncoderDecoder = tensorflow.train.AdamOptimizer(
                learning_rate=learningRate).minimize(self.parameters['Cost'])

        #############################################################################
        self.DBLSTM_Structure(learningRate=learningRate)
Пример #17
0
def model_fn(features, labels, mode, params):
    # particular to this project
    word2index = params['word2index']
    # index2word = params['index2word']

    GPUs = get_available_gpus()
    GPU = {
        'titan': GPUs[1],
        'sidekick': GPUs[0]}

    lookup_table, emb_vectors = load_embeddings(params['embedding_vectors'], params['vocab'])
    embedded_enc_input = tf.nn.embedding_lookup(emb_vectors, features['encoder_inputs'])
    forget_bias = get_forget_bias(params, mode)

    num_units = [2048, 2048]
    init = tf.initializers.truncated_normal(0.0, 0.01)

    with tf.device(GPU['titan']):
        encoder_cells = [tf.nn.rnn_cell.LSTMCell(num_units=num, forget_bias=forget_bias, initializer=init) for num in num_units]
        encoder_stacked_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(encoder_cells)

        enc_outputs, enc_final_state = tf.nn.dynamic_rnn(encoder_stacked_rnn_cell,
                                                         embedded_enc_input,
                                                         sequence_length=features['encoder_input_lengths'],
                                                         dtype=tf.float32)
    # Decoder model
    with tf.device(GPU['sidekick']):
        partial_embedding_helper = partial(embedding_helper, emb_vectors=emb_vectors)
        if mode == tf.estimator.ModeKeys.TRAIN:
            embed_dec_inputs = tf.nn.embedding_lookup(emb_vectors, features['decoder_inputs'])
            helper = tf.contrib.seq2seq.TrainingHelper(
                inputs=embed_dec_inputs,
                sequence_length=features['decoder_input_lengths'],
            )
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=partial_embedding_helper,
                start_tokens=tf.tile([word2index['<GO>']],
                                     [tf.shape(features['encoder_inputs'])[0]]),
                end_token=word2index['<EOS>'])

        dec_cell = tf.nn.rnn_cell.LSTMCell(num_units=num_units[-1],  # needs to match size of last layer of encoder
                                           forget_bias=forget_bias,
                                           initializer=init)

        decoder = tf.contrib.seq2seq.BasicDecoder(
            cell=dec_cell,
            helper=helper,
            initial_state=enc_final_state[-1],
            output_layer=Dense(params['vocab_size'], use_bias=False))
        dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
            decoder=decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=params['output_max_length'])
    logits = tf.identity(dec_outputs.rnn_output, 'logits')

    if mode == tf.estimator.ModeKeys.PREDICT:
        indices = predict_words(logits)
        predictions = {'sentence_tokens': indices}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    training_labels = labels['target_sequences']
    weights = tf.cast(tf.cast(tf.not_equal(training_labels, tf.constant(word2index['<PAD>'])), tf.bool), tf.float32)
    sequence_loss = tf.contrib.seq2seq.sequence_loss(logits=logits, targets=training_labels, weights=weights)

    tf.summary.scalar('sequence_loss', sequence_loss)

    if mode == tf.estimator.ModeKeys.EVAL:
        metrics = {'loss': sequence_loss}
        return tf.estimator.EstimatorSpec(mode, loss=sequence_loss, eval_metric_ops=metrics)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    train_op = optimizer.minimize(sequence_loss, global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(mode, loss=sequence_loss, train_op=train_op)
Пример #18
0
    def __init__(self, tfFLAGS, embed=None):
        self.vocab_size = tfFLAGS.vocab_size
        self.embed_size = tfFLAGS.embed_size
        self.num_units = tfFLAGS.num_units
        self.num_layers = tfFLAGS.num_layers
        self.beam_width = tfFLAGS.beam_width
        self.use_lstm = tfFLAGS.use_lstm
        self.attn_mode = tfFLAGS.attn_mode
        self.train_keep_prob = tfFLAGS.keep_prob
        self.max_decode_len = tfFLAGS.max_decode_len
        self.bi_encode = tfFLAGS.bi_encode
        self.recog_hidden_units = tfFLAGS.recog_hidden_units
        self.prior_hidden_units = tfFLAGS.prior_hidden_units
        self.z_dim = tfFLAGS.z_dim
        self.full_kl_step = tfFLAGS.full_kl_step

        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.max_gradient_norm = 5.0
        if tfFLAGS.opt == 'SGD':
            self.learning_rate = tf.Variable(float(tfFLAGS.learning_rate),
                                             trainable=False,
                                             dtype=tf.float32)
            self.learning_rate_decay_op = self.learning_rate.assign(
                self.learning_rate * tfFLAGS.learning_rate_decay_factor)
            self.opt = tf.train.GradientDescentOptimizer(self.learning_rate)
        elif tfFLAGS.opt == 'Momentum':
            self.opt = tf.train.MomentumOptimizer(
                learning_rate=tfFLAGS.learning_rate, momentum=tfFLAGS.momentum)
        else:
            self.learning_rate = tfFLAGS.learning_rate
            self.opt = tf.train.AdamOptimizer()

        self._make_input(embed)

        with tf.variable_scope("output_layer"):
            self.output_layer = Dense(
                self.vocab_size,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1))

        with tf.variable_scope("encoders",
                               initializer=tf.orthogonal_initializer()):
            self.enc_post_outputs, self.enc_post_state = self._build_encoder(
                scope='post_encoder',
                inputs=self.enc_post,
                sequence_length=self.post_len)
            self.enc_ref_outputs, self.enc_ref_state = self._build_encoder(
                scope='ref_encoder',
                inputs=self.enc_ref,
                sequence_length=self.ref_len)
            self.enc_response_outputs, self.enc_response_state = self._build_encoder(
                scope='resp_encoder',
                inputs=self.enc_response,
                sequence_length=self.response_len)

            self.post_state = self._get_representation_from_enc_state(
                self.enc_post_state)
            self.ref_state = self._get_representation_from_enc_state(
                self.enc_ref_state)
            self.response_state = self._get_representation_from_enc_state(
                self.enc_response_state)
            self.cond_embed = tf.concat([self.post_state, self.ref_state],
                                        axis=-1)

        with tf.variable_scope("RecognitionNetwork"):
            recog_input = tf.concat([self.cond_embed, self.response_state],
                                    axis=-1)
            recog_hidden = tf.layers.dense(inputs=recog_input,
                                           units=self.recog_hidden_units,
                                           activation=tf.nn.tanh)
            recog_mulogvar = tf.layers.dense(inputs=recog_hidden,
                                             units=self.z_dim * 2,
                                             activation=None)
            # recog_mulogvar = tf.layers.dense(inputs=recog_input, units=self.z_dim * 2, activation=None)
            recog_mu, recog_logvar = tf.split(recog_mulogvar, 2, axis=-1)

        with tf.variable_scope("PriorNetwork"):
            prior_input = self.cond_embed
            prior_hidden = tf.layers.dense(inputs=prior_input,
                                           units=self.prior_hidden_units,
                                           activation=tf.nn.tanh)
            prior_mulogvar = tf.layers.dense(inputs=prior_hidden,
                                             units=self.z_dim * 2,
                                             activation=None)
            prior_mu, prior_logvar = tf.split(prior_mulogvar, 2, axis=-1)

        with tf.variable_scope("GenerationNetwork"):
            latent_sample = tf.cond(
                self.use_prior,
                lambda: sample_gaussian(prior_mu, prior_logvar),
                lambda: sample_gaussian(recog_mu, recog_logvar),
                name='latent_sample')

            gen_input = tf.concat([self.cond_embed, latent_sample], axis=-1)
            if self.use_lstm:
                self.dec_init_state = tuple([
                    tf.contrib.rnn.LSTMStateTuple(
                        c=tf.layers.dense(inputs=gen_input,
                                          units=self.num_units,
                                          activation=None),
                        h=tf.layers.dense(inputs=gen_input,
                                          units=self.num_units,
                                          activation=None))
                    for _ in range(self.num_layers)
                ])
                print self.dec_init_state
            else:
                self.dec_init_state = tuple([
                    tf.layers.dense(inputs=gen_input,
                                    units=self.num_units,
                                    activation=None)
                    for _ in range(self.num_layers)
                ])

            kld = gaussian_kld(recog_mu, recog_logvar, prior_mu, prior_logvar)
            self.avg_kld = tf.reduce_mean(kld)
            self.kl_weights = tf.minimum(
                tf.to_float(self.global_step) / self.full_kl_step, 1.0)
            self.kl_loss = self.kl_weights * self.avg_kld

        self._build_decoder()
        self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2,
                                    max_to_keep=1,
                                    pad_step_number=True,
                                    keep_checkpoint_every_n_hours=1.0)
        for var in tf.trainable_variables():
            print var
Пример #19
0
def build_decoder(encoder_outputs, encoder_state, input_sequence_length,
                  char_ids, batch_size, num_classes, num_decoder_layers,
                  maximum_iterations):

    vocab_size = num_classes
    out_layer = Dense(vocab_size, name='output_projection')

    # Decoder.
    with tf.variable_scope("decoder") as decoder_scope:

        cell, decoder_initial_state = build_decoder_cell(
            encoder_outputs, encoder_state, input_sequence_length,
            num_decoder_layers, batch_size)

        # Train
        # if mode != 'INFER':
        # char_ids = tf.placeholder(tf.int32,
        #                                shape=[None, None],
        #                                name='ids_target')
        embedding = tf.get_variable(
            'embedding',
            shape=[vocab_size, 300],  # embeddings dimension I have given 2
            dtype=tf.float32)

        char_embedding_lookup = tf.nn.embedding_lookup(embedding,
                                                       char_ids,
                                                       name='char_embedding')
        char_embedding = tf.nn.dropout(char_embedding_lookup,
                                       0.986,
                                       name='char_embedding_dropout')

        helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs=char_embedding,
            sequence_length=input_sequence_length,
            embedding=embedding,
            sampling_probability=0.5,
            time_major=False)

        # Decoder
        my_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                     helper,
                                                     decoder_initial_state,
                                                     output_layer=out_layer)

        # Dynamic decoding
        outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
            my_decoder,
            output_time_major=False,
            maximum_iterations=maximum_iterations,
            swap_memory=False,
            impute_finished=True,
            scope=decoder_scope)

        sample_id = outputs.sample_id
        logits = outputs.rnn_output

        # Inference
        # else:
        #     start_tokens = tf.fill([batch_size], sos_id_2)
        #     end_token = eos_id_2

        #     # Beam search
        #     if beam_width > 0:
        #         my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
        #             cell=cell,
        #             embedding=embedding,
        #             start_tokens=start_tokens,
        #             end_token=end_token,
        #             initial_state=decoder_initial_state,
        #             beam_width=beam_width,
        #             output_layer=output_layer,
        #         )

        #     # Greedy
        #     else:
        #         helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding,
        #                                                           start_tokens,
        #                                                           end_token)

        #         my_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
        #                                                      helper,
        #                                                      decoder_initial_state,
        #                                                      output_layer=output_layer)
        #     if inference_targets:
        #         maximum_iterations = maximum_iterations
        #     else:
        #         maximum_iterations = None

        #     # Dynamic decoding
        #     outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
        #         my_decoder,
        #         maximum_iterations=maximum_iterations,
        #         output_time_major=False,
        #         impute_finished=False,
        #         swap_memory=False,
        #         scope=decoder_scope)

        #     if beam_width > 0:
        #         logits = tf.no_op()
        #         sample_id = outputs.predicted_ids
        #     else:
        #         logits = tf.no_op()
        #         sample_id = outputs.sample_id

    return logits, sample_id, final_context_state
Пример #20
0
def _create_decoder(cells,
                    batch_size,
                    encoder_outputs,
                    encoder_state,
                    encoder_lengths,
                    decoding_inputs,
                    decoding_lengths,
                    embed_matrix,
                    target_vocab_size,
                    scope,
                    max_sequence_size,
                    use_attention=True):
    """Summary

    Parameters
    ----------
    cells : TYPE
        Description
    batch_size : TYPE
        Description
    encoder_outputs : TYPE
        Description
    encoder_state : TYPE
        Description
    encoder_lengths : TYPE
        Description
    decoding_inputs : TYPE
        Description
    decoding_lengths : TYPE
        Description
    embed_matrix : TYPE
        Description
    target_vocab_size : TYPE
        Description
    scope : TYPE
        Description
    max_sequence_size : TYPE
        Description
    use_attention : bool, optional
        Description

    Returns
    -------
    TYPE
        Description
    """
    from tensorflow.python.layers.core import Dense

    # Output projection
    output_layer = Dense(target_vocab_size, name='output_projection')

    # Setup Attention
    if use_attention:
        attn_mech = tf.contrib.seq2seq.LuongAttention(cells.output_size,
                                                      encoder_outputs,
                                                      encoder_lengths,
                                                      scale=True)
        cells = tf.contrib.seq2seq.AttentionWrapper(
            cell=cells,
            attention_mechanism=attn_mech,
            attention_layer_size=cells.output_size,
            alignment_history=False)
        initial_state = cells.zero_state(dtype=tf.float32,
                                         batch_size=batch_size)
        initial_state = initial_state.clone(cell_state=encoder_state)

    # Setup training a build decoder
    helper = tf.contrib.seq2seq.TrainingHelper(
        inputs=decoding_inputs,
        sequence_length=decoding_lengths,
        time_major=False)
    train_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=cells,
        helper=helper,
        initial_state=initial_state,
        output_layer=output_layer)
    train_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        train_decoder,
        output_time_major=False,
        impute_finished=True,
        maximum_iterations=max_sequence_size)
    train_logits = tf.identity(train_outputs.rnn_output, name='train_logits')

    # Setup inference and build decoder
    scope.reuse_variables()
    start_tokens = tf.tile(tf.constant([GO_ID], dtype=tf.int32), [batch_size])
    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
        embedding=embed_matrix, start_tokens=start_tokens, end_token=EOS_ID)
    infer_decoder = tf.contrib.seq2seq.BasicDecoder(
        cell=cells,
        helper=helper,
        initial_state=initial_state,
        output_layer=output_layer)
    infer_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
        infer_decoder,
        output_time_major=False,
        impute_finished=True,
        maximum_iterations=max_sequence_size)
    infer_logits = tf.identity(infer_outputs.sample_id, name='infer_logits')

    return train_logits, infer_logits
Пример #21
0
    def Decoder(self, encoder_output, encoder_final_state):
        with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
            emb_w = tf.get_variable("embedding",
                                    shape=[self.voc_size, self.dim_hidden])

        if self.mode == 'test' and self.beam == True:
            print("use beamsearch decoding..")
            encoder_output = tf.contrib.seq2seq.tile_batch(
                encoder_output, multiplier=self.beam_size)
            encoder_final_state = tf.contrib.seq2seq.tile_batch(
                encoder_final_state, multiplier=self.beam_size)

        attention_output = tf.contrib.seq2seq.LuongAttention(
            self.dim_hidden, encoder_output)
        decoder_cell = tf.nn.rnn_cell.MultiRNNCell([
            self.get_a_cell(self.dim_hidden)
            for _ in range(self.lstm_num_layer)
        ])
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell,
            attention_output,
            attention_layer_size=self.dim_hidden)
        projection_layer = Dense(self.voc_size, use_bias=False)

        if self.mode == 'train':
            decoder_input = tf.nn.embedding_lookup(emb_w, self.ys[:, :-1])
            decoder_seq_length = [self.input_timestep] * self.batch_size
            decoder_init_state = decoder_cell.zero_state(
                self.batch_size,
                self.dtype).clone(cell_state=encoder_final_state)
            helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                decoder_input,
                decoder_seq_length,
                emb_w,
                0.2,
                time_major=False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                decoder_cell,
                helper,
                decoder_init_state,
                output_layer=projection_layer)

            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=training_decoder, maximum_iterations=self.max_len)
        elif self.mode == 'test':
            start_tokens = tf.ones([self.batch_size], tf.int32)
            end_token = 2
            if self.beam == True:
                decoder_init_state = decoder_cell.zero_state(
                    self.batch_size * self.beam_size,
                    self.dtype).clone(cell_state=encoder_final_state)
                inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                    cell=decoder_cell,
                    embedding=emb_w,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=decoder_init_state,
                    beam_width=self.beam_size,
                    output_layer=projection_layer)
            else:

                decoder_init_state = decoder_cell.zero_state(
                    self.batch_size,
                    self.dtype).clone(cell_state=encoder_final_state)
                decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=emb_w,
                    start_tokens=start_tokens,
                    end_token=end_token)
                inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=decoding_helper,
                    initial_state=decoder_init_state,
                    output_layer=projection_layer)

            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder=inference_decoder, maximum_iterations=self.max_len)

        return outputs
Пример #22
0
def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim,
                   target_dict_dim, is_generating, beam_size,
                   max_generation_length):
    src_word_idx = tf.placeholder(tf.int32, shape=[None, None])
    src_sequence_length = tf.placeholder(tf.int32, shape=[
        None,
    ])

    src_embedding_weights = tf.get_variable("source_word_embeddings",
                                            [source_dict_dim, embedding_dim])
    src_embedding = tf.nn.embedding_lookup(src_embedding_weights, src_word_idx)

    src_forward_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    src_reversed_cell = tf.nn.rnn_cell.BasicLSTMCell(encoder_size)
    # no peephole
    encoder_outputs, _ = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=src_forward_cell,
        cell_bw=src_reversed_cell,
        inputs=src_embedding,
        sequence_length=src_sequence_length,
        dtype=tf.float32)

    # concat the forward outputs and backward outputs
    encoded_vec = tf.concat(encoder_outputs, axis=2)

    # project the encoder outputs to size of decoder lstm
    encoded_proj = tf.contrib.layers.fully_connected(inputs=tf.reshape(
        encoded_vec, shape=[-1, embedding_dim * 2]),
                                                     num_outputs=decoder_size,
                                                     activation_fn=None,
                                                     biases_initializer=None)
    encoded_proj_reshape = tf.reshape(
        encoded_proj, shape=[-1, tf.shape(encoded_vec)[1], decoder_size])

    # get init state for decoder lstm's H
    backword_first = tf.slice(encoder_outputs[1], [0, 0, 0], [-1, 1, -1])
    decoder_boot = tf.contrib.layers.fully_connected(inputs=tf.reshape(
        backword_first, shape=[-1, embedding_dim]),
                                                     num_outputs=decoder_size,
                                                     activation_fn=tf.nn.tanh,
                                                     biases_initializer=None)

    # prepare the initial state for decoder lstm
    cell_init = tf.zeros(tf.shape(decoder_boot), tf.float32)
    initial_state = LSTMStateTuple(cell_init, decoder_boot)

    # create decoder lstm cell
    decoder_cell = LSTMCellWithSimpleAttention(
        decoder_size,
        encoded_vec if not is_generating else seq2seq.tile_batch(
            encoded_vec, beam_size),
        encoded_proj_reshape if not is_generating else seq2seq.tile_batch(
            encoded_proj_reshape, beam_size),
        src_sequence_length if not is_generating else seq2seq.tile_batch(
            src_sequence_length, beam_size),
        forget_bias=0.0)

    output_layer = Dense(target_dict_dim, name='output_projection')

    if not is_generating:
        trg_word_idx = tf.placeholder(tf.int32, shape=[None, None])
        trg_sequence_length = tf.placeholder(tf.int32, shape=[
            None,
        ])
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])
        trg_embedding = tf.nn.embedding_lookup(trg_embedding_weights,
                                               trg_word_idx)

        training_helper = seq2seq.TrainingHelper(
            inputs=trg_embedding,
            sequence_length=trg_sequence_length,
            time_major=False,
            name='training_helper')

        training_decoder = seq2seq.BasicDecoder(cell=decoder_cell,
                                                helper=training_helper,
                                                initial_state=initial_state,
                                                output_layer=output_layer)

        # get the max length of target sequence
        max_decoder_length = tf.reduce_max(trg_sequence_length)

        decoder_outputs_train, _, _ = seq2seq.dynamic_decode(
            decoder=training_decoder,
            output_time_major=False,
            impute_finished=True,
            maximum_iterations=max_decoder_length)

        decoder_logits_train = tf.identity(decoder_outputs_train.rnn_output)
        decoder_pred_train = tf.argmax(decoder_logits_train,
                                       axis=-1,
                                       name='decoder_pred_train')
        masks = tf.sequence_mask(lengths=trg_sequence_length,
                                 maxlen=max_decoder_length,
                                 dtype=tf.float32,
                                 name='masks')

        # place holder of label sequence
        lbl_word_idx = tf.placeholder(tf.int32, shape=[None, None])

        # compute the loss
        loss = seq2seq.sequence_loss(logits=decoder_logits_train,
                                     targets=lbl_word_idx,
                                     weights=masks,
                                     average_across_timesteps=True,
                                     average_across_batch=True)

        # return feeding list and loss operator
        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length,
            'trg_word_idx': trg_word_idx,
            'trg_sequence_length': trg_sequence_length,
            'lbl_word_idx': lbl_word_idx
        }, loss
    else:
        start_tokens = tf.ones([
            tf.shape(src_word_idx)[0],
        ], tf.int32) * START_TOKEN_IDX
        # share the same embedding weights with target word
        trg_embedding_weights = tf.get_variable(
            "target_word_embeddings", [target_dict_dim, embedding_dim])

        inference_decoder = beam_search_decoder.BeamSearchDecoder(
            cell=decoder_cell,
            embedding=lambda tokens: tf.nn.embedding_lookup(
                trg_embedding_weights, tokens),
            start_tokens=start_tokens,
            end_token=END_TOKEN_IDX,
            initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                tf.contrib.seq2seq.tile_batch(initial_state[0], beam_size),
                tf.contrib.seq2seq.tile_batch(initial_state[1], beam_size)),
            beam_width=beam_size,
            output_layer=output_layer)

        decoder_outputs_decode, _, _ = seq2seq.dynamic_decode(
            decoder=inference_decoder,
            output_time_major=False,
            #impute_finished=True,# error occurs
            maximum_iterations=max_generation_length)

        predicted_ids = decoder_outputs_decode.predicted_ids

        return {
            'src_word_idx': src_word_idx,
            'src_sequence_length': src_sequence_length
        }, predicted_ids
    def build_decoder(self, encoder_outputs, encoder_state):

        sos_id_2 = tf.cast(self.char2ind[self.sos], tf.int32)
        eos_id_2 = tf.cast(self.char2ind[self.eos], tf.int32)
        self.output_layer = Dense(self.vocab_size, name='output_projection')

        # Decoder.
        with tf.variable_scope("decoder") as decoder_scope:

            cell, decoder_initial_state = self.build_decoder_cell(
                encoder_outputs, encoder_state, self.audio_sequence_lengths)

            # Train
            if self.mode != 'INFER':

                helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
                    inputs=self.char_embedding,
                    sequence_length=self.char_sequence_lengths,
                    embedding=self.embedding,
                    sampling_probability=0.5,
                    time_major=False)

                # Decoder
                my_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell,
                    helper,
                    decoder_initial_state,
                    output_layer=self.output_layer)

                # Dynamic decoding
                outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder,
                    output_time_major=False,
                    maximum_iterations=self.maximum_iterations,
                    swap_memory=False,
                    impute_finished=True,
                    scope=decoder_scope)

                sample_id = outputs.sample_id
                logits = outputs.rnn_output

            # Inference
            else:
                start_tokens = tf.fill([self.batch_size], sos_id_2)
                end_token = eos_id_2

                # Beam search
                if self.beam_width > 0:
                    my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                        cell=cell,
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.output_layer,
                    )

                # Greedy
                else:
                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                        self.embedding, start_tokens, end_token)

                    my_decoder = tf.contrib.seq2seq.BasicDecoder(
                        cell,
                        helper,
                        decoder_initial_state,
                        output_layer=self.output_layer)
                if self.inference_targets:
                    maximum_iterations = self.maximum_iterations
                else:
                    maximum_iterations = None

                # Dynamic decoding
                outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    my_decoder,
                    maximum_iterations=maximum_iterations,
                    output_time_major=False,
                    impute_finished=False,
                    swap_memory=False,
                    scope=decoder_scope)

                if self.beam_width > 0:
                    logits = tf.no_op()
                    sample_id = outputs.predicted_ids
                else:
                    logits = tf.no_op()
                    sample_id = outputs.sample_id

        return logits, sample_id, final_context_state
    def __init__(self,
                 num_emb,
                 batch_size,
                 emb_dim,
                 encoder_num_units,
                 emb_data,
                 ques_length,
                 ans_length,
                 start_token,
                 gen_filter_sizes,
                 gen_num_filters,
                 learning_rate=0.01,
                 reward_gamma=0.95):
        self.num_emb = num_emb
        self.batch_size = batch_size
        self.emb_dim = emb_dim
        self.emb_data = emb_data
        self.encoder_num_units = encoder_num_units
        self.max_ques_length = ques_length
        self.max_ans_length = ans_length
        self.start_token = tf.constant([start_token] * self.batch_size,
                                       dtype=tf.int32)
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.reward_gamma = reward_gamma
        self.gen_filter_sizes = gen_filter_sizes
        self.gen_num_filters = gen_num_filters
        self.grad_clip = 5.0

        self.seq_start_token = None
        self.seq_end_token = None
        self.rnn_size = 50
        self.layer_size = 2
        self.beam_width = 10
        self.atten_depth = 50  #The depth of the query mechanism

        self.g_embeddings = tf.Variable(
            self.init_matrix([self.num_emb, self.emb_dim]))

        self.x = tf.placeholder(tf.int32,
                                shape=[
                                    self.batch_size, self.max_ques_length
                                ])  # sequence of tokens generated by generator
        self.response = tf.placeholder(
            tf.int32, shape=[self.batch_size, self.max_ans_length
                             ])  # get from rollout policy and discriminator
        self.target_sequence_length = tf.placeholder(
            tf.int32, [self.batch_size], name='target_sequence_length')
        self.target_response_length = tf.placeholder(
            tf.int32, [self.batch_size], name='target_response_length')
        self.max_response_length_per_batch = tf.placeholder(tf.int32, shape=())

        with tf.device("/cpu:0"):
            #self.processed_x = tf.transpose(tf.nn.embedding_lookup(self.g_embeddings, self.x), perm=[1, 0, 2])  # seq_length x batch_size x emb_dim
            self.processed_x = tf.nn.embedding_lookup(self.g_embeddings,
                                                      self.x)
            self.processed_response = tf.nn.embedding_lookup(
                self.g_embeddings, self.response)
            print("processed_x shape: ", self.processed_x.shape)
            print("processed_response shape: ", self.processed_response.shape)

        self.add_encoder_layer()
        self.getCnnEncoder(self.gen_filter_sizes, self.gen_num_filters)
        self.output_layer = Dense(
            self.num_emb,
            kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                               stddev=0.1))
        with tf.variable_scope('decode'):
            training_decoder_output = self.add_decoder_for_training()
        with tf.variable_scope('decode', reuse=True):
            predicting_decoder_output, final_context_state = self.add_decoder_for_inference(
            )

        #attention visualizion
        attention_images = (final_context_state.alignment_history.stack())
        print("attention_images shape: ", attention_images.shape)
        # Reshape to (batch, src_seq_len, tgt_seq_len,1)
        attention_images = tf.expand_dims(
            tf.transpose(attention_images, [1, 2, 0]), -1)
        # Scale to range [0, 255]
        attention_images *= 255
        self.infer_summary = tf.summary.image("attention_images",
                                              attention_images)

        # encoder_output, encoder_state = self.get_encoder_layer(self.processed_x, self.encode_rnn_size, self.encode_layer_size, self.target_sequence_length) #sourse seqlenth

        # training_decoder_output, predicting_decoder_output = self.decoding_layer(
        #     self.decode_layer_size,
        #     self.decode_rnn_size,
        #     self.target_response_length,
        #     self.max_ans_length,
        #     encoder_state,
        #     encoder_output,
        #     self.x)

        #######################################################################################################
        #  Training
        #######################################################################################################
        self.g_pretrain_predictions = training_decoder_output.rnn_output
        self.g_pretrain_sample = training_decoder_output.sample_id
        print("self.g_pretrain_predictions: ", self.g_pretrain_predictions)
        masks = tf.sequence_mask(self.target_sequence_length,
                                 self.max_response_length_per_batch,
                                 dtype=tf.float32,
                                 name='masks')
        self.pretrain_loss = tf.contrib.seq2seq.sequence_loss(
            self.g_pretrain_predictions,
            self.response[:, 0:self.max_response_length_per_batch], masks)
        # training updates
        pretrain_opt = self.g_optimizer(self.learning_rate)

        pre_gradients = pretrain_opt.compute_gradients(self.pretrain_loss)
        self.pretrain_grad_zip = [(tf.clip_by_value(grad, -5., 5.), var)
                                  for grad, var in pre_gradients
                                  if grad is not None]
        self.pretrain_updates = pretrain_opt.apply_gradients(
            self.pretrain_grad_zip)

        self.g_samples = predicting_decoder_output.predicted_ids
Пример #25
0
    def _init_decoder(self, forward_only):
        with tf.variable_scope("decoder") as scope:

            def output_fn(outputs):
                return tf.contrib.layers.linear(outputs,
                                                self.target_vocab_size,
                                                scope=scope)

            # attention_states: size [batch_size, max_time, num_units]
            #attention_states = tf.transpose(self.encoder_outputs, [1, 0, 2])
            self.batch_size = tf.shape(self.encoder_inputs)[0]

            self.attn_mech = tf.contrib.seq2seq.LuongAttention(
                num_units=self.dec_hidden_size,
                memory=self.encoder_outputs,
                memory_sequence_length=self.encoder_inputs_length,
                normalize=False,
                name='LuongAttention')

            self.dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(
                cell=self.decoder_cell,
                attention_mechanism=self.attn_mech,
                attention_size=self.dec_hidden_size,
                # attention_history=False (in ver 1.2)
                name='Attention_Wrapper')

            self.initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(
                cell_state=self.encoder_state,
                attention=_zero_state_tensors(self.dec_hidden_size,
                                              self.batch_size, tf.float32))

            self.output_layer = Dense(self.target_vocab_size + 2,
                                      name='output_projection')

            if forward_only:
                start_tokens = tf.tile(tf.constant([model_config.PAD_ID],
                                                   dtype=tf.int32),
                                       [self.batch_size],
                                       name='start_tokens')

                inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=self.dec_embedding_matrix,
                    start_tokens=start_tokens,
                    end_token=model_config.EOS_ID)

                inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=self.dec_cell,
                    helper=inference_helper,
                    initial_state=self.initial_state,
                    output_layer=self.output_layer)

                infer_dec_outputs, infer_dec_last_state = tf.contrib.seq2seq.dynamic_decode(
                    inference_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=self.target_vocab_size)

                # [batch_size x dec_sentence_length], tf.int32
                self.predictions = tf.identity(infer_dec_outputs.sample_id,
                                               name='predictions')
            else:
                # maxium unrollings in current batch = max(dec_sent_len) + 1(GO symbol)
                self.max_dec_len = tf.reduce_max(self.decoder_inputs_length +
                                                 1,
                                                 name='max_dec_len')

                self.training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=self.decoder_inputs_embedded,
                    sequence_length=self.decoder_inputs_length + 1,
                    time_major=False,
                    name='training_helper')

                self.training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=self.dec_cell,
                    helper=self.training_helper,
                    initial_state=self.initial_state,
                    output_layer=self.output_layer)

                self.decoder_outputs, self.decoder_state = tf.contrib.seq2seq.dynamic_decode(
                    self.training_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=self.max_dec_len)

                # logits: [batch_size x max_dec_len x dec_vocab_size+2]
                self.logits = tf.identity(self.decoder_outputs.rnn_output,
                                          name='logits')

                # targets: [batch_size x max_dec_len x dec_vocab_size+2]
                self.targets = tf.slice(self.decoder_inputs, [0, 0],
                                        [-1, self.max_dec_len], 'targets')

                # masks: [batch_size x max_dec_len]
                # => ignore outputs after `dec_senquence_length+1` when calculating loss
                self.masks = tf.sequence_mask(self.decoder_inputs_length + 1,
                                              self.max_dec_len,
                                              dtype=tf.float32,
                                              name='masks')

                # internal: `tf.nn.sparse_softmax_cross_entropy_with_logits`
                self.loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.logits,
                    targets=self.targets,
                    weights=self.masks,
                    name='batch_loss')
Пример #26
0
    def __init__(self,
                 vocab_size,
                 hidden_size,
                 dropout,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 lr_decay_factor,
                 max_target_length,
                 max_source_length,
                 decoder_mode=False):
        '''
        vocab_size: number of vocab tokens
        buckets: buckets of max sequence lengths
        hidden_size: dimension of hidden layers
        num_layers: number of hidden layers
        max_gradient_norm: maximum gradient magnitude
        batch_size: number of training examples fed to network at once
        learning_rate: starting learning rate of network
        lr_decay_factor: amount by which to decay learning rate
        num_samples: number of samples for sampled softmax
        decoder_mode: Whether to build backpass nodes or not
        '''
        GO_ID = config.GO_ID
        EOS_ID = config.EOS_ID
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.global_step = tf.Variable(0, trainable=False)
        self.learning_rate = learning_rate
        self.encoder_inputs = tf.placeholder(shape=(None, None),
                                             dtype=tf.int32,
                                             name='encoder_inputs')
        self.source_lengths = tf.placeholder(shape=(None, ),
                                             dtype=tf.int32,
                                             name='source_lengths')

        self.decoder_targets = tf.placeholder(shape=(None, None),
                                              dtype=tf.int32,
                                              name='decoder_targets')
        self.target_lengths = tf.placeholder(shape=(None, ),
                                             dtype=tf.int32,
                                             name="target_lengths")

        with tf.variable_scope('embeddings') as scope:
            embeddings = tf.Variable(tf.random_uniform(
                [vocab_size, hidden_size], -1.0, 1.0),
                                     dtype=tf.float32)
            encoder_inputs_embedded = tf.nn.embedding_lookup(
                embeddings, self.encoder_inputs)
            targets_embedding = tf.nn.embedding_lookup(embeddings,
                                                       self.decoder_targets)

        with tf.variable_scope('encoder') as scope:

            encoder_cell = rnn.LSTMCell(hidden_size)
            encoder_cell = rnn.DropoutWrapper(encoder_cell,
                                              input_keep_prob=dropout)
            encoder_cell = tf.nn.rnn_cell.MultiRNNCell(
                [encoder_cell for _ in range(num_layers)], state_is_tuple=True)

            encoder_outputs, encoder_state = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=encoder_cell,
                cell_bw=encoder_cell,
                sequence_length=self.source_lengths,
                inputs=encoder_inputs_embedded,
                dtype=tf.float32,
                time_major=False)  #BiLSTM encoder
            encoder_output = encoder_outputs[0]
            encoder_outputs = tf.concat(encoder_outputs, 2)

        with tf.variable_scope('decoder') as scope:
            decoder_cell = rnn.LSTMCell(hidden_size)
            decoder_cell = rnn.DropoutWrapper(decoder_cell,
                                              input_keep_prob=dropout)

            decoder_cell = tf.nn.rnn_cell.MultiRNNCell(
                [decoder_cell for _ in range(num_layers)], state_is_tuple=True)

            #TODO add attention
            #attention_mechanism= seq2seq.BahdanauAttention(num_units=hidden_size,memory=encoder_outputs)

            #decoder_cell = seq2seq.AttentionWrapper(cell=decoder_cell,
            #                                       attention_mechanism=)

            attn_mech = seq2seq.BahdanauAttention(
                num_units=hidden_size,  #depth of query mechanism
                memory=encoder_output,  #out of RNN hidden states
                memory_sequence_length=self.source_lengths,
                name='BahdanauAttentiion')
            attn_cell = seq2seq.AttentionWrapper(
                cell=decoder_cell,  #same as encoder
                attention_mechanism=attn_mech,
                attention_layer_size=hidden_size,  #depth of attention tensor
                name='attention_wrapper')  #attention layer

        if decoder_mode:
            beam_width = 1

            attn_zero = attn_cell.zero_state(batch_size=(batch_size *
                                                         beam_width),
                                             dtype=tf.float32)
            init_state = attn_zero.clone(cell_state=encoder_state)
            decoder = seq2seq.BeamSearchDecoder(
                cell=attn_cell,
                embedding=embeddings,
                start_tokens=tf.tile([GO_ID], [1]),
                end_token=EOS_ID,
                initial_state=init_state,
                beam_width=beam_width,
                output_layer=Dense(vocab_size))  #BeamSearch in Decoder
            final_outputs, final_state, final_sequence_lengths =\
                            seq2seq.dynamic_decode(decoder=decoder)
            self.logits = final_outputs.predicted_ids
        else:
            helper = seq2seq.TrainingHelper(
                inputs=targets_embedding, sequence_length=self.target_lengths)
            decoder = seq2seq.BasicDecoder(
                cell=attn_cell,
                helper=helper,
                #initial_state=attn_cell.zero_state(batch_size, tf.float32),
                initial_state=attn_cell.zero_state(
                    batch_size, tf.float32).clone(cell_state=encoder_state[0]),
                output_layer=Dense(vocab_size))
            final_outputs, final_state, final_sequence_lengths =\
                            seq2seq.dynamic_decode(decoder=decoder)

            self.logits = final_outputs.rnn_output

        if not decoder_mode:
            with tf.variable_scope("loss") as scope:
                #have to pad logits, dynamic decode produces results not consistent
                #in shape with targets
                pad_size = self.max_target_length - tf.reduce_max(
                    final_sequence_lengths)
                self.logits = tf.pad(self.logits,
                                     [[0, 0], [0, pad_size], [0, 0]])

                weights = tf.sequence_mask(lengths=final_sequence_lengths,
                                           maxlen=self.max_target_length,
                                           dtype=tf.float32,
                                           name='weights')

                x_entropy_loss = seq2seq.sequence_loss(
                    logits=self.logits,
                    targets=self.decoder_targets,
                    weights=weights)  #cross-entropy loss function

                self.loss = tf.reduce_mean(x_entropy_loss)

            optimizer = tf.train.AdamOptimizer()  #Adam optimization algorithm
            gradients = optimizer.compute_gradients(x_entropy_loss)
            capped_grads = [(tf.clip_by_value(grad, -max_gradient_norm,
                                              max_gradient_norm), var)
                            for grad, var in gradients]
            self.train_op = optimizer.apply_gradients(
                capped_grads, global_step=self.global_step)
            self.saver = tf.train.Saver(tf.global_variables())
def dynamic_decode_test():

    vocab_size = 6
    SOS_token = 0
    EOS_token = 5
    
    x_data = np.array([[SOS_token, 3, 1, 4, 3, 2],[SOS_token, 3, 4, 2, 3, 1],[SOS_token, 1, 3, 2, 2, 1]], dtype=np.int32)
    y_data = np.array([[3, 1, 4, 3, 2,EOS_token],[3, 4, 2, 3, 1,EOS_token],[1, 3, 2, 2, 1,EOS_token]],dtype=np.int32)
    print("data shape: ", x_data.shape)
    sess = tf.InteractiveSession()
    
    output_dim = vocab_size
    batch_size = len(x_data)
    hidden_dim =7
    num_layers = 2
    seq_length = x_data.shape[1]
    embedding_dim = 8
    state_tuple_mode = True
    init_state_flag = 0
    init = np.arange(vocab_size*embedding_dim).reshape(vocab_size,-1)
    
    train_mode = False
    with tf.variable_scope('test',reuse=tf.AUTO_REUSE) as scope:
        # Make rnn
        
        method = 1
        if method == 0:
            cells = []
            for _ in range(num_layers):
                cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim)
                #cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_dim,state_is_tuple=state_tuple_mode)
                #cell = tf.contrib.rnn.GRUCell(num_units=hidden_dim)  # init_state_flag==0 으로 해야 됨.
                cells.append(cell)
            cell = tf.contrib.rnn.MultiRNNCell(cells)    
        else:
            #cell = tf.contrib.rnn.BasicRNNCell(num_units=hidden_dim)
            cell = tf.contrib.rnn.LSTMCell(num_units=hidden_dim,num_proj=7)
    
        embedding = tf.get_variable("embedding", initializer=init.astype(np.float32),dtype = tf.float32)
        inputs = tf.nn.embedding_lookup(embedding, x_data) # batch_size  x seq_length x embedding_dim
    
        Y = tf.convert_to_tensor(y_data)
    
    
        # tf.contrib.rnn.OutputProjectionWrapper  마지막에 FC layer를 하나 더 추가하는 효과. 아래에서 적용하는 Dense보다 앞에 적용된다. Dense가 있기 때문에 OutputProjectionWrapper 또는 Dense로 처리 가능함
        # FC layer를 multiple로 적용하려면 OutputProjectionWrapper을 사용해야 함.
        if False:
            cell = tf.contrib.rnn.OutputProjectionWrapper(cell,13,activation=tf.nn.relu)
            cell = tf.contrib.rnn.OutputProjectionWrapper(cell,17)
    
        if init_state_flag==0:
             initial_state = cell.zero_state(batch_size, tf.float32) #(batch_size x hidden_dim) x layer 개수 
        else:
            if state_tuple_mode:
                h0 = tf.random_normal([batch_size,hidden_dim]) #h0 = tf.cast(np.random.randn(batch_size,hidden_dim),tf.float32)
                # 첫번째 layer의 c=0, h=h0, 두번째 layer의 c=0, h=0, ....
                initial_state=(tf.contrib.rnn.LSTMStateTuple(tf.zeros_like(h0), h0),) + (tf.contrib.rnn.LSTMStateTuple(tf.zeros_like(h0), tf.zeros_like(h0)),)*(num_layers-1)
                
            else:
                h0 = tf.random_normal([batch_size,hidden_dim]) #h0 = tf.cast(np.random.randn(batch_size,hidden_dim),tf.float32)
                initial_state = (tf.concat((tf.zeros_like(h0),h0), axis=1),) + (tf.concat((tf.zeros_like(h0),tf.zeros_like(h0)), axis=1),) * (num_layers-1)
        if train_mode:
            helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([seq_length]*batch_size,dtype=np.int32))
            #helper = tf.contrib.seq2seq.TrainingHelper(inputs, np.array([[2],[4],[6]]).reshape(-1))
        else:
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding, start_tokens=tf.tile([SOS_token], [batch_size]), end_token=EOS_token)
    
        output_layer = Dense(output_dim, name='output_projection')
        #output_layer = None
        
        
        decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=initial_state,output_layer=output_layer)    
        # maximum_iterations를 설정하지 않으면, inference에서 EOS토큰을 만나지 못하면 무한 루프에 빠진다
        # last_state는 num_layers 만큼 나온다.
        outputs, last_state, last_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(decoder=decoder,output_time_major=False,impute_finished=True,maximum_iterations=10)
    
        weights = tf.ones(shape=[batch_size,seq_length])
        loss =   tf.contrib.seq2seq.sequence_loss(logits=outputs.rnn_output, targets=Y, weights=weights)
    
    
        sess.run(tf.global_variables_initializer())
        print("initial_state: ", sess.run(initial_state))
        print("\n\noutputs: ",outputs)
        o = sess.run(outputs.rnn_output)  #batch_size, seq_length, outputs
        o2 = sess.run(tf.argmax(outputs.rnn_output,axis=-1))
        print("\n",o,o2) #batch_size, seq_length, outputs
    
        print("\n\nlast_state: ",last_state)
        print(sess.run(last_state)) # batch_size, hidden_dim
    
        print("\n\nlast_sequence_lengths: ",last_sequence_lengths)
        print(sess.run(last_sequence_lengths)) #  [seq_length]*batch_size    
        if output_layer is not None:
            print("kernel(weight)",sess.run(output_layer.trainable_weights[0]))  # kernel(weight)
            print("bias",sess.run(output_layer.trainable_weights[1]))  # bias
    
        if train_mode:
            p = sess.run(tf.nn.softmax(outputs.rnn_output)).reshape(-1,output_dim)   #(18,5) = (batch_size x seq_length, vocab_size)
            print("loss: {:20.6f}".format(sess.run(loss)))
            print("manual cal. loss: {:0.6f} ".format(np.average(-np.log(p[np.arange(y_data.size),y_data.flatten()]))) )
Пример #28
0
    def add_decoder(self):
        with tf.variable_scope('Decoder') as scope:
            with tf.device('/cpu:0'):
                self.dec_Wemb = tf.get_variable('embedding',
                                                initializer=tf.random_uniform([
                                                    self.dec_vocab_size + 2,
                                                    self.dec_emb_size
                                                ]),
                                                dtype=tf.float32)

            batch_size = tf.shape(self.enc_inputs)[0]

            dec_cell = self.cell(self.hidden_size)

            attn_mech = tf.contrib.seq2seq.LuongAttention(
                num_units=self.attn_size,
                memory=self.enc_outputs,
                memory_sequence_length=self.enc_sequence_length,
                name='LuongAttention')

            dec_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell=dec_cell,
                attention_mechanism=attn_mech,
                attention_layer_size=self.attn_size,
                name='Attention_Wrapper')

            initial_state = dec_cell.zero_state(
                dtype=tf.float32,
                batch_size=batch_size).clone(cell_state=self.enc_last_state)

            output_layer = Dense(self.dec_vocab_size + 2,
                                 name='output_projection')

            if self.mode == 'training':

                max_dec_len = tf.reduce_max(self.dec_sequence_length + 1,
                                            name='max_dec_len')

                dec_emb_inputs = tf.nn.embedding_lookup(self.dec_Wemb,
                                                        self.dec_inputs,
                                                        name='emb_inputs')

                training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=dec_emb_inputs,
                    sequence_length=self.dec_sequence_length + 1,
                    time_major=False,
                    name='training_helper')

                training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=dec_cell,
                    helper=training_helper,
                    initial_state=initial_state,
                    output_layer=output_layer)

                train_dec_outputs, train_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    training_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=max_dec_len)

                logits = tf.identity(train_dec_outputs.rnn_output,
                                     name='logits')

                targets = tf.slice(self.dec_inputs, [0, 0], [-1, max_dec_len],
                                   'targets')

                masks = tf.sequence_mask(self.dec_sequence_length + 1,
                                         max_dec_len,
                                         dtype=tf.float32,
                                         name='masks')

                self.batch_loss = tf.contrib.seq2seq.sequence_loss(
                    logits=logits,
                    targets=targets,
                    weights=masks,
                    name='batch_loss')

                self.valid_predictions = tf.identity(
                    train_dec_outputs.sample_id, name='valid_preds')

            elif self.mode == 'inference':

                start_tokens = tf.tile(tf.constant([self.start_token],
                                                   dtype=tf.int32),
                                       [batch_size],
                                       name='start_tokens')

                inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                    embedding=self.dec_Wemb,
                    start_tokens=start_tokens,
                    end_token=self.end_token)

                inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=dec_cell,
                    helper=inference_helper,
                    initial_state=initial_state,
                    output_layer=output_layer)

                infer_dec_outputs, infer_dec_last_state, _ = tf.contrib.seq2seq.dynamic_decode(
                    inference_decoder,
                    output_time_major=False,
                    impute_finished=True,
                    maximum_iterations=self.dec_sentence_length)

                self.predictions = tf.identity(infer_dec_outputs.sample_id,
                                               name='predictions')
Пример #29
0
def _match_model_fn_v6(features, labels, mode, params):
    '''
    this version uses origianl seq2seq, but uses a lstm merges the cause and word embedding_tabel

    and this version use the input embedding as the attention query
    '''
    # print('aaa')
    '''set parameters'''
    with tf.device('/gpu:0'), tf.variable_scope('model',
                                                reuse=tf.AUTO_REUSE) as scope:
        # set hyper parameters
        embedding_size = params['embedding_size']
        num_units = params['num_units']
        if mode == tf.estimator.ModeKeys.TRAIN:
            dropout_keep_prob = params['dropout_keep_prob']
        else:
            dropout_keep_prob = 1
        beam_width = params['beam_width']
        EOS = params['EOS']
        SOS = params['SOS']
        # set training parameters
        max_sequence_length = params['max_sequence_length']
        max_cause_length = params['max_cause_length']
        vocab_size = params['vocab_size']
        num_causes = EOS + 1
        '''process input and target'''
        # input layer
        input = tf.reshape(features['content'], [-1, max_sequence_length])
        batch_size = tf.shape(input)[0]
        input_length = tf.reshape(features['content_length'], [batch_size])
        cause_label = tf.reshape(labels['cause_label'],
                                 [batch_size, max_cause_length])
        cause_length = tf.reshape(labels['cause_length'], [batch_size])

        # necessary cast
        input = tf.cast(input, dtype=tf.int32)
        input_length = tf.cast(input_length, dtype=tf.int32)
        cause_label = tf.cast(cause_label, dtype=tf.int32)
        cause_length = tf.cast(cause_length, dtype=tf.int32)

        # word embedding layer
        embeddings_word = load_embedding(params['word2vec_model'], vocab_size,
                                         embedding_size)

        embedded_input = gen_array_ops.gather_v2(embeddings_word,
                                                 input,
                                                 axis=0)
        # cause-label embedding layer
        cause_encoder = CauseEncoder(word_embeddings=embeddings_word,
                                     params=params)
        embedded_cause = cause_encoder.apply(cause_label)

        # cause lookpu_table
        cause_table = tf.constant(params['cause_table'], dtype=tf.int32)
        encoder_output = encoders(embedded_input, input_length, params, mode)
        '''hierarchical multilabel decoder'''
        # build lstm cell with attention
        lstm = rnn.LayerNormBasicLSTMCell(num_units=num_units,
                                          reuse=tf.AUTO_REUSE,
                                          dropout_keep_prob=dropout_keep_prob)
        # lstm = rnn.DropoutWrapper(lstm, output_keep_prob=dropout_keep_prob)
        # the subtraction at the end of the line is a ele-wise subtraction supported by tensorflow

        attention_mechanism = MyBahdanauAttention(
            num_units=embedding_size,
            memory=encoder_output.attention_values,
            memory_sequence_length=encoder_output.attention_values_length)
        initial_state = rnn.LSTMStateTuple(encoder_output.initial_state,
                                           encoder_output.initial_state)
        cell = MyAttentionWrapper_v2(lstm,
                                     attention_mechanism,
                                     sot=SOS,
                                     output_attention=False,
                                     name="MyAttentionWrapper")
        cell_state = cell.zero_state(dtype=tf.float32, batch_size=batch_size)
        cell_state = cell_state.clone(cell_state=initial_state,
                                      attention=encoder_output.final_state)

        # extra dense layer to project a rnn output into a classification
        project_dense = Dense(num_causes,
                              _reuse=tf.AUTO_REUSE,
                              _scope='project_dense_scope',
                              name='project_dense')

        # train_decoder
        train_helper = MyTrainingHelper(embedded_cause, cause_label,
                                        cause_length)
        train_decoder = MyBasicDecoder(cell,
                                       train_helper,
                                       cell_state,
                                       lookup_table=cause_table,
                                       output_layer=project_dense,
                                       hie=params['hie'])

        decoder_output_train, decoder_state_train, decoder_len_train = dynamic_decode(
            train_decoder,
            maximum_iterations=max_cause_length - 1,
            parallel_iterations=64,
            scope='decoder')

        # beam_width = 1
        tiled_memory_sequence_length = tile_batch(
            encoder_output.attention_values_length, multiplier=beam_width)
        tiled_memory = tile_batch(encoder_output.attention_values,
                                  multiplier=beam_width)
        tiled_encoder_output_initital_state = tile_batch(
            encoder_output.initial_state, multiplier=beam_width)
        tiled_initial_state = rnn.LSTMStateTuple(
            tiled_encoder_output_initital_state,
            tiled_encoder_output_initital_state)
        tiled_first_attention = tile_batch(encoder_output.final_state,
                                           multiplier=beam_width)

        attention_mechanism = MyBahdanauAttention(
            num_units=embedding_size,
            memory=tiled_memory,
            memory_sequence_length=tiled_memory_sequence_length)

        cell = MyAttentionWrapper_v2(lstm,
                                     attention_mechanism,
                                     sot=SOS,
                                     output_attention=False,
                                     name="MyAttentionWrapper")
        cell_state = cell.zero_state(dtype=tf.float32,
                                     batch_size=batch_size * beam_width)
        cell_state = cell_state.clone(cell_state=tiled_initial_state,
                                      attention=tiled_first_attention)
        infer_decoder = MyBeamSearchDecoder(cell,
                                            embedding=cause_encoder,
                                            sots=tf.fill([batch_size], SOS),
                                            start_tokens=tf.fill([batch_size],
                                                                 SOS),
                                            end_token=EOS,
                                            initial_state=cell_state,
                                            beam_width=beam_width,
                                            output_layer=project_dense,
                                            lookup_table=cause_table,
                                            length_penalty_weight=0.7,
                                            hie=params['hie'])

        cause_output_infer, cause_state_infer, cause_length_infer = dynamic_decode(
            infer_decoder,
            parallel_iterations=64,
            maximum_iterations=max_cause_length - 1,
            scope='decoder')

        # loss
        mask_for_cause = tf.sequence_mask(cause_length - 1,
                                          max_cause_length - 1,
                                          dtype=tf.float32)
        # loss = sequence_loss(logits=padded_train_output, targets=cause_label, weights=mask_for_cause, name='loss')
        tmp_padding = tf.pad(decoder_output_train.rnn_output,
                             [[0, 0],
                              [
                                  0, max_cause_length - 1 -
                                  tf.shape(decoder_output_train.rnn_output)[1]
                              ], [0, 0]],
                             constant_values=0)
        loss = _compute_loss(tmp_padding, cause_label, mask_for_cause,
                             batch_size)
        # predicted_ids: [batch_size, max_cause_length, beam_width]

        predicted_and_cause_ids = tf.transpose(
            cause_output_infer.predicted_ids,
            perm=[0, 2, 1],
            name='predicted_cause_ids')

        # for monitoring
        cause_label_expanded = tf.reshape(cause_label[:, 1:],
                                          [-1, 1, max_cause_length - 1])
        predicted_and_cause_ids = tf.pad(
            predicted_and_cause_ids,
            [[0, 0], [0, 0],
             [0, max_cause_length - 1 - tf.shape(predicted_and_cause_ids)[2]]],
            constant_values=EOS)
        predicted_and_cause_ids = tf.concat(
            [predicted_and_cause_ids, cause_label_expanded],
            axis=1,
            name='predicted_and_cause_ids')
        predicted_and_cause_ids = tf.reshape(
            predicted_and_cause_ids,
            [-1, beam_width + 1, max_cause_length - 1])
        predicted_and_cause_ids_train = tf.concat(
            [decoder_output_train.sample_id, cause_label[:, 1:]],
            axis=1,
            name='predicted_and_cause_ids_train')

        predictions = {
            'predicted_and_cause_ids': predicted_and_cause_ids,
        }
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions)

        if mode == tf.estimator.ModeKeys.TRAIN:
            # warm_up_constant = params['warm_up_steps'] ** (-1.5)
            # embedding_constant = embedding_size ** (-0.5)
            # global_step = tf.to_float(tf.train.get_global_step())
            # learning_rate = tf.minimum(1 / tf.sqrt(global_step),
            #                            warm_up_constant * global_step) * embedding_constant
            # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9)
            optimizer = tf.train.AdamOptimizer()
            # # train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())
            # '''using gradient clipping'''
            # loss = tf.Print(loss, [loss, 'to be clear, this is the loss'])
            grads_and_vars = optimizer.compute_gradients(loss)
            clipped_gvs = [
                ele if ele[0] is None else
                (tf.clip_by_value(ele[0], -0.1, 0.1), ele[1])
                for ele in grads_and_vars
            ]
            train_op = optimizer.apply_gradients(
                clipped_gvs, global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        # predicted_cause_ids shape = [batch_size, cause_length]
        # cause_label = [batch_size, cause_length]
        #  select the predicted cause with the highest possibility
        # todo: evalutaion
        # bi_predicted_cause_ids = binarizer(predicted_cause_ids[:, 0, :], num_causes)
        # bi_cause_label = binarizer(cause_label, num_causes)

        # todo: now I have to leave the evaluation work be done outside the estimator
        eval_metric_ops = {
            'predicted_and_cause_ids':
            tf.contrib.metrics.streaming_concat(predicted_and_cause_ids),
            # 'precision': tf.metrics.precision(bi_cause_label, bi_predicted_cause_ids),
            # 'recall': tf.metrics.recall(bi_cause_label, bi_predicted_cause_ids),
            # 'f1-score': f_score(bi_cause_label, bi_predicted_cause_ids),
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metric_ops)
Пример #30
0
    def build_model(self, grad_clip, is_train=1):
        data = tf.placeholder(tf.int32, shape=[1, None], name="input_id")
        train_data = tf.placeholder(tf.int32, shape=[1, None], name="train_id")
        train_label = tf.placeholder(tf.int32,
                                     shape=[1, None],
                                     name="trian_label")
        z_0 = tf.placeholder(tf.float32, shape=[1],
                             name="prior_selection")  # 1 or 0

        wrods = tf.nn.embedding_lookup(self.embed, data)
        decoder_input = tf.nn.embedding_lookup(self.embed, train_data)

        with tf.variable_scope("encoder"):
            encoder = self._get_simple_lstm(lstm_size, lstm_layer)
            words = tf.nn.embedding_lookup(self.embed, data)
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(encoder,
                                                           words,
                                                           dtype=tf.float32)

        # define the variational approximation
        epsilon = tf.placeholder(tf.float32, shape=[1], name="epsilon")
        with tf.variable_scope("encoder_approx"):
            mean_encode_layer_1 = Dense(1)  # 1
            #mean_encode_layer_2 = Dense(1) # -1
            var_encode_layer = Dense(1)
        mean_approx_1 = mean_encode_layer_1(encoder_state[lstm_layer - 1][1])
        #mean_approx_2 = mean_encode_layer_2(encoder_state[0][1])
        var_approx = var_encode_layer(encoder_state[lstm_layer - 1][1])
        # p(Z) = z_0 * N(1, 1) + (1-z_0) * N(-1, 1)
        self.Z = (2 * z_0 - 1) * mean_approx_1 + epsilon * var_approx

        if is_train == 0:
            # do inference
            self.Z = tf.placeholder(tf.float32, shape=[1, 1], name="Z_input")
            self.start_tokens = tf.placeholder(tf.int32,
                                               shape=[1],
                                               name='start_tokens')
            self.end_tokens = tf.placeholder(tf.int32,
                                             shape=(),
                                             name="end_tokens")
            #print self.end_tokens.shape
            helper = GreedyEmbeddingHelper(self.embed, self.start_tokens,
                                           self.end_tokens)
        elif is_train == 1:
            self.decoder_seq_length = tf.placeholder(tf.int32,
                                                     shape=[None],
                                                     name='decoder_seq_length')
            '''
            NOTICE: since it is an auto-encoder, the input of the traininghelper
                  is the first n-1 words and the output is the last n-1 words
                  Otherwise, it will be just an identity transformation
            '''
            # words' shape: [1, sen_length, vocab_dim]
            helper = TrainingHelper(decoder_input, self.decoder_seq_length)

        with tf.variable_scope("decoder"):
            # decoder, use the latent variable to compute the new initial hidden state
            # and the cell state for the decoding lstm model.
            fc_rec = Dense(lstm_size)
            fc_rec2 = Dense(lstm_size)
            decoder_h = fc_rec(self.Z)
            decoder_c = fc_rec2(self.Z)
            fc_layer = Dense(self.shape[0])
            decoder_cell = self._get_simple_lstm(lstm_size, lstm_layer)
            d_i_s = tf.contrib.rnn.LSTMStateTuple(decoder_c, decoder_h)
            decoder = BasicDecoder(decoder_cell, helper, (d_i_s, ), fc_layer)

        logits, final_state, final_sequence_lengths = dynamic_decode(
            decoder, maximum_iterations=LENGTH)

        if is_train == 0:
            loss = tf.reshape(tf.nn.softmax(logits.rnn_output),
                              [-1, self.shape[0]])  # output shouldn't have SOS
            predict = tf.argmax(loss, axis=1)
            return predict, loss

        elif is_train == 1:
            # train
            targets = tf.reshape(train_label, [-1])
            logits_flatten = tf.reshape(logits.rnn_output, [-1, self.shape[0]])
            cross_ent = tf.losses.sparse_softmax_cross_entropy(
                targets, logits_flatten)
            #DL_loss = -0.5 * (2 * tf.log(var_approx) - z_0 * tf.square(mean_approx_1)
            #                - (1-z_0) * tf.square(mean_approx_2) + tf.square(var_approx)
            #                + z_0 * mean_approx_1 - (1-z_0) * mean_approx_2)
            DL_loss = -(0.5 *
                        (tf.log(tf.square(var_approx)) -
                         tf.square(mean_approx_1) - tf.square(var_approx)) +
                        mean_approx_1)
            loss = DL_loss + cross_ent  # negative ELOB
            tvars = tf.trainable_variables()
            grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars),
                                              grad_clip)
            optimizer = tf.train.AdamOptimizer(pretrain_lr)
            train_op = optimizer.apply_gradients(zip(
                grads, tvars))  # minimize the loss
            return train_op, loss, data, train_data, train_label, z_0, epsilon