Exemplo n.º 1
0
 def build_decoder(self, encoder_outputs, encoder_final_state):
     """
     构建完整解码器
     :return:
     """
     with tf.variable_scope("decode"):
         decoder_cell, decoder_initial_state = self.build_decoder_cell(
             encoder_outputs, encoder_final_state, self.hidden_size,
             self.cell_type, self.layer_size)
         # 输出层投影
         decoder_output_projection = layers.Dense(
             self.decoder_vocab_size,
             dtype=tf.float32,
             use_bias=False,
             kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.1),
             name='decoder_output_projection')
         if self.mode == 'train':
             # 训练模式
             decoder_inputs_embdedded = tf.nn.embedding_lookup(
                 self.decoder_embeddings, self.decoder_inputs_train)
             training_helper = TrainingHelper(
                 inputs=decoder_inputs_embdedded,
                 sequence_length=self.decoder_inputs_length,
                 name='training_helper')
             training_decoder = BasicDecoder(decoder_cell, training_helper,
                                             decoder_initial_state,
                                             decoder_output_projection)
             max_decoder_length = tf.reduce_max(self.decoder_inputs_length)
             training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                 training_decoder, maximum_iterations=max_decoder_length)
             self.masks = tf.sequence_mask(self.decoder_inputs_length,
                                           maxlen=max_decoder_length,
                                           dtype=tf.float32,
                                           name='masks')
             self.loss = tf.contrib.seq2seq.sequence_loss(
                 logits=training_decoder_output.rnn_output,
                 targets=self.decoder_inputs,
                 weights=self.masks,
                 average_across_timesteps=True,
                 average_across_batch=True)
         else:
             # 预测模式
             start_token = [DataUnit.START_INDEX] * self.batch_size
             end_token = DataUnit.END_INDEX
             inference_decoder = BeamSearchDecoder(
                 cell=decoder_cell,
                 embedding=lambda x: tf.nn.embedding_lookup(
                     self.decoder_embeddings, x),
                 start_tokens=start_token,
                 end_token=end_token,
                 initial_state=decoder_initial_state,
                 beam_width=self.beam_width,
                 output_layer=decoder_output_projection)
             inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                 inference_decoder, maximum_iterations=self.max_decode_step)
             self.decoder_pred_decode = inference_decoder_output.predicted_ids
             self.decoder_pred_decode = tf.transpose(
                 self.decoder_pred_decode, perm=[0, 2, 1])
Exemplo n.º 2
0
def decoder(x, decoder_inputs, keep_prob, sequence_length, memory,
            memory_length, first_attention):
    with tf.variable_scope("Decoder") as scope:
        label_embeddings = tf.get_variable(name="embeddings",
                                           shape=[n_classes, embedding_size],
                                           dtype=tf.float32)
        train_inputs_embedded = tf.nn.embedding_lookup(label_embeddings,
                                                       decoder_inputs)
        lstm = rnn.LayerNormBasicLSTMCell(n_hidden,
                                          dropout_keep_prob=keep_prob)
        output_l = layers_core.Dense(n_classes, use_bias=True)
        encoder_state = rnn.LSTMStateTuple(x, x)
        attention_mechanism = BahdanauAttention(
            embedding_size,
            memory=memory,
            memory_sequence_length=memory_length)
        cell = AttentionWrapper(lstm,
                                attention_mechanism,
                                output_attention=False)
        cell_state = cell.zero_state(dtype=tf.float32,
                                     batch_size=train_batch_size)
        cell_state = cell_state.clone(cell_state=encoder_state,
                                      attention=first_attention)
        train_helper = TrainingHelper(train_inputs_embedded, sequence_length)
        train_decoder = BasicDecoder(cell,
                                     train_helper,
                                     cell_state,
                                     output_layer=output_l)
        decoder_outputs_train, decoder_state_train, decoder_seq_train = dynamic_decode(
            train_decoder, impute_finished=True)
        tiled_inputs = tile_batch(memory, multiplier=beam_width)
        tiled_sequence_length = tile_batch(memory_length,
                                           multiplier=beam_width)
        tiled_first_attention = tile_batch(first_attention,
                                           multiplier=beam_width)
        attention_mechanism = BahdanauAttention(
            embedding_size,
            memory=tiled_inputs,
            memory_sequence_length=tiled_sequence_length)
        x2 = tile_batch(x, beam_width)
        encoder_state2 = rnn.LSTMStateTuple(x2, x2)
        cell = AttentionWrapper(lstm,
                                attention_mechanism,
                                output_attention=False)
        cell_state = cell.zero_state(dtype=tf.float32,
                                     batch_size=test_batch_size * beam_width)
        cell_state = cell_state.clone(cell_state=encoder_state2,
                                      attention=tiled_first_attention)
        infer_decoder = BeamSearchDecoder(cell,
                                          embedding=label_embeddings,
                                          start_tokens=[GO] * test_len,
                                          end_token=EOS,
                                          initial_state=cell_state,
                                          beam_width=beam_width,
                                          output_layer=output_l)
        decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode(
            infer_decoder, maximum_iterations=4)
        return decoder_outputs_train, decoder_outputs_infer, decoder_state_infer
Exemplo n.º 3
0
    def build_validation_graph(self, validation_data, beam_width=1):
        """
        Build same graph as training graph with validation data so test metrics with unseen data
        :param validation_data: Validation data input
        :param beam_width: beam width parameter
        :return:
        """

        # Unpack subject, content and answers and corresponding length
        subject, len_subject, content, len_content, target_input, target_output, len_target = validation_data

        # Choose best answer per question
        target_output = target_output[:, 0, :]
        len_target = tf.reshape(tf.to_int32(len_target[:, 0]), [-1])

        # Concat subject and content to feed it into encoder
        sub_cont_concat_op = tf.map_fn(
            self.concat_seqs, [subject, len_subject, content, len_content])[0]
        len_both = len_subject + len_content

        # Build initial graph with concatted subject and content and sequence length. Get back decoder cell and
        # attention zero state
        decoder_cell, attn_zero_state = self.build_initial_graph(
            sub_cont_concat_op, len_both, reuse=True, beam_width=beam_width)

        # Build beam search decoder
        decoder = BeamSearchDecoder(decoder_cell,
                                    self.embeddings_english,
                                    self.start_token,
                                    self.end_token,
                                    attn_zero_state,
                                    beam_width,
                                    output_layer=self.projection_layer)

        # Define variable scope train decoder to initialize the decoder with dynamic decode. Reuse variables from scope
        # because it has already been defined for train graph
        with tf.variable_scope("train_decoder", reuse=True):
            outputs, _, val_seq_len = dynamic_decode(
                decoder,
                output_time_major=False,
                maximum_iterations=self.max_seq_len)

        # Take only first output of beam search
        validation_outputs = tf.transpose(outputs.predicted_ids, [2, 0, 1])
        validation_outputs = tf.reshape(validation_outputs[0, :, :],
                                        [self.batch_size, -1])
        val_seq_len = tf.transpose(val_seq_len)
        val_seq_len = tf.reshape(val_seq_len[0, :], [-1])

        # Calculate metric scores
        avg_score, greedy_score, extreme_score = self.metrics_module(
            validation_outputs, val_seq_len, target_output, len_target)

        return avg_score, greedy_score, extreme_score, val_seq_len, validation_outputs
Exemplo n.º 4
0
    def build_infer_graph(self, beam_width=1, reuse=False):
        """
        Build graph for infering unseen data from the graph
        :param beam_width: Define beam width
        :return: Placeholders for inputing data and outputs of beam search
        """

        # Placeholders for Encoder
        subject_ph = tf.placeholder(shape=(self.batch_size, None),
                                    dtype=tf.int32,
                                    name='subject')
        content_ph = tf.placeholder(shape=(self.batch_size, None),
                                    dtype=tf.int32,
                                    name='content')
        len_subject_ph = tf.placeholder(shape=(None, ),
                                        dtype=tf.int32,
                                        name='sub_len')
        len_content_ph = tf.placeholder(shape=(None, ),
                                        dtype=tf.int32,
                                        name='cont_len')

        # Concat subject and content to feed it into encoder
        sub_cont_concat_op = tf.map_fn(
            self.concat_seqs,
            [subject_ph, len_subject_ph, content_ph, len_content_ph])[0]

        # Also concat length by adding them
        len_both = len_subject_ph + len_content_ph

        # Build initial graph with concatted subject and content and sequence length. Get back decoder cell and
        # attention zero state
        decoder_cell, attn_zero_state = self.build_initial_graph(
            sub_cont_concat_op, len_both, beam_width=beam_width, reuse=reuse)

        # Build beam search decoder
        decoder = BeamSearchDecoder(decoder_cell,
                                    self.embeddings_english,
                                    self.start_token,
                                    self.end_token,
                                    attn_zero_state,
                                    beam_width,
                                    output_layer=self.projection_layer)

        # Define variable scope train decoder to initialize the decoder with dynamic decode
        with tf.variable_scope("train_decoder", reuse=reuse):
            outputs, _, _ = dynamic_decode(decoder,
                                           output_time_major=False,
                                           maximum_iterations=self.max_seq_len)

        # Transform beam outputs for readable output
        beam_outputs = tf.transpose(outputs.predicted_ids, [2, 0, 1])

        return [subject_ph, content_ph, len_subject_ph,
                len_content_ph], beam_outputs
Exemplo n.º 5
0
 def setup_decoder(self):
     self.dec_init_state = self.cell.zero_state(self.batch_size *
                                                self.beam_width,
                                                dtype=tf.float32)
     self.decoder = BeamSearchDecoder(
         cell=self.cell,
         embedding=self.embedding,
         start_tokens=tf.tile([0], [self.batch_size]),
         end_token=-1,
         initial_state=self.dec_init_state,
         beam_width=self.beam_width,
         output_layer=tf.layers.Dense(self.vocab_size))
Exemplo n.º 6
0
 def decoder(self, encoder_outputs, encoder_states):
     decoder_cell, decoder_init_state = self.add_decoder_cell(
         encoder_outputs, encoder_states, self.hidden_size, self.cell_type,
         self.num_layers)
     output_proj = tf.layers.Dense(
         self.tgt_vcb_size,
         dtype=tf.float32,
         use_bias=False,
         kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
         name='output_proj')
     if self.mode == 'train':
         target_embedding = tf.nn.embedding_lookup(self.decoder_embeddings,
                                                   self.decoder_input_train)
         training_helper = TrainingHelper(target_embedding,
                                          self.target_len,
                                          name='training_helper')
         training_decoder = BasicDecoder(decoder_cell, training_helper,
                                         decoder_init_state, output_proj)
         max_dec_len = tf.reduce_max(self.target_len)
         output, _, _ = tf.contrib.seq2seq.dynamic_decode(
             training_decoder, maximum_iterations=max_dec_len)
         self.d_masks = tf.sequence_mask(self.target_len,
                                         max_dec_len,
                                         dtype=tf.float32,
                                         name='d_masks')
         self.prob = output.rnn_output
         self.loss = tf.contrib.seq2seq.sequence_loss(
             logits=self.prob,
             targets=self.target,
             weights=self.d_masks,
             average_across_timesteps=True,
             average_across_batch=True)
     else:
         start_token = [DataUnit.START_INDEX] * self.batch_size
         end_token = DataUnit.END_INDEX
         inference_decoder = BeamSearchDecoder(
             cell=decoder_cell,
             embedding=lambda x: tf.nn.embedding_lookup(
                 self.decoder_embeddings, x),
             start_tokens=start_token,
             end_token=end_token,
             initial_state=decoder_init_state,
             beam_width=self.beam_size,
             output_layer=output_proj)
         output, _, _ = tf.contrib.seq2seq.dynamic_decode(
             inference_decoder, maximum_iterations=self.max_decode_step)
         output_pred_ = output.predicted_ids
         self.decoder_output = tf.transpose(output_pred_, perm=[0, 2, 1])
def inference_decode(enc_outputs, seq_len, embeddings, out_dim):
    tiled_enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs,
                                                      hp.beam_width)
    tiled_seq_len = tf.contrib.seq2seq.tile_batch(seq_len, hp.beam_width)

    beam_batch_size = tf.shape(tiled_enc_outputs)[0]
    # start tokens, end token
    start_tokens = tf.tile([hp.START_TOKEN],
                           [beam_batch_size // hp.beam_width])
    end_token = hp.END_TOKEN

    dec_prenet_outputs = DecoderPrenetWrapper(GRUCell(hp.embed_size),
                                              is_training=False,
                                              prenet_sizes=hp.embed_size,
                                              dropout_prob=hp.dropout)
    attention_mechanism = BahdanauAttention(
        hp.embed_size,
        tiled_enc_outputs,
        normalize=True,
        memory_sequence_length=tiled_seq_len,
        probability_fn=tf.nn.softmax)
    attn_cell = AttentionWrapper(dec_prenet_outputs,
                                 attention_mechanism,
                                 alignment_history=True,
                                 output_attention=False)
    concat_cell = ConcatOutputAndAttentionWrapper(attn_cell)
    decoder_cell = MultiRNNCell([
        OutputProjectionWrapper(concat_cell, hp.embed_size),
        ResidualWrapper(GRUCell(hp.embed_size)),
        ResidualWrapper(GRUCell(hp.embed_size))
    ],
                                state_is_tuple=True)

    output_cell = OutputProjectionWrapper(decoder_cell, out_dim)
    initial_state = output_cell.zero_state(batch_size=beam_batch_size,
                                           dtype=tf.float32)

    decoder = BeamSearchDecoder(cell=output_cell,
                                embedding=embeddings,
                                start_tokens=start_tokens,
                                end_token=end_token,
                                initial_state=initial_state,
                                beam_width=hp.beam_width)
    outputs, t1, t2 = tf.contrib.seq2seq.dynamic_decode(
        decoder, maximum_iterations=hp.max_len)
    return outputs
Exemplo n.º 8
0
 def _build_infer(self, config):
   # infer_decoder/beam_search 
   # skip for flat_baseline  
   tiled_inputs = tile_batch(self.xx_context, multiplier=config.beam_width)
   tiled_sequence_length = tile_batch(self.x_seq_length, multiplier=config.beam_width)
   tiled_first_attention = tile_batch(self.first_attention, multiplier=config.beam_width)
   attention_mechanism = BahdanauAttention(config.decode_size, memory=tiled_inputs, memory_sequence_length=tiled_sequence_length)
   tiled_xx_final = tile_batch(self.xx_final, config.beam_width)
   encoder_state2 = rnn.LSTMStateTuple(tiled_xx_final, tiled_xx_final)
   cell = AttentionWrapper(self.lstm, attention_mechanism, output_attention=False)
   cell_state = cell.zero_state(dtype=tf.float32, batch_size = config.test_batch_size * config.beam_width)
   cell_state = cell_state.clone(cell_state=encoder_state2, attention=tiled_first_attention)
   infer_decoder = BeamSearchDecoder(cell, embedding=self.label_embeddings, start_tokens=[config.GO]*config.test_batch_size, end_token=config.EOS,
                                 initial_state=cell_state, beam_width=config.beam_width, output_layer=self.output_l)
   decoder_outputs_infer, decoder_state_infer, decoder_seq_infer = dynamic_decode(infer_decoder, maximum_iterations=config.max_seq_length)
   self.preds = decoder_outputs_infer.predicted_ids
   self.scores = decoder_state_infer.log_probs
Exemplo n.º 9
0
    def decoder_decode(self, decoder_cell, decoder_initial_state,
                       output_layer):
        # 每句的开始用<GO>标记
        start_tokens = tf.ones([
            self.batch_size,
        ], tf.int32) * self.word_to_idx['<GO>']
        # 每句的结束用<EOS>标记
        end_token = self.word_to_idx['<EOS>']

        # 如果使用BeamSearch,使用BeamSearchDecoder进行解码.
        if self.beam_search:
            inference_decoder = BeamSearchDecoder(
                cell=decoder_cell,
                embedding=self.embedding,
                start_tokens=start_tokens,
                end_token=end_token,
                initial_state=decoder_initial_state,
                beam_width=self.beam_size,
                output_layer=output_layer)
        else:  # 不使用BeamSearch,使用GreedyEmbeddingHelper帮助类.
            decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding,
                                                    start_tokens=start_tokens,
                                                    end_token=end_token)
            # 用BasicDecoder进行解码.
            inference_decoder = BasicDecoder(
                cell=decoder_cell,
                helper=decoding_helper,
                initial_state=decoder_initial_state,
                output_layer=output_layer)

        # dynamic_decode
        # 参数:
        # decoder: BasicDecoder、BeamSearchDecoder或者自己定义的decoder类对象
        # output_time_major: 见RNN,为真时step*batch_size*...,为假时batch_size*step*...
        # impute_finished: Boolean,为真时会拷贝最后一个时刻的状态并将输出置零,程序运行更稳定,使最终状态和输出具有正确的值,在反向传播时忽略最后一个完成步。但是会降低程序运行速度。
        # maximum_iterations: 最大解码步数,一般训练设置为decoder_inputs_length,预测时设置一个想要的最大序列长度即可。程序会在产生<eos>或者到达最大步数处停止。
        decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder,
                                               maximum_iterations=50)
        if self.beam_search:  # 如果使用BeamSearch,输出为预测的predicted_ids
            decoder_predict_decode = decoder_outputs.predicted_ids
        else:  # 扩充一个维度,即在最后添加一列 TODO:干什么?
            decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id,
                                                    -1)
        return decoder_predict_decode
Exemplo n.º 10
0
    def build_predict_decoder(self):
        start_tokens = tf.ones([
            self.batch_size,
        ], tf.int32) * self.word_to_id['<GO>']
        end_token = self.word_to_id['<EOS>']

        decoder_cell, deocder_initial_state = self.build_decoder_cell()
        output_layer = tf.layers.Dense(
            self.vocab_size,
            kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                               stddev=0.1))

        if self.beam_search:
            inference_decoder = BeamSearchDecoder(
                cell=decoder_cell,
                embedding=self.embedding,
                start_tokens=start_tokens,
                end_token=end_token,
                initial_state=deocder_initial_state,
                beam_width=self.beam_size,
                output_layer=output_layer)

        else:
            decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding,
                                                    start_tokens=start_tokens,
                                                    end_token=end_token)
            inference_decoder = BasicDecoder(
                cell=decoder_cell,
                helper=decoding_helper,
                initial_state=deocder_initial_state,
                output_layer=output_layer)

        decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder,
                                               maximum_iterations=50)

        if self.beam_search:
            decoder_predict_decode = decoder_outputs.predicted_ids
        else:
            decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id,
                                                    -1)

        return decoder_predict_decode
Exemplo n.º 11
0
    def _decode():
        beam_docoder = BeamSearchDecoder(
            cell=cell,
            embedding=embedding,
            start_tokens=start_tokens,
            end_token=end_token,
            initial_state=initial_state,
            beam_width=beam_width,
            output_layer=output_layer,
            length_penalty_weight=length_penalty_weight)

        if 'maximum_iterations' in kwargs:
            raise ValueError('Use `max_decoding_length` to set the maximum '
                             'allowed number of decoding steps.')
        outputs, final_state, _ = dynamic_decode(
            decoder=beam_docoder,
            output_time_major=output_time_major,
            maximum_iterations=max_decoding_length,
            **kwargs)

        return outputs, final_state, final_state.lengths
Exemplo n.º 12
0
    def build_predict_decoder(self):
        print('Building predict decoder...')

        start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.word_to_id['<GO>']
        end_token = self.word_to_id['<EOS>']

        if self.beam_search:
            inference_decoder = BeamSearchDecoder(
                cell=self.decoder_cell,
                embedding=self.embedding,
                start_tokens=start_tokens,
                end_token=end_token,
                initial_state=self.decoder_initial_state,
                beam_width=self.beam_size,
                output_layer=self.output_layer
            )

        else:
            decoding_helper = GreedyEmbeddingHelper(
                embedding=self.embedding,
                start_tokens=start_tokens,
                end_token=end_token
            )
            ##Uses the argmax of the output (treated as logits) and passes the result through an embedding layer to get the next input.
            ##embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup. The returned tensor will be passed to the decoder input.
            ##start_tokens: int32 vector shaped [batch_size], the start tokens.
            ##end_token: int32 scalar, the token that marks end of decoding.
            inference_decoder = BasicDecoder(
                cell=self.decoder_cell,
                helper=decoding_helper,
                initial_state=self.decoder_initial_state,
                output_layer=self.output_layer
            )

        decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder, maximum_iterations=50)
        ##predicted_ids: Final outputs returned by the beam search after all decoding is finished. A tensor of shape [batch_size, num_steps, beam_width] (or [num_steps, batch_size, beam_width] if output_time_major is True). Beams are ordered from best to worst.
        if self.beam_search:
            self.decoder_predict_decode = decoder_outputs.predicted_ids
        else:
            self.decoder_predict_decode = tf.expand_dims(decoder_outputs.sample_id, -1)
Exemplo n.º 13
0
 def interface_beamsearch(self, enc_outputs, enc_state, e_size):
     beam_width = self.beam_width
     enc_outputs = tf.contrib.seq2seq.tile_batch(enc_outputs,
                                                 multiplier=self.beam_width)
     enc_state = nest.map_structure(
         lambda s: tf.contrib.seq2seq.tile_batch(s, self.beam_width),
         enc_state)
     e_size = tf.contrib.seq2seq.tile_batch(e_size,
                                            multiplier=self.beam_width)
     batch_starts = self.dec_batch_inputs
     softmax_call = self.dense_before_softmax
     attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
         self.embedding_dim, enc_outputs, memory_sequence_length=e_size)
     attention_cell = tf.contrib.seq2seq.AttentionWrapper(
         self.dec_cell,
         attention_mechanism,
         attention_layer_size=self.embedding_dim,
         cell_input_fn=self.cell_input_fn)
     init_state = attention_cell.zero_state(self.dec_batch_size*self.beam_width, tf.float32)\
                  .clone(cell_state=enc_state)
     beamsearch = BeamSearchDecoder(attention_cell, self.decoder_embedding,
                                    batch_starts, self.end, init_state,
                                    beam_width, softmax_call)
     return beamsearch
Exemplo n.º 14
0
    def build_predict_decoder(self):
        print('Building predict decoder...')

        start_tokens = tf.ones([
            self.batch_size,
        ], tf.int32) * self.word_to_id['<GO>']
        end_token = self.word_to_id['<EOS>']

        if self.beam_search:
            inference_decoder = BeamSearchDecoder(
                cell=self.decoder_cell,
                embedding=self.embedding_t,
                start_tokens=start_tokens,
                end_token=end_token,
                initial_state=self.decoder_initial_state,
                beam_width=self.beam_size,
                output_layer=self.output_layer)

        else:
            decoding_helper = GreedyEmbeddingHelper(embedding=self.embedding_t,
                                                    start_tokens=start_tokens,
                                                    end_token=end_token)
            inference_decoder = BasicDecoder(
                cell=self.decoder_cell,
                helper=decoding_helper,
                initial_state=self.decoder_initial_state,
                output_layer=self.output_layer)

        decoder_outputs, _, _ = dynamic_decode(decoder=inference_decoder,
                                               maximum_iterations=50)

        if self.beam_search:
            self.decoder_predict_decode = decoder_outputs.predicted_ids
        else:
            self.decoder_predict_decode = tf.expand_dims(
                decoder_outputs.sample_id, -1)
Exemplo n.º 15
0
    def __init__(self,
                 vocab_size,
                 learning_rate,
                 encoder_size,
                 max_length,
                 embedding_size,
                 sos_token,
                 eos_token,
                 unk_token,
                 beam_size=5):
        self.vocab_size = vocab_size
        self.lr = learning_rate
        self.encoder_size = encoder_size
        self.max_length = max_length
        self.embedding_size = embedding_size
        self.SOS_token = sos_token
        self.EOS_token = eos_token
        self.UNK_token = unk_token
        self.beam_search_size = beam_size
        with tf.variable_scope('placeholder_and_embedding'):
            self.query = tf.placeholder(shape=(None, None), dtype=tf.int32)
            self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int32)
            self.reply = tf.placeholder(shape=(None, None), dtype=tf.int32)
            self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int32)
            self.decoder_inputs = tf.placeholder(shape=(None, None),
                                                 dtype=tf.int32)
            self.decoder_target = tf.placeholder(shape=(None, None),
                                                 dtype=tf.int32)
            self.decoder_length = tf.placeholder(shape=(None, ),
                                                 dtype=tf.int32)
            self.batch_size = tf.placeholder(shape=(), dtype=tf.int32)
            self.embedding_pl = tf.placeholder(dtype=tf.float32,
                                               shape=(self.vocab_size,
                                                      embedding_size),
                                               name='embedding_source_pl')
            word_embedding = tf.get_variable(name='word_embedding',
                                             shape=(self.vocab_size,
                                                    embedding_size),
                                             dtype=tf.float32,
                                             trainable=True)
            self.init_embedding = word_embedding.assign(self.embedding_pl)
            self.max_target_sequence_length = tf.reduce_max(
                self.decoder_length, name='max_target_len')
            self.mask = tf.sequence_mask(self.decoder_length,
                                         self.max_target_sequence_length,
                                         dtype=tf.float32,
                                         name='masks')

        with tf.variable_scope("query_encoder"):
            self.query_encoder = deep_components.gru_encoder(
                word_embedding, self.encoder_size)
            query_out, query_state = self.query_encoder(
                seq_index=self.query, seq_len=self.query_length)
        with tf.variable_scope("reply_encoder"):
            self.reply_encoder = deep_components.gru_encoder(
                word_embedding, self.encoder_size)
            reply_out, reply_state = self.reply_encoder(
                seq_index=self.reply, seq_len=self.reply_length)
        with tf.variable_scope("decoder"):
            combined_encoder_state = tf.concat([query_state, reply_state],
                                               axis=1)
            tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
                combined_encoder_state, multiplier=self.beam_search_size)
            tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
                query_out, multiplier=self.beam_search_size)
            tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
                self.query_length, multiplier=self.beam_search_size)
            decoder_cell = deep_components.AttentionGRUCell(
                memory=tiled_encoder_outputs,
                memory_size=self.encoder_size,
                attention_size=self.encoder_size,
                embedding_dims=self.embedding_size,
                rnn_units=self.encoder_size * 2)
            '''decoder_gru = GRUCell(self.encoder_size * 2)
            attention_mechanism = BahdanauAttention(
                num_units=self.encoder_size,
                memory=tiled_encoder_outputs,
                memory_sequence_length=tiled_sequence_length)
            attention_cell = AttentionWrapper(decoder_gru, attention_mechanism,
                                              attention_layer_size=self.encoder_size)
            decoder_initial_state_beam = attention_cell.zero_state(
                dtype=tf.float32, batch_size=tf.cast(self.batch_size * self.beam_search_size,dtype=tf.int32)).clone(
                cell_state=tiled_encoder_final_state)'''
            #############################
            #attention_cell=decoder_gru
            #decoder_initial_state_beam = tiled_encoder_final_state
            ##############################
            decode_out_layer = tf.layers.Dense(self.vocab_size,
                                               name='output_layer',
                                               _reuse=tf.AUTO_REUSE)
        with tf.variable_scope("seq2seq-train"):
            # train
            self.tiled_d_in = tile_batch(self.decoder_inputs,
                                         multiplier=self.beam_search_size)
            self.tiled_d_tgt = tile_batch(self.decoder_target,
                                          multiplier=self.beam_search_size)
            train_helper = TrainingHelper(
                tf.contrib.seq2seq.tile_batch(
                    tf.nn.embedding_lookup(word_embedding,
                                           self.decoder_inputs),
                    multiplier=self.beam_search_size),
                sequence_length=tile_batch(self.decoder_length,
                                           multiplier=self.beam_search_size),
                name="train_helper")
            train_decoder = BasicDecoder(
                decoder_cell,
                train_helper,
                initial_state=tiled_encoder_final_state,
                output_layer=decode_out_layer)
            self.dec_output, _, self.gen_len = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=self.max_target_sequence_length)
            #self.gen_max_len=tf.reduce_max(self.gen_len)
            #self.padding=tf.zeros(shape=(self.batch_size,self.max_length-self.gen_max_len,self.vocab_size),dtype=tf.float32)
            #self.padding=tile_batch(self.padding,multiplier=self.beam_search_size)
            self.dec_logits = tf.identity(self.dec_output.rnn_output)
            #self.dec_logits = tf.concat((self.dec_logits,self.padding),axis=1)
            self.decoder_target_mask = tile_batch(
                self.mask, multiplier=self.beam_search_size)
            self.cost = sequence_loss(
                self.dec_logits,
                tile_batch(self.decoder_target,
                           multiplier=self.beam_search_size),
                self.decoder_target_mask)
            self.optimizer = tf.train.AdamOptimizer(
                learning_rate=self.lr).minimize(self.cost)
        with tf.variable_scope("seq2seq_beam_search_generate"):
            start_tokens = tf.ones([
                self.batch_size,
            ], tf.int32) * self.SOS_token
            beam_infer_decoder = BeamSearchDecoder(
                decoder_cell,
                embedding=word_embedding,
                end_token=self.EOS_token,
                start_tokens=start_tokens,
                initial_state=tiled_encoder_final_state,
                beam_width=self.beam_search_size,
                output_layer=decode_out_layer)
            self.bs_outputs, _, _ = dynamic_decode(
                beam_infer_decoder, maximum_iterations=self.max_length)
        with tf.variable_scope("greedy_generate"):
            decoding_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                embedding=word_embedding,
                start_tokens=start_tokens,
                end_token=self.EOS_token)
            inference_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=decoding_helper,
                initial_state=tiled_encoder_final_state,
                output_layer=decode_out_layer)
            self.greedy_outputs, _, _ = dynamic_decode(
                inference_decoder, maximum_iterations=self.max_length)
Exemplo n.º 16
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell(
                 encoder_outputs, encoder_state)

            # 解码器embedding
            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(tf.constant(
                        0.0,
                        shape=(self.target_vocab_size, self.embedding_size)),
                                                          trainable=True,
                                                          name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = \
                        self.decoder_embeddings.assign(
                            self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection')

            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)
                inputs = self.decoder_inputs_embedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper')

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards,其实我这里是修改了masks
                # train_entropy = cross entropy
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_inputs,
                        logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.tile([WordSequence.START], [self.batch_size])
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return tf.nn.embedding_lookup(self.decoder_embeddings,
                                                  inputs)

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = (
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=self.time_major,
                        # impute_finished=True,	# error occurs
                        maximum_iterations=max_decode_step,
                        parallel_iterations=self.parallel_iterations,
                        swap_memory=True,
                        scope=decoder_scope))

                if not self.use_beamsearch_decode:

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Exemplo n.º 17
0
    def build_decoder(self,encoder_output,encoder_state):
       '''
       构建解码器
       :param encoder_output:
       :param encoder_state:
       :return:
       '''

       with tf.variable_scope('decoder') as decoder_scope:#这里是为了调试方便,将参数折叠成一个层。
           (
               self.decoder_cell,
               self.decoder_initial_state
           ) = self.build_decoder_cell(encoder_output,encoder_state)
           #解码器的embedding matrix
           with tf.device(get_embed_device(self.target_vocab_size)):
               if self.share_embedding:
                   self.decoder_embeddings = self.encoder_embeddings

               elif self.pretrain_embedding:
                   self.decoder_embeddings = tf.Variable(
                       tf.constant(
                           0.0,
                           shape=(self.target_vocab_size,
                                  self.embedding_size)
                       ),
                       trainable=True,
                       name = 'embeddings'
                   )

                   self.decoder_embeddings_placeholder = tf.placeholder(
                       dtype=tf.float32,
                       shape=(self.target_vocab_size,self.embedding_size),
                   )

                   self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder)
               else:
                   self.decoder_embeddings = tf.get_variable(
                       name='embeddings',
                       shape = (self.target_vocab_size,self.embedding_size),
                       initializer=self.initializer,
                       dtype=tf.float32
                   )

                   #上面也是对用于解码器的embedding的初始化
           #定义输出的projection,实际上就是全连接层
           self.decoder_output_projection = layers.Dense(
               self.target_vocab_size,
               dtype=tf.float32,
               use_bias=False,
               name='decoder_output_projection'
           )

           if self.mode == 'train':
               self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                   params=self.decoder_embeddings,
                   ids=self.decoder_inputs_train
               )

               inputs = self.decoder_inputs_embedded
               if self.time_major:
                   inputs = tf.transpose(inputs,(1,0,2))

               #seq2seq的一个类,用来帮助feeding参数。
               training_helper = seq2seq.TrainingHelper(
                   inputs=inputs,
                   sequence_length=self.decoder_inputs_length,
                   time_major=self.time_major,
                   name='training_helper'
               )

               training_decoder = seq2seq.BasicDecoder(
                   cell=self.decoder_cell,
                   helper=training_helper,
                   initial_state=self.decoder_initial_state
               )

              #decoder在当前的batch下的最大time_steps
               max_decoder_length =tf.reduce_max(
                   self.decoder_inputs_length
               )

               (
                   outputs,
                   self.final_state,
                   final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                   decoder=training_decoder,
                   output_time_major=self.time_major,
                   impute_finished=True,
                   maximum_iterations=max_decoder_length,
                   parallel_iterations=self.parallel_iterations,
                   swap_memory=True,
                   scope = decoder_scope
               )

               self.decoder_logits_train = self.decoder_output_projection(
                   outputs.rnn_output
               )

               '''
               self.masks感觉有用,通过这个mask来区分数据位和填充位,这个是计算sequence_loss需要传入的参数。
               
               
               
               '''
               self.masks = tf.sequence_mask(
                   lengths=self.decoder_inputs_length,
                   maxlen=max_decoder_length,
                   dtype=tf.float32,
                   name='masks'
               )

               decoder_logits_train = self.decoder_logits_train
               if self.time_major:
                   decoder_logits_train = tf.transpose(decoder_logits_train(1,0,2))

               self.decoder_pre_train =tf.argmax(
                    decoder_logits_train,
                    axis=-1,
                    name='deocder_pred_train'
                )

               self.tran_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                   labels=self.decoder_inputs,
                   logits=decoder_logits_train
               )

               self.masks_rewards = self.masks * self.rewards

               self.loss_rewards = seq2seq.sequence_loss(
                   logits=decoder_logits_train,
                   targets=self.decoder_inputs,
                   weights=self.masks_rewards,
                   average_across_timesteps=True,
                   average_across_batch=True
               )

               self.loss = seq2seq.sequence_loss(
                   logits=decoder_logits_train,
                   targets=self.decoder_inputs,
                   weights=self.masks,
                   average_across_timesteps=True,
                   average_across_batch=True
               )

               print('in build_decoder')
               print(self.add_loss.name)
               self.add_loss =  self.add_loss + self.loss
               print(self.add_loss.name)

           elif self.mode == 'decode':
               start_tokens = tf.tile(
                   [WordSequence.START],
                   [self.batch_size]
               )
               end_token = WordSequence.END

               def embed_and_input_proj(inputs):
                   return tf.nn.embedding_lookup(
                       self.decoder_embeddings,
                       inputs
                   )

               if not self.use_beamsearch_decode:
                   decoding_helper = seq2seq.GreedyEmbeddingHelper(
                       start_tokens=start_tokens,
                       end_token=end_token,
                       embedding=embed_and_input_proj
                   )

                   inference_decoder = seq2seq.BasicDecoder(
                       cell=self.decoder_cell,
                       helper=decoding_helper,
                       initial_state=self.decoder_initial_state,
                       output_layer=self.decoder_output_projection
                   )

               else:
                   #这里的BeamSearchDecoder 传入的initial_state是经过变换,成了原来的beam_width 这么多倍。
                   inference_decoder = BeamSearchDecoder(
                       cell=self.decoder_cell,
                       embedding=embed_and_input_proj,
                       start_tokens=start_tokens,
                       end_token = end_token,
                       initial_state=self.decoder_initial_state,
                       beam_width=self.beam_width,
                       output_layer=self.decoder_output_projection
                   )
               if self.max_decode_step is not None:
                   max_decoder_step = self.max_decode_step
               else:
                   max_decoder_step = tf.round(
                       tf.reduce_max(self.encoder_inputs_length)* 4
                   )

               (
                   self.decoder_outputs_decode,
                   self.final_state,
                   final_sequence_lengths
               ) = (seq2seq.dynamic_decode(
                   decoder=inference_decoder,
                   output_time_major=self.time_major,
                   maximum_iterations=self.parallel_iterations,
                   swap_memory=True,
                   scope = decoder_scope
               ))

               if not self.use_beamsearch_decode:
                   dod = self.decoder_outputs_decode
                   self.decoder_pred_decode = dod.sample_id
                  
               else:
                   self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
                   if self.time_major:
                       self.decoder_pred_decode = tf.transpose(
                           self.decoder_pred_decode,(1,0,2)
                       )


                   self.decoder_pred_decode = tf.transpose(
                       self.decoder_pred_decode,
                       perm=[0,2,1]
                   )

                   dod = self.decoder_outputs_decode
                   self.beam_prob = dod.beam_search_decoder_output.scores
Exemplo n.º 18
0
 def build_decoder(self, encoder_outputs, encoder_final_state):
     """
     构建完整解码器
     :return:
     """
     with tf.variable_scope("decode"):
         decoder_cell, decoder_initial_state = self.build_decoder_cell(
             encoder_outputs, encoder_final_state, self.hidden_size,
             self.cell_type, self.layer_size)
         # 输出层投影
         decoder_output_projection = layers.Dense(
             self.decoder_vocab_size,
             dtype=tf.float32,
             use_bias=False,
             kernel_initializer=tf.truncated_normal_initializer(mean=0.0,
                                                                stddev=0.1),
             name='decoder_output_projection')
         if self.mode == 'train':
             # 训练模式
             decoder_inputs_embdedded = tf.nn.embedding_lookup(
                 self.decoder_embeddings, self.decoder_inputs_train)
             '''
             TrainingHelper用于train阶段,next_inputs方法一样也接收outputs与sample_ids,但是只是从初始化时的inputs返回下一时刻的输入。
             TrainingHelper
             __init__( inputs, sequence_length, time_major=False, name=None )
             - inputs: A (structure of) input tensors.
             - sequence_length: An int32 vector tensor.
             - time_major: Python bool. Whether the tensors in inputs are time major. If False (default), they are assumed to be batch major.
             - name: Name scope for any created operations.
             inputs:对应Decoder框架图中的embedded_input,time_major=False的时候,inputs的shape就是[batch_size, sequence_length, embedding_size] ,time_major=True时,inputs的shape为[sequence_length, batch_size, embedding_size]
             sequence_length:这个文档写的太简略了,不过在源码中可以看出指的是当前batch中每个序列的长度(self._batch_size = array_ops.size(sequence_length))。
             time_major:决定inputs Tensor前两个dim表示的含义
             name:如文档所述
             '''
             training_helper = TrainingHelper(
                 inputs=decoder_inputs_embdedded,
                 sequence_length=self.decoder_inputs_length,
                 name='training_helper')
             '''
             BasicDecoder的作用就是定义一个封装了decoder应该有的功能的实例,根据Helper实例的不同,这个decoder可以实现不同的功能,比如在train的阶段,不把输出重新作为输入,而在inference阶段,将输出接到输入。
             BasicDecoder
             __init__( cell, helper, initial_state, output_layer=None )
             - cell: An RNNCell instance.
             - helper: A Helper instance.
             - initial_state: A (possibly nested tuple of…) tensors and TensorArrays. The initial state of the RNNCell.
             - output_layer: (Optional) An instance of tf.layers.Layer, i.e., tf.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling.
             cell:在这里就是一个多层LSTM的实例,与定义encoder时无异
             helper:这里只是简单说明是一个Helper实例,第一次看文档的时候肯定还不知道这个Helper是什么,不用着急,看到具体的Helper实例就明白了
             initial_state:encoder的final state,类型要一致,也就是说如果encoder的final state是tuple类型(如LSTM的包含了cell state与hidden state),那么这里的输入也必须是tuple。直接将encoder的final_state作为这个参数输入即可
             output_layer:对应的就是框架图中的Dense_Layer,只不过文档里写tf.layers.Dense,但是tf.layers下只有dense方法,Dense的实例还需要from tensorflow.python.layers.core import Dense。
             '''
             training_decoder = BasicDecoder(decoder_cell, training_helper,
                                             decoder_initial_state,
                                             decoder_output_projection)
             max_decoder_length = tf.reduce_max(self.decoder_inputs_length)
             '''
             首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。
             其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。
             '''
             training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                 training_decoder, maximum_iterations=max_decoder_length)
             '''
             tf.sequence_mask函数返回的一个mask张量。经过tf.Session()打印可以得到一个array数据。
             decoder_inputs_length范围内的数据用1填充,[decoder_inputs_length,max_decoder_length]区间用0填充
             '''
             self.masks = tf.sequence_mask(self.decoder_inputs_length,
                                           maxlen=max_decoder_length,
                                           dtype=tf.float32,
                                           name='masks')
             '''
             tf.contrib.seq2seq.sequence_loss可以直接计算序列的损失函数,重要参数:
             logits:尺寸[batch_size, sequence_length, num_decoder_symbols]
             targets:尺寸[batch_size, sequence_length],不用做one_hot。
             weights:[batch_size, sequence_length],即mask,滤去padding的loss计算,使loss计算更准确。
             '''
             self.loss = tf.contrib.seq2seq.sequence_loss(
                 logits=training_decoder_output.rnn_output,
                 targets=self.decoder_inputs,
                 weights=self.masks,  # mask,滤去padding的loss计算,使loss计算更准确。
                 average_across_timesteps=True,
                 average_across_batch=True)
         else:
             # 预测模式
             start_token = [DataUnit.START_INDEX] * self.batch_size
             end_token = DataUnit.END_INDEX
             '''
             BeamSearchDecoder             
             cell: An RNNCell instance.
             embedding: A callable that takes a vector tensor of ids (argmax ids), or the params argument for embedding_lookup.
             start_tokens: int32 vector shaped [batch_size], the start tokens.
             end_token: int32 scalar, the token that marks end of decoding.
             initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
             beam_width: Python integer, the number of beams.
             output_layer: (Optional) An instance of tf.keras.layers.Layer, i.e., tf.keras.layers.Dense. Optional layer to apply to the RNN output prior to storing the result or sampling.
             length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
             coverage_penalty_weight: Float weight to penalize the coverage of source sentence. Disabled with 0.0.
             reorder_tensor_arrays: If True, TensorArrays' elements within the cell state will be reordered according to the beam search path. 
             If the TensorArray can be reordered, the stacked form will be returned. Otherwise, 
             the TensorArray will be returned as is. Set this flag to False if the cell state contains TensorArrays that are not amenable to reordering.   
             '''
             inference_decoder = BeamSearchDecoder(
                 cell=decoder_cell,
                 embedding=lambda x: tf.nn.embedding_lookup(
                     self.decoder_embeddings, x),
                 start_tokens=start_token,
                 end_token=end_token,
                 initial_state=decoder_initial_state,
                 beam_width=self.beam_width,
                 output_layer=decoder_output_projection)
             '''
             首先tf.contrib.seq2seq.dynamic_decode主要作用是接收一个Decoder类,然后依据Encoder进行解码,实现序列的生成(映射)。
             其中,这个函数主要的一个思想是一步一步地调用Decoder的step函数(该函数接收当前的输入和隐层状态会生成下一个词),实现最后的一句话的生成。该函数类似tf.nn.dynamic_rnn。
              '''
             inference_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                 inference_decoder, maximum_iterations=self.max_decode_step)
             self.decoder_pred_decode = inference_decoder_output.predicted_ids
             self.decoder_pred_decode = tf.transpose(
                 self.decoder_pred_decode, perm=[0, 2, 1])
    def predict(self, encoder_output, encoder_state):
        '''
        开始预测
        :param encoder_output:
        :param encoder_state:
        :return:
        '''
        with tf.variable_scope('decoder') as decoder_scope:  # 这里是为了调试方便,将参数折叠成一个层。

            #####解码器的单元 解码器的初始化状态########
            print(decoder_scope, 'decoder_scope')
            (
                self.decoder_cell,
                self.decoder_initial_state
            ) = self.build_decoder_cell(encoder_output, encoder_state)
            #####解码器的单元 解码器的初始化状态########

            with tf.device(get_embed_device(self.target_vocab_size)):
                ##############################################加载解码器的embedding##################################################
                ###编码器和解码器是否共享embedding matrix###
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                ###编码器和解码器是否共享embedding matrix###

                ###是否加载预先训练好的embedding matrix###
                elif self.pretrain_embedding:
                    self.decoder_embeddings = tf.Variable(
                        tf.constant(
                            0.0,
                            shape=(self.target_vocab_size,
                                   self.embedding_size)
                        ),
                        trainable=True,
                        name='embeddings'
                    )

                    self.decoder_embeddings_placeholder = tf.placeholder(
                        dtype=tf.float32,
                        shape=(self.target_vocab_size, self.embedding_size),
                    )

                    self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder)
                    # 运行时通过placeholder传入embedding matrix,通过assign的形式进行赋值。
                ###是否加载预先训练好的embedding matrix###
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32
                    )
                    # 不加载预先训练好的embedding matrix的情况,声明一个decoder_embeddings matrix,使用-0.5到0.5的均匀分布进行初始化
                ##############################################加载解码器的embedding##################################################

            # 定义输出的projection,实际上就是全连接层
            ##################################解码器的映射############################################
            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection'
            )
            ##################################解码器的映射############################################
            start_tokens = tf.tile(
                [WordSequence.START],
                [self.batch_size]
            )
            end_token = WordSequence.END

            def embed_and_input_proj(inputs):
                return tf.nn.embedding_lookup(
                    self.decoder_embeddings,
                    inputs
                )

            ################################### 没有使用beamsearch的情况 ################################
            if not self.use_beamsearch_decode:
                decoding_helper = seq2seq.GreedyEmbeddingHelper(
                    start_tokens=start_tokens,
                    end_token=end_token,
                    embedding=embed_and_input_proj  # 这里embedding参数作用是获得embedding vector的id
                )

                # 这个时候使用的decoding_helper 就是贪婪模式下的Helper
                '''
                对output使用argmax(treated as logits)并且送入到embedding matrix中查询embedding vector,
                得到下一个输入值

             '''

                inference_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=decoding_helper,
                    initial_state=self.decoder_initial_state,
                    output_layer=self.decoder_output_projection
                )
            ################################### 没有使用beamsearch的情况 ################################

            else:

                ##################################beamsearch 的inference_decoder##############################################
                # 这里的BeamSearchDecoder 传入的initial_state是经过变换,成了原来的beam_width 这么多倍。
                inference_decoder = BeamSearchDecoder(
                    cell=self.decoder_cell,
                    embedding=embed_and_input_proj,
                    start_tokens=start_tokens,
                    end_token=end_token,
                    initial_state=self.decoder_initial_state,
                    beam_width=self.beam_width,
                    output_layer=self.decoder_output_projection
                )
                ##################################beamsearch 的inference_decoder##############################################

            if self.max_decode_step is not None:
                max_decoder_step = self.max_decode_step
            else:
                max_decoder_step = tf.round(
                    tf.reduce_max(self.encoder_inputs_length) * 4
                )

            ###############################解码开始####################################

            (
                self.decoder_outputs_decode,
                self.final_state,
                final_sequence_lengths
            ) = (seq2seq.dynamic_decode(
                decoder=inference_decoder,
                output_time_major=self.time_major,
                impute_finished=False,
                maximum_iterations=100,
                swap_memory=True,
                scope=decoder_scope
            ))
            ###############################解码开始####################################

            ##############################没有使用beamsearch 解码的情况############################
            if not self.use_beamsearch_decode:
                dod = self.decoder_outputs_decode
                self.decoder_pred_decode = dod.sample_id
                # self.decoder_pred_decode = tf.transpose(
                #     self.decoder_pred_decode, (1, 0)
                # )
                return self.decoder_pred_decode, final_sequence_lengths
            ##############################没有使用beamsearch 解码的情况############################
            else:

                ##############################使用beamsearch 解码的情况############################
                self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids
                if self.time_major:
                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, (1, 0, 2)
                    )

                self.decoder_pred_decode = tf.transpose(self.decoder_pred_decode, (0, 2, 1))

                dod = self.decoder_outputs_decode
                self.beam_prob = dod.beam_search_decoder_output.scores
                ##############################使用beamsearch 解码的情况############################
                return self.decoder_pred_decode, self.beam_prob, final_sequence_lengths
Exemplo n.º 20
0
    def build_graph(self):
        print('Building the TensorFlow graph...')
        opts = self.options

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.enc_input = tf.placeholder(
                tf.int32,
                shape=[opts.max_hist_len, opts.batch_size, opts.max_uttr_len])
            self.enc_input_e = tf.placeholder(
                tf.float32,
                shape=[opts.batch_size, opts.max_hist_len, opts.n_emot])
            self.dec_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1])
            self.target = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len + 1])

            self.enc_input_len = tf.placeholder(
                tf.int32, shape=[opts.max_hist_len, opts.batch_size])
            self.dec_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])
            self.hist_len = tf.placeholder(tf.int32, shape=[opts.batch_size])

            with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
                # word_embeddings = tf.Variable(tf.random_uniform([opts.vocab_size, opts.word_embed_size], -1.0, 1.0),
                #     name = 'word_embeddings')
                word_embeddings = tf.Variable(opts.word_embeddings,
                                              name='word_embeddings')
                enc_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.enc_input)
                dec_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.dec_input)

            with tf.variable_scope('word_level_encoding', reuse=tf.AUTO_REUSE):
                outputs_enc = []
                cell_fw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s)
                cell_bw = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_s)
                for i in range(opts.max_hist_len):
                    outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        inputs=enc_input_embed[i, :, :, :],
                        sequence_length=self.enc_input_len[i, :],
                        dtype=tf.float32)
                    outputs_enc.append(tf.concat(outputs, 2))
                outputs_enc = tf.stack(outputs_enc)

            with tf.variable_scope('emotion_encoding', reuse=tf.AUTO_REUSE):
                emot_input_layer = tf.layers.Dense(
                    opts.emot_input_layer_size,
                    activation=tf.sigmoid,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1),
                    name='emot_input_layer')
                enc_input_e = emot_input_layer(self.enc_input_e)

                cell_emot = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc_e)
                _, final_state = tf.nn.dynamic_rnn(
                    cell_emot,
                    inputs=enc_input_e,
                    sequence_length=self.hist_len,
                    dtype=tf.float32)
                emot_vector = final_state * opts.beta

            if opts.mode == 'PREDICT':
                outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3])
                outputs_enc = tile_batch(outputs_enc,
                                         multiplier=opts.beam_width)
                outputs_enc = tf.transpose(outputs_enc, perm=[1, 0, 2, 3])
                tiled_enc_input_len = tile_batch(tf.transpose(
                    self.enc_input_len),
                                                 multiplier=opts.beam_width)
                tiled_enc_input_len = tf.transpose(tiled_enc_input_len)
                tiled_hist_len = tile_batch(self.hist_len,
                                            multiplier=opts.beam_width)
                tiled_emot_vector = tile_batch(emot_vector,
                                               multiplier=opts.beam_width)
            else:
                tiled_enc_input_len = self.enc_input_len
                tiled_hist_len = self.hist_len
                tiled_emot_vector = emot_vector

            with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs:
                attn_mechanism = UttrLevelAttentionMechanism(
                    word_level_num_units=opts.word_level_attn_depth,
                    uttr_level_num_units=opts.uttr_level_attn_depth,
                    n_hidden_units=opts.n_hidden_units_enc_s,
                    memory=outputs_enc,
                    memory_sequence_length=tiled_enc_input_len,
                    hist_length=tiled_hist_len)
                cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec)
                cell_dec = MyAttentionWrapper(cell_dec, attn_mechanism,
                                              tiled_emot_vector)
                output_layer = tf.layers.Dense(
                    units=opts.vocab_size - 1,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1),
                    name='output_layer')

                # Train
                if opts.mode == 'TRAIN':
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=cell_dec.zero_state(
                            opts.batch_size, tf.float32),
                        dtype=tf.float32,
                        scope=vs)
                    logits = output_layer.apply(outputs_dec)
                    weights = tf.sequence_mask(self.dec_input_len,
                                               maxlen=opts.max_uttr_len + 1,
                                               dtype=tf.float32)
                    self.loss = sequence_loss(logits, self.target, weights)
                    self.loss_batch = sequence_loss(logits,
                                                    self.target,
                                                    weights,
                                                    average_across_batch=False)
                    self.optimizer = tf.train.AdamOptimizer(
                        opts.learning_rate).minimize(self.loss)
                    self.init = tf.global_variables_initializer()

                # Predict
                if opts.mode == 'PREDICT':
                    start_tokens = tf.constant(opts.go_index,
                                               dtype=tf.int32,
                                               shape=[opts.batch_size])
                    bs_decoder = BeamSearchDecoder(
                        cell=cell_dec,
                        embedding=word_embeddings,
                        start_tokens=start_tokens,
                        end_token=opts.eos_index,
                        initial_state=cell_dec.zero_state(
                            opts.batch_size * opts.beam_width, tf.float32),
                        beam_width=opts.beam_width,
                        output_layer=output_layer)
                    final_outputs, final_state, _ = dynamic_decode(
                        bs_decoder,
                        impute_finished=False,
                        maximum_iterations=opts.max_uttr_len + 1,
                        scope=vs)
                    self.predicted_ids = final_outputs.predicted_ids
                    self.scores = final_outputs.beam_search_decoder_output.scores
                    self.uttr_level_alignments = final_state[
                        0].alignment_history_ul.stack()
                    self.word_level_alignments = final_state[
                        0].alignment_history_wl.stack()
                    self.final_sequence_lengths = final_state[3]

            self.tvars = tf.trainable_variables()
            self.saver = tf.train.Saver(max_to_keep=100)
Exemplo n.º 21
0
    def build_model(self):
        print('building model... ...')
        with tf.variable_scope('seq2seq_placeholder'):
            self.encoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="encoder_inputs")
            self.decoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="decoder_inputs")
            self.decoder_targets = tf.placeholder(tf.int32, [None, None],
                                                  name="decoder_targets")
            self.decoder_targets_masks = tf.placeholder(tf.bool, [None, None],
                                                        name="mask")
            self.encoder_length = tf.placeholder(tf.int32, [None],
                                                 name="encoder_length")
            self.decoder_length = tf.placeholder(tf.int32, [None],
                                                 name="decoder_length")

            # PCGN placeholder
            self.user_feat = tf.placeholder(tf.float32, [None, self.feat_dim],
                                            name="user_feat")
            self.user_desc = tf.placeholder(tf.int32, [None, None],
                                            name="user_desc")
            self.desc_length = tf.placeholder(tf.int32, [None],
                                              name="user_desc_length")
            self.max_target_sequence_length = tf.constant(
                value=self.target_max_length, name='max_target_len'
            )  # 20# tf.reduce_max(self.decoder_length, name='max_target_len')

        with tf.variable_scope('seq2seq_embedding'):
            self.embedding = self.init_embedding(self.vocab_size,
                                                 self.embedding_size)

        with tf.variable_scope('seq2seq_encoder'):
            encoder_outputs, encoder_states = build_encoder(
                self.embedding,
                self.encoder_inputs,
                self.encoder_length,
                self.encode_num_layers,
                self.encode_num_units,
                self.encode_cell_type,
                bidir=self.encode_bidir)

        if self.use_user_desc or self.use_user_feat:
            with tf.variable_scope('user_profile_encoder'):
                # create emotion category embeddings
                desc_initializer = tf.contrib.layers.xavier_initializer()
                self.user_feat_mem_embedding = tf.layers.Dense(
                    self.user_feat_mem_unit,
                    use_bias=False,
                    activation=tf.nn.relu,
                    kernel_initializer=desc_initializer,
                    name="user_feat_mem_layer")
                self.user_feats, self.user_embs, self.user_desc_encode = self.build_user_embedding(
                    self.user_feat, self.user_desc, self.desc_length,
                    self.user_feat_unit, self.desc_rnn_unit, self.embedding,
                    self.use_user_desc, self.use_user_feat)

                if self.use_external_desc_express:
                    #self.embed_desc = self.user_desc_encode
                    dim2 = self.desc_rnn_unit
                    dim1 = self.decode_num_units
                    if self.use_blog_user_coattn:
                        dim1 = dim1 * 2
                    self.blog_desc_inetract = tf.Variable(
                        desc_initializer(shape=(dim1, dim2)),
                        name="blog_desc_inetraction_layer",
                        dtype=tf.float32)
                    if self.use_external_feat_express:
                        dim2 = dim2 + self.user_feat_unit
                    self.user_map_layer = tf.Variable(
                        desc_initializer(shape=(dim2, self.user_map_unit)),
                        name="user_map_layer",
                        dtype=tf.float32)

        with tf.variable_scope('seq2seq_decoder'):
            encoder_length = self.encoder_length
            if self.use_user_desc or self.use_user_feat:
                user_feats = self.user_feats
                user_embs = self.user_embs
                if self.use_user_desc:
                    desc_length = self.desc_length
                    user_desc_encode = self.user_desc_encode
            if self.beam_search:
                # 如果使用beam_search,则需要将encoder的输出进行tile_batch,其实就是复制beam_size份。
                print("use beamsearch decoding..")
                encoder_outputs = tile_batch(encoder_outputs,
                                             multiplier=self.beam_size)
                encoder_states = tile_batch(encoder_states,
                                            multiplier=self.beam_size)
                encoder_length = tile_batch(encoder_length,
                                            multiplier=self.beam_size)
                if self.use_user_desc or self.use_user_feat:
                    user_feats = tile_batch(user_feats,
                                            multiplier=self.beam_size)
                    user_embs = tile_batch(user_embs,
                                           multiplier=self.beam_size)
                    if self.use_user_desc:
                        desc_length = tile_batch(desc_length,
                                                 multiplier=self.beam_size)
                        user_desc_encode = tile_batch(
                            user_desc_encode, multiplier=self.beam_size)

            attention_mechanism = BahdanauAttention(
                num_units=self.attn_num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_length)

            if self.use_blog_user_coattn:
                attention_mechanism_desc = BahdanauAttention(
                    num_units=self.desc_attn_num_units,
                    memory=user_desc_encode,
                    memory_sequence_length=desc_length)

            decoder_cell = create_rnn_cell(self.decode_num_layers,
                                           self.decode_num_units,
                                           self.decode_cell_type)

            if self.use_blog_user_coattn:
                _attention_mechanism = (attention_mechanism,
                                        attention_mechanism_desc)
                _attention_layer_size = [
                    self.decode_num_units, self.decode_num_units
                ]
            else:
                _attention_mechanism = attention_mechanism
                _attention_layer_size = self.decode_num_units

            if self.use_user_feat:
                if self.use_gate_memory:
                    _read_g = tf.layers.Dense(self.user_feat_mem_unit,
                                              use_bias=False,
                                              name="internal_read_gate")

                    _write_g = tf.layers.Dense(self.user_feat_mem_unit,
                                               use_bias=False,
                                               name="internal_write_gate")
                    if self.use_blog_user_coattn:
                        _read_atten_gate = tf.layers.Dense(
                            2 * self.desc_attn_num_units,
                            use_bias=False,
                            name="internal_read_attn_gate")
                    else:
                        _read_atten_gate = None
                else:
                    _read_g = None
                    _write_g = None
                    _read_atten_gate = None
                decoder_cell = PCGNWrapper(
                    cell=decoder_cell,
                    attention_mechanism=_attention_mechanism,
                    user_feats=user_feats,
                    user_embs=user_embs,
                    user_feat_mem_units=self.user_feat_mem_unit,
                    # memory size
                    user_feat_mem_embedding=self.user_feat_mem_embedding,
                    read_gate=_read_g,
                    write_gate=_write_g,
                    use_gate_memory=self.use_gate_memory,
                    attention_layer_size=_attention_layer_size,
                    read_atten_gate=_read_atten_gate,
                    name='PCGNWrapper')

            else:
                decoder_cell = AttentionWrapper(
                    cell=decoder_cell,
                    attention_mechanism=_attention_mechanism,
                    attention_layer_size=_attention_layer_size,
                    name='Attention_Wrapper')

            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_states)

            output_layer = tf.layers.Dense(self.vocab_size,
                                           use_bias=False,
                                           name='output_projection')  #

            if self.mode == 'train':
                decoder_inputs_embedded = tf.nn.embedding_lookup(
                    self.embedding, self.decoder_inputs)
                # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出
                training_helper = TrainingHelper(
                    inputs=decoder_inputs_embedded,
                    sequence_length=self.decoder_length,
                    name='training_helper')

                training_decoder = BasicDecoder(
                    cell=decoder_cell,
                    helper=training_helper,
                    initial_state=decoder_initial_state)

                self.decoder_outputs, self.final_state, self.final_sequence_length = dynamic_decode(
                    decoder=training_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_sequence_length)

                self.decoder_logits_train = tf.identity(
                    self.decoder_outputs.rnn_output)

                if self.use_external_desc_express:
                    if self.use_external_feat_express:
                        _user_feats = user_embs
                    else:
                        _user_feats = None
                    self.decoder_logits_train = self.external_personality_express(
                        self.decoder_logits_train,
                        user_desc_encode,
                        self.blog_desc_inetract,
                        user_feats=_user_feats,
                        use_external_feat_express=self.
                        use_external_feat_express,
                        user_map=self.user_map_layer)
                with tf.variable_scope('decoder'):
                    self.generic_logits = output_layer(
                        self.decoder_logits_train)  # 得到普通词的概率分布logits

                    if self.use_gate_memory:
                        self.feat_mem = self.final_state.user_feat_mem  # user_feat_mem的最终状态

                with tf.variable_scope('loss'):
                    g_probs = tf.nn.softmax(self.generic_logits)
                    train_log_probs = tf.log(g_probs)
                    self.g_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        logits=self.generic_logits,
                        labels=self.decoder_targets
                    )  # - tf.log(1 - self.alphas)
                    losses = tf.boolean_mask(self.g_losses,
                                             self.decoder_targets_masks)
                    self.loss = tf.reduce_mean(losses)

                    if self.use_gate_memory:
                        self.int_mem_reg = tf.reduce_mean(
                            tf.norm(self.feat_mem + 1e-7, axis=1))
                        self.loss += self.int_mem_reg  # + self.alpha_reg

                # prepare for perlexity computations
                # self.decoder_targets_masks=tf.cast(self.decoder_targets_masks,tf.bool)
                CE = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=train_log_probs, labels=self.decoder_targets)
                CE = tf.boolean_mask(
                    CE, tf.cast(self.decoder_targets_masks, tf.bool))
                # CE = tf.boolean_mask(CE, self.decoder_targets_masks)
                self.CE = tf.reduce_mean(CE)

                # optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)#tf.train.AdamOptimizer(self.learning_rate)
                optimizer = tf.train.AdamOptimizer(
                    self.learning_rate)  # beta1=0.5,beta2=0.9
                trainable_params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, trainable_params)
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))

            elif self.mode == 'infer':
                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * SOS_ID
                end_token = EOS_ID
                if self.use_user_feat or self.use_user_desc:
                    if self.use_external_desc_express:
                        _embed_desc = user_desc_encode
                        _blog_desc_inetract = self.blog_desc_inetract
                        _user_map = self.user_map_layer
                        if self.use_external_feat_express:
                            _feat_embed = user_embs
                        else:
                            _feat_embed = None
                    else:
                        _embed_desc = None
                        _blog_desc_inetract = None
                        _user_map = None
                        _feat_embed = None

                    inference_decoder = PCGNBeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_size,
                        output_layer=output_layer,
                        use_external_desc_express=self.
                        use_external_desc_express,
                        embed_desc=_embed_desc,
                        blog_desc_inetract=_blog_desc_inetract,
                        feat_embed=_feat_embed,
                        use_external_feat_express=self.
                        use_external_feat_express,
                        user_map=_user_map)

                else:
                    inference_decoder = BeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_size,
                        output_layer=output_layer)
                decoder_outputs, _, _ = dynamic_decode(
                    decoder=inference_decoder,
                    maximum_iterations=self.infer_max_iter)

                infer_outputs = decoder_outputs.predicted_ids  # [batch_size, decoder_targets_length, beam_size]
                self.infer_outputs = tf.transpose(
                    infer_outputs, [0, 2, 1], name='infer_outputs'
                )  # [batch_size, beam_size, decoder_targets_length]

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=self.max_to_keep)
Exemplo n.º 22
0
    def build_graph(self):
        # build_graph-train vs validate-train
        print('Building the TensorFlow graph...')
        opts = self.options

        self.graph = tf.Graph()
        with self.graph.as_default():
            self.enc_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_enc])
            self.dec_input = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec])
            self.target = tf.placeholder(
                tf.int32, shape=[opts.batch_size, opts.max_uttr_len_dec])

            self.enc_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])
            self.dec_input_len = tf.placeholder(tf.int32,
                                                shape=[opts.batch_size])

            self.VAD = tf.placeholder(tf.float32, shape=[opts.corpus_size, 3])
            self.termfreq = tf.placeholder(tf.float32,
                                           shape=[opts.corpus_size, 1])
            self.VAD_loss = tf.placeholder(tf.float32,
                                           shape=[opts.corpus_size, 1])

            with tf.variable_scope('embedding', reuse=tf.AUTO_REUSE):
                # how to get input_embed for encoder and decoder
                word_embeddings = tf.Variable(tf.random_uniform(
                    [opts.corpus_size, opts.word_embed_size], -1.0, 1.0),
                                              name='embedding')
                #                 word_embeddings = tf.constant(opts.word_embeddings, name = 'word_embeddings')

                enc_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.enc_input)
                dec_input_embed = tf.nn.embedding_lookup(
                    word_embeddings, self.dec_input)

                enc_input_VAD = tf.nn.embedding_lookup(self.VAD,
                                                       self.enc_input)
                target_VAD = tf.nn.embedding_lookup(self.VAD, self.target)

                enc_input_tf = tf.nn.embedding_lookup(self.termfreq,
                                                      self.enc_input)
                target_tf = tf.nn.embedding_lookup(self.termfreq, self.target)

                target_VAD_loss = tf.nn.embedding_lookup(
                    self.VAD_loss, self.target)
                target_VAD_loss = tf.squeeze(target_VAD_loss)

            with tf.variable_scope('encoding', reuse=tf.AUTO_REUSE):
                cell_enc = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_enc)
                # bi-directional?
                enc_outputs, _ = tf.nn.dynamic_rnn(
                    cell_enc,
                    enc_input_embed,
                    sequence_length=self.enc_input_len,
                    dtype=tf.float32)

            if opts.mode == 'PREDICT':
                enc_outputs = tile_batch(enc_outputs,
                                         multiplier=opts.beam_width)
                enc_input_embed = tile_batch(enc_input_embed,
                                             multiplier=opts.beam_width)
                enc_input_VAD = tile_batch(enc_input_VAD,
                                           multiplier=opts.beam_width)
                enc_input_tf = tile_batch(enc_input_tf,
                                          multiplier=opts.beam_width)
                tiled_enc_input_len = tile_batch(self.enc_input_len,
                                                 multiplier=opts.beam_width)
            else:
                tiled_enc_input_len = self.enc_input_len

#             with tf.variable_scope('attention', reuse = tf.AUTO_REUSE) as attention_layer:
#                 attention_Wb = tf.layers.Dense(units=3,
#                                              use_bias=False,
#                                              kernel_initializer = tf.truncated_normal_initializer(stddev = 0.1),
#                                              name='attention_Wb')

            with tf.variable_scope('decoding', reuse=tf.AUTO_REUSE) as vs:
                # attn_mechanism: alpha_<t,t'>
                attn_mechanism = MyBahdanauAttention(
                    num_units=opts.attn_depth,
                    memory=enc_outputs,
                    memory_sequence_length=tiled_enc_input_len,
                    enc_input_embed=enc_input_embed,
                    enc_input_VAD=enc_input_VAD,
                    enc_input_tf=enc_input_tf,
                    VAD_mode=opts.VAD_mode)
                cell_dec = tf.nn.rnn_cell.GRUCell(opts.n_hidden_units_dec)
                # AttentionWrapper: c?
                cell_dec = AttentionWrapper(cell_dec,
                                            attn_mechanism,
                                            output_attention=False)
                output_layer = tf.layers.Dense(
                    units=opts.corpus_size,
                    kernel_initializer=tf.truncated_normal_initializer(
                        stddev=0.1))

                # Train
                if opts.mode == 'TRAIN':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size, tf.float32)
                    attention = compute_attention(
                        attn_mechanism, dec_initial_state.cell_state)  #(1,256)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=dec_initial_state,
                        dtype=tf.float32,
                        scope=vs)
                    # logits: `[batch_size, sequence_length, num_decoder_symbols]`
                    # The logits correspond to the prediction across all classes at each timestep.
                    logits = output_layer.apply(outputs_dec)
                    # batch size * max sentence length; binary; 0 for non-word in orignal sentence; mask
                    sequence_mask = tf.sequence_mask(
                        self.dec_input_len,
                        maxlen=opts.max_uttr_len_dec,
                        dtype=tf.float32)
                    if opts.VAD_mode:
                        weights = sequence_mask * target_VAD_loss  # affective objective function
                    else:
                        weights = sequence_mask
                    # sequence_mask: [batch_size, max_len]
                    # target: [batch_size, max_len] VAD_loss: [batch_size,max_len]
                    # softmax_loss_function(labels=targets, logits=logits_flat) 默认为sparse_softmax_cross_entropy_with_logits
                    self.loss = sequence_loss(logits, self.target, weights)
                    self.loss_batch = sequence_loss(logits,
                                                    self.target,
                                                    weights,
                                                    average_across_batch=False)
                    self.optimizer = tf.train.AdamOptimizer(
                        opts.learning_rate).minimize(self.loss)
                    self.init = tf.global_variables_initializer()

                # Predict
                if opts.mode == 'PREDICT':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size * opts.beam_width, tf.float32)
                    attention = compute_attention(attn_mechanism,
                                                  dec_initial_state.cell_state)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    start_tokens = tf.constant(opts.go_index,
                                               dtype=tf.int32,
                                               shape=[opts.batch_size])
                    bs_decoder = BeamSearchDecoder(
                        cell=cell_dec,
                        embedding=word_embeddings,
                        start_tokens=start_tokens,
                        end_token=opts.eos_index,
                        initial_state=dec_initial_state,
                        beam_width=opts.beam_width,
                        output_layer=output_layer)
                    final_outputs, final_state, _ = dynamic_decode(
                        bs_decoder,
                        impute_finished=False,
                        maximum_iterations=opts.max_uttr_len_dec,
                        scope=vs)
                    self.predicted_ids = final_outputs.predicted_ids
                    #                     self.scores = final_outputs.scores # 'FinalBeamSearchDecoderOutput' object has no attribute 'scores'
                    self.prob = final_state.log_probs
                    # log_probs: The log probabilities with shape `[batch_size, beam_width, vocab_size]`.
                    #  logits: Logits at the current time step. A tensor of shape `[batch_size, beam_width, vocab_size]`
                    # step_log_probs = nn_ops.log_softmax(logits) # logsoftmax = logits - log(reduce_sum(exp(logits), axis))
                    # step_log_probs = _mask_probs(step_log_probs, end_token, previously_finished)
                    # total_probs = array_ops.expand_dims(beam_state.log_probs, 2) + step_log_probs
                    #  final_outputs.scores #[batch_size, length, beam_width]

                if opts.mode == 'POST_PREDICT':
                    dec_initial_state = cell_dec.zero_state(
                        opts.batch_size, tf.float32)
                    attention = compute_attention(
                        attn_mechanism, dec_initial_state.cell_state)  #(1,256)
                    dec_initial_state = dec_initial_state.clone(
                        attention=attention)
                    outputs_dec, _ = tf.nn.dynamic_rnn(
                        cell=cell_dec,
                        inputs=dec_input_embed,
                        sequence_length=self.dec_input_len,
                        initial_state=dec_initial_state,
                        dtype=tf.float32,
                        scope=vs)
                    logits = output_layer.apply(outputs_dec)
                    sequence_mask = tf.sequence_mask(
                        self.dec_input_len,
                        maxlen=opts.max_uttr_len_dec,
                        dtype=tf.float32)
                    score = tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.target, logits=logits)
                    self.prob = -1 * tf.reduce_sum(score * sequence_mask)

            self.tvars = tf.trainable_variables()
            self.saver = tf.train.Saver(max_to_keep=100)
Exemplo n.º 23
0
    def build_train_graph(self, train_examples):
        """
        Building train graph with train examples
        :param train_examples: Examples from train data
        :return: Predicted outputs, parameters of generator, decoder cell, attention zero state
        """

        # Unpack subject, content and answers and corresponding length
        subject, len_subject, content, len_content, target_input, target_output, len_target = train_examples

        # Choose best answer per question
        target_input = target_input[:, 0, :]
        target_output = target_output[:, 0, :]
        len_target = tf.to_int32(len_target[:, 0])

        # Look up word vectors for decoder input
        decoder_inputs_embedded = tf.nn.embedding_lookup(
            self.embeddings_english, target_input)

        # Concat subject and content to feed it into encoder
        sub_cont_concat_op = tf.map_fn(
            self.concat_seqs, [subject, len_subject, content, len_content])[0]
        len_both = len_subject + len_content

        # Load inital graph twice, one for train and another for output with beam decoder
        decoder_cell, attn_zero_state = self.build_initial_graph(
            sub_cont_concat_op, len_both)
        decoder_cell_beam, attn_zero_state_beam = self.build_initial_graph(
            sub_cont_concat_op,
            len_both,
            reuse=True,
            beam_width=self.beam_width)

        # Make train decoder
        helper = TrainingHelper(decoder_inputs_embedded,
                                len_target,
                                time_major=False)
        decoder = BasicDecoder(decoder_cell,
                               helper,
                               attn_zero_state,
                               output_layer=self.projection_layer)

        # Make beam search decoder
        beam_search_decoder = BeamSearchDecoder(
            decoder_cell_beam,
            self.embeddings_english,
            self.start_token,
            self.end_token,
            attn_zero_state_beam,
            self.beam_width,
            output_layer=self.projection_layer)

        # Define variable scope train decoder to initialize the train decoder and beam search decoder
        # with dynamic decode
        with tf.variable_scope("train_decoder"):
            final_outputs, final_state, final_seq_len = dynamic_decode(
                decoder, output_time_major=False)
        with tf.variable_scope("train_decoder", reuse=True):
            beam_outputs, _, beam_out_len = dynamic_decode(
                beam_search_decoder,
                output_time_major=False,
                maximum_iterations=self.max_seq_len)

        # Output of train decoder
        final_outputs_max_len = tf.shape(final_outputs.sample_id)[1]
        target_output = target_output[:, :final_outputs_max_len]

        # Output of beam search decoder
        beam_outputs = tf.transpose(beam_outputs.predicted_ids, [2, 0, 1])
        beam_outputs = tf.reshape(beam_outputs[0, :, :], [self.batch_size, -1])
        beam_out_len = tf.transpose(beam_out_len)
        beam_out_len = tf.reshape(beam_out_len[0, :], [-1])

        # Get generator parameters
        generator_params = [
            param for param in tf.trainable_variables()
            if "discriminator" not in param.name
        ]

        return target_output, final_outputs, final_seq_len, generator_params, decoder_cell, attn_zero_state, beam_outputs, beam_out_len
Exemplo n.º 24
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            #创建解码器单元
            (self.decoder_cell,self.decoder_initial_state)\
            = self.build_decoder_cell(encoder_outputs, encoder_state)

            # 解码器embedding 根据词表大小选择CPU还是GPU上训练
            with tf.device(_get_embed_device(self.target_vocab_size)):
                #如果是共享的embedding 则赋值,否则加载预训练 或者初始化进行后续的训练
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                #如果是预训练的embedding
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(
                        tf.constant(
                            0.0,
                            shape=(self.target_vocab_size,self.embedding_size)),
                            trainable=True,#是否可以被训练
                            name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = self.decoder_embeddings.assign(
                            self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32
                    )
            
            #解码器的输出
            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,       #一共有词表大小个输出
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection'
            )

            if self.mode == 'train':
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train   #placeholder初始化时设定
                )
                inputs = self.decoder_inputs_embedded
                
                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))
                
                #帮助feed参数 一般用于训练阶段Decoder解码,辅助Decoder解码过程
                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper'
                )

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,  #用之前定义的初始化单元的状态进行初始化
                )

                # decoder在当前batch下最大的time_steps
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)
                
                #定义动态解码的输出
                (outputs,self.final_state,_)\
                 = seq2seq.dynamic_decode(    #动态decoder
                    decoder=training_decoder,
                    output_time_major=self.time_major, #True是以time(seq_length)为第一维,False是以batch_size为第一维
                    impute_finished=True,         #追踪finished,如果一个序列已经finished,那么后面的每一步output为0
                    maximum_iterations=max_decoder_length,#最大迭代次数(可以理解为decoder最多可以生成几个词)
                    parallel_iterations=self.parallel_iterations,##while_loop的并行次数
                    swap_memory=True, ##True时,当遇到OOM(out of memory),是否把张量从显存转到内存
                    scope=decoder_scope)
                #在训练时将所有的结果在全连接层一次性做投影运算 可以提高效率官方提倡
                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output     #上面定义的解码器的输出
                )

                # masks: masking for valid and padded time steps,
                #tf.sequence_mask的作用是构建序列长度的mask标志 
                """
                tf.sequence_mask([1,2], 4)
                -->
                [[ True False False False]
                 [ True  True False False]]
                """
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32, 
                    name='masks'
                )

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(decoder_logits_train,
                                            (1,0,2))
                #解码器训练时的预测输出 decoder_logits_train一共有词表大小个输出,现仅取值最大的那个下标即为预测的对应下标
                self.decoder_pred_train = tf.argmax(
                    decoder_logits_train, 
                    axis=-1,
                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards,其实我这里是修改了masks 损失之类
                # train_entropy = cross entropy
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_inputs,#标签
                        logits=decoder_logits_train)#预测

                self.masks_rewards = self.masks * self.rewards
                
                #seq2sqe中的损失函数 就是将各个时间步输出相加求平均 权重为mask 当句子长度短于最大长度,为0部分的权重为0
                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,    #这里权重跟下面的不同
                    average_across_timesteps=True,  #损失将除以总的权重
                    average_across_batch=True,     #损失将是总的损失处于批次大小
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                # 预测模式,非训练

                #对原数据进行扩展 参考
                #https://blog.csdn.net/tsyccnh/article/details/82459859
                start_tokens = tf.tile(
                    [WordSequence.START],
                    [self.batch_size]
                )
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    将输入转换成对应词表对应下的embedding
                    """
                    return tf.nn.embedding_lookup(
                        self.decoder_embeddings,
                        inputs
                    )
                
                #如果不使用集束搜索解码 这里定义helper和decoder的结构
                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    #贪婪搜索解码
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,#起始token
                        end_token=end_token,    #结束token
                        embedding=embed_and_input_proj  #已经将输入转换成对应的embedding
                    )
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection
                    )
                else:
                    #使用beamsearch解码
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )
                
                
                #一般使用最大值
                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(tf.reduce_max(
                        self.encoder_inputs_length) * 4)

                
                (    self.decoder_outputs_decode,#输出
                    self.final_state,        #最后的状态
                    _ # self.decoder_outputs_length_decode
                ) = seq2seq.dynamic_decode(
                    decoder=inference_decoder,            #这里包含了使用哪种解码方式
                    output_time_major=self.time_major,
                    # impute_finished=True,	# error occurs
                    maximum_iterations=max_decode_step,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope
                )
                
                #如果不使用beamsearch解码,使用贪婪解码
                 #调用dynamic_decode进行解码,decoder_outputs_decode是一个namedtuple,里面包含两项(rnn_outputs, sample_id)
                # rnn_output: [batch_size, decoder_targets_length, vocab_size],保存decode每个时刻每个单词的概率,可以用来计算loss
                # sample_id: [batch_size], tf.int32,保存最终的编码结果。可以表示最后的答案
                
                if not self.use_beamsearch_decode:

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id #就是最终的答案

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))
                #如果使用beamsearch
                #参考 https://blog.csdn.net/liuchonge/article/details/79021938
                # 对于使用beam_search的时候,decoder_outputs_decode它里面包含两项(predicted_ids, beam_search_decoder_output)
                # predicted_ids: [batch_size, decoder_targets_length, beam_size],保存输出结果
                # beam_search_decoder_output: BeamSearchDecoderOutput instance namedtuple(scores, predicted_ids, parent_ids)
                # 所以对应只需要返回predicted_ids或者sample_id即可翻译成最终的结果
                else:
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode,
                        perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Exemplo n.º 25
0
    def build_decoder(self, encoder_outputs, encoder_state):
        """构建解码器
        """
        with tf.variable_scope('decoder') as decoder_scope:
            # Building decoder_cell and decoder_initial_state
            (self.decoder_cell,
             self.decoder_initial_state) = self.build_decoder_cell(
                 encoder_outputs, encoder_state)

            # 解码器embedding
            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(tf.constant(
                        0.0,
                        shape=(self.target_vocab_size, self.embedding_size)),
                                                          trainable=True,
                                                          name='embeddings')
                    self.decoder_embeddings_placeholder = tf.placeholder(
                        tf.float32,
                        (self.target_vocab_size, self.embedding_size))
                    self.decoder_embeddings_init = \
                        self.decoder_embeddings.assign(
                            self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embeddings',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32)

            self.decoder_output_projection = layers.Dense(
                self.target_vocab_size,
                dtype=tf.float32,
                use_bias=False,
                name='decoder_output_projection')

            if self.mode == 'train':
                # decoder_inputs_embedded:
                # [batch_size, max_time_step + 1, embedding_size]
                self.decoder_inputs_embedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train)

                # Helper to feed inputs for training:
                # read inputs from dense ground truth vectors
                inputs = self.decoder_inputs_embedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper')

                # 训练的时候不在这里应用 output_layer
                # 因为这里会每个 time_step 的进行 output_layer 的投影计算,比较慢
                # 注意这个trick要成功必须设置 dynamic_decode 的 scope 参数
                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state,
                    # output_layer=self.decoder_output_projection
                )

                # Maximum decoder time_steps in current batch
                max_decoder_length = tf.reduce_max(self.decoder_inputs_length)

                # decoder_outputs_train: BasicDecoderOutput
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_train.rnn_output:
                #     if output_time_major=False:
                #         [batch_size, max_time_step + 1, num_decoder_symbols]
                #     if output_time_major=True:
                #         [max_time_step + 1, batch_size, num_decoder_symbols]
                # decoder_outputs_train.sample_id: [batch_size], tf.int32

                (
                    outputs,
                    self.final_state,  # contain attention
                    _  # self.final_sequence_lengths
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope)

                # More efficient to do the projection
                # on the batch-time-concatenated tensor
                # logits_train:
                # [batch_size, max_time_step + 1, num_decoder_symbols]
                # 训练的时候一次性对所有的结果进行 output_layer 的投影运算
                # 官方NMT库说这样能提高10~20%的速度
                # 实际上我提高的速度会更大
                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output)

                # masks: masking for valid and padded time steps,
                # [batch_size, max_time_step + 1]
                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks')

                # Computes per word average cross-entropy over a batch
                # Internally calls
                # 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(
                        decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(decoder_logits_train,
                                                    axis=-1,
                                                    name='decoder_pred_train')

                # 下面的一些变量用于特殊的学习训练
                # 自定义rewards,其实我这里是修改了masks
                # train_entropy = cross entropy
                self.train_entropy = \
                    tf.nn.sparse_softmax_cross_entropy_with_logits(
                        labels=self.decoder_inputs,
                        logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True,
                )

                self.loss_add = self.loss + self.add_loss

            elif self.mode == 'decode':
                # 预测模式,非训练

                start_tokens = tf.tile([WordSequence.START], [self.batch_size])
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    """输入层的投影层wrapper
                    """
                    return tf.nn.embedding_lookup(self.decoder_embeddings,
                                                  inputs)

                if not self.use_beamsearch_decode:
                    # Helper to feed inputs for greedy decoding:
                    # uses the argmax of the output
                    decoding_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_tokens,
                        end_token=end_token,
                        embedding=embed_and_input_proj)
                    # Basic decoder performs greedy decoding at each time step
                    # print("building greedy decoder..")
                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoding_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection)
                else:
                    # Beamsearch is used to approximately
                    # find the most likely translation
                    # print("building beamsearch decoder..")
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection,
                    )

                # For GreedyDecoder, return
                # decoder_outputs_decode: BasicDecoderOutput instance
                #     namedtuple(rnn_outputs, sample_id)
                # decoder_outputs_decode.rnn_output:
                # if output_time_major=False:
                #     [batch_size, max_time_step, num_decoder_symbols]
                # if output_time_major=True
                #     [max_time_step, batch_size, num_decoder_symbols]
                # decoder_outputs_decode.sample_id:
                # if output_time_major=False
                #     [batch_size, max_time_step], tf.int32
                # if output_time_major=True
                #     [max_time_step, batch_size], tf.int32

                # For BeamSearchDecoder, return
                # decoder_outputs_decode: FinalBeamSearchDecoderOutput instance
                #     namedtuple(predicted_ids, beam_search_decoder_output)
                # decoder_outputs_decode.predicted_ids:
                # if output_time_major=False:
                #     [batch_size, max_time_step, beam_width]
                # if output_time_major=True
                #     [max_time_step, batch_size, beam_width]
                # decoder_outputs_decode.beam_search_decoder_output:
                #     BeamSearchDecoderOutput instance
                #     namedtuple(scores, predicted_ids, parent_ids)

                # 官方文档提到的一个潜在的最大长度选择
                # 我这里改为 * 4
                # maximum_iterations = tf.round(tf.reduce_max(source_sequence_length) * 2)
                # https://www.tensorflow.org/tutorials/seq2seq

                if self.max_decode_step is not None:
                    max_decode_step = self.max_decode_step
                else:
                    # 默认 4 倍输入长度的输出解码
                    max_decode_step = tf.round(
                        tf.reduce_max(self.encoder_inputs_length) * 4)

                (
                    self.decoder_outputs_decode,
                    self.final_state,
                    _  # self.decoder_outputs_length_decode
                ) = (
                    seq2seq.dynamic_decode(
                        decoder=inference_decoder,
                        output_time_major=self.time_major,
                        # impute_finished=True,	# error occurs
                        maximum_iterations=max_decode_step,
                        parallel_iterations=self.parallel_iterations,
                        swap_memory=True,
                        scope=decoder_scope))

                if not self.use_beamsearch_decode:
                    # decoder_outputs_decode.sample_id:
                    #     [batch_size, max_time_step]
                    # Or use argmax to find decoder symbols to emit:
                    # self.decoder_pred_decode = tf.argmax(
                    #     self.decoder_outputs_decode.rnn_output,
                    #     axis=-1, name='decoder_pred_decode')

                    # Here, we use expand_dims to be compatible with
                    # the result of the beamsearch decoder
                    # decoder_pred_decode:
                    #     [batch_size, max_time_step, 1] (output_major=False)

                    # self.decoder_pred_decode = tf.expand_dims(
                    #     self.decoder_outputs_decode.sample_id,
                    #     -1
                    # )

                    dod = self.decoder_outputs_decode
                    self.decoder_pred_decode = dod.sample_id

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0))

                else:
                    # Use beam search to approximately
                    # find the most likely translation
                    # decoder_pred_decode:
                    # [batch_size, max_time_step, beam_width] (output_major=False)
                    self.decoder_pred_decode = \
                        self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2))

                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode, perm=[0, 2, 1])
                    dod = self.decoder_outputs_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Exemplo n.º 26
0
    def buildModel(self):
        T_in = self.args.T_in
        T_out = self.args.T_out
        D_in = self.args.D_in
        D_out = self.args.D_out
        E = self.args.embedding_dim
        H = self.args.hidden_dim
        SOS = self.args.SOS
        EOS = self.args.EOS
        PAD = self.args.PAD
        beam_width = 3

        # Input
        with tf.name_scope('input'):
            x = tf.placeholder(shape=(None, T_in),
                               dtype=tf.int32,
                               name='encoder_inputs')
            # N, T_out
            y = tf.placeholder(shape=(None, T_out),
                               dtype=tf.int32,
                               name='decoder_inputs')
            # N
            x_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # N
            y_len = tf.placeholder(shape=(None, ), dtype=tf.int32)
            # dynamic sample num
            batch_size = tf.shape(x)[0]

            # symbol mask
            sos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * SOS
            eos = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * EOS
            pad = tf.ones(shape=(batch_size, 1), dtype=tf.int32) * PAD

            # input mask
            x_mask = tf.sequence_mask(x_len, T_in, dtype=tf.float32)
            y_with_sos_mask = tf.sequence_mask(y_len,
                                               T_out + 1,
                                               dtype=tf.float32)
            y_with_pad = tf.concat([y, pad], axis=1)
            eos_mask = tf.one_hot(y_len, depth=T_out + 1, dtype=tf.int32) * EOS

            # masked inputs
            y_with_eos = y_with_pad + eos_mask
            y_with_sos = tf.concat([sos, y], axis=1)

        ## Embedding
        with tf.name_scope('embedding'):
            if self.args.use_pretrained:
                embedding_pretrained = np.fromfile(self.args.pretrained_file,
                                                   dtype=np.float32).reshape(
                                                       (-1, E))
                embedding = tf.Variable(embedding_pretrained, trainable=False)
            else:
                embedding = tf.get_variable(name='embedding',
                                            shape=(D_in, E),
                                            dtype=tf.float32,
                                            initializer=xavier_initializer())
            e_x = tf.nn.embedding_lookup(embedding, x)
            e_y = tf.nn.embedding_lookup(embedding, y_with_sos)
            if self.args.mode == 'train':
                e_x = tf.nn.dropout(e_x, self.args.keep_prob)

        ## Encoder
        with tf.name_scope('encoder'):
            ## Multi-BiLSTM
            fw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bw_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            bi_encoder_output, bi_encoder_state = tf.nn.bidirectional_dynamic_rnn(
                fw_cell,
                bw_cell,
                e_x,
                sequence_length=x_len,
                dtype=tf.float32,
                time_major=False,
                scope=None)
            encoder_output = bi_encoder_output[0] + bi_encoder_output[1]
            encoder_final_state = bi_encoder_state[0]

        ## Decoder
        with tf.name_scope('decoder'):
            decoder_cell = rnn.MultiRNNCell([
                rnn.BasicLSTMCell(num_units=H)
                for i in range(self.args.layer_size)
            ])
            decoder_lengths = tf.ones(shape=[batch_size],
                                      dtype=tf.int32) * (T_out + 1)

            ## Trainning decoder
            with tf.variable_scope('attention'):
                attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=encoder_output,
                    memory_sequence_length=x_len,
                    name='attention_fn')
            projection_layer = Dense(units=D_out,
                                     kernel_initializer=xavier_initializer())

            train_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=H)
            train_decoder_init_state = train_decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_final_state)
            training_helper = TrainingHelper(e_y,
                                             decoder_lengths,
                                             time_major=False)
            train_decoder = BasicDecoder(
                cell=train_decoder_cell,
                helper=training_helper,
                initial_state=train_decoder_init_state,
                output_layer=projection_layer)
            train_decoder_outputs, _, _ = dynamic_decode(
                train_decoder,
                impute_finished=True,
                maximum_iterations=T_out + 1)
            # N, T_out+1, D_out
            train_decoder_outputs = ln(train_decoder_outputs.rnn_output)

            ## Beam_search decoder
            beam_memory = tile_batch(encoder_output, beam_width)
            beam_memory_state = tile_batch(encoder_final_state, beam_width)
            beam_memory_length = tile_batch(x_len, beam_width)

            with tf.variable_scope('attention', reuse=True):
                beam_attention_mechanism = LuongAttention(
                    num_units=H,
                    memory=beam_memory,
                    memory_sequence_length=beam_memory_length,
                    name='attention_fn')
            beam_decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=beam_attention_mechanism,
                attention_layer_size=None)
            beam_decoder_init_state = beam_decoder_cell.zero_state(
                batch_size=batch_size * beam_width,
                dtype=tf.float32).clone(cell_state=beam_memory_state)
            start_tokens = tf.ones((batch_size), dtype=tf.int32) * SOS
            beam_decoder = BeamSearchDecoder(
                cell=beam_decoder_cell,
                embedding=embedding,
                start_tokens=start_tokens,
                end_token=EOS,
                initial_state=beam_decoder_init_state,
                beam_width=beam_width,
                output_layer=projection_layer)
            beam_decoder_outputs, _, _ = dynamic_decode(
                beam_decoder,
                scope=tf.get_variable_scope(),
                maximum_iterations=T_out + 1)
            beam_decoder_result_ids = beam_decoder_outputs.predicted_ids

        with tf.name_scope('loss'):
            logits = tf.nn.softmax(train_decoder_outputs)
            cross_entropy = tf.keras.losses.sparse_categorical_crossentropy(
                y_with_eos, logits)
            loss_mask = tf.sequence_mask(y_len + 1,
                                         T_out + 1,
                                         dtype=tf.float32)
            loss = tf.reduce_sum(cross_entropy * loss_mask) / tf.cast(
                batch_size, dtype=tf.float32)
            prediction = tf.argmax(logits, 2)

        ## train_op
        with tf.name_scope('train'):
            global_step = tf.train.get_or_create_global_step()
            lr = noam_scheme(self.args.lr, global_step, self.args.warmup_steps)
            optimizer = tf.train.AdamOptimizer(lr)

            ## gradient clips
            trainable_params = tf.trainable_variables()
            gradients = tf.gradients(loss, trainable_params)
            clip_gradients, _ = tf.clip_by_global_norm(
                gradients, self.args.gradient_clip_num)
            train_op = optimizer.apply_gradients(zip(clip_gradients,
                                                     trainable_params),
                                                 global_step=global_step)

        # Summary
        with tf.name_scope('summary'):
            tf.summary.scalar('lr', lr)
            tf.summary.scalar('loss', loss)
            tf.summary.scalar('global_step', global_step)
            summaries = tf.summary.merge_all()
        return x, y, x_len, y_len, logits, loss, prediction, beam_decoder_result_ids, global_step, train_op, summaries
Exemplo n.º 27
0
    def build_model(self):
        print('building model... ...')
        with tf.variable_scope('seq2seq_placeholder'):
            self.encoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="encoder_inputs")
            self.decoder_inputs = tf.placeholder(tf.int32, [None, None],
                                                 name="decoder_inputs")
            self.decoder_targets = tf.placeholder(tf.int32, [None, None],
                                                  name="decoder_targets")
            self.decoder_targets_masks = tf.placeholder(tf.float32,
                                                        [None, None],
                                                        name="mask")
            self.encoder_length = tf.placeholder(tf.int32, [None],
                                                 name="encoder_length")
            self.decoder_length = tf.placeholder(tf.int32, [None],
                                                 name="decoder_length")
            self.max_target_sequence_length = tf.reduce_max(
                self.decoder_length, name='max_target_len')

        with tf.variable_scope('seq2seq_embedding'):
            self.embedding = self.init_embedding(self.vocab_size,
                                                 self.embedding_size)

        with tf.variable_scope('seq2seq_encoder'):
            encoder_outputs, encoder_states = build_encoder(
                self.embedding,
                self.encoder_inputs,
                self.encoder_length,
                self.enc_num_layers,
                self.enc_num_units,
                self.enc_cell_type,
                bidir=self.enc_bidir)

        with tf.variable_scope('seq2seq_decoder'):
            encoder_length = self.encoder_length
            if self.beam_search:
                print("use beamsearch decoding..")
                encoder_outputs = tile_batch(encoder_outputs,
                                             multiplier=self.beam_size)
                encoder_states = tile_batch(encoder_states,
                                            multiplier=self.beam_size)
                encoder_length = tile_batch(encoder_length,
                                            multiplier=self.beam_size)

            attention_mechanism = BahdanauAttention(
                num_units=self.attn_num_units,
                memory=encoder_outputs,
                memory_sequence_length=encoder_length)

            decoder_cell = create_rnn_cell(self.dec_num_layers,
                                           self.dec_num_units,
                                           self.dec_cell_type)
            decoder_cell = AttentionWrapper(
                cell=decoder_cell,
                attention_mechanism=attention_mechanism,
                attention_layer_size=self.dec_num_units,
                name='Attention_Wrapper')

            batch_size = self.batch_size if not self.beam_search else self.batch_size * self.beam_size

            decoder_initial_state = decoder_cell.zero_state(
                batch_size=batch_size,
                dtype=tf.float32).clone(cell_state=encoder_states)

            output_layer = tf.layers.Dense(self.vocab_size,
                                           use_bias=False,
                                           name='output_projection')

            if self.mode == 'train':
                decoder_inputs_embedded = tf.nn.embedding_lookup(
                    self.embedding, self.decoder_inputs)
                # training helper的作用就是决定下一个时序的decoder的输入为给定的decoder inputs, 而不是上一个时刻的输出
                training_helper = tf.contrib.seq2seq.TrainingHelper(
                    inputs=decoder_inputs_embedded,
                    sequence_length=self.decoder_length,
                    name='training_helper')

                training_decoder = tf.contrib.seq2seq.BasicDecoder(
                    cell=decoder_cell,
                    helper=training_helper,
                    initial_state=decoder_initial_state,
                    output_layer=output_layer)

                decoder_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    impute_finished=True,
                    maximum_iterations=self.max_target_sequence_length)

                self.decoder_logits_train = decoder_outputs.rnn_output

                self.loss = tf.contrib.seq2seq.sequence_loss(
                    logits=self.decoder_logits_train,
                    targets=self.decoder_targets,
                    weights=self.decoder_targets_masks)

                optimizer = tf.train.AdamOptimizer(self.learning_rate)
                trainable_params = tf.trainable_variables()
                gradients = tf.gradients(self.loss, trainable_params)
                clip_gradients, _ = tf.clip_by_global_norm(
                    gradients, self.max_gradient_norm)
                self.train_op = optimizer.apply_gradients(
                    zip(clip_gradients, trainable_params))

            elif self.mode == 'infer':
                start_tokens = tf.ones([
                    self.batch_size,
                ], tf.int32) * SOS_ID  # 这里的batch_size不需要复制
                end_token = EOS_ID

                if self.beam_search:
                    inference_decoder = BeamSearchDecoder(
                        cell=decoder_cell,
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token,
                        initial_state=decoder_initial_state,
                        beam_width=self.beam_size,
                        output_layer=output_layer)
                else:
                    decoding_helper = GreedyEmbeddingHelper(
                        embedding=self.embedding,
                        start_tokens=start_tokens,
                        end_token=end_token)

                    inference_decoder = BasicDecoder(
                        cell=decoder_cell,
                        helper=decoding_helper,
                        initial_state=decoder_initial_state,
                        output_layer=output_layer)

                decoder_outputs, _, _ = dynamic_decode(
                    decoder=inference_decoder,
                    maximum_iterations=self.infer_max_iter)
                if self.beam_search:
                    infer_outputs = decoder_outputs.predicted_ids  # [batch_size, decoder_targets_length, beam_size]
                    self.infer_outputs = tf.transpose(
                        infer_outputs,
                        [0, 2, 1
                         ])  # [batch_size, beam_size, decoder_targets_length]
                else:
                    self.infer_outputs = decoder_outputs.sample_id  # [batch_size, decoder_targets_length]

        self.saver = tf.train.Saver(tf.global_variables(),
                                    max_to_keep=self.max_to_keep)
Exemplo n.º 28
0
    def _test_beam_search(self,
                          decoder,
                          initial_state=None,
                          tiled_initial_state=None,
                          tf_initial_state=None,
                          beam_width_1=1,
                          initiated=False):
        # Compare with tf built-in BeamSearchDecoder
        outputs, final_state, _ = beam_search_decode(decoder_or_cell=decoder,
                                                     embedding=self._embedding,
                                                     start_tokens=[1] *
                                                     self._batch_size,
                                                     end_token=2,
                                                     beam_width=beam_width_1,
                                                     max_decoding_length=20)

        self.assertIsInstance(outputs,
                              tf.contrib.seq2seq.FinalBeamSearchDecoderOutput)
        self.assertIsInstance(final_state,
                              tf.contrib.seq2seq.BeamSearchDecoderState)

        num_trainable_variables = len(tf.trainable_variables())
        _ = decoder(decoding_strategy='infer_greedy',
                    embedding=self._embedding,
                    start_tokens=[1] * self._batch_size,
                    end_token=2,
                    max_decoding_length=20)
        self.assertEqual(num_trainable_variables,
                         len(tf.trainable_variables()))

        if tf_initial_state is None:
            tf_initial_state = decoder.cell.zero_state(
                self._batch_size * beam_width_1, tf.float32)
        beam_decoder = BeamSearchDecoder(cell=decoder.cell,
                                         embedding=self._embedding,
                                         start_tokens=[1] * self._batch_size,
                                         end_token=2,
                                         initial_state=tf_initial_state,
                                         beam_width=beam_width_1,
                                         output_layer=decoder.output_layer)

        outputs_1, final_state_1, _ = dynamic_decode(decoder=beam_decoder,
                                                     maximum_iterations=20)

        ## Tests time major
        outputs_2, _, _ = beam_search_decode(
            decoder_or_cell=decoder,
            embedding=self._embedding,
            start_tokens=[1] * self._batch_size,
            end_token=2,
            beam_width=self._beam_width,
            initial_state=initial_state,
            tiled_initial_state=tiled_initial_state,
            max_decoding_length=21)
        outputs_3, _, _ = beam_search_decode(
            decoder_or_cell=decoder,
            embedding=self._embedding,
            start_tokens=[1] * self._batch_size,
            end_token=2,
            beam_width=self._beam_width,
            initial_state=initial_state,
            tiled_initial_state=tiled_initial_state,
            max_decoding_length=21,
            output_time_major=True)

        with self.test_session() as sess:
            if not initiated:
                sess.run(tf.global_variables_initializer())

            outputs_, final_state_, outputs_1_, final_state_1_ = sess.run(
                [outputs, final_state, outputs_1, final_state_1],
                feed_dict={
                    context.global_mode(): tf.estimator.ModeKeys.PREDICT
                })

            np.testing.assert_array_equal(outputs_.predicted_ids,
                                          outputs_1_.predicted_ids)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.scores,
                outputs_1_.beam_search_decoder_output.scores)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.predicted_ids,
                outputs_1_.beam_search_decoder_output.predicted_ids)
            np.testing.assert_array_equal(
                outputs_.beam_search_decoder_output.parent_ids,
                outputs_1_.beam_search_decoder_output.parent_ids)
            np.testing.assert_array_equal(final_state_.log_probs,
                                          final_state_1_.log_probs)
            np.testing.assert_array_equal(final_state_.lengths,
                                          final_state_1_.lengths)

            outputs_2_, outputs_3_ = sess.run([outputs_2, outputs_3],
                                              feed_dict={
                                                  context.global_mode():
                                                  tf.estimator.ModeKeys.PREDICT
                                              })
            self.assertEqual(outputs_2_.predicted_ids.shape,
                             tuple([self._batch_size, 21, 11]))
            self.assertEqual(outputs_3_.predicted_ids.shape,
                             tuple([21, self._batch_size, 11]))
Exemplo n.º 29
0
    def build_decoder(self, encoder_outputs, encoder_state):
        with tf.variable_scope('decoder') as decoder_scope:
            (
                self.decoder_cell,
                self.decoder_initial_state
            ) = self.build_decoder_cell(encoder_outputs, encoder_state)

            with tf.device(_get_embed_device(self.target_vocab_size)):
                if self.share_embedding:
                    self.decoder_embeddings = self.encoder_embeddings
                elif self.pretrained_embedding:

                    self.decoder_embeddings = tf.Variable(
                        tf.constant(0.0, shape=(self.target_vocab_size, self.embedding_size)),
                        trainable=True,
                        name='embeddings'
                    )

                    self.decoder_embeddings_placeholder =\
                        tf.placeholder(tf.float32, (self.target_vocab_size,
                                                    self.embedding_size))

                    self.decoder_embeddings_init = self.decoder_embeddings.assign(self.decoder_embeddings_placeholder)
                else:
                    self.decoder_embeddings = tf.get_variable(
                        name='embedding',
                        shape=(self.target_vocab_size, self.embedding_size),
                        initializer=self.initializer,
                        dtype=tf.float32
                    )

            self.decoder_output_projection = layers.Dense(self.target_vocab_size,
                                                          dtype=tf.float32,
                                                          use_bias=False,
                                                          name='decoder_output_projection')

            if self.mode == 'train':
                self.decoder_inputs_embdedded = tf.nn.embedding_lookup(
                    params=self.decoder_embeddings,
                    ids=self.decoder_inputs_train
                )

                inputs = self.decoder_inputs_embdedded

                if self.time_major:
                    inputs = tf.transpose(inputs, (1, 0, 2))

                training_helper = seq2seq.TrainingHelper(
                    inputs=inputs,
                    sequence_length=self.decoder_inputs_length,
                    time_major=self.time_major,
                    name='training_helper'
                )

                training_decoder = seq2seq.BasicDecoder(
                    cell=self.decoder_cell,
                    helper=training_helper,
                    initial_state=self.decoder_initial_state
                )

                max_decoder_length = tf.reduce_max(
                    self.decoder_inputs_length
                )

                (
                    outputs,
                    self.final_state,
                    _
                ) = seq2seq.dynamic_decode(
                    decoder=training_decoder,
                    output_time_major=self.time_major,
                    impute_finished=True,
                    maximum_iterations=max_decoder_length,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope
                )

                self.decoder_logits_train = self.decoder_output_projection(
                    outputs.rnn_output
                )

                self.masks = tf.sequence_mask(
                    lengths=self.decoder_inputs_length,
                    maxlen=max_decoder_length,
                    dtype=tf.float32,
                    name='masks'
                )

                decoder_logits_train = self.decoder_logits_train
                if self.time_major:
                    decoder_logits_train = tf.transpose(decoder_logits_train, (1, 0, 2))

                self.decoder_pred_train = tf.argmax(
                    decoder_logits_train, axis=-1, name='decoder_pred_train'
                )

                self.train_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.decoder_inputs,
                    logits=decoder_logits_train)

                self.masks_rewards = self.masks * self.rewards

                self.loss_rewards = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks_rewards,
                    average_across_timesteps=True,
                    average_across_batch=True
                )

                self.loss = seq2seq.sequence_loss(
                    logits=decoder_logits_train,
                    targets=self.decoder_inputs,
                    weights=self.masks,
                    average_across_timesteps=True,
                    average_across_batch=True
                )

                self.add_loss = self.loss + self.add_loss

            elif self.mode == 'decode':
                start_token = tf.tile(
                    [WordSequence.START],
                    [self.batch_size]
                )
                end_token = WordSequence.END

                def embed_and_input_proj(inputs):
                    return tf.nn.embedding_lookup(
                        self.decoder_embeddings,
                        inputs
                    )

                if not self.use_beamsearch_decode:
                    decoder_helper = seq2seq.GreedyEmbeddingHelper(
                        start_tokens=start_token,
                        end_token=end_token,
                        embedding=embed_and_input_proj
                    )

                    inference_decoder = seq2seq.BasicDecoder(
                        cell=self.decoder_cell,
                        helper=decoder_helper,
                        initial_state=self.decoder_initial_state,
                        output_layer=self.decoder_output_projection
                    )
                else:
                    inference_decoder = BeamSearchDecoder(
                        cell=self.decoder_cell,
                        embedding=embed_and_input_proj,
                        start_tokens=start_token,
                        end_token=end_token,
                        initial_state=self.decoder_initial_state,
                        beam_width=self.beam_width,
                        output_layer=self.decoder_output_projection
                    )
                if self.max_decode_step is not None:
                    max_decoder_step = self.max_decode_step
                else:
                    max_decoder_step = tf.round(tf.reduce_max(
                        self.encoder_inputs_length
                    ) * 4)
                (
                    self.decoder_outputs_decode,
                    self.final_state
                ) = (seq2seq.dynamic_decode(
                    decoder=inference_decoder,
                    output_time_major=self.time_major,
                    maximum_iterations=max_decoder_step,
                    parallel_iterations=self.parallel_iterations,
                    swap_memory=True,
                    scope=decoder_scope
                ))

                if not self.use_beamsearch_decode:
                    dod = self.decoder_outputs_decode
                    self.decoder_pred_train = tf.transpose(
                        self.decoder_pred_decode, (1, 0)
                    )
                else:
                    self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids

                    if self.time_major:
                        self.decoder_pred_decode = tf.transpose(
                            self.decoder_pred_decode, (1, 0, 2)
                        )
                    self.decoder_pred_decode = tf.transpose(
                        self.decoder_pred_decode,
                        perm=[0, 2, 1]
                    )
                    dod = self.decoder_pred_decode
                    self.beam_prob = dod.beam_search_decoder_output.scores
Exemplo n.º 30
0
 def __init__(self,
              vocab_size,
              learning_rate,
              encoder_size,
              max_length,
              embedding_size,
              sos_token,
              eos_token,
              unk_token,
              beam_size=5):
     self.vocab_size = vocab_size
     self.lr = learning_rate
     self.encoder_size = encoder_size
     self.max_length = max_length
     self.embedding_size = embedding_size
     self.SOS_token = sos_token
     self.EOS_token = eos_token
     self.UNK_token = unk_token
     self.beam_search_size = beam_size
     with tf.variable_scope('placeholder_and_embedding'):
         self.query = tf.placeholder(shape=(None, None), dtype=tf.int64)
         self.query_length = tf.placeholder(shape=(None, ), dtype=tf.int64)
         self.reply = tf.placeholder(shape=(None, None), dtype=tf.int64)
         self.reply_length = tf.placeholder(shape=(None, ), dtype=tf.int64)
         self.decoder_inputs = tf.placeholder(shape=(None, self.max_length),
                                              dtype=tf.int64)
         self.decoder_target = tf.placeholder(shape=(None, self.max_length),
                                              dtype=tf.int64)
         self.decoder_length = tf.placeholder(shape=(None, ),
                                              dtype=tf.int64)
         self.batch_size = tf.placeholder(shape=(), dtype=tf.int32)
         self.embedding_pl = tf.placeholder(dtype=tf.float32,
                                            shape=(self.vocab_size,
                                                   embedding_size),
                                            name='embedding_source_pl')
         word_embedding = tf.get_variable(name='word_embedding',
                                          shape=(self.vocab_size,
                                                 embedding_size),
                                          dtype=tf.float32,
                                          trainable=False)
         self.init_embedding = word_embedding.assign(self.embedding_pl)
     with tf.variable_scope("query_encoder"):
         self.query_encoder = deep_components.gru_encoder(
             word_embedding, self.encoder_size)
         query_out, query_state = self.query_encoder(
             seq_index=self.query, seq_len=self.query_length)
     with tf.variable_scope("reply_encoder"):
         self.reply_encoder = deep_components.gru_encoder(
             word_embedding, self.encoder_size)
         reply_out, reply_state = self.reply_encoder(
             seq_index=self.reply, seq_len=self.reply_length)
     with tf.variable_scope("decoder"):
         self.decoder = deep_components.decoder(word_embedding,
                                                self.encoder_size * 2,
                                                self.vocab_size)
     with tf.variable_scope("seq2seq-train"):
         # train
         encoder_state = tf.concat([query_state, reply_state], axis=1)
         decoder_outputs = []
         decoder_state = encoder_state
         for i in range(0, self.max_length):
             word_indices = self.decoder_inputs[:, i]
             decoder_out, decoder_state = self.decoder(
                 word_indices, decoder_state)
             decoder_outputs.append(decoder_out)  # b * l * vocab_size_tar
         decoder_outputs = tf.concat(decoder_outputs,
                                     1)  #b*max_length*vocab_size_tar
     with tf.variable_scope("cost"):
         # cost
         decoder_target_mask = tf.sequence_mask(self.decoder_length,
                                                maxlen=self.max_length,
                                                dtype=tf.float32)
         self.cost = sequence_loss(decoder_outputs, self.decoder_target,
                                   decoder_target_mask)
         self.optimizer = tf.train.AdamOptimizer(
             learning_rate=self.lr).minimize(self.cost)
     with tf.variable_scope("seq2seq-generate"):
         # generate
         self.generate_outputs = []
         decoder_state = encoder_state
         word_indices = self.decoder_inputs[:, 0]  # SOS
         for i in range(0, self.max_length):
             decoder_out, decoder_state = self.decoder(
                 word_indices, decoder_state)
             softmax_out = tf.nn.softmax(decoder_out[:, 0, :])
             word_indices = tf.cast(tf.arg_max(softmax_out, -1),
                                    dtype=tf.int64)  # b * 1
             self.generate_outputs.append(
                 tf.expand_dims(word_indices, axis=1))
         self.generate_outputs = tf.concat(self.generate_outputs,
                                           1)  #b*max_len
     with tf.variable_scope("seq2seq_beam_search_generate"):
         tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
             encoder_state, multiplier=self.beam_search_size)
         start_tokens = tf.ones([
             self.batch_size,
         ], tf.int32) * self.SOS_token
         bs_decoder = BeamSearchDecoder(
             self.decoder.gru_cell,
             word_embedding,
             start_tokens=start_tokens,
             end_token=self.EOS_token,
             initial_state=tiled_encoder_final_state,
             beam_width=self.beam_search_size,
             output_layer=self.decoder.out_layer)
         self.bs_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(
             decoder=bs_decoder, maximum_iterations=self.max_length)