예제 #1
0
        def symbols_to_logits_fn(ids, i, cache):
            decoder_input = ids[:, -1:]
            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(self.mode))
            # !!!!!!!!
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            # decoder_input += timing_signal[i:i + 1]
            slice_pos_encoding = tf.slice(timing_signal, [i, 0],
                                          [1, self.params.hidden_size],
                                          name='slice_pos_encoding')
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :, i:i +
                                                                  1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            # logits = tf.squeeze(logits, axis=[1])
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache
예제 #2
0
    def encode(self, inputs, attention_bias):
        with tf.name_scope("encode"):
            embedded_inputs = self.encoder_embedding_layer(
                inputs, not ModeKeys.is_predict_one(self.mode))
            if ModeKeys.is_predict_one(self.mode):
                inputs_padding = None
            else:
                inputs_padding = model_utils.get_padding(inputs)

            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                if ModeKeys.is_predict_one(self.mode):
                    pos_encoding = model_utils.get_position_encoding(
                        self.params.max_length, self.params.hidden_size)
                    pos_encoding = tf.slice(pos_encoding, [0, 0],
                                            [length, self.params.hidden_size],
                                            name='slice_pos_encoding')
                else:
                    pos_encoding = model_utils.get_position_encoding(
                        length, self.params.hidden_size)

                encoder_inputs = embedded_inputs + pos_encoding

            if self.is_train:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs, 1 - self.params.layer_postprocess_dropout)

            return self.encoder_stack(encoder_inputs, attention_bias,
                                      inputs_padding)
예제 #3
0
    def argmax_predict(self, encoder_outputs, encoder_decoder_attention_bias):
        if ModeKeys.is_predict_one(self.mode):
            batch_size = 1
        else:
            batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]
        max_decode_length = input_length + self.params.extra_decode_length

        symbols_to_logits_fn = self._get_symbols_to_logits_fn(
            max_decode_length)

        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
                "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
            }
            for layer in range(self.params.num_hidden_layers)
        }

        cache["encoder_outputs"] = encoder_outputs
        if not ModeKeys.is_predict_one(self.mode):
            cache[
                "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        if self.params.beam_size > 1:
            pass
        else:

            def inner_loop(i, finished, next_id, decoded_ids, cache):
                logits, cache = symbols_to_logits_fn(
                    next_id, i, cache)  # [batch, vocab_size]
                next_id = tf.argmax(logits, -1, output_type=tf.int32)
                finished |= tf.equal(next_id, EOS_ID)
                next_id = tf.reshape(next_id, shape=[-1, 1])
                decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
                return i + 1, finished, next_id, decoded_ids, cache

            def is_not_finished(i, finished, _1, _2, _3):
                return (i < max_decode_length) & tf.logical_not(
                    tf.reduce_all(finished))

            decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32)
            finished = tf.fill([batch_size], False)
            next_id = tf.zeros([batch_size, 1], dtype=tf.int32)

            _, _, _, decoded_ids, _ = tf.while_loop(
                cond=is_not_finished,
                body=inner_loop,
                loop_vars=[
                    tf.constant(0), finished, next_id, decoded_ids, cache
                ],
                shape_invariants=[
                    tf.TensorShape([]),
                    tf.TensorShape([None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    nest.map_structure(get_state_shape_invariants, cache),
                ])
            return decoded_ids
예제 #4
0
        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]  # [batch, 1]

            # decoder_input = ids[:, :]     # [batch, 1]
            # print("decoder_input:", decoder_input.shape)

            # Preprocess decoder input by getting embeddings and adding timing signal.
            # !!!!!!!!
            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(
                    self.mode))  # [batch, 1, hidden_size]
            # !!!!!!!!
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            # add position embedding
            # decoder_input += timing_signal[i:i + 1]
            slice_pos_encoding = tf.slice(
                timing_signal, [i, 0], [1, self.params.hidden_size],
                name='slice_pos_encoding')  # [1, hidden_size]
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :,
                                                                  i:i + 1, :i +
                                                                  1]  # [1, 1, 1, time_step]
                # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step]
            # print("attention bias:", self_attention_bias.shape)
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            # logits = tf.squeeze(logits, axis=[1])
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache
예제 #5
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        if ModeKeys.is_predict_one(self.mode):
            timing_signal = model_utils.get_position_encoding(
                self.params.max_length, self.params.hidden_size)
            timing_signal = tf.slice(
                timing_signal, [0, 0],
                [max_decode_length + 1, self.params.hidden_size],
                name='slice_timing_signal')
        else:
            timing_signal = model_utils.get_position_encoding(
                max_decode_length + 1, self.params.hidden_size
            )  # [max_decode_length + 1, hidden_size]

        if ModeKeys.is_predict_one(self.mode):
            decoder_self_attention_bias = None
        else:
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                max_decode_length
            )  # [1, 1, max_decode_length, max_decode_length]

        def symbols_to_logits_fn(ids, i, cache):

            decoder_input = ids[:, -1:]  # [batch, 1]

            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(
                    self.mode))  # [batch, 1, hidden_size]
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            slice_pos_encoding = tf.slice(
                timing_signal, [i, 0], [1, self.params.hidden_size],
                name='slice_pos_encoding')  # [1, hidden_size]
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :,
                                                                  i:i + 1, :i +
                                                                  1]  # [1, 1, 1, time_step]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache

        return symbols_to_logits_fn
예제 #6
0
    def build_generator(self, inputs):
        # Calculate attention bias for encoder self-attention and decoder
        # multi-headed attention layers.
        if ModeKeys.is_predict_one(self.mode):
            self.attention_bias = None
        else:
            self.attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        # Run the inputs through the encoder layer to map the symbol
        # representations to continuous representations.
        self.encoder_outputs = self.encode(
            inputs, self.attention_bias)  # [batch, src_len, hidden_size]

        # get encdec_attenion k/v just for predict_one_encoder
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      self.encoder_outputs,
                                                      None, None, None)

        # Generate output sequence if targets is None, or return logits if target
        # sequence is known.
        if self.is_train:
            tf.logging.info("!!!!!! using rl predict in traning !!!!!!")
            return self.rl_predict(self.encoder_outputs, self.attention_bias)
        else:
            tf.logging.info(
                "!!!!!!! using argmax_predict in inference !!!!!!!!")
            return self.argmax_predict(self.encoder_outputs,
                                       self.attention_bias)
예제 #7
0
 def __init__(self, params, is_train, mode):
     super(DecoderStack, self).__init__()
     self.mode = mode
     self.predict_one = ModeKeys.is_predict_one(self.mode)
     self.layers = []
     for _ in range(params.num_hidden_layers):
         self_attention_layer = attention_layer.SelfAttention(
             params.hidden_size, params.num_heads, params.attention_dropout,
             is_train, self.predict_one)
         if self.mode == ModeKeys.PREDICT_ONE_DECODER:
             enc_dec_attention_layer = attention_layer.EncDecPredictOneAttention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         else:
             enc_dec_attention_layer = attention_layer.Attention(
                 params.hidden_size, params.num_heads,
                 params.attention_dropout, is_train, self.predict_one)
         feed_forward_network = ffn_layer.FeedFowardNetwork(
             params.hidden_size, params.filter_size, params.relu_dropout,
             is_train, self.predict_one)
         # decoder 包含3个模块,分别是self-attention,enc_dec_attention,以及feed-forward. 分别wrapper熵layer_norm和dropout.
         self.layers.append([
             PrePostProcessingWrapper(self_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(enc_dec_attention_layer, params,
                                      is_train),
             PrePostProcessingWrapper(feed_forward_network, params,
                                      is_train)
         ])
         self.output_normalization = LayerNormalization(params.hidden_size)
예제 #8
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence."""
        with tf.name_scope("decode"):
            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            with tf.name_scope("shift_targets"):
                decoder_inputs = tf.pad(
                    decoder_inputs,
                    [[0, 0], [1, 0], [0, 0]
                     ])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.decoder_softmax_layer.linear(outputs)
            return logits
예제 #9
0
    def build_generator(self, inputs):
        if ModeKeys.is_predict_one(self.mode):
            self.attention_bias = None
        else:
            self.attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]
        self.encoder_outputs = self.encode(
            inputs, self.attention_bias)  # [batch, src_len, hidden_size]
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      self.encoder_outputs,
                                                      None, None, None)

        if self.is_train:
            # if self.mode == tf.estimator.ModeKeys.TRAIN:
            tf.logging.info("!!!!!! using rl predict in traning !!!!!!")
            decoded_ids, decoded_logits, log_probs = self.rl_predict(
                self.encoder_outputs, self.attention_bias)
            return decoded_ids, decoded_logits, log_probs
        else:
            tf.logging.info(
                "!!!!!!! using argmax_predict in prediction/evaluation !!!!!!!!"
            )
            decoded_ids, decoded_logits = self.argmax_predict(
                self.encoder_outputs, self.attention_bias)
            return decoded_ids, decoded_logits, _
예제 #10
0
    def build_pretrain(self, inputs, targets):
        # initializer = tf.variance_scaling_initializer(
        #     self.params.initializer_gain, mode="fan_avg", distribution="uniform")
        #
        # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE):
        if ModeKeys.is_predict_one(self.mode):
            attention_bias = None
        else:
            attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        encoder_outputs = self.encode(
            inputs, attention_bias)  # [batch, src_len, hidden_size]

        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      encoder_outputs, None,
                                                      None, None)

        if targets is None:
            prediction, _ = self.argmax_predict(encoder_outputs,
                                                attention_bias)
            return prediction
        else:
            logits = self.decode(
                targets, encoder_outputs,
                attention_bias)  # [batch, tgt_len, vocab_size]
            return logits
예제 #11
0
    def build_padding_rollout_generator(self, real_inputs, gen_samples, max_len, given_num):
        with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE):
            if ModeKeys.is_predict_one(self.mode):
                self.attention_bias = None
            else:
                self.attention_bias = model_utils.get_padding_bias(real_inputs)
            self.encoder_outputs = self.encode(real_inputs, self.attention_bias)

            def condition(given_num, _):
                return given_num < max_len

            def inner_loop(given_num, given_y):
                logits = self.decode(given_y, self.encoder_outputs, self.attention_bias)
                next_logits = logits[:, given_num, :]  # [batch, decoder_vocab_size]
                next_probs = tf.nn.softmax(next_logits)
                log_probs = tf.log(next_probs)
                next_sample = tf.multinomial(log_probs, num_samples=1)
                next_sample = tf.cast(next_sample, dtype=tf.int32)
                given_y = tf.concat([given_y[:, :given_num], next_sample], axis=1)
                given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num - 1]])
                return given_num + 1, given_y

            given_y = gen_samples[:, :given_num]
            init_given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num]])
            init_given_num = given_num

            given_num, roll_sample = tf.while_loop(
                cond=condition,
                body=inner_loop,
                loop_vars=[init_given_num, init_given_y],
                shape_invariants=[init_given_num.get_shape(),
                                  tf.TensorShape([None, None])]
            )
            return roll_sample
예제 #12
0
    def decode(self, targets, encoder_outputs, attention_bias):
        with tf.name_scope("decode"):
            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            # done
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # !!!
            # logits = self.embedding_softmax_layer.linear(outputs)
            logits = self.decoder_softmax_layer.linear(outputs)
            # done
            return logits
예제 #13
0
    def encode(self, inputs, attention_bias):
        """Generate continuous representation for inputs.

        Args:
          inputs: int tensor with shape [batch_size, input_length].
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float tensor with shape [batch_size, input_length, hidden_size]
        """
        with tf.name_scope("encode"):
            # Prepare inputs to the layer stack by adding positional encodings and
            # applying dropout.
            embedded_inputs = self.encoder_embedding_layer(
                inputs, not ModeKeys.is_predict_one(self.mode))
            if ModeKeys.is_predict_one(self.mode):
                inputs_padding = None
            else:
                inputs_padding = model_utils.get_padding(inputs)

            # add_pos_encoding
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                if ModeKeys.is_predict_one(self.mode):
                    pos_encoding = model_utils.get_position_encoding(
                        self.params.max_length, self.params.hidden_size)
                    pos_encoding = tf.slice(pos_encoding, [0, 0],
                                            [length, self.params.hidden_size],
                                            name='slice_pos_encoding')
                else:
                    pos_encoding = model_utils.get_position_encoding(
                        length, self.params.hidden_size)

                encoder_inputs = embedded_inputs + pos_encoding

            if self.is_train:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs, 1 - self.params.layer_postprocess_dropout)

            return self.encoder_stack(encoder_inputs, attention_bias,
                                      inputs_padding)
예제 #14
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

        Args:
          targets: target values for the output sequence.
            int tensor with shape [batch_size, target_length]
          encoder_outputs: continuous representation of input sequence.
            float tensor with shape [batch_size, input_length, hidden_size]
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            # !!!
            # decoder_inputs = self.embedding_softmax_layer(targets)

            decoder_inputs = self.decoder_embedding_layer(
                targets, not ModeKeys.is_predict_one(self.mode))
            # done
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(
                    decoder_inputs,
                    [[0, 0], [1, 0], [0, 0]
                     ])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # !!!
            # logits = self.embedding_softmax_layer.linear(outputs)
            logits = self.decoder_softmax_layer.linear(outputs)
            # done
            return logits
예제 #15
0
    def inference(self, inputs, targets=None, reuse=None):
        with tf.variable_scope(self.name_scope,
                               initializer=self.initializer,
                               reuse=reuse):

            if ModeKeys.is_predict_one(self.mode):
                attention_bias = None
            else:
                attention_bias = model_utils.get_padding_bias(inputs)
            encoder_outputs = self.encode(inputs, attention_bias)
            if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
                fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
                fake_decoder_outputs = self.decoder_stack(
                    fake_decoder_inputs, encoder_outputs, None, None, None)
            if targets is None:
                return self.predict(encoder_outputs, attention_bias)
            else:
                logits = self.decode(targets, encoder_outputs, attention_bias)
                return logits
예제 #16
0
    def __init__(self, params, is_train, mode):
        super(EncoderStack, self).__init__()
        self.mode = mode
        self.predict_one = ModeKeys.is_predict_one(self.mode)
        self.layers = []
        for _ in range(params.num_hidden_layers):
            # Create sublayers for each layer.
            self_attention_layer = attention_layer.SelfAttention(
                params.hidden_size, params.num_heads, params.attention_dropout,
                is_train, self.predict_one)
            feed_forward_network = ffn_layer.FeedFowardNetwork(
                params.hidden_size, params.filter_size, params.relu_dropout,
                is_train, self.predict_one)

            self.layers.append([
                PrePostProcessingWrapper(self_attention_layer, params,
                                         is_train),
                PrePostProcessingWrapper(feed_forward_network, params,
                                         is_train)
            ])
        # Create final layer normalization layer.
        self.output_normalization = LayerNormalization(params.hidden_size)
예제 #17
0
    def build_pretrain(self, inputs, targets):
        # initializer = tf.variance_scaling_initializer(
        #     self.params.initializer_gain, mode="fan_avg", distribution="uniform")
        #
        # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE):

        # Calculate attention bias for encoder self-attention and decoder
        # multi-headed attention layers.
        if ModeKeys.is_predict_one(self.mode):
            attention_bias = None
        else:
            attention_bias = model_utils.get_padding_bias(
                inputs)  # [batch, 1, 1, src_len]

        # Run the inputs through the encoder layer to map the symbol
        # representations to continuous representations.
        encoder_outputs = self.encode(
            inputs, attention_bias)  # [batch, src_len, hidden_size]

        # get encdec_attenion k/v just for predict_one_encoder
        if self.mode == ModeKeys.PREDICT_ONE_ENCODER:
            fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size])
            fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs,
                                                      encoder_outputs, None,
                                                      None, None)

        # Generate output sequence if targets is None, or return logits if target
        # sequence is known.
        if targets is None:
            tf.logging.info(
                "!!!!!!!!!!!prediction using argmax prediction!!!!!!!!!!!!!")
            prediction, _ = self.argmax_predict(encoder_outputs,
                                                attention_bias)
            return prediction
        else:
            logits = self.decode(
                targets, encoder_outputs,
                attention_bias)  # [batch, tgt_len, vocab_size]
            return logits
예제 #18
0
    def rl_predict(self, encoder_outputs, encoder_decoder_attention_bias):
        if ModeKeys.is_predict_one(self.mode):
            batch_size = 1
        else:
            batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]
        max_decode_length = input_length + self.params.extra_decode_length

        symbols_to_logits_fn = self._get_symbols_to_logits_fn(
            max_decode_length)

        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)

        # Create cache storing decoder attention values for each layer.
        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
                "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
            }
            for layer in range(self.params.num_hidden_layers)
        }

        # Add encoder output and attention bias to the cache.
        cache["encoder_outputs"] = encoder_outputs
        if not ModeKeys.is_predict_one(self.mode):
            cache[
                "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        if self.params.beam_size > 1:
            pass
        else:

            def inner_loop(i, finished, next_id, decoded_ids, log_probs,
                           decoded_logits, cache):
                # print("time step:", i)
                """One step of greedy decoding."""

                logits, cache = symbols_to_logits_fn(next_id, i, cache)
                categorical = tf.contrib.distributions.Categorical(
                    logits=logits)
                next_id = categorical.sample()
                log_prob = categorical.log_prob(next_id)  # [batch,]
                finished |= tf.equal(next_id, EOS_ID)
                finished = tf.reshape(finished, (-1, ))
                next_id = tf.reshape(next_id, shape=[-1, 1])
                log_prob = tf.reshape(log_prob, shape=[-1, 1])
                decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
                log_probs = tf.concat([log_probs, log_prob],
                                      axis=1)  # [batch, len]
                logits = tf.expand_dims(logits, axis=1)
                decoded_logits = tf.concat([decoded_logits, logits], axis=1)
                return i + 1, finished, next_id, decoded_ids, log_probs, decoded_logits, cache

            def is_not_finished(i, finished, _1, _2, _3, _4, _5):
                return (i < max_decode_length) & tf.logical_not(
                    tf.reduce_all(finished))

            decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32)
            log_probs = tf.zeros([batch_size, 0], dtype=tf.float32)
            decoded_logits = tf.zeros(
                [batch_size, 0, self.params.target_vocab_size],
                dtype=tf.float32)
            finished = tf.fill([batch_size], False)
            next_id = tf.zeros([batch_size, 1], dtype=tf.int32)

            _, _, _, decoded_ids, log_probs, decoded_logits, cache = tf.while_loop(
                cond=is_not_finished,
                body=inner_loop,
                loop_vars=[
                    tf.constant(0), finished, next_id, decoded_ids, log_probs,
                    decoded_logits, cache
                ],
                shape_invariants=[
                    tf.TensorShape([]),
                    tf.TensorShape([None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None, None]),
                    nest.map_structure(get_state_shape_invariants, cache),
                ])

            # return {"outputs": decoded_ids, "scores": tf.ones([batch_size, 1])}
            return decoded_ids, decoded_logits, log_probs
예제 #19
0
    def rl_predict_new(self, encoder_outputs, encoder_decoder_attention_bias):
        if ModeKeys.is_predict_one(self.mode):
            batch_size = 1
        else:
            batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]
        max_decode_length = input_length + self.params.extra_decode_length
        symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length)
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)
        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
                "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
            } for layer in range(self.params.num_hidden_layers)}

        # Add encoder output and attention bias to the cache.
        cache["encoder_outputs"] = encoder_outputs
        if not ModeKeys.is_predict_one(self.mode):
            cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        if self.params.beam_size > 1:
            pass
        else:
            def inner_loop(i, finished, next_id, decoded_ids, log_probs, cache):
                prev_id = next_id
                logits, cache = symbols_to_logits_fn(next_id, i, cache)
                categorical = tf.contrib.distributions.Categorical(logits=logits)
                next_id = categorical.sample()
                log_prob = categorical.log_prob(next_id)  # [batch,]
                finished |= tf.equal(next_id, EOS_ID)
                finished = tf.reshape(finished, (-1,))
                next_id = tf.reshape(next_id, shape=[-1, 1])
                mask = tf.cast(tf.math.not_equal(prev_id, EOS_ID), dtype=tf.int32)
                next_id = next_id * mask

                def pad_fn():
                    mask_pad = tf.cast(tf.math.not_equal(prev_id, PAD_ID), dtype=tf.int32)
                    return next_id * mask_pad

                next_id = tf.cond(tf.less(i, 1), lambda: next_id, pad_fn)
                log_prob = tf.reshape(log_prob, shape=[-1, 1])
                decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
                log_probs = tf.concat([log_probs, log_prob], axis=1)  # [batch, len]
                return i + 1, finished, next_id, decoded_ids, log_probs, cache

            def is_not_finished(i, finished, _1, _2, _3, _4):
                return (i < max_decode_length) & tf.logical_not(tf.reduce_all(finished))

            decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32)
            log_probs = tf.zeros([batch_size, 0], dtype=tf.float32)
            finished = tf.fill([batch_size], False)
            next_id = tf.zeros([batch_size, 1], dtype=tf.int32)

            _, _, _, decoded_ids, log_probs, cache = tf.while_loop(
                cond=is_not_finished,
                body=inner_loop,
                loop_vars=[tf.constant(0), finished, next_id, decoded_ids, log_probs, cache],
                shape_invariants=[
                    tf.TensorShape([]),
                    tf.TensorShape([None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    nest.map_structure(get_state_shape_invariants, cache),
                ])
            return decoded_ids, log_probs
예제 #20
0
    def predict(self, encoder_outputs, encoder_decoder_attention_bias):
        """Return predicted sequence."""
        if ModeKeys.is_predict_one(self.mode):
            batch_size = 1
        else:
            batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]
        max_decode_length = input_length + self.params.extra_decode_length

        symbols_to_logits_fn = self._get_symbols_to_logits_fn(
            max_decode_length)

        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)

        # Create cache storing decoder attention values for each layer.
        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
                "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
            }
            for layer in range(self.params.num_hidden_layers)
        }

        # Add encoder output and attention bias to the cache.
        cache["encoder_outputs"] = encoder_outputs
        if not ModeKeys.is_predict_one(self.mode):
            cache[
                "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        if self.params.beam_size > 1:
            print("!!!!!!!!!!! right here, beam_size = %i!!!!!!!!!!!!" %
                  self.params.beam_size)
            # Use beam search to find the top beam_size sequences and scores.
            decoded_ids, scores = beam_search.sequence_beam_search(
                symbols_to_logits_fn=symbols_to_logits_fn,
                initial_ids=initial_ids,
                initial_cache=cache,
                vocab_size=self.params.target_vocab_size,
                beam_size=self.params.beam_size,
                alpha=self.params.alpha,
                max_decode_length=max_decode_length,
                eos_id=EOS_ID)

            # Get the top sequence for each batch element
            top_decoded_ids = decoded_ids[:, 0, 1:]
            top_scores = scores[:, 0]

            return {"outputs": top_decoded_ids, "scores": top_scores}

        else:

            def inner_loop(i, finished, next_id, decoded_ids, cache):
                """One step of greedy decoding."""
                logits, cache = symbols_to_logits_fn(next_id, i, cache)
                next_id = tf.argmax(logits, -1, output_type=tf.int32)
                finished |= tf.equal(next_id, EOS_ID)
                # next_id = tf.expand_dims(next_id, axis=1)
                next_id = tf.reshape(next_id, shape=[-1, 1])
                decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
                return i + 1, finished, next_id, decoded_ids, cache

            def is_not_finished(i, finished, *_):
                return (i < max_decode_length) & tf.logical_not(
                    tf.reduce_all(finished))

            decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32)
            finished = tf.fill([batch_size], False)
            next_id = tf.zeros([batch_size, 1], dtype=tf.int32)
            _, _, _, decoded_ids, _ = tf.while_loop(
                is_not_finished,
                inner_loop,
                [tf.constant(0), finished, next_id, decoded_ids, cache],
                shape_invariants=[
                    tf.TensorShape([]),
                    tf.TensorShape([None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    nest.map_structure(get_state_shape_invariants, cache),
                ])

            return {"outputs": decoded_ids, "scores": tf.ones([batch_size, 1])}
예제 #21
0
        norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
        return norm_x * self.scale + self.bias


if __name__ == "__main__":
    import os
    tf.enable_eager_execution()
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    params = model_params.TransformerBaseParams()
    x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32)

    Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size,
                                                     params.hidden_size,
                                                     "source_embedding")
    embedded_inputs = Enc_Embedding(
        x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN))
    print(embedded_inputs.shape)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    print(attention_bias.shape)
    encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN)
    enc_out = encoder_stack(embedded_inputs, attention_bias, None)
    print(enc_out.shape)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        10)
    self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1]
    print(self_attention_bias)
    attention_bias = model_utils.get_padding_bias(x_inputs)
    cache = {
        "layer_%d" % layer: {
            "k": tf.zeros([2, 0, params.hidden_size]),
            "v": tf.zeros([2, 0, params.hidden_size]),
예제 #22
0
    def argmax_predict(self, encoder_outputs, encoder_decoder_attention_bias):
        """Return predicted sequence."""
        if ModeKeys.is_predict_one(self.mode):
            batch_size = 1
        else:
            batch_size = tf.shape(encoder_outputs)[0]
        input_length = tf.shape(encoder_outputs)[1]
        max_decode_length = input_length + self.params.extra_decode_length

        symbols_to_logits_fn = self._get_symbols_to_logits_fn(
            max_decode_length)

        # Create initial set of IDs that will be passed into symbols_to_logits_fn.
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)

        # Create cache storing decoder attention values for each layer.
        cache = {
            "layer_%d" % layer: {
                "k": tf.zeros([batch_size, 0, self.params.hidden_size]),
                "v": tf.zeros([batch_size, 0, self.params.hidden_size]),
            }
            for layer in range(self.params.num_hidden_layers)
        }

        # Add encoder output and attention bias to the cache.
        cache["encoder_outputs"] = encoder_outputs
        if not ModeKeys.is_predict_one(self.mode):
            cache[
                "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias

        if self.params.beam_size > 1:
            pass
        else:

            def inner_loop(i, finished, next_id, decoded_ids, cache):
                print("time step:", i)
                """One step of greedy decoding."""
                logits, cache = symbols_to_logits_fn(next_id, i, cache)
                # logits, cache = symbols_to_logits_fn(decoded_ids, i, cache)
                next_id = tf.argmax(logits, -1, output_type=tf.int32)
                finished |= tf.equal(next_id, EOS_ID)
                # next_id = tf.expand_dims(next_id, axis=1)
                next_id = tf.reshape(next_id, shape=[-1, 1])
                decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
                return i + 1, finished, next_id, decoded_ids, cache

            def is_not_finished(i, finished, _1, _2, _3):
                return (i < max_decode_length) & tf.logical_not(
                    tf.reduce_all(finished))

            decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32)
            finished = tf.fill([batch_size], False)
            next_id = tf.zeros([batch_size, 1], dtype=tf.int32)

            _, _, _, decoded_ids, _ = tf.while_loop(
                cond=is_not_finished,
                body=inner_loop,
                loop_vars=[
                    tf.constant(0), finished, next_id, decoded_ids, cache
                ],
                shape_invariants=[
                    tf.TensorShape([]),
                    tf.TensorShape([None]),
                    tf.TensorShape([None, None]),
                    tf.TensorShape([None, None]),
                    nest.map_structure(get_state_shape_invariants, cache),
                ])

            return decoded_ids, _