示例#1
0
 def _prepare_source():
     """ Pre-processes inputs to the encoder and generates the corresponding attention masks."""
     # Embed
     source_embeddings = self._embed(source_ids)
     # Obtain length and depth of the input tensors
     _, time_steps, depth = tf_utils.get_shape_list(source_embeddings)
     # Transform input mask into attention mask
     inverse_mask = tf.cast(tf.equal(source_mask, 0.0),
                            dtype=FLOAT_DTYPE)
     attn_mask = inverse_mask * -1e9
     # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits
     attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1)
     # Differentiate between self-attention and cross-attention masks for further, optional modifications
     self_attn_mask = attn_mask
     cross_attn_mask = attn_mask
     # Add positional encodings
     positional_signal = get_positional_signal(time_steps, depth,
                                               FLOAT_DTYPE)
     source_embeddings += positional_signal
     # Apply dropout
     if self.config.transformer_dropout_embeddings > 0:
         source_embeddings = tf.layers.dropout(
             source_embeddings,
             rate=self.config.transformer_dropout_embeddings,
             training=self.training)
     return source_embeddings, self_attn_mask, cross_attn_mask
    def generate_decoding_function(self, encoder_output):

        with tf.compat.v1.name_scope(self._scope):
            # Generate a positional signal for the longest possible output.
            positional_signal = get_positional_signal(
                self._config.translation_maxlen, self._config.embedding_size,
                FLOAT_DTYPE)

        decoder = self._model.dec

        def _decoding_function(step_target_ids, current_time_step, memories):
            """Single-step decoding function.

            Args:
                step_target_ids: Tensor with shape (batch_size)
                current_time_step: scalar Tensor.
                memories: dictionary (see top-level class description)

            Returns:
            """
            with tf.compat.v1.name_scope(self._scope):
                # TODO Is this necessary?
                vocab_ids = tf.reshape(step_target_ids, [-1, 1])
                # Look up embeddings for target IDs.
                target_embeddings = decoder._embed(vocab_ids)
                # Add positional signal.
                signal_slice = positional_signal[:, current_time_step -
                                                 1:current_time_step, :]
                target_embeddings += signal_slice
                # Optionally, apply dropout to embeddings.
                if self.config.transformer_dropout_embeddings > 0:
                    target_embeddings = tf.compat.v1.layers.dropout(
                        target_embeddings,
                        rate=self.config.transformer_dropout_embeddings,
                        training=decoder.training)
                # Propagate values through the decoder stack.
                # NOTE: No self-attention mask is applied at decoding, as
                #       future information is unavailable.
                layer_output = target_embeddings
                for layer_id in range(1,
                                      self.config.transformer_dec_depth + 1):
                    layer = decoder.decoder_stack[layer_id]
                    mem_key = 'layer_{:d}'.format(layer_id)
                    layer_output, memories[mem_key] = \
                        layer['self_attn'].forward(
                            layer_output, None, None, memories[mem_key])
                    layer_output, _ = layer['cross_attn'].forward(
                        layer_output, encoder_output.enc_output,
                        encoder_output.cross_attn_mask)
                    layer_output = layer['ffn'].forward(layer_output)
                # Return prediction at the final time-step to be consistent
                # with the inference pipeline.
                dec_output = layer_output[:, -1, :]
                # Project decoder stack outputs and apply the soft-max
                # non-linearity.
                step_logits = \
                    decoder.softmax_projection_layer.project(dec_output)
                return step_logits, memories

        return _decoding_function
示例#3
0
 def _prepare_source():
     """ Pre-processes inputs to the encoder and generates the corresponding attention masks."""
     # Embed
     pre_source_embeddings = self._embed(source_ids)
     with tf.variable_scope(self.name):
         source_embeddings = self.emb_ffn.forward(pre_source_embeddings)
     glove_embeddings = self.embedding_layer.get_glove_embed(source_pids)
     source_embeddings += glove_embeddings
     # Obtain length and depth of the input tensors
     _, time_steps, depth = get_shape_list(source_embeddings)
     # Transform input mask into attention mask
     # 恢复source_mask
     shape_mask = get_shape_list(source_mask)
     source_mask1 = tf.slice(source_mask, [0, 0, 0], [shape_mask[0], shape_mask[1], 1])
     source_mask2 = tf.reshape(source_mask1, [shape_mask[0], shape_mask[1]])
     inverse_mask = tf.cast(tf.equal(source_mask2, 0.0), dtype=self.float_dtype)
     attn_mask = inverse_mask * -1e9
     # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits
     attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1)
     # Differentiate between self-attention and cross-attention masks for further, optional modifications
     self_attn_mask = attn_mask
     cross_attn_mask = attn_mask
     # Add positional encodings
     positional_signal = get_positional_signal(time_steps, depth, self.float_dtype)
     source_embeddings += positional_signal
     # Apply dropout
     if self.config.transformer_dropout_embeddings > 0:
         source_embeddings = tf.layers.dropout(source_embeddings,
                                               rate=self.config.transformer_dropout_embeddings, training=self.training)
     return source_embeddings, self_attn_mask, cross_attn_mask
示例#4
0
 def _embed(self, index_sequence):
     """ Embeds source-side indices to obtain the corresponding dense tensor representations. """
     #重要更改
     #index_sequence: (batch_size, seq_len, u_len)
     u_emb = self.embedding_layer.embed(index_sequence)  #(batch_size, seq_len, u_len, embedding_size)
     shape = get_shape_list(u_emb)
     #加上位置编码,特指md5:[1, u_len, embedding_size]
     if self.config.utf8_type == "md5":
         md5_positional_signal = get_positional_signal(shape[2], shape[3], self.float_dtype)
         u_emb += md5_positional_signal
     #修剪为2048
     input_size = self.config.pre_source_embedding_size  # 默认2048
     cc = input_size - shape[2]*shape[3]
     if self.config.pre_source_embed_cross: #似乎效果更差,且测试时bleu值异常
         embsize = tf.to_int32((input_size/shape[2]))
         accsize = input_size % shape[2]
         fix_merge_emb = tf.pad(u_emb, [[0, 0], [0, 0], [0, 0], [0, tf.reduce_max([embsize-shape[3], 0])]], constant_values=1.0)
         fix_merge_emb = tf.slice(fix_merge_emb, [0, 0, 0, 0], [-1, -1, -1, embsize])
         fix_merge_emb = tf.reshape(fix_merge_emb, [shape[0], shape[1], shape[2]*embsize])
         fix_merge_emb = tf.pad(fix_merge_emb, [[0, 0], [0, 0], [0, accsize]], constant_values=1.0)
     else:
         merge_emb = tf.reshape(u_emb, [shape[0], shape[1], shape[2]*shape[3]])  #(batch_size, seq_len, u_len*embedding_size)
         fix_merge_emb = tf.pad(merge_emb, [[0, 0], [0, 0], [0, tf.reduce_max([cc, 0])]], constant_values=0)
         fix_merge_emb = tf.slice(fix_merge_emb, [0, 0, 0], [-1, -1, input_size])
     
     return fix_merge_emb
示例#5
0
        def _prepare_source():
            """ Pre-processes inputs to the encoder and generates the corresponding attention masks."""
            DICT_SIZE, ENG_DICT_FILE, OUTPUT_TRANSLATE_FILE, _, _, DEBIASED_EMBEDDING, _ = get_debias_files_from_config(
                self.consts_config_str)
            if self.USE_DEBIASED:
                print("using debiased embeddings")
                self.embedding_layer.embedding_table = self.embedding_matrix
            else:
                print("using non debiased embeddings")
            source_embeddings = self._embed(source_ids)
            if self.COLLECT_EMBEDDING_TABLE:
                ## print the embedding table
                # ########################################### PRINT #########################################################
                printops = []
                printops.append(
                    tf.compat.v1.Print(
                        [], [tf.shape(self.embedding_layer.embedding_table)],
                        "embedding_table shape ",
                        summarize=10000))
                for i in list(range(DICT_SIZE)):
                    printops.append(
                        tf.compat.v1.Print(
                            [], [self.embedding_layer.embedding_table[i, :]],
                            "enc_inputs for word " + str(i),
                            summarize=10000))
                    printops.append(
                        tf.compat.v1.Print(
                            [], [],
                            "**************************************",
                            summarize=10000))
                    tf.io.write_file(
                        "output_translate.txt",
                        str(self.embedding_layer.embedding_table[i, :]))
                with tf.control_dependencies(printops):
                    source_embeddings = source_embeddings * 1
                # ###########################################################################################################

            # Embed
            ### comment: first embedding without positional signal
            # Obtain length and depth of the input tensors
            _, time_steps, depth = tf_utils.get_shape_list(source_embeddings)
            # Transform input mask into attention mask
            inverse_mask = tf.cast(tf.equal(source_mask, 0.0),
                                   dtype=FLOAT_DTYPE)
            attn_mask = inverse_mask * -1e9
            # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits
            attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1)
            # Differentiate between self-attention and cross-attention masks for further, optional modifications
            self_attn_mask = attn_mask
            cross_attn_mask = attn_mask
            # Add positional encodings
            positional_signal = get_positional_signal(time_steps, depth,
                                                      FLOAT_DTYPE)
            source_embeddings += positional_signal  ### comment: first embedding with positional signal

            # Apply dropout
            if self.dropout_embedding is not None:
                source_embeddings = self.dropout_embedding(
                    source_embeddings, training=self.training)
            return source_embeddings, self_attn_mask, cross_attn_mask
示例#6
0
    def decode_at_train(self, target_ids, enc_output, cross_attn_mask):
        """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder;
         performs decoding in parallel at training time. """
        def _decode_all(target_embeddings):
            """ Decodes the encoder-generated representations into target-side logits in parallel. """
            # Apply input dropout
            dec_input = \
                tf.layers.dropout(target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training)
            # Propagate inputs through the encoder stack
            dec_output = dec_input
            for layer_id in range(1, self.config.transformer_dec_depth + 1):
                dec_output, _ = self.decoder_stack[layer_id][
                    'self_attn'].forward(dec_output, None, self_attn_mask)
                dec_output, _ = \
                    self.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask)
                dec_output = self.decoder_stack[layer_id]['ffn'].forward(
                    dec_output)
            return dec_output

        def _prepare_targets():
            """ Pre-processes target token ids before they're passed on as input to the decoder
            for parallel decoding. """
            # Embed target_ids
            target_embeddings = self._embed(target_ids)
            target_embeddings += positional_signal
            if self.config.transformer_dropout_embeddings > 0:
                target_embeddings = tf.layers.dropout(
                    target_embeddings,
                    rate=self.config.transformer_dropout_embeddings,
                    training=self.training)
            return target_embeddings

        def _decoding_function():
            """ Generates logits for target-side tokens. """
            # Embed the model's predictions up to the current time-step; add positional information, mask
            target_embeddings = _prepare_targets()
            # Pass encoder context and decoder embeddings through the decoder
            dec_output = _decode_all(target_embeddings)
            # Project decoder stack outputs and apply the soft-max non-linearity
            full_logits = self.softmax_projection_layer.project(dec_output)
            return full_logits

        with tf.variable_scope(self.name):
            # Transpose encoder information in hybrid models
            if self.from_rnn:
                enc_output = tf.transpose(enc_output, [1, 0, 2])
                cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0])

            self_attn_mask = get_right_context_mask(tf.shape(target_ids)[-1])
            positional_signal = get_positional_signal(
                tf.shape(target_ids)[-1], self.config.embedding_size,
                FLOAT_DTYPE)
            logits = _decoding_function()
        return logits
示例#7
0
 def _pre_embed(self, index_sequence):
     u_emb = self.embedding_layer.embed(index_sequence) #(batch_size, u_len, embedding_size)
     shape = get_shape_list(u_emb)
     if self.config.utf8_type == "md5":
         md5_positional_signal = get_positional_signal(shape[1], shape[2], self.float_dtype)
         u_emb += md5_positional_signal
     input_size = self.config.pre_source_embedding_size
     cc = input_size - shape[1]*shape[2]
     merge_emb = tf.reshape(u_emb, [shape[0], shape[1]*shape[2]])
     #merge_emb: (batch_size, u_len*embedding_size)
     fix_merge_emb = tf.pad(merge_emb, [[0, 0], [0, tf.reduce_max([cc, 0])]], constant_values=1.0)
     fix_merge_emb = tf.slice(fix_merge_emb, [0, 0], [-1, input_size])
     return fix_merge_emb
示例#8
0
    def decode_at_train(self, target_ids, enc_output, cross_attn_mask):
        """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder;
         performs decoding in parallel at training time. """

        def _decode_all(target_embeddings):
            """ Decodes the encoder-generated representations into target-side logits in parallel. """
            # Apply input dropout
            dec_input = \
                tf.layers.dropout(target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training)
            # Propagate inputs through the encoder stack
            dec_output = dec_input
            for layer_id in range(1, self.config.transformer_dec_depth + 1):
                dec_output, _ = self.decoder_stack[layer_id]['self_attn'].forward(dec_output, None, self_attn_mask)
                dec_output, _ = \
                    self.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask)
                dec_output = self.decoder_stack[layer_id]['ffn'].forward(dec_output)
            return dec_output

        def _prepare_targets():
            """ Pre-processes target token ids before they're passed on as input to the decoder
            for parallel decoding. """
            # Embed target_ids
            target_embeddings = self._embed(target_ids)
            target_embeddings += positional_signal
            if self.config.transformer_dropout_embeddings > 0:
                target_embeddings = tf.layers.dropout(target_embeddings,
                                                      rate=self.config.transformer_dropout_embeddings, training=self.training)
            return target_embeddings

        def _decoding_function():
            """ Generates logits for target-side tokens. """
            # Embed the model's predictions up to the current time-step; add positional information, mask
            target_embeddings = _prepare_targets()
            # Pass encoder context and decoder embeddings through the decoder
            dec_output = _decode_all(target_embeddings)
            # Project decoder stack outputs and apply the soft-max non-linearity
            full_logits = self.softmax_projection_layer.project(dec_output)
            return full_logits

        with tf.variable_scope(self.name):
            # Transpose encoder information in hybrid models
            if self.from_rnn:
                enc_output = tf.transpose(enc_output, [1, 0, 2])
                cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0])

            self_attn_mask = get_right_context_mask(tf.shape(target_ids)[-1])
            positional_signal = get_positional_signal(tf.shape(target_ids)[-1],
                                                      self.config.embedding_size,
                                                      self.float_dtype)
            logits = _decoding_function()
        return logits
示例#9
0
 def _prepare_source():
     """ Pre-processes inputs to the encoder and generates the corresponding attention masks."""
     # Embed
     source_embeddings = self._embed(source_ids)
     # Obtain length and depth of the input tensors
     _, time_steps, depth = get_shape_list(source_embeddings)
     # Transform input mask into attention mask
     inverse_mask = tf.cast(tf.equal(source_mask, 0.0), dtype=self.float_dtype)
     attn_mask = inverse_mask * -1e9
     # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits
     attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1)
     # Differentiate between self-attention and cross-attention masks for further, optional modifications
     self_attn_mask = attn_mask
     cross_attn_mask = attn_mask
     # Add positional encodings
     positional_signal = get_positional_signal(time_steps, depth, self.float_dtype)
     source_embeddings += positional_signal
     # Apply dropout
     if self.config.transformer_dropout_embeddings > 0:
         source_embeddings = tf.layers.dropout(source_embeddings,
                                               rate=self.config.transformer_dropout_embeddings, training=self.training)
     return source_embeddings, self_attn_mask, cross_attn_mask
示例#10
0
def decode_at_test(model, decoder, enc_output, cross_attn_mask, batch_size,
                   beam_size, do_sample, normalization_alpha):
    """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder;
     performs decoding via auto-regression at test time. """
    def _decode_step(target_embeddings, memories):
        """ Decode the encoder-generated representations into target-side logits with auto-regression. """
        # Propagate inputs through the encoder stack
        dec_output = target_embeddings
        # NOTE: No self-attention mask is applied at decoding, as future information is unavailable
        for layer_id in range(1, decoder.config.transformer_dec_depth + 1):
            dec_output, memories['layer_{:d}'.format(layer_id)] = \
                decoder.decoder_stack[layer_id]['self_attn'].forward(
                    dec_output, None, None, memories['layer_{:d}'.format(layer_id)])
            dec_output, _ = \
                decoder.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask)
            dec_output = decoder.decoder_stack[layer_id]['ffn'].forward(
                dec_output)
        # Return prediction at the final time-step to be consistent with the inference pipeline
        dec_output = dec_output[:, -1, :]
        return dec_output, memories

    def _pre_process_targets(step_target_ids, current_time_step):
        """ Pre-processes target token ids before they're passed on as input to the decoder
        for auto-regressive decoding. """
        # Embed target_ids
        target_embeddings = decoder._embed(step_target_ids)
        signal_slice = positional_signal[:, current_time_step -
                                         1:current_time_step, :]
        target_embeddings += signal_slice
        if decoder.config.transformer_dropout_embeddings > 0:
            target_embeddings = tf.layers.dropout(
                target_embeddings,
                rate=decoder.config.transformer_dropout_embeddings,
                training=decoder.training)
        return target_embeddings

    def _decoding_function(step_target_ids, current_time_step, memories):
        """ Generates logits for the target-side token predicted for the next-time step with auto-regression. """
        # Embed the model's predictions up to the current time-step; add positional information, mask
        target_embeddings = _pre_process_targets(step_target_ids,
                                                 current_time_step)
        # Pass encoder context and decoder embeddings through the decoder
        dec_output, memories = _decode_step(target_embeddings, memories)
        # Project decoder stack outputs and apply the soft-max non-linearity
        step_logits = decoder.softmax_projection_layer.project(dec_output)
        return step_logits, memories

    with tf.variable_scope(decoder.name):
        # Transpose encoder information in hybrid models
        if decoder.from_rnn:
            enc_output = tf.transpose(enc_output, [1, 0, 2])
            cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0])

        positional_signal = get_positional_signal(
            decoder.config.translation_maxlen, decoder.config.embedding_size,
            decoder.float_dtype)
        if beam_size > 0:
            # Initialize target IDs with <GO>
            initial_ids = tf.cast(tf.fill([batch_size], 1),
                                  dtype=decoder.int_dtype)
            initial_memories = decoder._get_initial_memories(
                batch_size, beam_size=beam_size)
            output_sequences, scores = _beam_search(
                _decoding_function, initial_ids, initial_memories,
                decoder.int_dtype, decoder.float_dtype,
                decoder.config.translation_maxlen, batch_size, beam_size,
                decoder.embedding_layer.get_vocab_size(), 0,
                normalization_alpha)

        else:
            # Initialize target IDs with <GO>
            initial_ids = tf.cast(tf.fill([batch_size, 1], 1),
                                  dtype=decoder.int_dtype)
            initial_memories = decoder._get_initial_memories(batch_size,
                                                             beam_size=1)
            output_sequences, scores = greedy_search(
                model,
                _decoding_function,
                initial_ids,
                initial_memories,
                decoder.int_dtype,
                decoder.float_dtype,
                decoder.config.translation_maxlen,
                batch_size,
                0,
                do_sample,
                time_major=False)
    return output_sequences, scores
示例#11
0
def decode_greedy(models, do_sample=False, beam_size=0,
                  normalization_alpha=None):
    """Decodes a source sequence using beam search or sampling.

    Args:
        models: a list of Transformer objects.
        do_sample: randomly sample instead of argmax for greedy search
        beam_size: integer specifying the beam width.
        normalization_alpha: length normalization hyperparameter.

    Returns:
        A tuple (ids, scores), where ids is a Tensor with shape (batch_size, k,
        max_seq_len) containing k translations for each input sentence in
        model.inputs.x and scores is a Tensor with shape (batch_size, k)
    """

    # Get some parameter values. For ensembling, some settings are required to
    # be consistent across all models but others are not.  In the former case,
    # we assume that consistency has already been checked.  For the parameters
    # that are allowed to vary across models, the first model's settings take
    # precedence.
    batch_size, _ = get_shape_list(models[0].source_ids)
    model_name = models[0].name
    decoder_name = models[0].dec.name
    from_rnn = models[0].dec.from_rnn
    config = models[0].dec.config
    float_dtype = models[0].dec.float_dtype
    int_dtype = models[0].dec.int_dtype
    vocab_size = models[0].dec.embedding_layer.get_vocab_size(),

    # Generate a positional signal for the longest possible output.
    with tf.name_scope('{:s}_decode'.format(model_name)):
        with tf.variable_scope(decoder_name):
            positional_signal = get_positional_signal(
                config.translation_maxlen,
                config.embedding_size,
                float_dtype)

    # Generate a decoding function for each model.
    decoding_functions = []
    for model in models:
        assert model.name == model_name

        # Encode source sequences.
        with tf.name_scope('{:s}_encode'.format(model.name)):
            enc_output, cross_attn_mask = model.enc.encode(model.source_ids,
                                                           model.source_mask)

        # Generate a model-specific decoding function.
        with tf.name_scope('{:s}_decode'.format(model.name)):
            func = generate_decoding_function(enc_output, cross_attn_mask,
                                              model.dec, positional_signal)
            decoding_functions.append(func)

    # Decode into target sequences
    with tf.name_scope('{:s}_decode'.format(model_name)):
        with tf.variable_scope(decoder_name):

            if beam_size > 0:
                # Initialize target IDs with <GO>
                initial_ids = tf.cast(tf.fill([batch_size], 1), dtype=int_dtype)
                initial_memories = [
                    model.dec._get_initial_memories(batch_size,
                                                    beam_size=beam_size)
                    for model in models]
                output_sequences, scores = _beam_search(
                    decoding_functions,
                    initial_ids,
                    initial_memories,
                    int_dtype,
                    float_dtype,
                    config.translation_maxlen,
                    batch_size,
                    beam_size,
                    vocab_size,
                    0,
                    normalization_alpha)

            else:
                # Initialize target IDs with <GO>
                initial_ids = tf.cast(tf.fill([batch_size, 1], 1),
                                      dtype=int_dtype)
                initial_memories = [
                    model.dec._get_initial_memories(batch_size, beam_size=1)
                    for model in models]
                output_sequences, scores = greedy_search(
                    models[0],
                    decoding_functions[0],
                    initial_ids,
                    initial_memories[0],
                    int_dtype,
                    float_dtype,
                    config.translation_maxlen,
                    batch_size,
                    0,
                    do_sample,
                    time_major=False)

    return output_sequences, scores