def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs,
                                           sequence_length=sequence_length)

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)
        mask = transformer.build_sequence_mask(
            sequence_length,
            num_heads=self.num_heads,
            maximum_length=tf.shape(inputs)[1],
            dtype=inputs.dtype)

        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    inputs_norm = transformer.norm(inputs)
                    context = transformer.multi_head_attention(
                        self.num_heads,
                        inputs_norm,
                        inputs_norm,
                        mode,
                        num_units=self.num_units,
                        mask=mask,
                        dropout=self.attention_dropout)
                    context = transformer.drop_and_add(inputs,
                                                       context,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

        outputs = transformer.norm(inputs)
        return (outputs, state, sequence_length)
예제 #2
0
    def encode(self,
               inputs,
               sequence_length=None,
               mode=tf.estimator.ModeKeys.TRAIN):
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs,
                                           sequence_length=sequence_length)

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        outputs = []
        state = ()

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("multi_head"):
                    context = transformer.multi_head_attention(
                        self.num_heads,
                        inputs,
                        inputs,
                        inputs,
                        mode,
                        values_length=sequence_length,
                        dropout=self.attention_dropout)
                    context = transformer.add_and_norm(inputs,
                                                       context,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        context, self.ffn_inner_dim)
                    transformed = transformer.add_and_norm(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed
                state += (tf.reduce_mean(inputs, axis=1), )

                if self.keep_layers_output:
                    outputs.append(inputs)

        return (inputs if not outputs else outputs, state, sequence_length)
예제 #3
0
    def _self_attention_stack(self,
                              inputs,
                              sequence_length=None,
                              mode=tf.estimator.ModeKeys.TRAIN,
                              cache=None,
                              memory=None,
                              memory_sequence_length=None):
        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        decoder_mask = None
        memory_mask = None

        if sequence_length is not None:
            decoder_mask = transformer.build_future_mask(
                sequence_length, num_heads=self.num_heads, dtype=inputs.dtype)
        if memory_sequence_length is not None:
            memory_mask = transformer.build_sequence_mask(
                memory_sequence_length,
                num_heads=self.num_heads,
                dtype=memory.dtype)

        for l in range(self.num_layers):
            layer_name = "layer_{}".format(l)
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                with tf.variable_scope("masked_multi_head"):
                    inputs_norm = transformer.norm(inputs)
                    encoded = transformer.multi_head_attention(
                        self.num_heads,
                        inputs_norm,
                        inputs_norm,
                        mode,
                        num_units=self.num_units,
                        mask=decoder_mask,
                        cache=layer_cache,
                        dropout=self.attention_dropout)
                    encoded = transformer.drop_and_add(inputs,
                                                       encoded,
                                                       mode,
                                                       dropout=self.dropout)

                if memory is not None:
                    with tf.variable_scope("multi_head"):
                        context = transformer.multi_head_attention(
                            self.num_heads,
                            transformer.norm(encoded),
                            memory,
                            mode,
                            mask=memory_mask,
                            dropout=self.attention_dropout)
                        context = transformer.drop_and_add(
                            encoded, context, mode, dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        transformer.norm(context),
                        self.ffn_inner_dim,
                        mode,
                        dropout=self.relu_dropout)
                    transformed = transformer.drop_and_add(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        outputs = transformer.norm(inputs)
        return outputs
예제 #4
0
    def _self_attention_stack(self,
                              inputs,
                              sequence_length,
                              mode=tf.estimator.ModeKeys.TRAIN,
                              memory=None,
                              memory_sequence_length=None):
        if self.position_encoder is not None:
            inputs = self.position_encoder(inputs,
                                           sequence_length=sequence_length)

        inputs = tf.layers.dropout(
            inputs,
            rate=self.dropout,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        for l in range(self.num_layers):
            with tf.variable_scope("layer_{}".format(l)):
                with tf.variable_scope("masked_multi_head"):
                    encoded = transformer.multi_head_attention(
                        self.num_heads,
                        inputs,
                        inputs,
                        inputs,
                        mode,
                        values_length=sequence_length,
                        mask_future=True,
                        dropout=self.attention_dropout)
                    encoded = transformer.add_and_norm(inputs,
                                                       encoded,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("multi_head"):
                    if memory is None:
                        values = encoded
                    elif tf.contrib.framework.nest.is_sequence(memory):
                        if l >= len(memory):
                            raise ValueError(
                                """If the encoder memory is a sequence,
                               it must contain one memory per decoder layer""")
                        values = memory[l]
                    else:
                        values = memory
                    keys = values

                    context = transformer.multi_head_attention(
                        self.num_heads,
                        encoded,
                        keys,
                        values,
                        mode,
                        values_length=memory_sequence_length,
                        dropout=self.attention_dropout)
                    context = transformer.add_and_norm(encoded,
                                                       context,
                                                       mode,
                                                       dropout=self.dropout)

                with tf.variable_scope("ffn"):
                    transformed = transformer.feed_forward(
                        context, self.ffn_inner_dim)
                    transformed = transformer.add_and_norm(
                        context, transformed, mode, dropout=self.dropout)

                inputs = transformed

        outputs = inputs
        return outputs