Пример #1
0
 def _add_multi_head_attention_layer(self, keys, queries, values, key_seq,
                                     value_seq):
     # Multi-Head Attention
     x = layers.SelfAttentionLayer(input_size=self.nb_heads *
                                   self.hidden_size,
                                   hidden_size=self.hidden_size,
                                   key_seq=key_seq,
                                   value_seq=value_seq,
                                   nb_heads=self.nb_heads,
                                   causality=False,
                                   dtype=self.dtype)(keys=keys,
                                                     queries=queries,
                                                     values=values)
     # Dropout
     x = self._dropout(x)
     # Add & Norm
     h = layers.LayerNorm(hidden_size=self.nb_heads * self.hidden_size,
                          dtype=self.dtype)(x=x + queries)
     # 2-layer Feed Forward
     x = layers.FeedForwardLayer(hidden_size=self.nb_heads *
                                 self.hidden_size,
                                 activation=tf.nn.relu,
                                 dtype=self.dtype)(x=h)
     x = layers.FeedForwardLayer(hidden_size=self.nb_heads *
                                 self.hidden_size,
                                 activation=lambda x: x,
                                 dtype=self.dtype)(x=x)
     # Dropout
     x = self._dropout(x)
     # Add & Norm
     x = layers.LayerNorm(hidden_size=self.nb_heads * self.hidden_size,
                          dtype=self.dtype)(x=x + h)
     return x
Пример #2
0
    def __init__(self, config, batch_size, dropout_embedding, dropout_hidden, hidden_to_logits_W=None):
        self.config = config

        with tf.variable_scope("prev_emb_to_hidden"):
            self.prev_emb_to_hidden = layers.FeedForwardLayer(
                                in_size=config.target_embedding_size,
                                out_size=config.target_embedding_size,
                                batch_size=batch_size,
                                non_linearity=lambda y: y,
                                use_layer_norm=config.use_layer_norm,
                                dropout_input=dropout_embedding)
        with tf.variable_scope("state_to_hidden"):
            self.state_to_hidden = layers.FeedForwardLayer(
                                    in_size=config.state_size,
                                    out_size=config.target_embedding_size,
                                    batch_size=batch_size,
                                    non_linearity=lambda y: y,
                                    use_layer_norm=config.use_layer_norm,
                                    dropout_input=dropout_hidden)
        with tf.variable_scope("attended_context_to_hidden"):
            self.att_ctx_to_hidden = layers.FeedForwardLayer(
                                    in_size=2*config.state_size,
                                    out_size=config.target_embedding_size,
                                    batch_size=batch_size,
                                    non_linearity=lambda y: y,
                                    use_layer_norm=config.use_layer_norm,
                                    dropout_input=dropout_hidden)

        if config.output_hidden_activation == 'prelu':
            with tf.variable_scope("hidden_prelu"):
                self.hidden_prelu = PReLU(in_size=config.target_embedding_size)

        with tf.variable_scope("hidden_to_logits"):
            self.hidden_to_logits = layers.FeedForwardLayer(
                            in_size=config.target_embedding_size,
                            out_size=config.target_vocab_size,
                            batch_size=batch_size,
                            non_linearity=lambda y: y,
                            W=hidden_to_logits_W,
                            dropout_input=dropout_embedding)

        if config.softmax_mixture_size > 1:
            with tf.variable_scope("hidden_to_pi_logits"):
                self.hidden_to_pi_logits = layers.FeedForwardLayer(
                    in_size=config.target_embedding_size,
                    out_size=config.softmax_mixture_size,
                    batch_size=batch_size,
                    non_linearity=lambda y: y,
                    dropout_input=dropout_embedding)
            self.hidden_to_mos_hidden = []
            for k in range(config.softmax_mixture_size):
                with tf.variable_scope("hidden_to_mos_hidden_{}".format(k)):
                    layer = layers.FeedForwardLayer(
                        in_size=config.target_embedding_size,
                        out_size=config.target_embedding_size,
                        batch_size=batch_size,
                        use_layer_norm=config.use_layer_norm,
                        dropout_input=dropout_embedding)
                    self.hidden_to_mos_hidden.append(layer)
Пример #3
0
    def __init__(self,
                 config,
                 context,
                 x_embs,
                 x_mask,
                 dropout_target,
                 dropout_embedding,
                 dropout_hidden,
                 encoder_embedding_layer=None):

        self.dropout_target = dropout_target
        batch_size = tf.shape(x_mask)[1]

        with tf.variable_scope("initial_state_constructor"):
            context_sum = tf.reduce_sum(context *
                                        tf.expand_dims(x_mask, axis=2),
                                        axis=0)

            context_mean = context_sum / tf.expand_dims(
                tf.reduce_sum(x_mask, axis=0), axis=1)
            self.init_state_layer = layers.FeedForwardLayer(
                in_size=config.state_size * 2,
                out_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                dropout_input=dropout_hidden)
            self.init_state = self.init_state_layer.forward(context_mean)
            self.x_embs = x_embs

            self.translation_maxlen = config.translation_maxlen
            self.embedding_size = config.target_embedding_size
            self.state_size = config.state_size
            self.target_vocab_size = config.target_vocab_size

        with tf.variable_scope("embedding"):
            if encoder_embedding_layer == None:
                self.y_emb_layer = layers.EmbeddingLayer(
                    vocabulary_sizes=[config.target_vocab_size],
                    dim_per_factor=[config.target_embedding_size])
            else:
                self.y_emb_layer = encoder_embedding_layer

        with tf.variable_scope("base"):
            with tf.variable_scope("gru0"):
                if config.theano_compat:
                    bias_type = layers.LegacyBiasType.THEANO_A
                else:
                    bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_FALSE
                self.grustep1 = layers.GRUStep(
                    input_size=config.target_embedding_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    legacy_bias_type=bias_type,
                    dropout_input=dropout_embedding,
                    dropout_state=dropout_hidden)
            with tf.variable_scope("attention"):
                self.attstep = layers.AttentionStep(
                    context=context,
                    context_state_size=2 * config.state_size,
                    context_mask=x_mask,
                    state_size=config.state_size,
                    hidden_size=2 * config.state_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    dropout_context=dropout_hidden,
                    dropout_state=dropout_hidden)
            if config.theano_compat:
                bias_type = layers.LegacyBiasType.THEANO_B
            else:
                bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE
            self.grustep2 = layers.DeepTransitionGRUStep(
                input_size=2 * config.state_size,
                state_size=config.state_size,
                batch_size=batch_size,
                use_layer_norm=config.rnn_layer_normalization,
                legacy_bias_type=bias_type,
                dropout_input=dropout_hidden,
                dropout_state=dropout_hidden,
                transition_depth=config.rnn_dec_base_transition_depth - 1,
                var_scope_fn=lambda i: "gru{0}".format(i + 1))

        with tf.variable_scope("high"):
            if config.rnn_dec_depth == 1:
                self.high_gru_stack = None
            else:
                if config.theano_compat:
                    bias_type = layers.LegacyBiasType.THEANO_A
                else:
                    bias_type = layers.LegacyBiasType.NEMATUS_COMPAT_TRUE
                self.high_gru_stack = layers.GRUStack(
                    input_size=config.state_size,
                    state_size=config.state_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    legacy_bias_type=bias_type,
                    dropout_input=dropout_hidden,
                    dropout_state=dropout_hidden,
                    stack_depth=config.rnn_dec_depth - 1,
                    transition_depth=config.rnn_dec_high_transition_depth,
                    context_state_size=(2 * config.state_size
                                        if config.rnn_dec_deep_context else 0),
                    residual_connections=True,
                    first_residual_output=0)

        if config.rnn_lexical_model:
            with tf.variable_scope("lexical"):
                self.lexical_layer = layers.LexicalModel(
                    in_size=config.embedding_size,
                    out_size=config.embedding_size,
                    batch_size=batch_size,
                    use_layer_norm=config.rnn_layer_normalization,
                    dropout_embedding=dropout_embedding,
                    dropout_hidden=dropout_hidden)
        else:
            self.lexical_layer = None

        with tf.variable_scope("next_word_predictor"):
            W = None
            if config.tie_decoder_embeddings:
                W = self.y_emb_layer.get_embeddings(factor=0)
                W = tf.transpose(W)
            self.predictor = Predictor(config,
                                       batch_size,
                                       dropout_embedding,
                                       dropout_hidden,
                                       hidden_to_logits_W=W)