Пример #1
0
    def cross_attention(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            # feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)

            encx, ency = self._infer(encx, ency)

            # feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]])

        return encx, ency
Пример #2
0
    def inter_blocks(self,
                     a_repre,
                     b_repre,
                     scope,
                     training=True,
                     reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=training,
                                       causality=False)
            # feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=training,
                                       causality=False)
            # feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model])

        return encx, ency
Пример #3
0
    def decode(self, decoder_inputs, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("decoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(dec)
            # Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("decoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=dec,
                                          keys=dec,
                                          values=dec,
                                          key_masks=tgt_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=True,
                                          scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                          keys=memory,
                                          values=memory,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False,
                                          scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(dec)

        return dec, outputs, scopes
Пример #4
0
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(
            self.embeddings
        )  # (d_model, vocab_size) 为什么这里可以直接用 embeddings: 根据论文section 3.4 数书和输出共享embedding
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits,
                                      axis=-1))  # argmax 与 arg_max 的区别 (N,T2)

        return logits, y_hat, y, sents2
Пример #5
0
    def decode(self, ys, memory, training=True):
        with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # Embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)
            dec *= self.hp.num_units**0.5  # scale
            dec += position_encoding(dec, self.hp.maxlen)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope('num_blocks_{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,  # todo 本次的causality是True
                        scope='self_attention')
                    # attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,  # todo 此次的causality是False
                        scope='vanilla_attention')
                    # Feed-Foward
                    dec = feed_forward(
                        dec, num_units=[self.hp.d_ff, self.hp.num_units])

        # Final linear projection(embedding weights are shared)
        weights = tf.transpose(self.embeddings)  #[hidden_units,vocab_size]
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N,T2,vocab_size)
        # set values corresponding to unk = 0
        logits_first = tf.expand_dims(logits[:, :, 0], 2)
        zeros = tf.zeros_like(logits_first)
        logits = tf.concat([logits_first, zeros, logits[:, :, 2:]], axis=2)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
Пример #6
0
    def encode(self, xs, training=True):
        '''
        :return: encoder outputs (N,T1,hidden_units)
        '''
        with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # Embeding
            enc = tf.nn.embedding_lookup(self.embeddings.x)
            enc *= self.hp.num_units**0.5  # scale

            enc += position_encoding(enc, self.hp.maxlen)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope('num_blocks_{}'.format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = feed_forward(
                        enc, num_units=[self.hp.d_ff, self.hp.num_units])

        memory = enc
        return memory, sents1
Пример #7
0
    def _encode(self, enc, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # embedding
            enc *= self.arg.d_model**0.5  # scale

            enc += positional_encoding(enc, self.arg.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.arg.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.arg.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.arg.num_heads,
                        dropout_rate=self.arg.dropout_rate,
                        training=training,
                        causality=False)
        memory = enc
        return memory
Пример #8
0
    def time_encode(self, encoder_inputs):
        '''
        Returns
        memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE)
        '''
        with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE):

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs)
            enc *= hp.HIDDEN_SIZE**0.5

            enc += positional_encoding(enc, hp.MAX_LEN)
            enc = tf.nn.dropout(enc, self.dropout)

            # Blocks
            for i in range(hp.NUM_BLOCKS):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=hp.NUM_HEADS,
                                              dropout=self.dropout,
                                              causality=True)
                    # feed forward
                    enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE])

        output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE))
        logits = tf.layers.dense(output, len(self.token2idx))
        return logits
Пример #9
0
def transformer_encode(enc, config, training=True):
    '''
    Returns
    memory: encoder outputs. (N, T1, d_model)
    '''
    with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE):

        # embedding
        enc *= config.d_model**0.5  # scale

        enc += positional_encoding(enc, config.max_sent_num)
        enc = tf.layers.dropout(enc, config.drop_rate, training=training)

        ## Blocks
        for i in range(config.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          num_heads=config.num_heads,
                                          dropout_rate=config.drop_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[config.d_ff, config.d_model])
    memory = enc
    return memory
Пример #10
0
    def encode(self, xs, training=True):
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            enc *= self.hp.d_model**0.5 # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
Пример #11
0
    def dense_blocks(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # self-attention
            _encx = multihead_attention(queries=a_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)

            # self-attention
            _ency = multihead_attention(queries=b_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)



            # inter-attention
            ency = multihead_attention(queries=_encx,
                                       keys=_ency,
                                       values=_ency,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)

            # inter-attention
            encx = multihead_attention(queries=_ency,
                                       keys=_encx,
                                       values=_encx,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)

            encx, ency = self._infer(encx, ency)

            #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num)

            return encx, ency
Пример #12
0
    def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SAKmeans")

        with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks
            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)

            num_heads = num_interest
            self.user_eb = getKVector(sess, self.seq, num_heads)
            self.dim = embedding_dim
            item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim])

            # item_list_emb = [-1, seq_len, embedding_dim]
            # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1)
            atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1]))
            atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1))

            # 找出与target item最相似的用户兴趣向量
            readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]),
                                tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range(
                                    tf.shape(item_list_emb)[0]) * num_heads)

            self.build_sampled_softmax_loss(self.item_eb, readout)
Пример #13
0
    def inter_blocks(self, a_repre, b_repre, x_layer, y_layer, layer_num, scope, reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # encx, ency = inter_multihead_attention(queries=b_repre,
            #                            keys=a_repre,
            #                            values=a_repre,
            #                            num_heads=self.hp.num_heads,
            #                            dropout_rate=self.hp.dropout_rate,
            #                            training=self.is_training,
            #                            causality=False)

            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            #feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            #feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model])

            #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num)

            #encx, ency = self._infer(encx, ency)

        return encx, ency
Пример #14
0
    def encode(self, x, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            self.token2idx, self.idx2token = load_vocab(self.hp.vocab)
            self.embeddings = get_token_embeddings(self.hp.vocab_size,
                                                   self.hp.d_model,
                                                   zero_pad=True)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(self.embeddings)
        with tf.variable_scope("encoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(enc)
            ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("encoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(enc)
        memory = enc
        return memory, src_masks, outputs, scopes
Пример #15
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding1
            enc1 = tf.nn.embedding_lookup(self.embeddings1,
                                          x)  # (N, T1, d_model)
            enc1 *= self.hp.d_model**0.5  # scale
            enc1 += positional_encoding(enc1, self.hp.maxlen1)
            enc1 = tf.layers.dropout(enc1,
                                     self.hp.dropout_rate,
                                     training=training)

            # embedding2
            enc2 = tf.nn.embedding_lookup(self.embeddings2,
                                          x)  # (N, T1, d_model)
            enc2 *= self.hp.d_model**0.5  # scale
            enc2 += positional_encoding(enc2, self.hp.maxlen1)
            enc2 = tf.layers.dropout(enc2,
                                     self.hp.dropout_rate,
                                     training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc1, enc2 = multihead_attention(
                        queries=(enc1, enc2),
                        keys=enc1,
                        values=enc2,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model])
                    enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = (enc1, enc2)
        return memory, sents1, src_masks
Пример #16
0
    def encode(self, xs, training=True):
        '''
        xs: 训练数据
        Returns
        memory: encoder outputs. (N, T1, d_model)
                                N: batch size;
                                T1: sentence length
                                d_model: 512, 词向量长度
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # xs: tuple of
            #               x: int32 tensor. (N, T1)
            #               x_seqlens: int32 tensor. (N,)  句子长度
            #               sents1: str tensor. (N,)
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 加上位置编码向量
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks 编码器模块
            # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
Пример #17
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            if self.hp.fac_embed:
                enc = tf.nn.embedding_lookup(self.embeddings1,
                                             x)  # (N, T1, d_embed)
                enc = tf.matmul(enc, self.embeddings2)  # (N, T1, d_model)
            else:
                enc = tf.nn.embedding_lookup(self.embeddings,
                                             x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                if self.hp.share_weights:
                    vs_name = "blocks_shared"
                else:
                    vs_name = "num_blocks_{}".format(i)
                with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
Пример #18
0
    def encode(self, xs, training=True):  # 实现encode 模型
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        # N:batch_size
        # T1:句子长度
        # d_model:词向量维度
        '''
        # 实现的功能:
        # (1)输入词向量 + positional_encoding
        # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 将位置向量添加到初始词向量中
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
Пример #19
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1) batch_size,time_steps
            # source的mask
            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            # enc += positional_encoding(enc, self.hp.maxlen1)
            #位置编码的维度和词向量的维度一致才能相加
            # 对于topic to essay 这个地方感觉不需要位置向量
            # 编码器部分不需要位置向量但是解码器部分需要位置向量

            #词向量也进行dropout
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention because queries,keys and values both are 'enc'
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # 这个causality决定是否对未来的词进行mask,False则可以看到未来的词
                    # feed forward + residual connection
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
Пример #20
0
 def aggregation(self, a_repre, b_repre):
     dim = a_repre.shape.as_list()[-1]
     with tf.variable_scope("aggregation", reuse=tf.AUTO_REUSE):
         # Blocks
         for i in range(self.hp.num_agg_blocks):
             with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                 # Vanilla attention
                 a_repre = multihead_attention(queries=a_repre,
                                               keys=a_repre,
                                               values=a_repre,
                                               num_heads=self.hp.num_heads,
                                               dropout_rate=self.hp.dropout_rate,
                                               training=self.is_training,
                                               causality=False,
                                               scope="vanilla_attention")
                 ### Feed Forward
                 a_repre = ff(a_repre, num_units=[self.hp.d_ff, dim])
     return a_repre
Пример #21
0
    def build_blocks(self, inputs, masks, reuse=None):
        self.blk = inputs
        for i in range(self.num_blocks):
            with tf.variable_scope("blocks_{}".format(i), reuse=reuse):
                ## Multihead Attention ( self-attention)
                self.blk = multihead_attention(queries=self.blk,
                                               keys=self.blk,
                                               qkv_masks=masks,
                                               num_units=self.hidden_units,
                                               num_heads=self.num_heads,
                                               dropout_rate=self.dropout,
                                               # is_training=is_training,
                                               causality=False,
                                               scope="self_attention",
                                               reuse=reuse)
                self.blk = feedforward(self.blk, num_units=[4*self.hidden_units, self.hidden_units], reuse=reuse)

        return self.blk
Пример #22
0
    def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SASRec")

        with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks

            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)
            self.sum_pooling = tf.reduce_sum(self.seq, 1)
            fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu)
            self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
Пример #23
0
 def encoder_blocks(self, a_repre, reuse=tf.AUTO_REUSE):
     for i in range(self.hp.num_transformer):
         with tf.variable_scope("num_trans_blocks_{}".format(i),
                                reuse=reuse):
             # self-attention
             a_repre = multihead_attention(
                 queries=a_repre,
                 keys=a_repre,
                 values=a_repre,
                 num_heads=self.hp.num_heads,
                 dropout_rate=self.hp.dropout_rate,
                 training=self.is_training,
                 causality=False)
             # feed forward
             #a_repre = ff(a_repre, num_units=[self.hp.d_ff, self.hp.d_model])
             a_repre = ff(
                 a_repre,
                 num_units=[self.hp.d_ff,
                            a_repre.shape.as_list()[-1]])
     return a_repre
Пример #24
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, video_path = xs

            # src_masks

            # embedding
            enc = tf.layers.dense(x, self.d_model)
            #src_masks = tf.math.equal(mask, 0) # (N, T1)
            src_masks = tf.sequence_mask(seqlens)

            #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            #enc *= self.hp.d_model**0.5 # scale

            enc /= self.hp.d_model**0.5

            enc += positional_encoding(enc, self.hp.n_video)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                    )
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, src_masks
Пример #25
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs
            # x = tf.Print(x, [x], message='x =', summarize=10)
            # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3)
            # with tf.control_dependencies([print_sent]):
            # embedding
            # xs_pri = tf.print('xs =', tf.shape(x), summarize=3)
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3)
            ## Blocks
            # with tf.control_dependencies([xs_pri, enc_pri]):

            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
Пример #26
0
    def encode(self, encx, src_masks):

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # all_layer = []
            ## Blocks
            for i in range(self.hp.num_blocks_encoder):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    encx = multihead_attention(
                        queries=encx,
                        keys=encx,
                        values=encx,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=self.is_training,
                        causality=False)
                    # feed forward
                    encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

                    # all_layer.append(encx)

        return encx
Пример #27
0
    def _encode(self, x, seq_num, training=True, name=None):
        """
        Returns
        memory: encoder outputs. (N, T1, d_model)
        """
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # embedding
            x = tf.identity(x, "input_x")
            enc = tf.nn.embedding_lookup(self._embeddings[seq_num],
                                         x)  # (N, T1, d_model)
            enc *= self._context.d_model**0.5  # scale

            enc += positional_encoding(enc, self._context.maxlens[seq_num])
            enc = tf.layers.dropout(enc,
                                    self._context.dropout_rate,
                                    training=training)

            # # Blocks
            for i in range(self._context.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self._context.num_heads,
                        dropout_rate=self._context.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(
                        enc,
                        num_units=[self._context.d_ff, self._context.d_model])
        memory = tf.identity(enc, name=name)
        return memory
Пример #28
0
    def decode(self, ys, x_paraphrased_dict, memory, training=True):
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys
            x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict
            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2)
            seqlens = tf.shape(decoder_inputs)[1]  # (N, T2, 2)
            paraphrased_lens = tf.shape(x_paraphrased_dict)[1]  # (N, T2, 2)

            x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1]

            x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o)  # N, W2, d_model
            if self.hp.paraphrase_type == 0:
                x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p)
            else:
                x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=True,
                                              scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False,
                                              scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

            # add paraphrased dictionary attention
            h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2)

            o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1)
            W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model
            score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_o) # N, T2, W2
            c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1)
            W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model
            score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_p) # N, T2, W2
            c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model
            out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen,
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="enc_pe")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="dec_pe")
            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
    def decode(self, xs, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        self.memory = memory
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, sents2 = ys
            x, _, = xs

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            attn_dists = []
            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec, _ = multihead_attention(queries=dec,
                                                 keys=dec,
                                                 values=dec,
                                                 num_heads=self.hp.num_heads,
                                                 dropout_rate=self.hp.dropout_rate,
                                                 training=training,
                                                 causality=True,
                                                 scope="self_attention")

                    # Vanilla attention
                    dec, attn_dist = multihead_attention(queries=dec,
                                                          keys=self.memory,
                                                          values=self.memory,
                                                          num_heads=self.hp.num_heads,
                                                          dropout_rate=self.hp.dropout_rate,
                                                          training=training,
                                                          causality=False,
                                                          scope="vanilla_attention")
                    attn_dists.append(attn_dist)
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)

        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False)

        logits = tf.nn.softmax(logits)

        # final distribution
        logits = self._calc_final_dist(x, gens, logits, attn_dists[-1])

        return logits, y, sents2