예제 #1
0
    def decode(self, ys, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            if training:
                dec += positional_encoding(dec, self.hp.maxlen2)
            else:
                dec += positional_encoding(dec, 1000)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              key_masks=tgt_masks,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=True,
                                              scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              key_masks=src_masks,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False,
                                              scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
예제 #2
0
    def representation(self, xs, ys, training=True):
        with tf.variable_scope("representation", reuse=tf.AUTO_REUSE):
            x = xs
            y = ys

            # print(x)
            # print(y)

            # embedding
            encx = tf.nn.embedding_lookup(self.embeddings,
                                          x)  # (N, T1, d_model)
            encx *= self.hp.d_model**0.5  # scale

            encx += positional_encoding(encx, self.hp.maxlen)
            encx = tf.layers.dropout(encx,
                                     self.hp.dropout_rate,
                                     training=training)

            ency = tf.nn.embedding_lookup(self.embeddings,
                                          y)  # (N, T1, d_model)
            ency *= self.hp.d_model**0.5  # scale

            ency += positional_encoding(ency, self.hp.maxlen)
            ency = tf.layers.dropout(ency,
                                     self.hp.dropout_rate,
                                     training=training)

            #add ln
            encx = ln(encx)
            ency = ln(ency)

            ## Blocks
            x_layer = []
            y_layer = []
            for i in range(self.hp.num_extract_blocks +
                           self.hp.num_inter_blocks):
                if i < self.hp.num_extract_blocks:
                    encx = self.base_blocks(encx,
                                            encx,
                                            training=training,
                                            scope="num_blocks_{}".format(i))
                    ency = self.base_blocks(ency,
                                            ency,
                                            training=training,
                                            scope="num_blocks_{}".format(i))
                    #encx, ency = localInference(encx, ency)
                    x_layer.append(encx)
                    y_layer.append(ency)
                else:
                    encx, ency = self.inter_blocks(
                        encx,
                        ency,
                        training=training,
                        scope="num_blocks_{}".format(i))
                    #encx, ency = localInference(encx, ency)
                    x_layer.append(encx)
                    y_layer.append(ency)
        return x_layer, y_layer
예제 #3
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding1
            enc1 = tf.nn.embedding_lookup(self.embeddings1,
                                          x)  # (N, T1, d_model)
            enc1 *= self.hp.d_model**0.5  # scale
            enc1 += positional_encoding(enc1, self.hp.maxlen1)
            enc1 = tf.layers.dropout(enc1,
                                     self.hp.dropout_rate,
                                     training=training)

            # embedding2
            enc2 = tf.nn.embedding_lookup(self.embeddings2,
                                          x)  # (N, T1, d_model)
            enc2 *= self.hp.d_model**0.5  # scale
            enc2 += positional_encoding(enc2, self.hp.maxlen1)
            enc2 = tf.layers.dropout(enc2,
                                     self.hp.dropout_rate,
                                     training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc1, enc2 = multihead_attention(
                        queries=(enc1, enc2),
                        keys=enc1,
                        values=enc2,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model])
                    enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = (enc1, enc2)
        return memory, sents1, src_masks
예제 #4
0
    def _encode(self, enc, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # embedding
            enc *= self.arg.d_model**0.5  # scale

            enc += positional_encoding(enc, self.arg.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.arg.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.arg.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.arg.num_heads,
                        dropout_rate=self.arg.dropout_rate,
                        training=training,
                        causality=False)
        memory = enc
        return memory
예제 #5
0
    def time_encode(self, encoder_inputs):
        '''
        Returns
        memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE)
        '''
        with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE):

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs)
            enc *= hp.HIDDEN_SIZE**0.5

            enc += positional_encoding(enc, hp.MAX_LEN)
            enc = tf.nn.dropout(enc, self.dropout)

            # Blocks
            for i in range(hp.NUM_BLOCKS):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=hp.NUM_HEADS,
                                              dropout=self.dropout,
                                              causality=True)
                    # feed forward
                    enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE])

        output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE))
        logits = tf.layers.dense(output, len(self.token2idx))
        return logits
예제 #6
0
파일: models.py 프로젝트: yysys/Multi_test
    def build_embedding_layer(self, inputs, reuse=None):
        self.emb_char = embedding(inputs,
                                  vocab_size=self.vocab_size,
                                  num_units=self.hidden_units,
                                  scale=True,
                                  scope="emb_char",
                                  reuse=reuse)
        self.emb_char_pos = self.emb_char
        if self.emb_pos_type == 'sin':
            self.emb_char_pos += positional_encoding(inputs,
                                                     num_units=self.hidden_units,
                                                     zero_pad=False,
                                                     scale=False,
                                                     scope="emb_pos",
                                                     reuse=reuse)
        else:
            self.emb_char_pos += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(inputs)[1]), 0), [tf.shape(inputs)[0], 1]),
                                           vocab_size=self.maxlen,
                                           num_units=self.hidden_units,
                                           zero_pad=False,
                                           scale=False,
                                           scope="emb_pos",
                                           reuse=reuse)

        self.emb = tf.layers.dropout(self.emb_char_pos, rate=self.dropout,)

        return self.emb
예제 #7
0
    def decode(self, decoder_inputs, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("decoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(dec)
            # Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("decoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=dec,
                                          keys=dec,
                                          values=dec,
                                          key_masks=tgt_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=True,
                                          scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                          keys=memory,
                                          values=memory,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False,
                                          scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(dec)

        return dec, outputs, scopes
예제 #8
0
def transformer_encode(enc, config, training=True):
    '''
    Returns
    memory: encoder outputs. (N, T1, d_model)
    '''
    with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE):

        # embedding
        enc *= config.d_model**0.5  # scale

        enc += positional_encoding(enc, config.max_sent_num)
        enc = tf.layers.dropout(enc, config.drop_rate, training=training)

        ## Blocks
        for i in range(config.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          num_heads=config.num_heads,
                                          dropout_rate=config.drop_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[config.d_ff, config.d_model])
    memory = enc
    return memory
예제 #9
0
    def encode(self, xs, training=True):
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            enc *= self.hp.d_model**0.5 # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
예제 #10
0
파일: model.py 프로젝트: boluochuile/MIKRec
    def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SAKmeans")

        with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks
            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)

            num_heads = num_interest
            self.user_eb = getKVector(sess, self.seq, num_heads)
            self.dim = embedding_dim
            item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim])

            # item_list_emb = [-1, seq_len, embedding_dim]
            # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1)
            atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1]))
            atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1))

            # 找出与target item最相似的用户兴趣向量
            readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]),
                                tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range(
                                    tf.shape(item_list_emb)[0]) * num_heads)

            self.build_sampled_softmax_loss(self.item_eb, readout)
예제 #11
0
    def encode(self, x, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            self.token2idx, self.idx2token = load_vocab(self.hp.vocab)
            self.embeddings = get_token_embeddings(self.hp.vocab_size,
                                                   self.hp.d_model,
                                                   zero_pad=True)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(self.embeddings)
        with tf.variable_scope("encoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(enc)
            ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("encoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(enc)
        memory = enc
        return memory, src_masks, outputs, scopes
예제 #12
0
    def encode(self, xs, training=True):
        '''
        xs: 训练数据
        Returns
        memory: encoder outputs. (N, T1, d_model)
                                N: batch size;
                                T1: sentence length
                                d_model: 512, 词向量长度
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # xs: tuple of
            #               x: int32 tensor. (N, T1)
            #               x_seqlens: int32 tensor. (N,)  句子长度
            #               sents1: str tensor. (N,)
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 加上位置编码向量
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks 编码器模块
            # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
예제 #13
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            if self.hp.fac_embed:
                enc = tf.nn.embedding_lookup(self.embeddings1,
                                             x)  # (N, T1, d_embed)
                enc = tf.matmul(enc, self.embeddings2)  # (N, T1, d_model)
            else:
                enc = tf.nn.embedding_lookup(self.embeddings,
                                             x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                if self.hp.share_weights:
                    vs_name = "blocks_shared"
                else:
                    vs_name = "num_blocks_{}".format(i)
                with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
예제 #14
0
    def encode(self, xs, training=True):  # 实现encode 模型
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        # N:batch_size
        # T1:句子长度
        # d_model:词向量维度
        '''
        # 实现的功能:
        # (1)输入词向量 + positional_encoding
        # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 将位置向量添加到初始词向量中
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
예제 #15
0
    def pre_encoder(self, x):
        with tf.variable_scope("pre_encoder", reuse=tf.AUTO_REUSE):
            #x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=self.is_training)

            return enc, src_masks
예제 #16
0
파일: model.py 프로젝트: boluochuile/MIKRec
    def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SASRec")

        with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks

            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)
            self.sum_pooling = tf.reduce_sum(self.seq, 1)
            fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu)
            self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
예제 #17
0
파일: model.py 프로젝트: KangSooHan/GAI
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, video_path = xs

            # src_masks

            # embedding
            enc = tf.layers.dense(x, self.d_model)
            #src_masks = tf.math.equal(mask, 0) # (N, T1)
            src_masks = tf.sequence_mask(seqlens)

            #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            #enc *= self.hp.d_model**0.5 # scale

            enc /= self.hp.d_model**0.5

            enc += positional_encoding(enc, self.hp.n_video)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                    )
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, src_masks
예제 #18
0
파일: model.py 프로젝트: QAQ-v/transformer
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs
            # x = tf.Print(x, [x], message='x =', summarize=10)
            # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3)
            # with tf.control_dependencies([print_sent]):
            # embedding
            # xs_pri = tf.print('xs =', tf.shape(x), summarize=3)
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3)
            ## Blocks
            # with tf.control_dependencies([xs_pri, enc_pri]):

            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
예제 #19
0
    def __init__(self,
                 num_layers,
                 d_model,
                 num_heads,
                 d_ff,
                 input_vocab_size,
                 maximum_position_encoding,
                 rate=0.1):
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.d_model = d_model

        self.pos_enc = positional_encoding(maximum_position_encoding, d_model)

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.dropout = tf.keras.layers.Dropout(rate)

        self.encoder_layers = [
            EncoderLayer(d_model, d_ff, num_heads, rate)
            for x in range(num_layers)
        ]
예제 #20
0
    def _encode(self, x, seq_num, training=True, name=None):
        """
        Returns
        memory: encoder outputs. (N, T1, d_model)
        """
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # embedding
            x = tf.identity(x, "input_x")
            enc = tf.nn.embedding_lookup(self._embeddings[seq_num],
                                         x)  # (N, T1, d_model)
            enc *= self._context.d_model**0.5  # scale

            enc += positional_encoding(enc, self._context.maxlens[seq_num])
            enc = tf.layers.dropout(enc,
                                    self._context.dropout_rate,
                                    training=training)

            # # Blocks
            for i in range(self._context.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self._context.num_heads,
                        dropout_rate=self._context.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(
                        enc,
                        num_units=[self._context.d_ff, self._context.d_model])
        memory = tf.identity(enc, name=name)
        return memory
    def get_output(self,
                   input,
                   training,
                   return_spectrogram=False,
                   reuse=True):
        '''
        Creates symbolic computation graph of the U-Net for a given input batch
        :param input: Input batch of mixtures, 3D tensor [batch_size, num_samples, 1], mono raw audio
        :param reuse: Whether to create new parameter variables or reuse existing ones
        :Param return_spectrogram: Whether to output the spectrogram estimate or convert it to raw audio and return that
        :return: U-Net output: If return_spectrogram: Accompaniment and voice magnitudes as length-two list with two 4D tensors. Otherwise Two 3D tensors containing the raw audio estimates
        '''
        # Setup STFT computation
        window = functools.partial(window_ops.hann_window, periodic=True)
        inv_window = tf.contrib.signal.inverse_stft_window_fn(
            self.hop, forward_window_fn=window)
        with tf.variable_scope("separator", reuse=reuse):
            enc_outputs = list()

            # Compute spectrogram
            assert (input.get_shape().as_list()[2] == 1
                    )  # Model works ONLY on mono
            stfts = tf.contrib.signal.stft(tf.squeeze(input, 2),
                                           frame_length=self.frame_len,
                                           frame_step=self.hop,
                                           fft_length=self.frame_len,
                                           window_fn=window)
            mix_mag = tf.abs(stfts)
            mix_angle = tf.angle(stfts)

            # Input for network
            mix_mag_norm = tf.log1p(tf.expand_dims(mix_mag, 3))
            mix_mag_norm = mix_mag_norm[:, :, :
                                        -1, :]  # Cut off last frequency bin to make number of frequency bins divisible by 2

            mags = dict()
            for name in self.source_names:
                current_layer = mix_mag_norm
                current_layer = tf.layers.conv2d(current_layer,
                                                 128, [3, 3],
                                                 strides=[2, 2],
                                                 activation=None,
                                                 padding='same')
                current_layer = tf.contrib.layers.batch_norm(
                    current_layer,
                    activation_fn=LeakyReLU,
                    is_training=training)

                # Position Embedding
                current_shape = current_layer.get_shape().as_list()
                maxlen = current_shape[1] * current_shape[2]
                pos_inputs = tf.reshape(current_layer,
                                        [current_shape[0], maxlen, -1])
                pos_layer = positional_encoding(pos_inputs,
                                                maxlen,
                                                masking=False)
                pos_layer = tf.reshape(pos_layer, current_shape)
                current_layer += pos_layer

                # Down-convolution: Repeat pool-conv
                for i in range(self.num_layers):
                    assert (current_layer.get_shape().as_list()[1] % 2 == 0 and
                            current_layer.get_shape().as_list()[2] % 2 == 0)
                    # block
                    current_layer = tf_multihead_attention(
                        queries=current_layer,
                        keys=current_layer,
                        values=current_layer,
                        num_heads=8,
                        dropout_rate=0.1,
                        training=training,
                        causality=False)
                    current_layer = cnn(current_layer, training=training)

                # Compute mask
                mask = tf.layers.conv2d_transpose(current_layer,
                                                  1, [3, 3],
                                                  strides=[2, 2],
                                                  activation=tf.nn.sigmoid,
                                                  padding="same")
                mask = tf.pad(
                    mask, [(0, 0), (0, 0), (0, 1), (0, 0)],
                    mode="CONSTANT",
                    constant_values=0.5
                )  # Pad last frequency bin of mask that is missing since we removed it in the input
                mask = tf.squeeze(mask, 3)

                # Compute source magnitudes
                source_mag = tf.multiply(mix_mag, mask)
                mags[name] = source_mag

            if return_spectrogram:
                return mags
            else:
                audio_out = dict()
                # Reconstruct audio
                for source_name in mags.keys():
                    stft = tf.multiply(tf.complex(mags[source_name], 0.0),
                                       tf.exp(tf.complex(0.0, mix_angle)))
                    audio = tf.contrib.signal.inverse_stft(
                        stft,
                        self.frame_len,
                        self.hop,
                        self.frame_len,
                        window_fn=inv_window)

                    # Reshape to [batch_size, samples, 1]
                    audio = tf.expand_dims(audio, 2)

                    audio_out[source_name] = audio

                return audio_out
예제 #22
0
    def decode(self, ys, x_paraphrased_dict, memory, training=True):
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys
            x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict
            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2)
            seqlens = tf.shape(decoder_inputs)[1]  # (N, T2, 2)
            paraphrased_lens = tf.shape(x_paraphrased_dict)[1]  # (N, T2, 2)

            x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1]

            x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o)  # N, W2, d_model
            if self.hp.paraphrase_type == 0:
                x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p)
            else:
                x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=True,
                                              scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False,
                                              scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

            # add paraphrased dictionary attention
            h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2)

            o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1)
            W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model
            score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_o) # N, T2, W2
            c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1)
            W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model
            score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_p) # N, T2, W2
            c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model
            out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
예제 #23
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen,
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="enc_pe")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="dec_pe")
            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
    def decode(self, xs, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        self.memory = memory
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, sents2 = ys
            x, _, = xs

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            attn_dists = []
            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec, _ = multihead_attention(queries=dec,
                                                 keys=dec,
                                                 values=dec,
                                                 num_heads=self.hp.num_heads,
                                                 dropout_rate=self.hp.dropout_rate,
                                                 training=training,
                                                 causality=True,
                                                 scope="self_attention")

                    # Vanilla attention
                    dec, attn_dist = multihead_attention(queries=dec,
                                                          keys=self.memory,
                                                          values=self.memory,
                                                          num_heads=self.hp.num_heads,
                                                          dropout_rate=self.hp.dropout_rate,
                                                          training=training,
                                                          causality=False,
                                                          scope="vanilla_attention")
                    attn_dists.append(attn_dist)
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)

        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False)

        logits = tf.nn.softmax(logits)

        # final distribution
        logits = self._calc_final_dist(x, gens, logits, attn_dists[-1])

        return logits, y, sents2
예제 #26
0
    def run_GaussionTransformer(self):
        embeddingScope = "embeddingBlock"
        encodingBlock = "encodingBlcok"
        interactionBlock = "interactionBlock"
        comparisonBlock = "comparisonBlock"

        self.positionEncoding1 = modules.positional_encoding(
            inputs=self.inputX_word,
            num_units=param.Hyperparams.postion_dimension
        )  #(N, L, postion_dimension)
        self.positionEncoding2 = modules.positional_encoding(
            inputs=self.inputY_word,
            num_units=param.Hyperparams.postion_dimension)
        self.shift = tf.Variable(
            tf.abs(tf.random_normal([1], stddev=0, seed=0, dtype=tf.float64)) +
            0.001,
            trainable=True,
            name='shift',
            dtype=tf.float64)
        self.bias = tf.Variable(
            -tf.abs(tf.random_normal([1], stddev=0, seed=0, dtype=tf.float64)),
            trainable=True,
            name='bias',
            dtype=tf.float64)

        with tf.variable_scope(embeddingScope, reuse=False):
            self.embedding_1 = modules.embedding_block(self.inputX_word,
                                                       self.inputX_char,
                                                       self.positionEncoding1,
                                                       scope="embedding_1")

            self.embedding_2 = modules.embedding_block(self.inputY_word,
                                                       self.inputY_char,
                                                       self.positionEncoding2,
                                                       scope="embedding_2")
            self.embedding_1 = tf.check_numerics(self.embedding_1,
                                                 "nan happend!!!!")
            self.embedding_2 = tf.check_numerics(self.embedding_2,
                                                 "nan happend!!!!")

        with tf.variable_scope(encodingBlock, reuse=False):
            self.encoding_1 = self.embedding_1
            self.encoding_2 = self.embedding_2
            for i in range(param.Hyperparams.encoder_num_blocks):
                with tf.variable_scope("multihead-atttention_{0}".format(i),
                                       reuse=False):  #这里添加scope, {}.format
                    self.encoding_1 = modules.multihead_attention(
                        self.encoding_1,
                        self.shift,
                        self.bias,
                        num_heads=param.Hyperparams.num_heads,
                        dropout_rate=self.dropout_rate,
                        is_training=self.is_training)
                    self.encoding_1 = tf.check_numerics(
                        self.encoding_1,
                        "encoding nan happend!!!! multihead-atttention_{0}".
                        format(i))

                with tf.variable_scope("multihead-atttention_{0}".format(i),
                                       reuse=True):  # 这里添加scope, {}.format
                    self.encoding_2 = modules.multihead_attention(
                        self.encoding_2,
                        self.shift,
                        self.bias,
                        num_heads=param.Hyperparams.num_heads,
                        dropout_rate=self.dropout_rate,
                        is_training=self.is_training)

            self.encoding_1 += self.positionEncoding1
            self.encoding_2 += self.positionEncoding2

        with tf.variable_scope(interactionBlock, reuse=None):
            self.interaction_1 = self.encoding_1
            self.interaction_2 = self.encoding_2

            for i in range(param.Hyperparams.inter_num_blocks):
                with tf.variable_scope("interaction_{0}".format(i),
                                       reuse=False):
                    self.interaction_1 = modules.InteractionBlock(
                        queries=self.interaction_1,
                        keys=self.interaction_2,
                        shift=self.shift,
                        bias=self.bias,
                        num_heads=param.Hyperparams.num_heads,
                        dropout_rate=self.dropout_rate,
                        is_training=self.is_training)
                    self.interaction_1 = tf.check_numerics(
                        self.interaction_1, "nan happend!!!!")
                with tf.variable_scope("interaction_{0}".format(i),
                                       reuse=True):
                    self.interaction_2 = modules.InteractionBlock(
                        queries=self.interaction_2,
                        keys=self.interaction_1,
                        shift=self.shift,
                        bias=self.bias,
                        num_heads=param.Hyperparams.num_heads,
                        dropout_rate=self.dropout_rate,
                        is_training=self.is_training)

        self.encoding_1 = tf.check_numerics(self.encoding_1,
                                            "encoding_1 is nan")
        self.encoding_2 = tf.check_numerics(self.encoding_2,
                                            "encoding_2 is nan")
        self.interaction_1 = tf.check_numerics(self.interaction_1,
                                               "interaction_ 1 is nan")
        self.interaction_2 = tf.check_numerics(self.interaction_2,
                                               "interaction_2 is nan")
        with tf.variable_scope(comparisonBlock, reuse=None):
            self.logit = modules.ComparisonBlock(
                input1_Encoding=self.encoding_1,
                input1_Interaction=self.interaction_1,
                input2_Encoding=self.encoding_2,
                input2_Interaction=self.interaction_2)
            self.pred_y = tf.argmax(tf.nn.softmax(self.logit), 1)
            if self.is_training:
                with tf.name_scope("optimize"):
                    # 损失函数,交叉熵
                    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                        logits=self.logit,
                        labels=self.y)  # 对logits进行softmax操作后,做交叉墒,输出的是一个向量
                    self.loss = tf.reduce_mean(
                        cross_entropy)  # 将交叉熵向量求和,即可得到交叉熵
                    # 优化器
                    self.optim = tf.train.AdamOptimizer(
                        learning_rate=param.Hyperparams.lr).minimize(self.loss)

                with tf.name_scope("accuracy"):
                    # 准确率
                    correct_pred = tf.equal(
                        tf.argmax(self.y, 1), self.pred_y
                    )  # 由于input_y也是onehot编码,因此,调用tf.argmax(self.input_y)得到的是1所在的下表
                    self.acc = tf.reduce_mean(tf.cast(correct_pred,
                                                      tf.float32))
예제 #27
0
파일: model.py 프로젝트: QAQ-v/transformer
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys
            # decoder_inputs = tf.Print(decoder_inputs, [decoder_inputs],
            # message='decoder_inputs =', summarize=10)
            # embedding
            # ys_pri = tf.print('y =', tf.shape(y), summarize=3)
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)
            # dec = tf.Print(dec, [dec], message='dec =', summarize=10)
            # dec_pri = tf.print('dec =', tf.shape(dec), dec, summarize=3)
            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # dec = tf.Print(dec, [dec], message='dec_finally =', summarize=10)
        # Final linear projection (embedding weights are shared)
        # with tf.control_dependencies([ys_pri, dec_pri]):
        weights = tf.transpose(self.embeddings)  # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
예제 #28
0
    def encode_decode(self, xs, ys, training=True):
        x, seqlens = xs
        decoder_inputs, y, seqlens = ys
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc += positional_encoding(enc, self.hp.maxlen1, self.hp)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            dec = tf.reduce_sum(tf.nn.embedding_lookup(self.embeddings,
                                                       decoder_inputs),
                                reduction_indices=2)  # (N, T1, d_model)
            # test_dec = dec
            dec = dec * self.hp.d_model**0.5  # scale
            # 子图结构里也需要对应的位置编码,因为要对应输出的预测结构
            dec += positional_encoding(dec, self.hp.maxlen2, self.hp)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

        ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=dec,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
            with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        # 是否加上mask层
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        if self.hp.type == 'attribute':
            enc = tf.reduce_sum(enc, reduction_indices=1)
            dec = tf.reduce_sum(dec, reduction_indices=1)
            logits = tf.layers.dense(inputs=tf.concat(enc, dec),
                                     units=1,
                                     activation=tf.nn.relu)
        else:
            logits = tf.einsum('ntd,nkd->ntk', dec, enc)  # (N, T2, T2)
            logits = (logits +
                      tf.transpose(logits, [0, 2, 1])) / 2  #强制最终结果为一个对称矩阵,符合
        return logits, y, decoder_inputs
예제 #29
0
    def decode(self, ys, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding

            if self.hp.fac_embed:
                dec = tf.nn.embedding_lookup(
                    self.embeddings1, decoder_inputs)  # (N, T2, d_embed)
                dec = tf.matmul(dec, self.embeddings2)  # (N, T2, d_model)
            else:
                dec = tf.nn.embedding_lookup(
                    self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                if self.hp.share_weights:
                    vs_name = "blocks_shared"
                else:
                    vs_name = "num_blocks_{}".format(i)
                with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        key_masks=tgt_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared in some situation)
        if self.hp.fac_embed:
            if self.hp.io_tie:  #0: no normalization. 1: cal cosine-sim on 1 and 2. 2: cal l2-norm-square on 2 and cosine-sim on 1. 3: cal l2-norm-square on 2 and dist on 1.
                if self.hp.embedding_normalization == 1:  #need add en=2 situation
                    output_embeddings1 = tf.transpose(
                        tf.concat((tf.zeros(shape=[1, self.hp.d_embed],
                                            dtype=tf.float32),
                                   tf.nn.l2_normalize(self.embeddings1[1::],
                                                      axis=-1)), 0))
                    logits = tf.einsum('ntd,dk->ntk', dec,
                                       tf.nn.l2_normalize(
                                           tf.transpose(self.embeddings2),
                                           axis=0))  #maybe use lstsq?
                    logits = tf.einsum('ntd,dk->ntk', logits,
                                       output_embeddings1)
                elif self.hp.embedding_normalization >= 2:
                    weights2 = self.embeddings2[1:, :]
                    weights2 = divide_norm_square_and_transpose(weights2)
                    weights2 = tf.concat(
                        (tf.zeros(shape=[self.hp.d_embed, 1],
                                  dtype=tf.float32), weights2), -1)
                    if self.hp.embedding_normalization == 2:
                        weights1 = tf.transpose(
                            tf.concat(
                                (tf.zeros(shape=[1, self.hp.d_embed],
                                          dtype=tf.float32),
                                 tf.nn.l2_normalize(self.embeddings1[1::],
                                                    axis=-1)), 0))
                    else:
                        weights1 = tf.transpose(self.embeddings1)
                    logits = tf.einsum('ntd,dk->ntk', dec, weights2)
                    logits = tf.einsum('ntd,dk->ntk', logits, weights1)
                    if self.hp.embedding_normalization == 3:
                        ebias = get_half_squarenorm(self.embeddings1)
                        logits = tf.subtract(logits, ebias)
                else:
                    logits = tf.einsum('ntd,dk->ntk', dec,
                                       tf.transpose(self.embeddings2))
                    logits = tf.einsum('ntd,dk->ntk', logits,
                                       tf.transpose(self.embeddings1))
            else:
                with tf.variable_scope("output_embedding",
                                       reuse=tf.AUTO_REUSE):
                    logits = tf.layers.dense(dec, self.vocab_size)

        else:
            if self.hp.io_tie:
                if self.hp.embedding_normalization == 0 or self.hp.embedding_normalization == 3:
                    weights = tf.transpose(
                        self.embeddings)  # (d_model, vocab_size)
                elif self.hp.embedding_normalization == 1:
                    weights = tf.transpose(
                        tf.concat((tf.zeros(shape=[1, self.hp.d_model],
                                            dtype=tf.float32),
                                   tf.nn.l2_normalize(self.embeddings[1:, :],
                                                      axis=-1)), 0))
                elif self.hp.embedding_normalization == 2:
                    weights = self.embeddings[1:, :]
                    weights = divide_norm_square_and_transpose(weights)
                    weights = tf.concat((tf.zeros(shape=[self.hp.d_model, 1],
                                                  dtype=tf.float32), weights),
                                        -1)
                logits = tf.einsum('ntd,dk->ntk', dec,
                                   weights)  # (N, T2, vocab_size)
                if self.hp.embedding_normalization == 2:
                    #bias=tf.ones(shape=[logits.shape[-1]],dtype=tf.float32)
                    pass
                    #with tf.variable_scope("gauss",reuse=tf.AUTO_REUSE):
                    #bias=tf.constant(1.0)
                    #logits=tf.subtract(logits,bias)
                    #logits=tf.square(logits)
                    #logits=tf.negative(logits)
                    #logits=gaussian_activation(logits)
                #logits=tf.exp(logits)
                if self.hp.embedding_normalization == 3:
                    ebias = get_half_squarenorm(self.embeddings)
                    logits = tf.subtract(logits, ebias)
            else:
                with tf.variable_scope("output_embedding",
                                       reuse=tf.AUTO_REUSE):
                    logits = tf.layers.dense(dec, self.vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
예제 #30
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()