示例#1
0
def global_attention(inputs,
                     batch,
                     attention_size,
                     vocaubulary_size,
                     ATT_W,
                     load_LR_model=False):
    """
    Global Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
    
    Args:
        inputs: The Attention inputs.
                Matches outputs of RNN/Bi-RNN layer (not final state):
                In case of RNN, this must be RNN outputs `Tensor`:
                In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
                the backward RNN outputs `Tensor`.
        batch: The RNN inputs, batch of dataset.
        attention_size: Linear size of the Attention weights.
        vocaubulary_size: the vocaubulary size of train + test dataset
        ATT_W: The global attention weights.
                in case of joint trained logistic regression model, input its parameters to initialize global attention weights.
        load_LR_model: If true, global attention weights are initialized by LR model weights.

    Returns:
        The Attention output `Tensor`.
            In case of RNN, this will be a `Tensor` shaped:
                `[batch_size, cell.output_size]`.
            In case of Bidirectional RNN, this will be a `Tensor` shaped:
                `[batch_size, cell_fw.output_size + cell_bw.output_size]`.
        Betas: Global attention scores.
            `[vocabulary_size, 1]
    """

    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)
    hidden_size = inputs.shape[2].value
    inputs = tf.layers.batch_normalization(inputs)

    w_omega = tf.Variable(
        tf.random_normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))

    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)

    if load_LR_model:
        betas = tf.squeeze(ATT_W)
        u = tf.nn.embedding_lookup(tf.abs(betas), batch)
    else:
        betas = tf.Variable(tf.random_normal([vocaubulary_size], stddev=0.1))
        u = tf.nn.embedding_lookup(betas, batch)

    alphas = tf.nn.softmax(u, name='alphas')

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
    output = feedforward(output,
                         num_units=[hidden_size, 2 * hidden_size, hidden_size])
    output = tf.layers.batch_normalization(output)

    return output, betas
示例#2
0
    def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SAKmeans")

        with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks
            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)

            num_heads = num_interest
            self.user_eb = getKVector(sess, self.seq, num_heads)
            self.dim = embedding_dim
            item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim])

            # item_list_emb = [-1, seq_len, embedding_dim]
            # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1)
            atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1]))
            atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1))

            # 找出与target item最相似的用户兴趣向量
            readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]),
                                tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range(
                                    tf.shape(item_list_emb)[0]) * num_heads)

            self.build_sampled_softmax_loss(self.item_eb, readout)
示例#3
0
    def build_blocks(self, inputs, masks, reuse=None):
        self.blk = inputs
        for i in range(self.num_blocks):
            with tf.variable_scope("blocks_{}".format(i), reuse=reuse):
                ## Multihead Attention ( self-attention)
                self.blk = multihead_attention(queries=self.blk,
                                               keys=self.blk,
                                               qkv_masks=masks,
                                               num_units=self.hidden_units,
                                               num_heads=self.num_heads,
                                               dropout_rate=self.dropout,
                                               # is_training=is_training,
                                               causality=False,
                                               scope="self_attention",
                                               reuse=reuse)
                self.blk = feedforward(self.blk, num_units=[4*self.hidden_units, self.hidden_units], reuse=reuse)

        return self.blk
示例#4
0
    def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="Model_SASRec")

        with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks

            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)
            self.sum_pooling = tf.reduce_sum(self.seq, 1)
            fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu)
            fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu)
            self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu)
            self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                self.enc += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                    [tf.shape(self.x)[0], 1]),
                                      vocab_size=hp.maxlen,
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="enc_pe")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                self.dec += embedding(tf.tile(
                    tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]),
                                   0), [tf.shape(self.decoder_inputs)[0], 1]),
                                      num_units=hp.emb_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope="dec_pe")
            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
示例#6
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))

            # define decoder inputs
            # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为
            # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分
            # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。
            # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)

            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=
                    True,  # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0)
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ## Dropout
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks, 叠加block,6个
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                # Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                # Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection, 分类任务,分类数量是词表长度
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
示例#7
0
    def __init__(self, is_training=True):
        self.graph = tf.Graph()

        with self.graph.as_default():
            if is_training:
                self.x, self.y, self.num_batch = get_batch_data()
            else:
                # x: (32,10)  y:(32,10)  一个batch32个句子,每个句子长度为10
                self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
                self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen))
            """
            定义decoder部分的input
            
             假设真实翻译后的输出为 i am a student </S>
             
             decoder部分的input应为: <S> i am a student
            """
            self.decoder_inputs = tf.concat(
                (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]),
                -1)  # 2代表<S>,是decoder的初始输入

            # 词典
            de2idx, idx2de = load_de_vocab()
            en2idx, idx2en = load_en_vocab()

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.x,
                    vocab_size=len(de2idx),
                    num_units=hp.hidden_units,
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.enc += positional_encoding(self.x,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope='enc_pe')

                else:
                    self.enc += embedding(tf.tile(
                        tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0),
                        [tf.shape(self.x)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="enc_pe")

                ##Drop out
                self.enc = tf.layers.dropout(
                    self.enc,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            with tf.variable_scope("decoder"):
                # Embedding
                self.dec = embedding(self.decoder_inputs,
                                     vocab_size=len(en2idx),
                                     num_units=hp.hidden_units,
                                     scale=True,
                                     scope="dec_embed")

                ## Positional Encoding
                if hp.sinusoid:
                    self.dec += positional_encoding(self.decoder_inputs,
                                                    vocab_size=hp.maxlen,
                                                    num_units=hp.hidden_units,
                                                    zero_pad=False,
                                                    scale=False,
                                                    scope="dec_pe")
                else:
                    self.dec += embedding(tf.tile(
                        tf.expand_dims(
                            tf.range(tf.shape(self.decoder_inputs)[1]), 0),
                        [tf.shape(self.decoder_inputs)[0], 1]),
                                          vocab_size=hp.maxlen,
                                          num_units=hp.hidden_units,
                                          zero_pad=False,
                                          scale=False,
                                          scope="dec_pe")

                # Dropout
                self.dec = tf.layers.dropout(
                    self.dec,
                    rate=hp.dropout_rate,
                    training=tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(hp.num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ## Multihead Attention ( self-attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.dec,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=True,
                            scope="self_attention")

                        ## Multihead Attention ( vanilla attention)
                        self.dec = multihead_attention(
                            queries=self.dec,
                            keys=self.enc,
                            num_units=hp.hidden_units,
                            num_heads=hp.num_heads,
                            dropout_rate=hp.dropout_rate,
                            is_training=is_training,
                            causality=False,
                            scope="vanilla_attention")

                        ## Feed Forward
                        self.dec = feedforward(
                            self.dec,
                            num_units=[4 * hp.hidden_units, hp.hidden_units])

            # Final linear projection
            self.logits = tf.layers.dense(self.dec, len(en2idx))
            self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
            self.istarget = tf.to_float(tf.not_equal(self.y, 0))
            self.acc = tf.reduce_sum(
                tf.to_float(tf.equal(self.preds, self.y)) * self.istarget /
                (tf.reduce_sum(self.istarget)))

            if is_training:
                # Loss
                # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。
                self.y_smoothed = label_smoothing(
                    tf.one_hot(self.y, depth=len(en2idx)))
                self.loss = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.logits, labels=self.y_smoothed)
                self.mean_loss = tf.reduce_sum(
                    self.loss * self.istarget) / (tf.reduce_sum(self.istarget))

                self.global_step = tf.Variable(0,
                                               name='global_step',
                                               trainable=False)
                self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr,
                                                        beta1=0.9,
                                                        beta2=0.98,
                                                        epsilon=1e-8)
                self.train_op = self.optimizer.minimize(
                    self.mean_loss, global_step=self.global_step)

                tf.summary.scalar('mean_loss', self.mean_loss)
                self.merged = tf.summary.merge_all()
    def build_model(self):
        # define decoder inputs
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1)  # 2:<S>

        # Encoder
        with tf.variable_scope("encoder"):
            ## Embedding
            self.enc = embedding(self.x,
                                 vocab_size=len(self.de2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="enc_embed")
            sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1))
            key_masks = tf.expand_dims(sign, -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.enc += positional_encoding(self.x,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="enc_pe")
            else:
                cells = self.rnn_cell()
                encoder_output, _encoder_state = tf.nn.dynamic_rnn(
                    cells,
                    self.enc,
                    sequence_length=self.x_len,
                    dtype=tf.float32)
                self.enc = tf.concat([self.enc, encoder_output], axis=-1)
                self.enc = tf.layers.dense(self.enc,
                                           hp.emb_dim,
                                           activation="relu")

            self.enc *= key_masks

            ## Dropout
            self.enc = tf.layers.dropout(self.enc,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    pos_emb = tf.get_variable(
                        'enc_pos_emb',
                        dtype=tf.float32,
                        shape=[self.enc.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    ### Multihead Attention
                    self.enc = multihead_attention(
                        queries=self.enc,
                        keys=self.enc,
                        pos_emb=pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False)

                    ### Feed Forward
                    self.enc = feedforward(
                        self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Decoder
        with tf.variable_scope("decoder"):
            ## Embedding
            self.dec = embedding(self.decoder_inputs,
                                 vocab_size=len(self.en2idx),
                                 num_units=hp.emb_dim,
                                 scale=True,
                                 scope="dec_embed")

            key_masks = tf.expand_dims(
                tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1)

            ## Positional Encoding
            if hp.sinusoid:
                self.dec += positional_encoding(self.decoder_inputs,
                                                num_units=hp.emb_dim,
                                                zero_pad=False,
                                                scale=False,
                                                scope="dec_pe")
            else:
                cells = self.rnn_cell()
                decoder_output, _decoder_state = tf.nn.dynamic_rnn(
                    cells,
                    self.dec,
                    sequence_length=self.y_len,
                    dtype=tf.float32)
                self.dec = tf.concat([self.dec, decoder_output], axis=-1)
                self.dec = tf.layers.dense(self.dec,
                                           hp.emb_dim,
                                           activation="relu")

            self.dec *= key_masks

            ## Dropout
            self.dec = tf.layers.dropout(self.dec,
                                         rate=hp.dropout_rate,
                                         training=tf.convert_to_tensor(
                                             self.is_training))

            ## Blocks
            for i in range(hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i)):
                    dec_dec_pos_emb = tf.get_variable(
                        'dec_de_pos_emb',
                        dtype=tf.float32,
                        shape=[self.dec.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    dec_enc_pos_emb = tf.get_variable(
                        'dec_enc_pos_emb',
                        dtype=tf.float32,
                        shape=[self.enc.shape[1]],
                        initializer=tf.contrib.layers.xavier_initializer())
                    ## Multihead Attention ( self-attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.dec,
                        pos_emb=dec_dec_pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=True,
                        scope="self_attention")

                    ## Multihead Attention ( vanilla attention)
                    self.dec = multihead_attention(
                        queries=self.dec,
                        keys=self.enc,
                        pos_emb=dec_enc_pos_emb,
                        num_units=hp.emb_dim,
                        num_heads=hp.num_heads,
                        dropout_rate=hp.dropout_rate,
                        is_training=self.is_training,
                        causality=False,
                        scope="vanilla_attention")

                    ## Feed Forward
                    self.dec = feedforward(
                        self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim])

        # Final linear projection
        self.logits = tf.layers.dense(self.dec, len(self.en2idx))
        self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
示例#9
0
    def build_graph(self):

        # Define input
        with tf.name_scope("input_ph"):
            self.X_ind = tf.placeholder(dtype=tf.int32, 
                                        shape=[None, self.field_size], 
                                        name="X_index")
            self.label = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name="label")
            self.is_training = tf.placeholder(dtype=tf.bool, 
                                              shape=(), 
                                              name="is_training")

        # lookup and process embedding
        with tf.name_scope("embedding"):
            self.emb = embedding(inputs=self.X_ind,
                                 vocab_size=self.feat_size,
                                 num_units=self.embedding_dim,
                                 scale=self.scale_embedding,
                                 scope="embedding_process")

        # self.emb: raw embedding, features: used for later
        features = self.emb

        with tf.name_scope("Multilayer_attn"):
            with tf.variable_scope("attention_head") as scope:
                features, _ = multihead_attention(
                    queries=features,
                    keys=features,
                    num_units=self.attention_size*self.num_head,
                    num_heads=self.num_head,
                    dropout_rate=self.dropout_rate,
                    is_training=self.is_training,
                    scope="multihead_attention"
                )

                features = feedforward(
                    inputs=features,
                    num_units=[4 * self.embedding_dim,
                               self.embedding_dim],
                    scope="feed_forward"
                )  # [N, T, dim]

        # multi-head feature to agg 1st order feature
        with tf.name_scope("Agg_first_order") as scope:
            ctx_order_1 = tf.get_variable(
                name="context_order_1",
                shape=(self.attention_size),
                dtype=tf.float32)

            agg_feat_1, self.attn_1 = agg_attention(
                query=ctx_order_1,
                keys=features,
                values=features,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )  # [N, dim]

        # build second order cross
        with tf.name_scope("Second_order") as scope:
            feat_2 = tf.multiply(
                features,
                tf.expand_dims(agg_feat_1, axis=1)
                )  # [N, T, dim]

            feat_2 += features  # Add the residual, [N, T, dim]

            ctx_order_2 = tf.get_variable(
                name="context_order_2",
                shape=(self.attention_size),
                dtype=tf.float32
                )

            agg_feat_2, self.attn_2 = agg_attention(
                query=ctx_order_2,
                keys=feat_2,
                values=feat_2,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )

        # build third order cross
        with tf.name_scope("Third_order") as scope:
            feat_3 = tf.multiply(
                features,
                tf.expand_dims(agg_feat_2, axis=1)
                )  # [N, T, dim]

            feat_3 += feat_2  # Add the residual, [N, T, dim]

            ctx_order_3 = tf.get_variable(
                name="context_order_3",
                shape=(self.attention_size),
                dtype=tf.float32
                )

            agg_feat_3, self.attn_3 = agg_attention(
                query=ctx_order_3,
                keys=feat_3,
                values=feat_3,
                attention_size=self.attention_size,
                regularize_scale=self.regularization_weight
                )

        with tf.name_scope("Merged_features"):

            # concatenate [enc, second_cross, third_cross]
            # TODO: can + multihead_features
            all_features = tf.stack([
                agg_feat_1,
                agg_feat_2,
                agg_feat_3,
                ],
                axis=1, name="concat_feature")  # (N, k, C)

        # map C to pool_filter_size dimension
        mapped_all_feature = tf.layers.conv1d(
            inputs=all_features,
            filters=self.pool_filter_size,
            kernel_size=1,
            use_bias=True,
            name="Mapped_all_feature"
        )  # (N, k, pf_size)
        
        # apply context vector
        feature_weights = tf.nn.softmax(
            tf.squeeze(
                tf.layers.dense(
                    mapped_all_feature,
                    units=1,
                    activation=None,
                    use_bias=False
                ),  # (N, k, 1),
                [2]
            ), # (N, k)
        )  # (N, k)

        self.attn_k = feature_weights
        
        # weighted sum
        weighted_sum_feat = tf.reduce_sum(
            tf.multiply(
                all_features,
                tf.expand_dims(feature_weights, axis=2),
            ),  # (N, k, C)
            axis=[1],
            name="Attn_weighted_sum_feature"
        )  # (N, C)
        
        # last non-linear
        hidden_logits = tf.layers.dense(
            weighted_sum_feat,
            units=self.embedding_dim // 2,
            activation=tf.nn.relu,
            use_bias=False,
            name="HiddenLogits"
        )  # (N, C/2)

        # the last dense for logits
        logits = tf.squeeze(
            tf.layers.dense(
                hidden_logits,
                units=1,
                activation=None,
                use_bias=False,
                name="Logits"
            ),  # (N, 1)
            axis=[1]
        )  # (N,)

        # sigmoid logits
        self.sigmoid_logits = tf.nn.sigmoid(logits)

        # regularization term
        self.regularization_loss = tf.losses.get_regularization_loss()

        self.logloss = tf.reduce_sum(
            tf.nn.sigmoid_cross_entropy_with_logits(
                labels=tf.expand_dims(self.label, -1),
                logits=tf.expand_dims(logits, -1),
                name="SumLogLoss"))

        self.mean_logloss = tf.divide(
            self.logloss,
            tf.to_float(self.batch_size),
            name="MeanLogLoss"
            )

        # overall loss
        self.overall_loss = tf.add(
            self.mean_logloss,
            self.regularization_loss,
            name="OverallLoss"
        )
        
        tf.summary.scalar("Mean_LogLoss", self.mean_logloss)
        tf.summary.scalar("Reg_Loss", self.regularization_loss)
        tf.summary.scalar("Overall_Loss", self.overall_loss)

        self.train_op = self.optimizer.minimize(self.overall_loss, 
                                                global_step=self.global_step)
        self.merged = tf.summary.merge_all()
示例#10
0
    def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2,
                 seq_len=256, num_blocks=2):
        super(Model_MSARec, self).__init__(n_mid, embedding_dim, hidden_size,
                                                   batch_size, seq_len, flag="MSARec")

        with tf.variable_scope("MSARec", reuse=tf.AUTO_REUSE) as scope:

            # Positional Encoding
            t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            self.mid_his_batch_embedded += t

            # Dropout
            self.seq = tf.layers.dropout(self.mid_his_batch_embedded,
                                         rate=dropout_rate,
                                         training=tf.convert_to_tensor(True))
            self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))

            # Build blocks
            for i in range(num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):

                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=hidden_size,
                                                   num_heads=num_interest,
                                                   dropout_rate=dropout_rate,
                                                   is_training=True,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size],
                                           dropout_rate=dropout_rate, is_training=True)
                    self.seq *= tf.reshape(self.mask, (-1, seq_len, 1))
            # (b, seq_len, dim)
            self.seq = normalize(self.seq)

            self.dim = embedding_dim

            item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim])
            # t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0)
            # item_list_add_pos = item_list_emb + t

            num_heads = num_interest
            fc1 = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.relu)
            fc2 = tf.layers.dense(fc1, num_heads, activation=tf.nn.tanh)
            # (b, num_heads, sql_len)
            fc2 = tf.transpose(fc2, [0, 2, 1])
            interest_emb = tf.layers.dense(fc2, embedding_dim, activation=tf.nn.relu)

            # with tf.variable_scope("multi_interest", reuse=tf.AUTO_REUSE) as scope:
            #     # item_list_add_pos: (b, seq_len, embedding_dim)
            #     # item_hidden: (b, sql_len, hidden_size * 4)
            #     # item_hidden = tf.layers.dense(item_list_add_pos, hidden_size * 4, activation=tf.nn.tanh)
            #     item_hidden = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.tanh)
            #     # item_att_w: (b, sql_len, num_heads)
            #     item_att_w = tf.layers.dense(item_hidden, num_heads, activation=tf.nn.tanh)
            #     # item_att_w: (b, num_heads, sql_len)
            #     item_att_w = tf.transpose(item_att_w, [0, 2, 1])
            #
            #     # atten_mask: (b, num_heads, sql_len)
            #     atten_mask = tf.tile(tf.expand_dims(self.mask, axis=1), [1, num_heads, 1])
            #     paddings = tf.ones_like(atten_mask) * (-2 ** 32 + 1)
            #
            #     # 对于填充的位置赋值极小值
            #     item_att_w = tf.where(tf.equal(atten_mask, 0), paddings, item_att_w)
            #     item_att_w = tf.nn.softmax(item_att_w)
            #
            #     # item_att_w [batch, num_heads, seq_len]
            #     # item_list_emb [batch, seq_len, embedding_dim]
            #     # interest_emb (batch, num_heads, embedding_dim)
            #     interest_emb = tf.matmul(item_att_w, item_list_emb)

            self.user_eb = interest_emb

            # item_list_emb = [-1, seq_len, embedding_dim]
            # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1)
            atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1]))
            atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1))

            # 找出与target item最相似的用户兴趣向量
            readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]),
                                tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range(
                                    tf.shape(item_list_emb)[0]) * num_heads)

            self.build_sampled_softmax_loss(self.item_eb, readout)
示例#11
0
    def __init__(self):
        self.graph = tf.Graph()
        self.tensor_info = {}

        self.build_inputs()

        with self.graph.as_default():
            self.saver = tf.train.Saver(max_to_keep=1)

            #dien
            with tf.name_scope('rnn_1'):
                rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE),
                                             inputs=self.item_his_eb,
                                             sequence_length=self.seq_len_ph,
                                             dtype=tf.float32,
                                             scope="gru1")
            with tf.name_scope('Attention_layer_1'):
                att_outputs, alphas = din_fcn_attention(self.item_eb,
                                                        rnn_outputs,
                                                        ATTENTION_SIZE,
                                                        self.mask_ph,
                                                        softmax_stag=1,
                                                        stag='1_1',
                                                        mode='LIST',
                                                        return_alphas=True)
            with tf.name_scope('rnn_2'):
                rnn_outputs2, final_state2 = dynamic_rnn(
                    VecAttGRUCell(HIDDEN_SIZE),
                    inputs=rnn_outputs,
                    att_scores=tf.expand_dims(alphas, -1),
                    sequence_length=self.seq_len_ph,
                    dtype=tf.float32,
                    scope="gru2")

            #dsin
            #with tf.name_scope("Self_Attention_layer"):

            hidden_units = 512
            num_blocks = 6
            num_heads = 8
            dropout_rate = 0.1

            with tf.variable_scope("encoder"):
                # Embedding
                self.enc = embedding(
                    self.recent_behavior_ph,
                    vocab_size=USER_API_SUM,  #   len(de2idx), 200
                    num_units=hidden_units,  #128
                    zero_pad=True,  # 让padding一直是0
                    scale=True,
                    scope="enc_embed")
                #self.enc = self.user_api_all_eb
                #FLAGS.batch_size,USER_API_LEN
                batch = self.recent_behavior_ph.get_shape().as_list()
                batch = tf.shape(self.recent_behavior_ph)
                self.enc += tf.cast(
                    positional_encoding(N=tf.shape(self.recent_behavior_ph)[0],
                                        T=USER_API_LEN,
                                        num_units=hidden_units,
                                        zero_pad=False,
                                        scale=False,
                                        scope='enc_pe'), tf.float32)

                ##Drop out
                #self.enc = tf.layers.dropout(self.enc,rate = dropout_rate,
                #                             training = tf.convert_to_tensor(is_training))

                ## Blocks
                for i in range(num_blocks):
                    with tf.variable_scope("num_blocks_{}".format(i)):
                        ### MultiHead Attention[128, 10, 512] 不变
                        self.enc = multihead_attention(
                            queries=self.enc,
                            keys=self.enc,
                            num_units=hidden_units,
                            num_heads=num_heads,
                            dropout_rate=dropout_rate,
                            #is_training = is_training,
                            causality=False)
                        self.enc = feedforward(
                            self.enc,
                            num_units=[4 * hidden_units, hidden_units])

            # Final linear projection
            #self.logits = tf.layers.dense(self.dec,USER_API_LEN*3))
            # print(self.enc.get_shape().as_list())
            # print(tf.shape(self.enc))
            self.user_api_eb_sum = tf.reduce_sum(self.enc, -2)

            inp = tf.concat([
                self.item_eb, self.item_his_eb_sum, self.item_eb *
                self.item_his_eb_sum, final_state2, self.mobile_embedded,
                self.province_embedded, self.city_embedded,
                self.grade_embedded, self.chinese_embedded, self.math_embedded,
                self.english_embedded, self.purchase_embedded,
                self.activity_embedded, self.freshness_embedded,
                self.hour_embedded, self.ad_img_eb_sum, self.user_api_eb_sum
            ], -1)

        self.build_fcn_net(
            inp,
            use_dice=True,
        )