def global_attention(inputs, batch, attention_size, vocaubulary_size, ATT_W, load_LR_model=False): """ Global Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector. Args: inputs: The Attention inputs. Matches outputs of RNN/Bi-RNN layer (not final state): In case of RNN, this must be RNN outputs `Tensor`: In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and the backward RNN outputs `Tensor`. batch: The RNN inputs, batch of dataset. attention_size: Linear size of the Attention weights. vocaubulary_size: the vocaubulary size of train + test dataset ATT_W: The global attention weights. in case of joint trained logistic regression model, input its parameters to initialize global attention weights. load_LR_model: If true, global attention weights are initialized by LR model weights. Returns: The Attention output `Tensor`. In case of RNN, this will be a `Tensor` shaped: `[batch_size, cell.output_size]`. In case of Bidirectional RNN, this will be a `Tensor` shaped: `[batch_size, cell_fw.output_size + cell_bw.output_size]`. Betas: Global attention scores. `[vocabulary_size, 1] """ if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) hidden_size = inputs.shape[2].value inputs = tf.layers.batch_normalization(inputs) w_omega = tf.Variable( tf.random_normal([hidden_size, attention_size], stddev=0.1)) b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1)) v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega) if load_LR_model: betas = tf.squeeze(ATT_W) u = tf.nn.embedding_lookup(tf.abs(betas), batch) else: betas = tf.Variable(tf.random_normal([vocaubulary_size], stddev=0.1)) u = tf.nn.embedding_lookup(betas, batch) alphas = tf.nn.softmax(u, name='alphas') # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1) output = feedforward(output, num_units=[hidden_size, 2 * hidden_size, hidden_size]) output = tf.layers.batch_normalization(output) return output, betas
def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SAKmeans") with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) num_heads = num_interest self.user_eb = getKVector(sess, self.seq, num_heads) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def build_blocks(self, inputs, masks, reuse=None): self.blk = inputs for i in range(self.num_blocks): with tf.variable_scope("blocks_{}".format(i), reuse=reuse): ## Multihead Attention ( self-attention) self.blk = multihead_attention(queries=self.blk, keys=self.blk, qkv_masks=masks, num_units=self.hidden_units, num_heads=self.num_heads, dropout_rate=self.dropout, # is_training=is_training, causality=False, scope="self_attention", reuse=reuse) self.blk = feedforward(self.blk, num_units=[4*self.hidden_units, self.hidden_units], reuse=reuse) return self.blk
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SASRec") with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.sum_pooling = tf.reduce_sum(self.seq, 1) fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu) fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu) self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu) self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: cells = self.rnn_cell() encoder_output, _encoder_state = tf.nn.dynamic_rnn( cells, self.enc, sequence_length=self.x_len, dtype=tf.float32) self.enc = tf.concat([self.enc, encoder_output], axis=-1) self.enc = tf.layers.dense(self.enc, hp.emb_dim, activation="relu") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): pos_emb = tf.get_variable( 'enc_pos_emb', dtype=tf.float32, shape=[self.enc.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, pos_emb=pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: cells = self.rnn_cell() decoder_output, _decoder_state = tf.nn.dynamic_rnn( cells, self.dec, sequence_length=self.y_len, dtype=tf.float32) self.dec = tf.concat([self.dec, decoder_output], axis=-1) self.dec = tf.layers.dense(self.dec, hp.emb_dim, activation="relu") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): dec_dec_pos_emb = tf.get_variable( 'dec_de_pos_emb', dtype=tf.float32, shape=[self.dec.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) dec_enc_pos_emb = tf.get_variable( 'dec_enc_pos_emb', dtype=tf.float32, shape=[self.enc.shape[1]], initializer=tf.contrib.layers.xavier_initializer()) ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, pos_emb=dec_dec_pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, pos_emb=dec_enc_pos_emb, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def build_graph(self): # Define input with tf.name_scope("input_ph"): self.X_ind = tf.placeholder(dtype=tf.int32, shape=[None, self.field_size], name="X_index") self.label = tf.placeholder(dtype=tf.float32, shape=[None], name="label") self.is_training = tf.placeholder(dtype=tf.bool, shape=(), name="is_training") # lookup and process embedding with tf.name_scope("embedding"): self.emb = embedding(inputs=self.X_ind, vocab_size=self.feat_size, num_units=self.embedding_dim, scale=self.scale_embedding, scope="embedding_process") # self.emb: raw embedding, features: used for later features = self.emb with tf.name_scope("Multilayer_attn"): with tf.variable_scope("attention_head") as scope: features, _ = multihead_attention( queries=features, keys=features, num_units=self.attention_size*self.num_head, num_heads=self.num_head, dropout_rate=self.dropout_rate, is_training=self.is_training, scope="multihead_attention" ) features = feedforward( inputs=features, num_units=[4 * self.embedding_dim, self.embedding_dim], scope="feed_forward" ) # [N, T, dim] # multi-head feature to agg 1st order feature with tf.name_scope("Agg_first_order") as scope: ctx_order_1 = tf.get_variable( name="context_order_1", shape=(self.attention_size), dtype=tf.float32) agg_feat_1, self.attn_1 = agg_attention( query=ctx_order_1, keys=features, values=features, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) # [N, dim] # build second order cross with tf.name_scope("Second_order") as scope: feat_2 = tf.multiply( features, tf.expand_dims(agg_feat_1, axis=1) ) # [N, T, dim] feat_2 += features # Add the residual, [N, T, dim] ctx_order_2 = tf.get_variable( name="context_order_2", shape=(self.attention_size), dtype=tf.float32 ) agg_feat_2, self.attn_2 = agg_attention( query=ctx_order_2, keys=feat_2, values=feat_2, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) # build third order cross with tf.name_scope("Third_order") as scope: feat_3 = tf.multiply( features, tf.expand_dims(agg_feat_2, axis=1) ) # [N, T, dim] feat_3 += feat_2 # Add the residual, [N, T, dim] ctx_order_3 = tf.get_variable( name="context_order_3", shape=(self.attention_size), dtype=tf.float32 ) agg_feat_3, self.attn_3 = agg_attention( query=ctx_order_3, keys=feat_3, values=feat_3, attention_size=self.attention_size, regularize_scale=self.regularization_weight ) with tf.name_scope("Merged_features"): # concatenate [enc, second_cross, third_cross] # TODO: can + multihead_features all_features = tf.stack([ agg_feat_1, agg_feat_2, agg_feat_3, ], axis=1, name="concat_feature") # (N, k, C) # map C to pool_filter_size dimension mapped_all_feature = tf.layers.conv1d( inputs=all_features, filters=self.pool_filter_size, kernel_size=1, use_bias=True, name="Mapped_all_feature" ) # (N, k, pf_size) # apply context vector feature_weights = tf.nn.softmax( tf.squeeze( tf.layers.dense( mapped_all_feature, units=1, activation=None, use_bias=False ), # (N, k, 1), [2] ), # (N, k) ) # (N, k) self.attn_k = feature_weights # weighted sum weighted_sum_feat = tf.reduce_sum( tf.multiply( all_features, tf.expand_dims(feature_weights, axis=2), ), # (N, k, C) axis=[1], name="Attn_weighted_sum_feature" ) # (N, C) # last non-linear hidden_logits = tf.layers.dense( weighted_sum_feat, units=self.embedding_dim // 2, activation=tf.nn.relu, use_bias=False, name="HiddenLogits" ) # (N, C/2) # the last dense for logits logits = tf.squeeze( tf.layers.dense( hidden_logits, units=1, activation=None, use_bias=False, name="Logits" ), # (N, 1) axis=[1] ) # (N,) # sigmoid logits self.sigmoid_logits = tf.nn.sigmoid(logits) # regularization term self.regularization_loss = tf.losses.get_regularization_loss() self.logloss = tf.reduce_sum( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.expand_dims(self.label, -1), logits=tf.expand_dims(logits, -1), name="SumLogLoss")) self.mean_logloss = tf.divide( self.logloss, tf.to_float(self.batch_size), name="MeanLogLoss" ) # overall loss self.overall_loss = tf.add( self.mean_logloss, self.regularization_loss, name="OverallLoss" ) tf.summary.scalar("Mean_LogLoss", self.mean_logloss) tf.summary.scalar("Reg_Loss", self.regularization_loss) tf.summary.scalar("Overall_Loss", self.overall_loss) self.train_op = self.optimizer.minimize(self.overall_loss, global_step=self.global_step) self.merged = tf.summary.merge_all()
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_MSARec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="MSARec") with tf.variable_scope("MSARec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) # item_list_add_pos = item_list_emb + t num_heads = num_interest fc1 = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, num_heads, activation=tf.nn.tanh) # (b, num_heads, sql_len) fc2 = tf.transpose(fc2, [0, 2, 1]) interest_emb = tf.layers.dense(fc2, embedding_dim, activation=tf.nn.relu) # with tf.variable_scope("multi_interest", reuse=tf.AUTO_REUSE) as scope: # # item_list_add_pos: (b, seq_len, embedding_dim) # # item_hidden: (b, sql_len, hidden_size * 4) # # item_hidden = tf.layers.dense(item_list_add_pos, hidden_size * 4, activation=tf.nn.tanh) # item_hidden = tf.layers.dense(item_list_emb, hidden_size * 4, activation=tf.nn.tanh) # # item_att_w: (b, sql_len, num_heads) # item_att_w = tf.layers.dense(item_hidden, num_heads, activation=tf.nn.tanh) # # item_att_w: (b, num_heads, sql_len) # item_att_w = tf.transpose(item_att_w, [0, 2, 1]) # # # atten_mask: (b, num_heads, sql_len) # atten_mask = tf.tile(tf.expand_dims(self.mask, axis=1), [1, num_heads, 1]) # paddings = tf.ones_like(atten_mask) * (-2 ** 32 + 1) # # # 对于填充的位置赋值极小值 # item_att_w = tf.where(tf.equal(atten_mask, 0), paddings, item_att_w) # item_att_w = tf.nn.softmax(item_att_w) # # # item_att_w [batch, num_heads, seq_len] # # item_list_emb [batch, seq_len, embedding_dim] # # interest_emb (batch, num_heads, embedding_dim) # interest_emb = tf.matmul(item_att_w, item_list_emb) self.user_eb = interest_emb # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def __init__(self): self.graph = tf.Graph() self.tensor_info = {} self.build_inputs() with self.graph.as_default(): self.saver = tf.train.Saver(max_to_keep=1) #dien with tf.name_scope('rnn_1'): rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb, sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru1") with tf.name_scope('Attention_layer_1'): att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask_ph, softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True) with tf.name_scope('rnn_2'): rnn_outputs2, final_state2 = dynamic_rnn( VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs, att_scores=tf.expand_dims(alphas, -1), sequence_length=self.seq_len_ph, dtype=tf.float32, scope="gru2") #dsin #with tf.name_scope("Self_Attention_layer"): hidden_units = 512 num_blocks = 6 num_heads = 8 dropout_rate = 0.1 with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.recent_behavior_ph, vocab_size=USER_API_SUM, # len(de2idx), 200 num_units=hidden_units, #128 zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") #self.enc = self.user_api_all_eb #FLAGS.batch_size,USER_API_LEN batch = self.recent_behavior_ph.get_shape().as_list() batch = tf.shape(self.recent_behavior_ph) self.enc += tf.cast( positional_encoding(N=tf.shape(self.recent_behavior_ph)[0], T=USER_API_LEN, num_units=hidden_units, zero_pad=False, scale=False, scope='enc_pe'), tf.float32) ##Drop out #self.enc = tf.layers.dropout(self.enc,rate = dropout_rate, # training = tf.convert_to_tensor(is_training)) ## Blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention[128, 10, 512] 不变 self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hidden_units, num_heads=num_heads, dropout_rate=dropout_rate, #is_training = is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hidden_units, hidden_units]) # Final linear projection #self.logits = tf.layers.dense(self.dec,USER_API_LEN*3)) # print(self.enc.get_shape().as_list()) # print(tf.shape(self.enc)) self.user_api_eb_sum = tf.reduce_sum(self.enc, -2) inp = tf.concat([ self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2, self.mobile_embedded, self.province_embedded, self.city_embedded, self.grade_embedded, self.chinese_embedded, self.math_embedded, self.english_embedded, self.purchase_embedded, self.activity_embedded, self.freshness_embedded, self.hour_embedded, self.ad_img_eb_sum, self.user_api_eb_sum ], -1) self.build_fcn_net( inp, use_dice=True, )