def cross_attention(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) encx, ency = self._infer(encx, ency) # feed forward ency = ff(ency, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]]) return encx, ency
def inter_blocks(self, a_repre, b_repre, scope, training=True, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model]) return encx, ency
def decode(self, decoder_inputs, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' scopes = [] outputs = [] with tf.variable_scope("decoder_embedding_lookup", reuse=tf.AUTO_REUSE): # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(dec) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("decoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(dec) return dec, outputs, scopes
def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose( self.embeddings ) # (d_model, vocab_size) 为什么这里可以直接用 embeddings: 根据论文section 3.4 数书和输出共享embedding logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) # argmax 与 arg_max 的区别 (N,T2) return logits, y_hat, y, sents2
def decode(self, ys, memory, training=True): with tf.variable_scope('decoder', reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # Embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) dec *= self.hp.num_units**0.5 # scale dec += position_encoding(dec, self.hp.maxlen) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope('num_blocks_{}'.format(i), reuse=tf.AUTO_REUSE): # Masked self-attention dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, # todo 本次的causality是True scope='self_attention') # attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, # todo 此次的causality是False scope='vanilla_attention') # Feed-Foward dec = feed_forward( dec, num_units=[self.hp.d_ff, self.hp.num_units]) # Final linear projection(embedding weights are shared) weights = tf.transpose(self.embeddings) #[hidden_units,vocab_size] logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N,T2,vocab_size) # set values corresponding to unk = 0 logits_first = tf.expand_dims(logits[:, :, 0], 2) zeros = tf.zeros_like(logits_first) logits = tf.concat([logits_first, zeros, logits[:, :, 2:]], axis=2) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def encode(self, xs, training=True): ''' :return: encoder outputs (N,T1,hidden_units) ''' with tf.variable_scope('encoder', reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # Embeding enc = tf.nn.embedding_lookup(self.embeddings.x) enc *= self.hp.num_units**0.5 # scale enc += position_encoding(enc, self.hp.maxlen) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope('num_blocks_{}'.format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = feed_forward( enc, num_units=[self.hp.d_ff, self.hp.num_units]) memory = enc return memory, sents1
def _encode(self, enc, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # embedding enc *= self.arg.d_model**0.5 # scale enc += positional_encoding(enc, self.arg.maxlen1) enc = tf.layers.dropout(enc, self.arg.dropout_rate, training=training) ## Blocks for i in range(self.arg.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.arg.num_heads, dropout_rate=self.arg.dropout_rate, training=training, causality=False) memory = enc return memory
def time_encode(self, encoder_inputs): ''' Returns memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE) ''' with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE): # embedding enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs) enc *= hp.HIDDEN_SIZE**0.5 enc += positional_encoding(enc, hp.MAX_LEN) enc = tf.nn.dropout(enc, self.dropout) # Blocks for i in range(hp.NUM_BLOCKS): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=hp.NUM_HEADS, dropout=self.dropout, causality=True) # feed forward enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE]) output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE)) logits = tf.layers.dense(output, len(self.token2idx)) return logits
def transformer_encode(enc, config, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE): # embedding enc *= config.d_model**0.5 # scale enc += positional_encoding(enc, config.max_sent_num) enc = tf.layers.dropout(enc, config.drop_rate, training=training) ## Blocks for i in range(config.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=config.num_heads, dropout_rate=config.drop_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[config.d_ff, config.d_model]) memory = enc return memory
def encode(self, xs, training=True): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def dense_blocks(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # self-attention _encx = multihead_attention(queries=a_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # self-attention _ency = multihead_attention(queries=b_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # inter-attention ency = multihead_attention(queries=_encx, keys=_ency, values=_ency, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # inter-attention encx = multihead_attention(queries=_ency, keys=_encx, values=_encx, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) encx, ency = self._infer(encx, ency) #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num) return encx, ency
def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SAKmeans") with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) num_heads = num_interest self.user_eb = getKVector(sess, self.seq, num_heads) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def inter_blocks(self, a_repre, b_repre, x_layer, y_layer, layer_num, scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # encx, ency = inter_multihead_attention(queries=b_repre, # keys=a_repre, # values=a_repre, # num_heads=self.hp.num_heads, # dropout_rate=self.hp.dropout_rate, # training=self.is_training, # causality=False) # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) #feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) #feed forward ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model]) #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num) #encx, ency = self._infer(encx, ency) return encx, ency
def encode(self, x, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' scopes = [] outputs = [] with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): self.token2idx, self.idx2token = load_vocab(self.hp.vocab) self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) scopes.append(tf.get_variable_scope().name) outputs.append(self.embeddings) with tf.variable_scope("encoder_embedding_lookup", reuse=tf.AUTO_REUSE): # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(enc) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("encoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(enc) memory = enc return memory, src_masks, outputs, scopes
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding1 enc1 = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_model) enc1 *= self.hp.d_model**0.5 # scale enc1 += positional_encoding(enc1, self.hp.maxlen1) enc1 = tf.layers.dropout(enc1, self.hp.dropout_rate, training=training) # embedding2 enc2 = tf.nn.embedding_lookup(self.embeddings2, x) # (N, T1, d_model) enc2 *= self.hp.d_model**0.5 # scale enc2 += positional_encoding(enc2, self.hp.maxlen1) enc2 = tf.layers.dropout(enc2, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc1, enc2 = multihead_attention( queries=(enc1, enc2), keys=enc1, values=enc2, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model]) enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model]) memory = (enc1, enc2) return memory, sents1, src_masks
def encode(self, xs, training=True): ''' xs: 训练数据 Returns memory: encoder outputs. (N, T1, d_model) N: batch size; T1: sentence length d_model: 512, 词向量长度 ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # xs: tuple of # x: int32 tensor. (N, T1) # x_seqlens: int32 tensor. (N,) 句子长度 # sents1: str tensor. (N,) x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 加上位置编码向量 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # Blocks 编码器模块 # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding if self.hp.fac_embed: enc = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_embed) enc = tf.matmul(enc, self.embeddings2) # (N, T1, d_model) else: enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): if self.hp.share_weights: vs_name = "blocks_shared" else: vs_name = "num_blocks_{}".format(i) with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): # 实现encode 模型 ''' Returns memory: encoder outputs. (N, T1, d_model) # N:batch_size # T1:句子长度 # d_model:词向量维度 ''' # 实现的功能: # (1)输入词向量 + positional_encoding # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接 with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 将位置向量添加到初始词向量中 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) batch_size,time_steps # source的mask # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale # enc += positional_encoding(enc, self.hp.maxlen1) #位置编码的维度和词向量的维度一致才能相加 # 对于topic to essay 这个地方感觉不需要位置向量 # 编码器部分不需要位置向量但是解码器部分需要位置向量 #词向量也进行dropout enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention because queries,keys and values both are 'enc' enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # 这个causality决定是否对未来的词进行mask,False则可以看到未来的词 # feed forward + residual connection enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def aggregation(self, a_repre, b_repre): dim = a_repre.shape.as_list()[-1] with tf.variable_scope("aggregation", reuse=tf.AUTO_REUSE): # Blocks for i in range(self.hp.num_agg_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Vanilla attention a_repre = multihead_attention(queries=a_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False, scope="vanilla_attention") ### Feed Forward a_repre = ff(a_repre, num_units=[self.hp.d_ff, dim]) return a_repre
def build_blocks(self, inputs, masks, reuse=None): self.blk = inputs for i in range(self.num_blocks): with tf.variable_scope("blocks_{}".format(i), reuse=reuse): ## Multihead Attention ( self-attention) self.blk = multihead_attention(queries=self.blk, keys=self.blk, qkv_masks=masks, num_units=self.hidden_units, num_heads=self.num_heads, dropout_rate=self.dropout, # is_training=is_training, causality=False, scope="self_attention", reuse=reuse) self.blk = feedforward(self.blk, num_units=[4*self.hidden_units, self.hidden_units], reuse=reuse) return self.blk
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SASRec") with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.sum_pooling = tf.reduce_sum(self.seq, 1) fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu) fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu) self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu) self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
def encoder_blocks(self, a_repre, reuse=tf.AUTO_REUSE): for i in range(self.hp.num_transformer): with tf.variable_scope("num_trans_blocks_{}".format(i), reuse=reuse): # self-attention a_repre = multihead_attention( queries=a_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward #a_repre = ff(a_repre, num_units=[self.hp.d_ff, self.hp.d_model]) a_repre = ff( a_repre, num_units=[self.hp.d_ff, a_repre.shape.as_list()[-1]]) return a_repre
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, video_path = xs # src_masks # embedding enc = tf.layers.dense(x, self.d_model) #src_masks = tf.math.equal(mask, 0) # (N, T1) src_masks = tf.sequence_mask(seqlens) #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) #enc *= self.hp.d_model**0.5 # scale enc /= self.hp.d_model**0.5 enc += positional_encoding(enc, self.hp.n_video) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, ) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # x = tf.Print(x, [x], message='x =', summarize=10) # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3) # with tf.control_dependencies([print_sent]): # embedding # xs_pri = tf.print('xs =', tf.shape(x), summarize=3) enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3) ## Blocks # with tf.control_dependencies([xs_pri, enc_pri]): for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def encode(self, encx, src_masks): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # all_layer = [] ## Blocks for i in range(self.hp.num_blocks_encoder): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention encx = multihead_attention( queries=encx, keys=encx, values=encx, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # all_layer.append(encx) return encx
def _encode(self, x, seq_num, training=True, name=None): """ Returns memory: encoder outputs. (N, T1, d_model) """ with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # embedding x = tf.identity(x, "input_x") enc = tf.nn.embedding_lookup(self._embeddings[seq_num], x) # (N, T1, d_model) enc *= self._context.d_model**0.5 # scale enc += positional_encoding(enc, self._context.maxlens[seq_num]) enc = tf.layers.dropout(enc, self._context.dropout_rate, training=training) # # Blocks for i in range(self._context.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self._context.num_heads, dropout_rate=self._context.dropout_rate, training=training, causality=False) # feed forward enc = ff( enc, num_units=[self._context.d_ff, self._context.d_model]) memory = tf.identity(enc, name=name) return memory
def decode(self, ys, x_paraphrased_dict, memory, training=True): with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2) seqlens = tf.shape(decoder_inputs)[1] # (N, T2, 2) paraphrased_lens = tf.shape(x_paraphrased_dict)[1] # (N, T2, 2) x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1] x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o) # N, W2, d_model if self.hp.paraphrase_type == 0: x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p) else: x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # add paraphrased dictionary attention h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2) o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1) W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_o) # N, T2, W2 c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1) W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_p) # N, T2, W2 c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def decode(self, xs, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' self.memory = memory with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, sents2 = ys x, _, = xs # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) attn_dists = [] # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec, _ = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec, attn_dist = multihead_attention(queries=dec, keys=self.memory, values=self.memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") attn_dists.append(attn_dist) ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False) logits = tf.nn.softmax(logits) # final distribution logits = self._calc_final_dist(x, gens, logits, attn_dists[-1]) return logits, y, sents2