def inter_blocks(self, a_repre, b_repre, scope, training=True, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model]) return encx, ency
def cross_attention(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) encx, ency = self._infer(encx, ency) # feed forward ency = ff(ency, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]]) return encx, ency
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding1 enc1 = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_model) enc1 *= self.hp.d_model**0.5 # scale enc1 += positional_encoding(enc1, self.hp.maxlen1) enc1 = tf.layers.dropout(enc1, self.hp.dropout_rate, training=training) # embedding2 enc2 = tf.nn.embedding_lookup(self.embeddings2, x) # (N, T1, d_model) enc2 *= self.hp.d_model**0.5 # scale enc2 += positional_encoding(enc2, self.hp.maxlen1) enc2 = tf.layers.dropout(enc2, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc1, enc2 = multihead_attention( queries=(enc1, enc2), keys=enc1, values=enc2, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model]) enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model]) memory = (enc1, enc2) return memory, sents1, src_masks
def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose( self.embeddings ) # (d_model, vocab_size) 为什么这里可以直接用 embeddings: 根据论文section 3.4 数书和输出共享embedding logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) # argmax 与 arg_max 的区别 (N,T2) return logits, y_hat, y, sents2
def time_encode(self, encoder_inputs): ''' Returns memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE) ''' with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE): # embedding enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs) enc *= hp.HIDDEN_SIZE**0.5 enc += positional_encoding(enc, hp.MAX_LEN) enc = tf.nn.dropout(enc, self.dropout) # Blocks for i in range(hp.NUM_BLOCKS): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=hp.NUM_HEADS, dropout=self.dropout, causality=True) # feed forward enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE]) output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE)) logits = tf.layers.dense(output, len(self.token2idx)) return logits
def transformer_encode(enc, config, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE): # embedding enc *= config.d_model**0.5 # scale enc += positional_encoding(enc, config.max_sent_num) enc = tf.layers.dropout(enc, config.drop_rate, training=training) ## Blocks for i in range(config.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=config.num_heads, dropout_rate=config.drop_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[config.d_ff, config.d_model]) memory = enc return memory
def decode(self, decoder_inputs, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' scopes = [] outputs = [] with tf.variable_scope("decoder_embedding_lookup", reuse=tf.AUTO_REUSE): # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(dec) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("decoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(dec) return dec, outputs, scopes
def encode(self, xs, training=True): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def inter_blocks(self, a_repre, b_repre, x_layer, y_layer, layer_num, scope, reuse=tf.AUTO_REUSE): with tf.variable_scope(scope, reuse=reuse): # encx, ency = inter_multihead_attention(queries=b_repre, # keys=a_repre, # values=a_repre, # num_heads=self.hp.num_heads, # dropout_rate=self.hp.dropout_rate, # training=self.is_training, # causality=False) # self-attention encx = multihead_attention(queries=b_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) #feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # self-attention ency = multihead_attention(queries=a_repre, keys=b_repre, values=b_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) #feed forward ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model]) #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num) #encx, ency = self._infer(encx, ency) return encx, ency
def encode(self, x, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' scopes = [] outputs = [] with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): self.token2idx, self.idx2token = load_vocab(self.hp.vocab) self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) scopes.append(tf.get_variable_scope().name) outputs.append(self.embeddings) with tf.variable_scope("encoder_embedding_lookup", reuse=tf.AUTO_REUSE): # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(enc) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("encoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(enc) memory = enc return memory, src_masks, outputs, scopes
def __init__(self, att_unit, value_attr, num_heads, model_structure, d_ff, d_model, drop_rate): super(EncoderLayer, self).__init__() self.mha = MultiHeadAttention(att_unit, value_attr, num_heads, model_structure, causality=False) self.ffn = ff(num_units=[d_ff, d_model]) self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = tf.keras.layers.Dropout(drop_rate) self.dropout2 = tf.keras.layers.Dropout(drop_rate)
def encode(self, xs, training=True): ''' xs: 训练数据 Returns memory: encoder outputs. (N, T1, d_model) N: batch size; T1: sentence length d_model: 512, 词向量长度 ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # xs: tuple of # x: int32 tensor. (N, T1) # x_seqlens: int32 tensor. (N,) 句子长度 # sents1: str tensor. (N,) x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 加上位置编码向量 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # Blocks 编码器模块 # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding if self.hp.fac_embed: enc = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_embed) enc = tf.matmul(enc, self.embeddings2) # (N, T1, d_model) else: enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): if self.hp.share_weights: vs_name = "blocks_shared" else: vs_name = "num_blocks_{}".format(i) with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): # 实现encode 模型 ''' Returns memory: encoder outputs. (N, T1, d_model) # N:batch_size # T1:句子长度 # d_model:词向量维度 ''' # 实现的功能: # (1)输入词向量 + positional_encoding # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接 with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 将位置向量添加到初始词向量中 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) batch_size,time_steps # source的mask # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale # enc += positional_encoding(enc, self.hp.maxlen1) #位置编码的维度和词向量的维度一致才能相加 # 对于topic to essay 这个地方感觉不需要位置向量 # 编码器部分不需要位置向量但是解码器部分需要位置向量 #词向量也进行dropout enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention because queries,keys and values both are 'enc' enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # 这个causality决定是否对未来的词进行mask,False则可以看到未来的词 # feed forward + residual connection enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def aggregation(self, a_repre, b_repre): dim = a_repre.shape.as_list()[-1] with tf.variable_scope("aggregation", reuse=tf.AUTO_REUSE): # Blocks for i in range(self.hp.num_agg_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Vanilla attention a_repre = multihead_attention(queries=a_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False, scope="vanilla_attention") ### Feed Forward a_repre = ff(a_repre, num_units=[self.hp.d_ff, dim]) return a_repre
def encoder_blocks(self, a_repre, reuse=tf.AUTO_REUSE): for i in range(self.hp.num_transformer): with tf.variable_scope("num_trans_blocks_{}".format(i), reuse=reuse): # self-attention a_repre = multihead_attention( queries=a_repre, keys=a_repre, values=a_repre, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward #a_repre = ff(a_repre, num_units=[self.hp.d_ff, self.hp.d_model]) a_repre = ff( a_repre, num_units=[self.hp.d_ff, a_repre.shape.as_list()[-1]]) return a_repre
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, video_path = xs # src_masks # embedding enc = tf.layers.dense(x, self.d_model) #src_masks = tf.math.equal(mask, 0) # (N, T1) src_masks = tf.sequence_mask(seqlens) #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) #enc *= self.hp.d_model**0.5 # scale enc /= self.hp.d_model**0.5 enc += positional_encoding(enc, self.hp.n_video) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, ) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # x = tf.Print(x, [x], message='x =', summarize=10) # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3) # with tf.control_dependencies([print_sent]): # embedding # xs_pri = tf.print('xs =', tf.shape(x), summarize=3) enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3) ## Blocks # with tf.control_dependencies([xs_pri, enc_pri]): for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def _encode(self, x, seq_num, training=True, name=None): """ Returns memory: encoder outputs. (N, T1, d_model) """ with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # embedding x = tf.identity(x, "input_x") enc = tf.nn.embedding_lookup(self._embeddings[seq_num], x) # (N, T1, d_model) enc *= self._context.d_model**0.5 # scale enc += positional_encoding(enc, self._context.maxlens[seq_num]) enc = tf.layers.dropout(enc, self._context.dropout_rate, training=training) # # Blocks for i in range(self._context.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self._context.num_heads, dropout_rate=self._context.dropout_rate, training=training, causality=False) # feed forward enc = ff( enc, num_units=[self._context.d_ff, self._context.d_model]) memory = tf.identity(enc, name=name) return memory
def encode(self, encx, src_masks): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # all_layer = [] ## Blocks for i in range(self.hp.num_blocks_encoder): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention encx = multihead_attention( queries=encx, keys=encx, values=encx, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=self.is_training, causality=False) # feed forward encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model]) # all_layer.append(encx) return encx
def _unilateral_net(self, x, name, training=True): """ :param x: (N, num_entities) :param name: :param training: :return: """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): # [batch_size, seq_length, embedding_size], [vocab_size, embedding_size] embedding = get_subword_embedding(self.embeddings, x) # (N, num_entities, ff_model) embedding = tf.reduce_mean(embedding, axis=1, name="embedding") # (N, ff_model) embedding = tf.layers.dropout(embedding, self.context.dropout_rate, training=training) embedding = ff(embedding, [self.context.d_ff, self.context.d_ff]) final_embedding = tf.sigmoid( tf.layers.dense(embedding, self.context.d_model)) final_embedding = tf.identity( final_embedding, name=name + "_embedding") # (N, num_entities, d_model) return final_embedding
def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # decoder_inputs = tf.Print(decoder_inputs, [decoder_inputs], # message='decoder_inputs =', summarize=10) # embedding # ys_pri = tf.print('y =', tf.shape(y), summarize=3) dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # dec = tf.Print(dec, [dec], message='dec =', summarize=10) # dec_pri = tf.print('dec =', tf.shape(dec), dec, summarize=3) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # dec = tf.Print(dec, [dec], message='dec_finally =', summarize=10) # Final linear projection (embedding weights are shared) # with tf.control_dependencies([ys_pri, dec_pri]): weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def decode(self, ys, x_paraphrased_dict, memory, training=True): with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2) seqlens = tf.shape(decoder_inputs)[1] # (N, T2, 2) paraphrased_lens = tf.shape(x_paraphrased_dict)[1] # (N, T2, 2) x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1] x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o) # N, W2, d_model if self.hp.paraphrase_type == 0: x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p) else: x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # add paraphrased dictionary attention h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2) o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1) W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_o) # N, T2, W2 c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1) W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_p) # N, T2, W2 c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def decode(self, xs, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' self.memory = memory with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, sents2 = ys x, _, = xs # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) attn_dists = [] # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec, _ = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec, attn_dist = multihead_attention(queries=dec, keys=self.memory, values=self.memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") attn_dists.append(attn_dist) ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False) logits = tf.nn.softmax(logits) # final distribution logits = self._calc_final_dist(x, gens, logits, attn_dists[-1]) return logits, y, sents2
def encode(self, xs, training=True, use_turn_embedding=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): self.x, self.turn_ids, sents1 = xs # self.x shape:(batch_size,max_len1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, self.x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1 + self.hp.maxlen2) batch_size = tf.shape(enc)[0] # TODO add turn encoding,定义turn_ids如何传入,放在xs里面 if use_turn_embedding: if self.turn_ids is None: raise ValueError("`turn_ids` must be specified if" "`use_turn_embedding` is True.") turn_cnt = tf.to_int32(tf.reduce_max(self.turn_ids)) turn_ids_table = tf.get_variable( name="turn_embedding", dtype=tf.float32, shape=(20, self.hp.d_model), # width即embedding size initializer=tf.contrib.layers.xavier_initializer()) flat_turn_ids = tf.reshape(self.turn_ids, [-1]) # (batch_size*seq_len) one_hot_ids = tf.one_hot( flat_turn_ids, depth=20) # (batch_size*seq_len,turn_cnt) turn_embedding = tf.matmul( one_hot_ids, turn_ids_table) # (batch_size*seq_len,embed_size) turn_embedding = tf.reshape(turn_embedding, [ batch_size, self.hp.maxlen1 + self.hp.maxlen2, self.hp.d_model ]) enc += turn_embedding # TODO end enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc, _ = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc_h = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) enc_u = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) # enc = enc_h/2 + enc_u/2 # print(enc) #TODO 修改成concatenation再加一个ff enc = tf.layers.dense(tf.concat([enc_h, enc_u], axis=-1), units=self.hp.d_model, activation=tf.sigmoid, trainable=training, use_bias=False) self.enc_output = enc self.enc_output_h = enc_h self.enc_output_u = enc_u return self.enc_output_h, self.enc_output_u, sents1
def decode(self, ys, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding if self.hp.fac_embed: dec = tf.nn.embedding_lookup( self.embeddings1, decoder_inputs) # (N, T2, d_embed) dec = tf.matmul(dec, self.embeddings2) # (N, T2, d_model) else: dec = tf.nn.embedding_lookup( self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): if self.hp.share_weights: vs_name = "blocks_shared" else: vs_name = "num_blocks_{}".format(i) with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared in some situation) if self.hp.fac_embed: if self.hp.io_tie: #0: no normalization. 1: cal cosine-sim on 1 and 2. 2: cal l2-norm-square on 2 and cosine-sim on 1. 3: cal l2-norm-square on 2 and dist on 1. if self.hp.embedding_normalization == 1: #need add en=2 situation output_embeddings1 = tf.transpose( tf.concat((tf.zeros(shape=[1, self.hp.d_embed], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings1[1::], axis=-1)), 0)) logits = tf.einsum('ntd,dk->ntk', dec, tf.nn.l2_normalize( tf.transpose(self.embeddings2), axis=0)) #maybe use lstsq? logits = tf.einsum('ntd,dk->ntk', logits, output_embeddings1) elif self.hp.embedding_normalization >= 2: weights2 = self.embeddings2[1:, :] weights2 = divide_norm_square_and_transpose(weights2) weights2 = tf.concat( (tf.zeros(shape=[self.hp.d_embed, 1], dtype=tf.float32), weights2), -1) if self.hp.embedding_normalization == 2: weights1 = tf.transpose( tf.concat( (tf.zeros(shape=[1, self.hp.d_embed], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings1[1::], axis=-1)), 0)) else: weights1 = tf.transpose(self.embeddings1) logits = tf.einsum('ntd,dk->ntk', dec, weights2) logits = tf.einsum('ntd,dk->ntk', logits, weights1) if self.hp.embedding_normalization == 3: ebias = get_half_squarenorm(self.embeddings1) logits = tf.subtract(logits, ebias) else: logits = tf.einsum('ntd,dk->ntk', dec, tf.transpose(self.embeddings2)) logits = tf.einsum('ntd,dk->ntk', logits, tf.transpose(self.embeddings1)) else: with tf.variable_scope("output_embedding", reuse=tf.AUTO_REUSE): logits = tf.layers.dense(dec, self.vocab_size) else: if self.hp.io_tie: if self.hp.embedding_normalization == 0 or self.hp.embedding_normalization == 3: weights = tf.transpose( self.embeddings) # (d_model, vocab_size) elif self.hp.embedding_normalization == 1: weights = tf.transpose( tf.concat((tf.zeros(shape=[1, self.hp.d_model], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings[1:, :], axis=-1)), 0)) elif self.hp.embedding_normalization == 2: weights = self.embeddings[1:, :] weights = divide_norm_square_and_transpose(weights) weights = tf.concat((tf.zeros(shape=[self.hp.d_model, 1], dtype=tf.float32), weights), -1) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) if self.hp.embedding_normalization == 2: #bias=tf.ones(shape=[logits.shape[-1]],dtype=tf.float32) pass #with tf.variable_scope("gauss",reuse=tf.AUTO_REUSE): #bias=tf.constant(1.0) #logits=tf.subtract(logits,bias) #logits=tf.square(logits) #logits=tf.negative(logits) #logits=gaussian_activation(logits) #logits=tf.exp(logits) if self.hp.embedding_normalization == 3: ebias = get_half_squarenorm(self.embeddings) logits = tf.subtract(logits, ebias) else: with tf.variable_scope("output_embedding", reuse=tf.AUTO_REUSE): logits = tf.layers.dense(dec, self.vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def encode_decode(self, xs, ys, training=True): x, seqlens = xs decoder_inputs, y, seqlens = ys with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc += positional_encoding(enc, self.hp.maxlen1, self.hp) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): dec = tf.reduce_sum(tf.nn.embedding_lookup(self.embeddings, decoder_inputs), reduction_indices=2) # (N, T1, d_model) # test_dec = dec dec = dec * self.hp.d_model**0.5 # scale # 子图结构里也需要对应的位置编码,因为要对应输出的预测结构 dec += positional_encoding(dec, self.hp.maxlen2, self.hp) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=dec, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, # 是否加上mask层 causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) if self.hp.type == 'attribute': enc = tf.reduce_sum(enc, reduction_indices=1) dec = tf.reduce_sum(dec, reduction_indices=1) logits = tf.layers.dense(inputs=tf.concat(enc, dec), units=1, activation=tf.nn.relu) else: logits = tf.einsum('ntd,nkd->ntk', dec, enc) # (N, T2, T2) logits = (logits + tf.transpose(logits, [0, 2, 1])) / 2 #强制最终结果为一个对称矩阵,符合 return logits, y, decoder_inputs
def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale input_shape = modeling_bert.get_shape_list(dec, expected_rank=3) seq_length = input_shape[1] width = input_shape[2] position_embeddings = tf.slice(self.full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(dec.shape.as_list()) position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) dec += position_embeddings dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def decode(self, ys, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding1 dec1 = tf.nn.embedding_lookup(self.embeddings1, decoder_inputs) # (N, T2, d_model) dec1 *= self.hp.d_model**0.5 # scale dec1 += positional_encoding(dec1, self.hp.maxlen2) dec1 = tf.layers.dropout(dec1, self.hp.dropout_rate, training=training) # embedding2 dec2 = tf.nn.embedding_lookup(self.embeddings2, decoder_inputs) # (N, T2, d_model) dec2 *= self.hp.d_model**0.5 # scale dec2 += positional_encoding(dec2, self.hp.maxlen2) dec2 = tf.layers.dropout(dec2, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec1, dec2 = multihead_attention( queries=(dec1, dec2), keys=dec1, values=dec2, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec1, dec2 = multihead_attention( queries=(dec1, dec2), keys=memory[0], values=memory[1], key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, memory=True, scope="vanilla_attention") ### Feed Forward dec1 = ff(dec1, num_units=[self.hp.d_ff, self.hp.d_model]) dec2 = ff(dec2, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose( tf.concat([self.embeddings1, self.embeddings2], axis=-1)) # (2 * d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', tf.concat([dec1, dec2], axis=-1), weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2