示例#1
0
    def inter_blocks(self,
                     a_repre,
                     b_repre,
                     scope,
                     training=True,
                     reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=training,
                                       causality=False)
            # feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=training,
                                       causality=False)
            # feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model])

        return encx, ency
示例#2
0
    def cross_attention(self, a_repre, b_repre, scope, reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            # feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)

            encx, ency = self._infer(encx, ency)

            # feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, encx.shape.as_list()[-1]])

        return encx, ency
示例#3
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding1
            enc1 = tf.nn.embedding_lookup(self.embeddings1,
                                          x)  # (N, T1, d_model)
            enc1 *= self.hp.d_model**0.5  # scale
            enc1 += positional_encoding(enc1, self.hp.maxlen1)
            enc1 = tf.layers.dropout(enc1,
                                     self.hp.dropout_rate,
                                     training=training)

            # embedding2
            enc2 = tf.nn.embedding_lookup(self.embeddings2,
                                          x)  # (N, T1, d_model)
            enc2 *= self.hp.d_model**0.5  # scale
            enc2 += positional_encoding(enc2, self.hp.maxlen1)
            enc2 = tf.layers.dropout(enc2,
                                     self.hp.dropout_rate,
                                     training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc1, enc2 = multihead_attention(
                        queries=(enc1, enc2),
                        keys=enc1,
                        values=enc2,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model])
                    enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = (enc1, enc2)
        return memory, sents1, src_masks
示例#4
0
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(
            self.embeddings
        )  # (d_model, vocab_size) 为什么这里可以直接用 embeddings: 根据论文section 3.4 数书和输出共享embedding
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits,
                                      axis=-1))  # argmax 与 arg_max 的区别 (N,T2)

        return logits, y_hat, y, sents2
示例#5
0
    def time_encode(self, encoder_inputs):
        '''
        Returns
        memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE)
        '''
        with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE):

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs)
            enc *= hp.HIDDEN_SIZE**0.5

            enc += positional_encoding(enc, hp.MAX_LEN)
            enc = tf.nn.dropout(enc, self.dropout)

            # Blocks
            for i in range(hp.NUM_BLOCKS):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=hp.NUM_HEADS,
                                              dropout=self.dropout,
                                              causality=True)
                    # feed forward
                    enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE])

        output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE))
        logits = tf.layers.dense(output, len(self.token2idx))
        return logits
示例#6
0
def transformer_encode(enc, config, training=True):
    '''
    Returns
    memory: encoder outputs. (N, T1, d_model)
    '''
    with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE):

        # embedding
        enc *= config.d_model**0.5  # scale

        enc += positional_encoding(enc, config.max_sent_num)
        enc = tf.layers.dropout(enc, config.drop_rate, training=training)

        ## Blocks
        for i in range(config.num_blocks):
            with tf.variable_scope("num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          num_heads=config.num_heads,
                                          dropout_rate=config.drop_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[config.d_ff, config.d_model])
    memory = enc
    return memory
示例#7
0
    def decode(self, decoder_inputs, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("decoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(dec)
            # Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("decoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=dec,
                                          keys=dec,
                                          values=dec,
                                          key_masks=tgt_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=True,
                                          scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                          keys=memory,
                                          values=memory,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False,
                                          scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(dec)

        return dec, outputs, scopes
示例#8
0
    def encode(self, xs, training=True):
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            enc *= self.hp.d_model**0.5 # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(queries=enc,
                                              keys=enc,
                                              values=enc,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
示例#9
0
    def inter_blocks(self, a_repre, b_repre, x_layer, y_layer, layer_num, scope, reuse=tf.AUTO_REUSE):
        with tf.variable_scope(scope, reuse=reuse):
            # encx, ency = inter_multihead_attention(queries=b_repre,
            #                            keys=a_repre,
            #                            values=a_repre,
            #                            num_heads=self.hp.num_heads,
            #                            dropout_rate=self.hp.dropout_rate,
            #                            training=self.is_training,
            #                            causality=False)

            # self-attention
            encx = multihead_attention(queries=b_repre,
                                       keys=a_repre,
                                       values=a_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            #feed forward
            encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

            # self-attention
            ency = multihead_attention(queries=a_repre,
                                       keys=b_repre,
                                       values=b_repre,
                                       num_heads=self.hp.num_heads,
                                       dropout_rate=self.hp.dropout_rate,
                                       training=self.is_training,
                                       causality=False)
            #feed forward
            ency = ff(ency, num_units=[self.hp.d_ff, self.hp.d_model])

            #encx, ency, ae_loss = self._dense_infer(encx, ency, x_layer, y_layer, layer_num)

            #encx, ency = self._infer(encx, ency)

        return encx, ency
示例#10
0
    def encode(self, x, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        scopes = []
        outputs = []
        with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE):
            self.token2idx, self.idx2token = load_vocab(self.hp.vocab)
            self.embeddings = get_token_embeddings(self.hp.vocab_size,
                                                   self.hp.d_model,
                                                   zero_pad=True)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(self.embeddings)
        with tf.variable_scope("encoder_embedding_lookup",
                               reuse=tf.AUTO_REUSE):
            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            scopes.append(tf.get_variable_scope().name)
            outputs.append(enc)
            ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("encoder_num_blocks_{}".format(i),
                                   reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          key_masks=src_masks,
                                          num_heads=self.hp.num_heads,
                                          dropout_rate=self.hp.dropout_rate,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
                scopes.append(tf.get_variable_scope().name)
                outputs.append(enc)
        memory = enc
        return memory, src_masks, outputs, scopes
示例#11
0
    def __init__(self, att_unit, value_attr, num_heads, model_structure, d_ff,
                 d_model, drop_rate):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(att_unit,
                                      value_attr,
                                      num_heads,
                                      model_structure,
                                      causality=False)
        self.ffn = ff(num_units=[d_ff, d_model])

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(drop_rate)
        self.dropout2 = tf.keras.layers.Dropout(drop_rate)
示例#12
0
    def encode(self, xs, training=True):
        '''
        xs: 训练数据
        Returns
        memory: encoder outputs. (N, T1, d_model)
                                N: batch size;
                                T1: sentence length
                                d_model: 512, 词向量长度
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # xs: tuple of
            #               x: int32 tensor. (N, T1)
            #               x_seqlens: int32 tensor. (N,)  句子长度
            #               sents1: str tensor. (N,)
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 加上位置编码向量
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks 编码器模块
            # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
示例#13
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            if self.hp.fac_embed:
                enc = tf.nn.embedding_lookup(self.embeddings1,
                                             x)  # (N, T1, d_embed)
                enc = tf.matmul(enc, self.embeddings2)  # (N, T1, d_model)
            else:
                enc = tf.nn.embedding_lookup(self.embeddings,
                                             x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                if self.hp.share_weights:
                    vs_name = "blocks_shared"
                else:
                    vs_name = "num_blocks_{}".format(i)
                with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
示例#14
0
    def encode(self, xs, training=True):  # 实现encode 模型
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        # N:batch_size
        # T1:句子长度
        # d_model:词向量维度
        '''
        # 实现的功能:
        # (1)输入词向量 + positional_encoding
        # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1)

            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)  # 将位置向量添加到初始词向量中
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
示例#15
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs

            # src_masks
            src_masks = tf.math.equal(x, 0)  # (N, T1) batch_size,time_steps
            # source的mask
            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            # enc += positional_encoding(enc, self.hp.maxlen1)
            #位置编码的维度和词向量的维度一致才能相加
            # 对于topic to essay 这个地方感觉不需要位置向量
            # 编码器部分不需要位置向量但是解码器部分需要位置向量

            #词向量也进行dropout
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention because queries,keys and values both are 'enc'
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # 这个causality决定是否对未来的词进行mask,False则可以看到未来的词
                    # feed forward + residual connection
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1, src_masks
示例#16
0
 def aggregation(self, a_repre, b_repre):
     dim = a_repre.shape.as_list()[-1]
     with tf.variable_scope("aggregation", reuse=tf.AUTO_REUSE):
         # Blocks
         for i in range(self.hp.num_agg_blocks):
             with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                 # Vanilla attention
                 a_repre = multihead_attention(queries=a_repre,
                                               keys=a_repre,
                                               values=a_repre,
                                               num_heads=self.hp.num_heads,
                                               dropout_rate=self.hp.dropout_rate,
                                               training=self.is_training,
                                               causality=False,
                                               scope="vanilla_attention")
                 ### Feed Forward
                 a_repre = ff(a_repre, num_units=[self.hp.d_ff, dim])
     return a_repre
示例#17
0
 def encoder_blocks(self, a_repre, reuse=tf.AUTO_REUSE):
     for i in range(self.hp.num_transformer):
         with tf.variable_scope("num_trans_blocks_{}".format(i),
                                reuse=reuse):
             # self-attention
             a_repre = multihead_attention(
                 queries=a_repre,
                 keys=a_repre,
                 values=a_repre,
                 num_heads=self.hp.num_heads,
                 dropout_rate=self.hp.dropout_rate,
                 training=self.is_training,
                 causality=False)
             # feed forward
             #a_repre = ff(a_repre, num_units=[self.hp.d_ff, self.hp.d_model])
             a_repre = ff(
                 a_repre,
                 num_units=[self.hp.d_ff,
                            a_repre.shape.as_list()[-1]])
     return a_repre
示例#18
0
文件: model.py 项目: KangSooHan/GAI
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, video_path = xs

            # src_masks

            # embedding
            enc = tf.layers.dense(x, self.d_model)
            #src_masks = tf.math.equal(mask, 0) # (N, T1)
            src_masks = tf.sequence_mask(seqlens)

            #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model)
            #enc *= self.hp.d_model**0.5 # scale

            enc /= self.hp.d_model**0.5

            enc += positional_encoding(enc, self.hp.n_video)

            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                    )
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, src_masks
示例#19
0
    def encode(self, xs, training=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            x, seqlens, sents1 = xs
            # x = tf.Print(x, [x], message='x =', summarize=10)
            # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3)
            # with tf.control_dependencies([print_sent]):
            # embedding
            # xs_pri = tf.print('xs =', tf.shape(x), summarize=3)
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3)
            ## Blocks
            # with tf.control_dependencies([xs_pri, enc_pri]):

            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
        memory = enc
        return memory, sents1
示例#20
0
    def _encode(self, x, seq_num, training=True, name=None):
        """
        Returns
        memory: encoder outputs. (N, T1, d_model)
        """
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # embedding
            x = tf.identity(x, "input_x")
            enc = tf.nn.embedding_lookup(self._embeddings[seq_num],
                                         x)  # (N, T1, d_model)
            enc *= self._context.d_model**0.5  # scale

            enc += positional_encoding(enc, self._context.maxlens[seq_num])
            enc = tf.layers.dropout(enc,
                                    self._context.dropout_rate,
                                    training=training)

            # # Blocks
            for i in range(self._context.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self._context.num_heads,
                        dropout_rate=self._context.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(
                        enc,
                        num_units=[self._context.d_ff, self._context.d_model])
        memory = tf.identity(enc, name=name)
        return memory
示例#21
0
    def encode(self, encx, src_masks):

        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            # all_layer = []
            ## Blocks
            for i in range(self.hp.num_blocks_encoder):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    encx = multihead_attention(
                        queries=encx,
                        keys=encx,
                        values=encx,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=self.is_training,
                        causality=False)
                    # feed forward
                    encx = ff(encx, num_units=[self.hp.d_ff, self.hp.d_model])

                    # all_layer.append(encx)

        return encx
示例#22
0
    def _unilateral_net(self, x, name, training=True):
        """
        :param x: (N, num_entities)
        :param name:
        :param training:
        :return:
        """
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # [batch_size, seq_length, embedding_size], [vocab_size, embedding_size]
            embedding = get_subword_embedding(self.embeddings,
                                              x)  # (N, num_entities, ff_model)
            embedding = tf.reduce_mean(embedding, axis=1,
                                       name="embedding")  # (N, ff_model)

            embedding = tf.layers.dropout(embedding,
                                          self.context.dropout_rate,
                                          training=training)
            embedding = ff(embedding, [self.context.d_ff, self.context.d_ff])
            final_embedding = tf.sigmoid(
                tf.layers.dense(embedding, self.context.d_model))
            final_embedding = tf.identity(
                final_embedding,
                name=name + "_embedding")  # (N, num_entities, d_model)
        return final_embedding
示例#23
0
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys
            # decoder_inputs = tf.Print(decoder_inputs, [decoder_inputs],
            # message='decoder_inputs =', summarize=10)
            # embedding
            # ys_pri = tf.print('y =', tf.shape(y), summarize=3)
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)
            # dec = tf.Print(dec, [dec], message='dec =', summarize=10)
            # dec_pri = tf.print('dec =', tf.shape(dec), dec, summarize=3)
            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # dec = tf.Print(dec, [dec], message='dec_finally =', summarize=10)
        # Final linear projection (embedding weights are shared)
        # with tf.control_dependencies([ys_pri, dec_pri]):
        weights = tf.transpose(self.embeddings)  # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
示例#24
0
    def decode(self, ys, x_paraphrased_dict, memory, training=True):
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys
            x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict
            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2)
            seqlens = tf.shape(decoder_inputs)[1]  # (N, T2, 2)
            paraphrased_lens = tf.shape(x_paraphrased_dict)[1]  # (N, T2, 2)

            x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1]

            x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o)  # N, W2, d_model
            if self.hp.paraphrase_type == 0:
                x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p)
            else:
                x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(queries=dec,
                                              keys=dec,
                                              values=dec,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=True,
                                              scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(queries=dec,
                                              keys=memory,
                                              values=memory,
                                              num_heads=self.hp.num_heads,
                                              dropout_rate=self.hp.dropout_rate,
                                              training=training,
                                              causality=False,
                                              scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

            # add paraphrased dictionary attention
            h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2)

            o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1)
            W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model
            score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_o) # N, T2, W2
            c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1)
            W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))
            h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model
            score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model
            score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2
            a = tf.nn.softmax(score_p) # N, T2, W2
            c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model

            c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model
            out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal(
          stddev=0.01, seed=None))

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
    def decode(self, xs, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        self.memory = memory
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, sents2 = ys
            x, _, = xs

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model ** 0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training)

            attn_dists = []
            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec, _ = multihead_attention(queries=dec,
                                                 keys=dec,
                                                 values=dec,
                                                 num_heads=self.hp.num_heads,
                                                 dropout_rate=self.hp.dropout_rate,
                                                 training=training,
                                                 causality=True,
                                                 scope="self_attention")

                    # Vanilla attention
                    dec, attn_dist = multihead_attention(queries=dec,
                                                          keys=self.memory,
                                                          values=self.memory,
                                                          num_heads=self.hp.num_heads,
                                                          dropout_rate=self.hp.dropout_rate,
                                                          training=training,
                                                          causality=False,
                                                          scope="vanilla_attention")
                    attn_dists.append(attn_dist)
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)

        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False)

        logits = tf.nn.softmax(logits)

        # final distribution
        logits = self._calc_final_dist(x, gens, logits, attn_dists[-1])

        return logits, y, sents2
示例#26
0
    def encode(self, xs, training=True, use_turn_embedding=True):
        '''
        Returns
        memory: encoder outputs. (N, T1, d_model)
        '''
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):

            self.x, self.turn_ids, sents1 = xs
            # self.x shape:(batch_size,max_len1)
            # embedding
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         self.x)  # (N, T1, d_model)
            enc *= self.hp.d_model**0.5  # scale

            enc += positional_encoding(enc, self.hp.maxlen1 + self.hp.maxlen2)
            batch_size = tf.shape(enc)[0]
            # TODO add turn encoding,定义turn_ids如何传入,放在xs里面

            if use_turn_embedding:
                if self.turn_ids is None:
                    raise ValueError("`turn_ids` must be specified if"
                                     "`use_turn_embedding` is True.")
                turn_cnt = tf.to_int32(tf.reduce_max(self.turn_ids))
                turn_ids_table = tf.get_variable(
                    name="turn_embedding",
                    dtype=tf.float32,
                    shape=(20, self.hp.d_model),  # width即embedding size
                    initializer=tf.contrib.layers.xavier_initializer())

                flat_turn_ids = tf.reshape(self.turn_ids,
                                           [-1])  # (batch_size*seq_len)
                one_hot_ids = tf.one_hot(
                    flat_turn_ids, depth=20)  # (batch_size*seq_len,turn_cnt)
                turn_embedding = tf.matmul(
                    one_hot_ids,
                    turn_ids_table)  # (batch_size*seq_len,embed_size)
                turn_embedding = tf.reshape(turn_embedding, [
                    batch_size, self.hp.maxlen1 + self.hp.maxlen2,
                    self.hp.d_model
                ])
                enc += turn_embedding
            # TODO end
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
            ## Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc, _ = multihead_attention(
                        queries=enc,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc_h = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
                    enc_u = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
                    # enc = enc_h/2 + enc_u/2
                    # print(enc)
                    #TODO 修改成concatenation再加一个ff
                    enc = tf.layers.dense(tf.concat([enc_h, enc_u], axis=-1),
                                          units=self.hp.d_model,
                                          activation=tf.sigmoid,
                                          trainable=training,
                                          use_bias=False)
        self.enc_output = enc
        self.enc_output_h = enc_h
        self.enc_output_u = enc_u
        return self.enc_output_h, self.enc_output_u, sents1
示例#27
0
    def decode(self, ys, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding

            if self.hp.fac_embed:
                dec = tf.nn.embedding_lookup(
                    self.embeddings1, decoder_inputs)  # (N, T2, d_embed)
                dec = tf.matmul(dec, self.embeddings2)  # (N, T2, d_model)
            else:
                dec = tf.nn.embedding_lookup(
                    self.embeddings, decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            dec += positional_encoding(dec, self.hp.maxlen2)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                if self.hp.share_weights:
                    vs_name = "blocks_shared"
                else:
                    vs_name = "num_blocks_{}".format(i)
                with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        key_masks=tgt_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared in some situation)
        if self.hp.fac_embed:
            if self.hp.io_tie:  #0: no normalization. 1: cal cosine-sim on 1 and 2. 2: cal l2-norm-square on 2 and cosine-sim on 1. 3: cal l2-norm-square on 2 and dist on 1.
                if self.hp.embedding_normalization == 1:  #need add en=2 situation
                    output_embeddings1 = tf.transpose(
                        tf.concat((tf.zeros(shape=[1, self.hp.d_embed],
                                            dtype=tf.float32),
                                   tf.nn.l2_normalize(self.embeddings1[1::],
                                                      axis=-1)), 0))
                    logits = tf.einsum('ntd,dk->ntk', dec,
                                       tf.nn.l2_normalize(
                                           tf.transpose(self.embeddings2),
                                           axis=0))  #maybe use lstsq?
                    logits = tf.einsum('ntd,dk->ntk', logits,
                                       output_embeddings1)
                elif self.hp.embedding_normalization >= 2:
                    weights2 = self.embeddings2[1:, :]
                    weights2 = divide_norm_square_and_transpose(weights2)
                    weights2 = tf.concat(
                        (tf.zeros(shape=[self.hp.d_embed, 1],
                                  dtype=tf.float32), weights2), -1)
                    if self.hp.embedding_normalization == 2:
                        weights1 = tf.transpose(
                            tf.concat(
                                (tf.zeros(shape=[1, self.hp.d_embed],
                                          dtype=tf.float32),
                                 tf.nn.l2_normalize(self.embeddings1[1::],
                                                    axis=-1)), 0))
                    else:
                        weights1 = tf.transpose(self.embeddings1)
                    logits = tf.einsum('ntd,dk->ntk', dec, weights2)
                    logits = tf.einsum('ntd,dk->ntk', logits, weights1)
                    if self.hp.embedding_normalization == 3:
                        ebias = get_half_squarenorm(self.embeddings1)
                        logits = tf.subtract(logits, ebias)
                else:
                    logits = tf.einsum('ntd,dk->ntk', dec,
                                       tf.transpose(self.embeddings2))
                    logits = tf.einsum('ntd,dk->ntk', logits,
                                       tf.transpose(self.embeddings1))
            else:
                with tf.variable_scope("output_embedding",
                                       reuse=tf.AUTO_REUSE):
                    logits = tf.layers.dense(dec, self.vocab_size)

        else:
            if self.hp.io_tie:
                if self.hp.embedding_normalization == 0 or self.hp.embedding_normalization == 3:
                    weights = tf.transpose(
                        self.embeddings)  # (d_model, vocab_size)
                elif self.hp.embedding_normalization == 1:
                    weights = tf.transpose(
                        tf.concat((tf.zeros(shape=[1, self.hp.d_model],
                                            dtype=tf.float32),
                                   tf.nn.l2_normalize(self.embeddings[1:, :],
                                                      axis=-1)), 0))
                elif self.hp.embedding_normalization == 2:
                    weights = self.embeddings[1:, :]
                    weights = divide_norm_square_and_transpose(weights)
                    weights = tf.concat((tf.zeros(shape=[self.hp.d_model, 1],
                                                  dtype=tf.float32), weights),
                                        -1)
                logits = tf.einsum('ntd,dk->ntk', dec,
                                   weights)  # (N, T2, vocab_size)
                if self.hp.embedding_normalization == 2:
                    #bias=tf.ones(shape=[logits.shape[-1]],dtype=tf.float32)
                    pass
                    #with tf.variable_scope("gauss",reuse=tf.AUTO_REUSE):
                    #bias=tf.constant(1.0)
                    #logits=tf.subtract(logits,bias)
                    #logits=tf.square(logits)
                    #logits=tf.negative(logits)
                    #logits=gaussian_activation(logits)
                #logits=tf.exp(logits)
                if self.hp.embedding_normalization == 3:
                    ebias = get_half_squarenorm(self.embeddings)
                    logits = tf.subtract(logits, ebias)
            else:
                with tf.variable_scope("output_embedding",
                                       reuse=tf.AUTO_REUSE):
                    logits = tf.layers.dense(dec, self.vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
示例#28
0
    def encode_decode(self, xs, ys, training=True):
        x, seqlens = xs
        decoder_inputs, y, seqlens = ys
        with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
            enc = tf.nn.embedding_lookup(self.embeddings,
                                         x)  # (N, T1, d_model)
            enc += positional_encoding(enc, self.hp.maxlen1, self.hp)
            enc = tf.layers.dropout(enc,
                                    self.hp.dropout_rate,
                                    training=training)
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            dec = tf.reduce_sum(tf.nn.embedding_lookup(self.embeddings,
                                                       decoder_inputs),
                                reduction_indices=2)  # (N, T1, d_model)
            # test_dec = dec
            dec = dec * self.hp.d_model**0.5  # scale
            # 子图结构里也需要对应的位置编码,因为要对应输出的预测结构
            dec += positional_encoding(dec, self.hp.maxlen2, self.hp)
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

        ## Blocks
        for i in range(self.hp.num_blocks):
            with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # self-attention
                    enc = multihead_attention(
                        queries=dec,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False)
                    # feed forward
                    enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
            with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        # 是否加上mask层
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=enc,
                        values=enc,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        if self.hp.type == 'attribute':
            enc = tf.reduce_sum(enc, reduction_indices=1)
            dec = tf.reduce_sum(dec, reduction_indices=1)
            logits = tf.layers.dense(inputs=tf.concat(enc, dec),
                                     units=1,
                                     activation=tf.nn.relu)
        else:
            logits = tf.einsum('ntd,nkd->ntk', dec, enc)  # (N, T2, T2)
            logits = (logits +
                      tf.transpose(logits, [0, 2, 1])) / 2  #强制最终结果为一个对称矩阵,符合
        return logits, y, decoder_inputs
示例#29
0
    def decode(self, ys, memory, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # embedding
            dec = tf.nn.embedding_lookup(self.embeddings,
                                         decoder_inputs)  # (N, T2, d_model)
            dec *= self.hp.d_model**0.5  # scale

            input_shape = modeling_bert.get_shape_list(dec, expected_rank=3)
            seq_length = input_shape[1]
            width = input_shape[2]

            position_embeddings = tf.slice(self.full_position_embeddings,
                                           [0, 0], [seq_length, -1])
            num_dims = len(dec.shape.as_list())

            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)

            dec += position_embeddings
            dec = tf.layers.dropout(dec,
                                    self.hp.dropout_rate,
                                    training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec = multihead_attention(
                        queries=dec,
                        keys=dec,
                        values=dec,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec = multihead_attention(
                        queries=dec,
                        keys=memory,
                        values=memory,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(self.embeddings)  # (d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', dec, weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2
示例#30
0
    def decode(self, ys, memory, src_masks, training=True):
        '''
        memory: encoder outputs. (N, T1, d_model)
        src_masks: (N, T1)

        Returns
        logits: (N, T2, V). float32.
        y_hat: (N, T2). int32
        y: (N, T2). int32
        sents2: (N,). string.
        '''
        with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
            decoder_inputs, y, seqlens, sents2 = ys

            # tgt_masks
            tgt_masks = tf.math.equal(decoder_inputs, 0)  # (N, T2)

            # embedding1
            dec1 = tf.nn.embedding_lookup(self.embeddings1,
                                          decoder_inputs)  # (N, T2, d_model)
            dec1 *= self.hp.d_model**0.5  # scale
            dec1 += positional_encoding(dec1, self.hp.maxlen2)
            dec1 = tf.layers.dropout(dec1,
                                     self.hp.dropout_rate,
                                     training=training)

            # embedding2
            dec2 = tf.nn.embedding_lookup(self.embeddings2,
                                          decoder_inputs)  # (N, T2, d_model)
            dec2 *= self.hp.d_model**0.5  # scale
            dec2 += positional_encoding(dec2, self.hp.maxlen2)
            dec2 = tf.layers.dropout(dec2,
                                     self.hp.dropout_rate,
                                     training=training)

            # Blocks
            for i in range(self.hp.num_blocks):
                with tf.variable_scope("num_blocks_{}".format(i),
                                       reuse=tf.AUTO_REUSE):
                    # Masked self-attention (Note that causality is True at this time)
                    dec1, dec2 = multihead_attention(
                        queries=(dec1, dec2),
                        keys=dec1,
                        values=dec2,
                        key_masks=tgt_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=True,
                        scope="self_attention")

                    # Vanilla attention
                    dec1, dec2 = multihead_attention(
                        queries=(dec1, dec2),
                        keys=memory[0],
                        values=memory[1],
                        key_masks=src_masks,
                        num_heads=self.hp.num_heads,
                        dropout_rate=self.hp.dropout_rate,
                        training=training,
                        causality=False,
                        memory=True,
                        scope="vanilla_attention")
                    ### Feed Forward
                    dec1 = ff(dec1, num_units=[self.hp.d_ff, self.hp.d_model])
                    dec2 = ff(dec2, num_units=[self.hp.d_ff, self.hp.d_model])

        # Final linear projection (embedding weights are shared)
        weights = tf.transpose(
            tf.concat([self.embeddings1, self.embeddings2],
                      axis=-1))  # (2 * d_model, vocab_size)
        logits = tf.einsum('ntd,dk->ntk', tf.concat([dec1, dec2], axis=-1),
                           weights)  # (N, T2, vocab_size)
        y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

        return logits, y_hat, y, sents2