Пример #1
0
 def call(self, inputs, mask=None):
     # 输入变换
     inputs = self.dense(inputs)
     inputs = tf.split(inputs, self.heads, axis=-1)
     inputs = K.stack(inputs, axis=-2)
     qw, kw = inputs[..., :self.head_size], inputs[..., self.head_size:]
     # RoPE编码
     if self.RoPE:
         pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs)
         cos_pos = K.repeat_elements(pos[..., None, 1::2], 2, -1)
         sin_pos = K.repeat_elements(pos[..., None, ::2], 2, -1)
         qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4)
         qw2 = K.reshape(qw2, K.shape(qw))
         qw = qw * cos_pos + qw2 * sin_pos
         kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4)
         kw2 = K.reshape(kw2, K.shape(kw))
         kw = kw * cos_pos + kw2 * sin_pos
     # 计算内积
     logits = tf.einsum('bmhd,bnhd->bhmn', qw, kw)
     # 排除padding
     logits = sequence_masking(logits, mask, '-inf', 2)
     logits = sequence_masking(logits, mask, '-inf', 3)
     # 排除下三角
     mask = tf.linalg.band_part(K.ones_like(logits), 0, -1)
     logits = logits - (1 - mask) * 1e12
     # scale返回
     return logits / self.head_size**0.5
Пример #2
0
 def call(self, inputs, mask=None):
     # 输入变换
     inputs = self.dense_1(inputs)
     qw, kw = inputs[..., ::2], inputs[..., 1::2]
     # RoPE编码
     if self.RoPE:
         pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs)
         cos_pos = K.repeat_elements(pos[..., 1::2], 2, -1)
         sin_pos = K.repeat_elements(pos[..., ::2], 2, -1)
         qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 3)
         qw2 = K.reshape(qw2, K.shape(qw))
         qw = qw * cos_pos + qw2 * sin_pos
         kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 3)
         kw2 = K.reshape(kw2, K.shape(kw))
         kw = kw * cos_pos + kw2 * sin_pos
     # 计算内积
     logits = tf.einsum('bmd,bnd->bmn', qw, kw) / self.head_size**0.5
     bias = tf.einsum('bnh->bhn', self.dense_2(inputs)) / 2
     logits = logits[:, None] + bias[:, ::2, None] + bias[:, 1::2, :, None]
     # 排除padding
     logits = sequence_masking(logits, mask, '-inf', 2)
     logits = sequence_masking(logits, mask, '-inf', 3)
     # 排除下三角
     if self.tril_mask:
         mask = tf.linalg.band_part(K.ones_like(logits), 0, -1)
         logits = logits - (1 - mask) * K.infinity()
     # 返回最终结果
     return logits
Пример #3
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     if a_mask:
         a_mask = inputs[n]
         n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if self.attention_scale:
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #4
0
 def call(self, inputs, mask=None, **kwargs):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask = None, None
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     qkv_inputs = [qw, kw, vw] + inputs[3:]
     qv_masks = [q_mask, v_mask]
     o = self.pay_attention_to(qkv_inputs, qv_masks, **kwargs)
     # 完成输出
     o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #5
0
 def target_score(self, y_true, y_pred, mask=None):
     """计算目标路径的相对概率(还没有归一化)
     要点:逐标签得分,加上转移概率得分。
     """
     y_true = sequence_masking(y_true, mask, 0)
     point_score = tf.einsum('bni,bni->b', y_true, y_pred)  # 逐标签得分
     trans_score = tf.einsum('bni,ij,bnj->b', y_true[:, :-1], self.trans,
                             y_true[:, 1:])  # 标签转移得分
     return point_score + trans_score
Пример #6
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力机制
     q_mask: 对输入的query序列进行mask。
             主要是将输出结果的padding部分置0
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息
     a_mask: 对attention矩阵的mask
             不同的attention mask对应不同的应用
     p_bias: 在attention里的位置偏置
             一般用来指定相对位置编码的种类
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = keras.backend.cast(mask[0], keras.backend.floatx())
         if mask[2] is not None:
             v_mask = keras.backend.cast(mask[2], keras.backend.floatx())
     if a_mask:
         a_mask = inputs[n]
         n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = keras.backend.reshape(qw, (-1, keras.backend.shape(q)[1], self.heads, self.key_size))
     kw = keras.backend.reshape(kw, (-1, keras.backend.shape(k)[1], self.heads, self.key_size))
     vw = keras.backend.reshape(vw, (-1, keras.backend.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     a = a / self.key_size**0.5
     a = B.sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         a = a - (1 - a_mask) * 1e12
     a = keras.backend.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     o = keras.backend.reshape(o, (-1, keras.backend.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 返回结果
     o = B.sequence_masking(o, q_mask, 0)
     return o
Пример #7
0
 def pay_attention_to(self, inputs, mask=None, **kwargs):
     """实现标准的乘性多头注意力
     a_bias: 对attention矩阵的bias。
             不同的attention bias对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     说明: 这里单独分离出pay_attention_to函数,是为了方便
           继承此类来定义不同形式的atttention;此处要求
           返回o.shape=(batch_size, seq_len, heads, head_size)。
     """
     (qw, kw, vw), n = inputs[:3], 3
     q_mask, v_mask = mask
     a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias')
     if a_bias:
         a_bias = inputs[n]
         n += 1
     if p_bias == 'rotary':
         cos_pos = K.repeat_elements(inputs[n][..., None, 1::2], 2, -1)
         sin_pos = K.repeat_elements(inputs[n][..., None, ::2], 2, -1)
         qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4)
         qw2 = K.reshape(qw2, K.shape(qw))
         qw = qw * cos_pos + qw2 * sin_pos
         kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4)
         kw2 = K.reshape(kw2, K.shape(kw))
         kw = kw * cos_pos + kw2 * sin_pos
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         position_bias = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, position_bias)
     elif p_bias == 't5_relative':
         position_bias = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(position_bias, 0)
     # Attention(续)
     if self.attention_scale:
         a = a / self.key_size**0.5
     if a_bias is not None:
         a = a + a_bias
     a = sequence_masking(a, v_mask, '-inf', -1)
     A = K.softmax(a)
     if self.attention_dropout:
         A = Dropout(self.attention_dropout)(A)
     # 完成输出
     # 如果是相对位置编码,还要加上attention 矩阵乘法 pos_embeddings
     o = tf.einsum('bhjk,bkhd->bjhd', A, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', A, position_bias)
     return o, a
Пример #8
0
 def call(self, inputs, mask=None):
     x = inputs
     # Pooling
     if self.pool_size > 1:
         if mask is not None:
             mask = K.cast(mask, K.floatx())
         x_in_len = K.shape(x)[1]
         x = sequence_masking(x, mask, 0)
         x = divisible_temporal_padding(x, self.pool_size)
         x = pool1d(x, self.pool_size, self.pool_size, pool_mode='avg')
     # 执行FFN
     x = self.dense_1(x)
     x = self.dense_2(x)
     # 恢复长度
     if self.pool_size > 1:
         x = K.repeat_elements(x, self.pool_size, 1)[:, :x_in_len]
     # 返回结果
     return x
Пример #9
0
 def call(self, inputs, mask=None):
     x = inputs
     # Pooling
     if self.pool_size > 1:
         if mask is not None:
             if not hasattr(self, 'mask_layer'):
                 self.mask_layer = search_layer(x, mask)
             mask = self.mask_layer.output_mask
         x_in_len = K.shape(x)[1]
         x = sequence_masking(x, mask, 0)
         x = divisible_temporal_padding(x, self.pool_size)
         x = pool1d(x, self.pool_size, self.pool_size, pool_mode='avg')
     # 执行FFN
     x = self.dense_1(x)
     x = self.dense_2(x)
     # 恢复长度
     if self.pool_size > 1:
         x = K.repeat_elements(x, self.pool_size, 1)[:, :x_in_len]
     # 返回结果
     return x
Пример #10
0
 def pay_attention_to(self, inputs, mask=None, **kwargs):
     """实现标准的乘性多头注意力
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     说明: 这里单独分离出pay_attention_to函数,是为了方便
           继承此类来定义不同形式的atttention;此处要求
           返回o.shape=(batch_size, seq_len, heads, head_size)。
     """
     (qw, kw, vw), n = inputs[:3], 3
     q_mask, v_mask = mask
     a_mask, p_bias = kwargs.get('a_mask'), kwargs.get('p_bias')
     if a_mask:
         a_mask = inputs[n]
         n += 1
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if self.attention_scale:
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     return o
Пример #11
0
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)
Пример #12
0
 def call(self, inputs, mask=None):
     return sequence_masking(inputs, mask, '-inf', 1)
Пример #13
0
 def call(self, inputs, mask=None):
     axis = 1 if self.data_format == 'channels_last' else 2
     inputs = sequence_masking(inputs, mask, '-inf', axis)
     return K.max(inputs, axis=axis)
Пример #14
0
 def call(self, inputs, q_mask=False, v_mask=False, a_mask=False):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     # 处理mask
     inputs = inputs[:]
     for i, mask in enumerate([q_mask, v_mask, a_mask]):
         if not mask:
             inputs.insert(3 + i, None)
     q, k, v, q_mask, v_mask = inputs[:5]
     if len(inputs) == 5:
         a_mask = 'history_only'
     elif len(inputs) == 6:
         a_mask = inputs[-1]
     else:
         raise ValueError('wrong inputs for MultiHeadAttention.')
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #15
0
 def call(self, inputs, q_mask=None, v_mask=None, a_mask=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     q, k, v = inputs[:3]
     if a_mask:
         if len(inputs) == 3:
             a_mask = 'history_only'
         else:
             a_mask = inputs[3]
     if q_mask is not None:
         if not hasattr(self, 'q_mask_layer'):
             self.q_mask_layer = search_layer(q, q_mask)
         q_mask = self.q_mask_layer.output_mask
     if v_mask is not None:
         if not hasattr(self, 'v_mask_layer'):
             self.v_mask_layer = search_layer(v, v_mask)
         v_mask = self.v_mask_layer.output_mask
     # Pooling
     if self.pool_size > 1:
         is_self_attention = (q is k is v)
         q_in_len = K.shape(q)[1]
         q = sequence_masking(q, q_mask, 0)
         q = divisible_temporal_padding(q, self.pool_size)
         q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg')
         if is_self_attention:
             k = v = q
         else:
             k = sequence_masking(k, v_mask, 0)
             k = divisible_temporal_padding(k, self.pool_size)
             k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg')
             v = sequence_masking(v, v_mask, 0)
             v = divisible_temporal_padding(v, self.pool_size)
             v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg')
         if v_mask is not None:
             v_mask = v_mask[:, ::self.pool_size]
         if a_mask is not None and not is_string(a_mask):
             a_mask = a_mask[..., ::self.pool_size, ::self.pool_size]
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 恢复长度
     if self.pool_size > 1:
         o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len]
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o