def call(self, inputs, mask=None): # 输入变换 inputs = self.dense(inputs) inputs = tf.split(inputs, self.heads, axis=-1) inputs = K.stack(inputs, axis=-2) qw, kw = inputs[..., :self.head_size], inputs[..., self.head_size:] # RoPE编码 if self.RoPE: pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs) cos_pos = K.repeat_elements(pos[..., None, 1::2], 2, -1) sin_pos = K.repeat_elements(pos[..., None, ::2], 2, -1) qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4) qw2 = K.reshape(qw2, K.shape(qw)) qw = qw * cos_pos + qw2 * sin_pos kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4) kw2 = K.reshape(kw2, K.shape(kw)) kw = kw * cos_pos + kw2 * sin_pos # 计算内积 logits = tf.einsum('bmhd,bnhd->bhmn', qw, kw) # 排除padding logits = sequence_masking(logits, mask, '-inf', 2) logits = sequence_masking(logits, mask, '-inf', 3) # 排除下三角 mask = tf.linalg.band_part(K.ones_like(logits), 0, -1) logits = logits - (1 - mask) * 1e12 # scale返回 return logits / self.head_size**0.5
def call(self, inputs, mask=None): # 输入变换 inputs = self.dense_1(inputs) qw, kw = inputs[..., ::2], inputs[..., 1::2] # RoPE编码 if self.RoPE: pos = SinusoidalPositionEmbedding(self.head_size, 'zero')(inputs) cos_pos = K.repeat_elements(pos[..., 1::2], 2, -1) sin_pos = K.repeat_elements(pos[..., ::2], 2, -1) qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 3) qw2 = K.reshape(qw2, K.shape(qw)) qw = qw * cos_pos + qw2 * sin_pos kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 3) kw2 = K.reshape(kw2, K.shape(kw)) kw = kw * cos_pos + kw2 * sin_pos # 计算内积 logits = tf.einsum('bmd,bnd->bmn', qw, kw) / self.head_size**0.5 bias = tf.einsum('bnh->bhn', self.dense_2(inputs)) / 2 logits = logits[:, None] + bias[:, ::2, None] + bias[:, 1::2, :, None] # 排除padding logits = sequence_masking(logits, mask, '-inf', 2) logits = sequence_masking(logits, mask, '-inf', 3) # 排除下三角 if self.tril_mask: mask = tf.linalg.band_part(K.ones_like(logits), 0, -1) logits = logits - (1 - mask) * K.infinity() # 返回最终结果 return logits
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, mask=None, **kwargs): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 """ q, k, v = inputs[:3] q_mask, v_mask = None, None if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention qkv_inputs = [qw, kw, vw] + inputs[3:] qv_masks = [q_mask, v_mask] o = self.pay_attention_to(qkv_inputs, qv_masks, **kwargs) # 完成输出 o = K.reshape(o, (-1, K.shape(o)[1], self.head_size * self.heads)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def target_score(self, y_true, y_pred, mask=None): """计算目标路径的相对概率(还没有归一化) 要点:逐标签得分,加上转移概率得分。 """ y_true = sequence_masking(y_true, mask, 0) point_score = tf.einsum('bni,bni->b', y_true, y_pred) # 逐标签得分 trans_score = tf.einsum('bni,ij,bnj->b', y_true[:, :-1], self.trans, y_true[:, 1:]) # 标签转移得分 return point_score + trans_score
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力机制 q_mask: 对输入的query序列进行mask。 主要是将输出结果的padding部分置0 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息 a_mask: 对attention矩阵的mask 不同的attention mask对应不同的应用 p_bias: 在attention里的位置偏置 一般用来指定相对位置编码的种类 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = keras.backend.cast(mask[0], keras.backend.floatx()) if mask[2] is not None: v_mask = keras.backend.cast(mask[2], keras.backend.floatx()) if a_mask: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = keras.backend.reshape(qw, (-1, keras.backend.shape(q)[1], self.heads, self.key_size)) kw = keras.backend.reshape(kw, (-1, keras.backend.shape(k)[1], self.heads, self.key_size)) vw = keras.backend.reshape(vw, (-1, keras.backend.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) a = a / self.key_size**0.5 a = B.sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = keras.backend.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) o = keras.backend.reshape(o, (-1, keras.backend.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 返回结果 o = B.sequence_masking(o, q_mask, 0) return o
def pay_attention_to(self, inputs, mask=None, **kwargs): """实现标准的乘性多头注意力 a_bias: 对attention矩阵的bias。 不同的attention bias对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 说明: 这里单独分离出pay_attention_to函数,是为了方便 继承此类来定义不同形式的atttention;此处要求 返回o.shape=(batch_size, seq_len, heads, head_size)。 """ (qw, kw, vw), n = inputs[:3], 3 q_mask, v_mask = mask a_bias, p_bias = kwargs.get('a_bias'), kwargs.get('p_bias') if a_bias: a_bias = inputs[n] n += 1 if p_bias == 'rotary': cos_pos = K.repeat_elements(inputs[n][..., None, 1::2], 2, -1) sin_pos = K.repeat_elements(inputs[n][..., None, ::2], 2, -1) qw2 = K.stack([-qw[..., 1::2], qw[..., ::2]], 4) qw2 = K.reshape(qw2, K.shape(qw)) qw = qw * cos_pos + qw2 * sin_pos kw2 = K.stack([-kw[..., 1::2], kw[..., ::2]], 4) kw2 = K.reshape(kw2, K.shape(kw)) kw = kw * cos_pos + kw2 * sin_pos # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': position_bias = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, position_bias) elif p_bias == 't5_relative': position_bias = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(position_bias, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 if a_bias is not None: a = a + a_bias a = sequence_masking(a, v_mask, '-inf', -1) A = K.softmax(a) if self.attention_dropout: A = Dropout(self.attention_dropout)(A) # 完成输出 # 如果是相对位置编码,还要加上attention 矩阵乘法 pos_embeddings o = tf.einsum('bhjk,bkhd->bjhd', A, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', A, position_bias) return o, a
def call(self, inputs, mask=None): x = inputs # Pooling if self.pool_size > 1: if mask is not None: mask = K.cast(mask, K.floatx()) x_in_len = K.shape(x)[1] x = sequence_masking(x, mask, 0) x = divisible_temporal_padding(x, self.pool_size) x = pool1d(x, self.pool_size, self.pool_size, pool_mode='avg') # 执行FFN x = self.dense_1(x) x = self.dense_2(x) # 恢复长度 if self.pool_size > 1: x = K.repeat_elements(x, self.pool_size, 1)[:, :x_in_len] # 返回结果 return x
def call(self, inputs, mask=None): x = inputs # Pooling if self.pool_size > 1: if mask is not None: if not hasattr(self, 'mask_layer'): self.mask_layer = search_layer(x, mask) mask = self.mask_layer.output_mask x_in_len = K.shape(x)[1] x = sequence_masking(x, mask, 0) x = divisible_temporal_padding(x, self.pool_size) x = pool1d(x, self.pool_size, self.pool_size, pool_mode='avg') # 执行FFN x = self.dense_1(x) x = self.dense_2(x) # 恢复长度 if self.pool_size > 1: x = K.repeat_elements(x, self.pool_size, 1)[:, :x_in_len] # 返回结果 return x
def pay_attention_to(self, inputs, mask=None, **kwargs): """实现标准的乘性多头注意力 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 说明: 这里单独分离出pay_attention_to函数,是为了方便 继承此类来定义不同形式的atttention;此处要求 返回o.shape=(batch_size, seq_len, heads, head_size)。 """ (qw, kw, vw), n = inputs[:3], 3 q_mask, v_mask = mask a_mask, p_bias = kwargs.get('a_mask'), kwargs.get('p_bias') if a_mask: a_mask = inputs[n] n += 1 # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.attention_scale: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) return o
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) return sequence_masking(inputs, mask, 1, 1)
def call(self, inputs, mask=None): return sequence_masking(inputs, mask, '-inf', 1)
def call(self, inputs, mask=None): axis = 1 if self.data_format == 'channels_last' else 2 inputs = sequence_masking(inputs, mask, '-inf', axis) return K.max(inputs, axis=axis)
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ # 处理mask inputs = inputs[:] for i, mask in enumerate([q_mask, v_mask, a_mask]): if not mask: inputs.insert(3 + i, None) q, k, v, q_mask, v_mask = inputs[:5] if len(inputs) == 5: a_mask = 'history_only' elif len(inputs) == 6: a_mask = inputs[-1] else: raise ValueError('wrong inputs for MultiHeadAttention.') # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[3] if q_mask is not None: if not hasattr(self, 'q_mask_layer'): self.q_mask_layer = search_layer(q, q_mask) q_mask = self.q_mask_layer.output_mask if v_mask is not None: if not hasattr(self, 'v_mask_layer'): self.v_mask_layer = search_layer(v, v_mask) v_mask = self.v_mask_layer.output_mask # Pooling if self.pool_size > 1: is_self_attention = (q is k is v) q_in_len = K.shape(q)[1] q = sequence_masking(q, q_mask, 0) q = divisible_temporal_padding(q, self.pool_size) q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg') if is_self_attention: k = v = q else: k = sequence_masking(k, v_mask, 0) k = divisible_temporal_padding(k, self.pool_size) k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg') v = sequence_masking(v, v_mask, 0) v = divisible_temporal_padding(v, self.pool_size) v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg') if v_mask is not None: v_mask = v_mask[:, ::self.pool_size] if a_mask is not None and not is_string(a_mask): a_mask = a_mask[..., ::self.pool_size, ::self.pool_size] # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 恢复长度 if self.pool_size > 1: o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len] # 返回结果 o = sequence_masking(o, q_mask, 0) return o