Exemplo n.º 1
0
 def basic_accuracy(self, y_true, y_pred, go_backwards=False):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算逐标签accuracy
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
Exemplo n.º 2
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     if a_mask:
         a_mask = inputs[n]
         n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if self.attention_scale:
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o
Exemplo n.º 3
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
Exemplo n.º 4
0
        def get_updates(self, loss, params):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_%s' % i) for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(NewOptimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies(updates):
                accum_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for g, ag in zip(grads, self.accum_grads)
                ]

            return accum_updates
Exemplo n.º 5
0
 def basic_loss(self, y_true, y_pred, go_backwards=False):
     """y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算loss
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     loss = K.sparse_categorical_crossentropy(y_true,
                                              y_pred,
                                              from_logits=True)
     return K.sum(loss * mask) / K.sum(mask)
Exemplo n.º 6
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
Exemplo n.º 7
0
 def new_update(x, new_x):
     if x is var and self._do_lazy_optimization(x):
         if indices is None:
             r = K.any(K.not_equal(grad, 0.0),
                       axis=-1,
                       keepdims=True)
             new_x = x + (new_x - x) * K.cast(r, K.floatx())
             return old_update(x, new_x)
         else:
             return self._resource_scatter_add(
                 x, indices, K.gather(new_x - x, indices))
     return old_update(x, new_x)
Exemplo n.º 8
0
 def learning_rate(self):
     if self._learning_rate is None:
         iterations = K.cast(self.iterations + 1, K.floatx())
         learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01)
         if self.multiply_by_parameter_scale:
             return learning_rate
         else:
             return learning_rate * 0.05
     else:
         if not hasattr(self, '__learning_rate'):
             with K.name_scope(self.__class__.__name__):
                 self.__learning_rate = K.variable(self._learning_rate,
                                                   name='learning_rate')
         return self.__learning_rate
Exemplo n.º 9
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
     mask = K.cast(mask, K.floatx())
     # 计算目标分数
     y_true, y_pred = y_true * mask, y_pred * mask
     target_score = self.target_score(y_true, y_pred)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     y_pred = K.concatenate([y_pred, mask], axis=2)
     input_length = K.int_shape(y_pred[:, 1:])[1]
     log_norm, _, _ = K.rnn(self.log_norm_step,
                            y_pred[:, 1:],
                            init_states,
                            input_length=input_length)  # 最后一步的log Z向量
     log_norm = tf.reduce_logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
Exemplo n.º 10
0
 def new_update(x, new_x):
     if is_one_of(x, params) and self._do_lazy_optimization(x):
         g = self.grads[x]
         r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True)
         new_x = x + (new_x - x) * K.cast(r, K.floatx())
     return old_update(x, new_x)
Exemplo n.º 11
0
 def beta2(self):
     if self._beta2 is None:
         iterations = K.cast(self.iterations + 1, K.floatx())
         return 1.0 - K.pow(iterations, -0.8)
     else:
         return self._beta2
Exemplo n.º 12
0
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)