def _resource_apply_op(self, grad, var, indices=None): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 ag = self.get_slot(var, 'ag') old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update ag_t = ag / self.grad_accum_steps op = super(new_optimizer, self)._resource_apply_op(ag_t, var) K.update = old_update # 累积梯度 with tf.control_dependencies([op]): ag_t = K.switch(cond, K.zeros_like(ag), ag) with tf.control_dependencies([K.update(ag, ag_t)]): if indices is None: ag_t = K.update(ag, ag + grad) else: ag_t = self._resource_scatter_add(ag, indices, grad) return ag_t
def _resource_apply_op(self, grad, var, indices=None): # 准备变量 var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = K.cast(self.epsilon, var_dtype) local_step = K.cast(self.iterations + 1, var_dtype) beta_1_t_power = K.pow(beta_1_t, local_step) beta_2_t_power = K.pow(beta_2_t, local_step) # 更新公式 if indices is None: m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad) v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2) else: mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)] with tf.control_dependencies(mv_ops): m_t = self._resource_scatter_add(m, indices, (1 - beta_1_t) * grad) v_t = self._resource_scatter_add(v, indices, (1 - beta_2_t) * grad**2) # 返回算子 with tf.control_dependencies([m_t, v_t]): if self.bias_correction: m_t = m_t / (1. - beta_1_t_power) v_t = v_t / (1. - beta_2_t_power) var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) return K.update(var, var_t)
def sparse_loss(self, y_true, y_pred): """y_true需要是整数形式(非one hot) """ # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 转为one hot y_true = K.one_hot(y_true, K.shape(self.trans)[0]) return self.dense_loss(y_true, y_pred)
def build(self, input_shape): output_dim = input_shape[-1] if not isinstance(output_dim, int): output_dim = output_dim.value if self.hidden_dim is None: self.trans = self.add_weight(name='trans', shape=(output_dim, output_dim), initializer='glorot_uniform', trainable=True) if self.lr_multiplier != 1: K.set_value(self.trans, K.eval(self.trans) / self.lr_multiplier) self.trans = self.lr_multiplier * self.trans else: self.l_trans = self.add_weight(name='l_trans', shape=(output_dim, self.hidden_dim), initializer='glorot_uniform', trainable=True) self.r_trans = self.add_weight(name='r_trans', shape=(output_dim, self.hidden_dim), initializer='glorot_uniform', trainable=True) if self.lr_multiplier != 1: K.set_value(self.l_trans, K.eval(self.l_trans) / self.lr_multiplier) self.l_trans = self.lr_multiplier * self.l_trans K.set_value(self.r_trans, K.eval(self.r_trans) / self.lr_multiplier) self.r_trans = self.lr_multiplier * self.r_trans
def call(self, inputs): input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) if self.merge_mode == 'add': return inputs + pos_embeddings else: pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, pos_embeddings])
def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x)
def compile(self): # 交叉熵作为loss,并mask掉输入部分的预测 y_true = self.model.input[0][:, 1:] # 目标tokens y_mask = self.model.input[1][:, 1:] # 目标mask y_mask = K.cast(y_mask, K.floatx()) # 转为浮点型 y_pred = self.model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) self.model.add_loss(cross_entropy) opt = extend_with_gradient_accumulation(Adam)(learning_rate=0.000015, grad_accum_steps=2) self.model.compile(optimizer=opt)
def new_update(x, new_x): if x is var and self._do_lazy_optimization(x): if indices is None: r = K.any(K.not_equal(grad, 0.), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x) else: return self._resource_scatter_add( x, indices, K.gather(new_x - x, indices)) return old_update(x, new_x)
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 """ inputs, mask = inputs[:, :-1], inputs[:, -1:] states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) outputs = tf.reduce_logsumexp(states + trans, 1) # (batch_size, output_dim) outputs = outputs + inputs outputs = mask * outputs + (1 - mask) * states[:, :, 0] return outputs, [outputs]
def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x)
def masked_cross_entropy(y_true, y_pred): """交叉熵作为loss,并mask掉padding部分的预测 """ y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) return cross_entropy
def __init__(self, center=True, scale=True, conditional=False, hidden_units=None, hidden_activation='linear', hidden_initializer='glorot_uniform', **kwargs): super(LayerNormalization, self).__init__(**kwargs) self.center = center self.scale = scale self.conditional = conditional self.hidden_units = hidden_units self.hidden_activation = activations.get(hidden_activation) self.hidden_initializer = initializers.get(hidden_initializer) self.epsilon = K.epsilon() * K.epsilon()
def _resource_apply_op(self, grad, var, indices=None): op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_var = self.get_slot(var, 'slow_var') slow_var_t = slow_var + alpha * (var - slow_var) with tf.control_dependencies([op]): slow_update = K.update(slow_var, K.switch(cond, slow_var_t, slow_var)) with tf.control_dependencies([slow_update]): copy_update = K.update(var, K.switch(cond, slow_var, var)) return copy_update
def call(self, inputs): """如果是条件Layer Norm,则默认以list为输入,第二个是condition """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) if self.center: beta = self.beta_dense(cond) + self.beta if self.scale: gamma = self.gamma_dense(cond) + self.gamma else: if self.center: beta = self.beta if self.scale: gamma = self.gamma outputs = inputs if self.center: mean = K.mean(outputs, axis=-1, keepdims=True) outputs = outputs - mean if self.scale: variance = K.mean(K.square(outputs), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = outputs / std outputs = outputs * gamma if self.center: outputs = outputs + beta return outputs
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def call(self, inputs): if not hasattr(self, 'kernel'): embedding_layer = search_layer(inputs, self.embedding_name) if embedding_layer is None: raise Exception('Embedding layer not found') self.kernel = K.transpose(embedding_layer.embeddings) self.units = K.int_shape(self.kernel)[1] # <<< if not hasattr(self, 'bias') and self.use_bias: # <<< self.bias = self.add_weight(name='bias', shape=(self.units, ), initializer='zeros') outputs = K.dot(inputs, self.kernel) if self.use_bias: outputs = K.bias_add(outputs, self.bias) outputs = self.activation(outputs) return outputs
def train_function(inputs): # 重新定义训练函数 grads = embedding_gradients(inputs)[0] # Embedding梯度 delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8) # 计算扰动 K.set_value(embeddings, K.eval(embeddings) + delta) # 注入扰动 outputs = old_train_function(inputs) # 梯度下降 K.set_value(embeddings, K.eval(embeddings) - delta) # 删除扰动 return outputs
def adversarial_training(model, embedding_name, epsilon=1): """给模型添加对抗训练 其中model是需要添加对抗训练的keras模型,embedding_name 则是model里边Embedding层的名字。要在模型compile之后使用。 """ if model.train_function is None: # 如果还没有训练函数 model._make_train_function() # 手动make old_train_function = model.train_function # 备份旧的训练函数 # 查找Embedding层 for output in model.outputs: embedding_layer = search_layer(output, embedding_name) if embedding_layer is not None: break if embedding_layer is None: raise Exception('Embedding layer not found') # 求Embedding梯度 embeddings = embedding_layer.embeddings # Embedding矩阵 gradients = K.gradients(model.total_loss, [embeddings]) # Embedding梯度 gradients = K.zeros_like(embeddings) + gradients[0] # 转为dense tensor # 封装为函数 inputs = (model._feed_inputs + model._feed_targets + model._feed_sample_weights) # 所有输入层 embedding_gradients = K.function( inputs=inputs, outputs=[gradients], name='embedding_gradients', ) # 封装为函数 def train_function(inputs): # 重新定义训练函数 grads = embedding_gradients(inputs)[0] # Embedding梯度 delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8) # 计算扰动 K.set_value(embeddings, K.eval(embeddings) + delta) # 注入扰动 outputs = old_train_function(inputs) # 梯度下降 K.set_value(embeddings, K.eval(embeddings) - delta) # 删除扰动 return outputs model.train_function = train_function # 覆盖原训练函数
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6, bias_correction=True, name='Adam', **kwargs): kwargs['name'] = name super(Adam, self).__init__(**kwargs) self._set_hyper('learning_rate', learning_rate) self._set_hyper('beta_1', beta_1) self._set_hyper('beta_2', beta_2) self.epsilon = epsilon or K.epislon() self.bias_correction = bias_correction
def set_model(self, model): """绑定模型,并初始化参数 """ super(ExponentialMovingAverage, self).set_model(model) self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights] self.old_weights = K.batch_get_value(model.weights) K.batch_set_value(zip(self.ema_weights, self.old_weights)) self.updates = [] for w1, w2 in zip(self.ema_weights, model.weights): op = K.moving_average_update(w1, w2, self.momentum) self.updates.append(op)
def get_updates(self, loss, params): updates = super(new_optimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 if self.input_mask is None: mask = None else: mask = K.cast(self.input_mask, K.floatx()) # 计算目标分数 target_score = self.target_score(y_true, y_pred, mask) # 递归计算log Z init_states = [y_pred[:, 0]] if mask is None: mask = K.ones_like(y_pred[:, :, :1]) else: mask = K.expand_dims(mask, 2) y_pred = K.concatenate([y_pred, mask]) log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states) # 最后一步的log Z向量 log_norm = tf.reduce_logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if p_bias != 't5_relative': # T5不用缩放 a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def dense_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是one hot形式 """ y_true = K.argmax(y_true, 2) return self.sparse_accuracy(y_true, y_pred)
def basic_accuracy(self, y_true, y_pred, go_backwards=False): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 if self.input_mask is None: mask = None else: mask = K.cast(self.input_mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算逐标签accuracy histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) if mask is None: return K.mean(isequal) else: return K.sum(isequal * mask) / K.sum(mask)
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ y_true = K.argmax(y_true, 2) return self.sparse_loss(y_true, y_pred)
def basic_loss(self, y_true, y_pred, go_backwards=False): """y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 if self.input_mask is None: mask = None else: mask = K.cast(self.input_mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算loss histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) if mask is None: return K.mean(loss) else: return K.sum(loss * mask) / K.sum(mask)
def reverse_sequence(self, inputs, mask=None): if mask is None: return [x[:, ::-1] for x in inputs] else: length = K.cast(K.sum(mask, 1), 'int32') return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 if self.input_mask is None: mask = None else: mask = K.cast(self.input_mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) if mask is None: return K.mean(isequal) else: return K.sum(isequal * mask) / K.sum(mask)