def _resource_apply(self, grad, var, indices=None): # 准备变量 var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = K.cast(self.epsilon, var_dtype) local_step = K.cast(self.iterations + 1, var_dtype) beta_1_t_power = K.pow(beta_1_t, local_step) beta_2_t_power = K.pow(beta_2_t, local_step) # 更新公式 if indices is None: m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad) v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2) else: mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)] with tf.control_dependencies(mv_ops): m_t = self._resource_scatter_add(m, indices, (1 - beta_1_t) * grad) v_t = self._resource_scatter_add(v, indices, (1 - beta_2_t) * grad**2) # 返回算子 with tf.control_dependencies([m_t, v_t]): if self.bias_correction: m_t = m_t / (1.0 - beta_1_t_power) v_t = v_t / (1.0 - beta_2_t_power) var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) return K.update(var, var_t)
def _resource_apply(self, grad, var, indices=None): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 ag = self.get_slot(var, 'ag') old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update ag_t = ag / self.grad_accum_steps op = super(NewOptimizer, self)._resource_apply(ag_t, var) K.update = old_update # 累积梯度 with tf.control_dependencies([op]): ag_t = K.switch(cond, K.zeros_like(ag), ag) with tf.control_dependencies([K.update(ag, ag_t)]): if indices is None: ag_t = K.update(ag, ag + grad) else: ag_t = self._resource_scatter_add(ag, indices, grad) return ag_t
def build(self, input_shape): super(ConditionalRandomField, self).build(input_shape) output_dim = input_shape[-1] self._trans = self.add_weight(name='trans', shape=(output_dim, output_dim), initializer='glorot_uniform', trainable=True) if self.lr_multiplier != 1: K.set_value(self._trans, K.eval(self._trans) / self.lr_multiplier)
def sparse_loss(self, y_true, y_pred): """y_true需要是整数形式(非one hot) """ # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 转为one hot y_true = K.one_hot(y_true, K.shape(self.trans)[0]) return self.dense_loss(y_true, y_pred)
def compute_mask(self, inputs, mask=None): if self.conditional: masks = [K.expand_dims(m, 0) for m in mask if m is not None] if len(masks) == 0: return None else: return K.all(K.concatenate(masks, axis=0), axis=0) else: return mask
def get_gradients(self, loss, params): grads = [] for g in super(NewOptimizer, self).get_gradients(loss, params): if isinstance(g, tf.IndexedSlices): g = tf.convert_to_tensor(g) if K.ndim(g) > 1: g = g - K.mean(g, axis=range(1, K.ndim(g)), keepdims=True) grads.append(g) return grads
def call(self, inputs, mode='embedding'): """新增mode参数,可以为embedding或dense。如果为embedding, 则等价于普通Embedding层;如果为dense,则等价于无bias的Dense层。 """ self._current_mode = mode if mode == 'embedding': return super(Embedding, self).call(inputs) else: kernel = K.transpose(self.embeddings) return K.dot(inputs, kernel)
def build(self, input_shape): super(MaximumEntropyMarkovModel, self).build(input_shape) output_dim = input_shape[-1] if self.hidden_dim is None: self._trans = self.add_weight(name='trans', shape=(output_dim, output_dim), initializer='glorot_uniform', trainable=True) if self.lr_multiplier != 1: K.set_value(self._trans, K.eval(self._trans) / self.lr_multiplier) else: self._l_trans = self.add_weight(name='l_trans', shape=(output_dim, self.hidden_dim), initializer='glorot_uniform', trainable=True) self._r_trans = self.add_weight(name='r_trans', shape=(output_dim, self.hidden_dim), initializer='glorot_uniform', trainable=True) if self.lr_multiplier != 1: K.set_value(self._l_trans, K.eval(self._l_trans) / self.lr_multiplier) K.set_value(self._r_trans, K.eval(self._r_trans) / self.lr_multiplier)
def compute_mask(self, inputs, mask=None): """为了适配T5,保证第一个token不被mask """ if self._current_mode == 'embedding': mask = super(Embedding, self).compute_mask(inputs, mask) if mask is not None: mask1 = K.ones_like(mask[:, :1], dtype='bool') mask2 = mask[:, 1:] return K.concatenate([mask1, mask2], 1) else: return mask
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 """ inputs, mask = inputs[:, :-1], inputs[:, -1:] states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) outputs = tf.reduce_logsumexp(states + trans, 1) # (batch_size, output_dim) outputs = outputs + inputs outputs = mask * outputs + (1 - mask) * states[:, :, 0] return outputs, [outputs]
def new_update(x, new_x): if x is var and self._do_lazy_optimization(x): if indices is None: r = K.any(K.not_equal(grad, 0.0), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x) else: return self._resource_scatter_add( x, indices, K.gather(new_x - x, indices)) return old_update(x, new_x)
def apply_ema_weights(self, bias_correction=True): """备份原模型权重,然后将平均权重应用到模型上去。 """ self.old_weights = K.batch_get_value(self.model_weights) ema_weights = K.batch_get_value(self.ema_weights) if bias_correction: iterations = K.eval(self.iterations) scale = 1.0 - np.power(self.ema_momentum, iterations) ema_weights = [weight / scale for weight in ema_weights] K.batch_set_value(zip(self.model_weights, ema_weights))
def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0.0, K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0), 1.0) new_x = x + dx * ratio return old_update(x, new_x)
def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0.0, K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0), 1.0) new_x = x + dx * ratio return old_update(x, new_x)
def get_updates(self, loss, params): updates = super(NewOptimizer, self).get_updates(loss, params) self.model_weights = params self.ema_weights = [K.zeros(K.shape(w)) for w in params] self.old_weights = K.batch_get_value(params) ema_updates, ema_momentum = [], self.ema_momentum with tf.control_dependencies(updates): for w1, w2 in zip(self.ema_weights, params): new_w = ema_momentum * w1 + (1 - ema_momentum) * w2 ema_updates.append(K.update(w1, new_w)) return ema_updates
def _resource_apply(self, grad, var, indices=None): op = super(NewOptimizer, self)._resource_apply(grad, var, indices) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_var = self.get_slot(var, 'slow_var') slow_var_t = slow_var + alpha * (var - slow_var) with tf.control_dependencies([op]): slow_update = K.update(slow_var, K.switch(cond, slow_var_t, slow_var)) with tf.control_dependencies([slow_update]): copy_update = K.update(var, K.switch(cond, slow_var, var)) return copy_update
def _resource_apply_dense(self, grad, var): op = super(NewOptimizer, self)._resource_apply_dense(grad, var) ema = self.get_slot(var, 'ema') ema_momentum = self.ema_momentum with tf.control_dependencies([op]): return K.update( ema, ema * ema_momentum + var * (1.0 - ema_momentum))
def call(self, inputs): """如果是条件Layer Norm,则默认以list为输入,第二个是condition """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) if self.center: beta = self.beta_dense(cond) + self.beta if self.scale: gamma = self.gamma_dense(cond) + self.gamma else: if self.center: beta = self.beta if self.scale: gamma = self.gamma outputs = inputs if self.center: mean = K.mean(outputs, axis=-1, keepdims=True) outputs = outputs - mean if self.scale: variance = K.mean(K.square(outputs), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = outputs / std outputs = outputs * gamma if self.center: outputs = outputs + beta return outputs
def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(NewOptimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates
def _create_slots(self, var_list): for var in var_list: if self.beta1 > 0.0: self.add_slot(var, 'm') shape = K.int_shape(var) factored_shape = self.factored_shape(shape) if factored_shape is None: self.add_slot(var, 'v') else: shape1, axis1, shape2, axis2 = factored_shape value1, value2 = np.zeros(shape1), np.zeros(shape2) self.add_slot(var, 'vr', value1) self.add_slot(var, 'vc', value2)
def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-6, bias_correction=True, **kwargs): kwargs['name'] = kwargs.get('name') or 'Adam' super(Adam, self).__init__(**kwargs) self._set_hyper('learning_rate', learning_rate) self._set_hyper('beta_1', beta_1) self._set_hyper('beta_2', beta_2) self.epsilon = epsilon or K.epislon() self.bias_correction = bias_correction
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def learning_rate(self): if self._learning_rate is None: iterations = K.cast(self.iterations + 1, K.floatx()) learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01) if self.multiply_by_parameter_scale: return learning_rate else: return learning_rate * 0.05 else: if not hasattr(self, '__learning_rate'): with K.name_scope(self.__class__.__name__): self.__learning_rate = K.variable(self._learning_rate, name='learning_rate') return self.__learning_rate
def get_updates(self, loss, params): updates = super(NewOptimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True) mask = K.cast(mask, K.floatx()) # 计算目标分数 y_true, y_pred = y_true * mask, y_pred * mask target_score = self.target_score(y_true, y_pred) # 递归计算log Z init_states = [y_pred[:, 0]] y_pred = K.concatenate([y_pred, mask], axis=2) input_length = K.int_shape(y_pred[:, 1:])[1] log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, input_length=input_length) # 最后一步的log Z向量 log_norm = tf.reduce_logsumexp(log_norm, 1) # logsumexp得标量 # 计算损失 -log p return log_norm - target_score
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: inputs, position_ids = inputs if K.dtype(position_ids) != 'int32': position_ids = K.cast(position_ids, 'int32') pos_embeddings = K.gather(self.embeddings, position_ids) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] pos_embeddings = self.embeddings[:seq_len] pos_embeddings = K.expand_dims(pos_embeddings, 0) if self.merge_mode != 'add': pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) if self.merge_mode == 'add': return inputs + pos_embeddings else: return K.concatenate([inputs, pos_embeddings])
def reset_old_weights(self): """恢复模型到旧权重。 """ K.batch_set_value(zip(self.model_weights, self.old_weights))
def apply_ema_weights(self): """备份原模型权重,然后将平均权重应用到模型上去。 """ self.old_weights = K.batch_get_value(self.model_weights) ema_weights = K.batch_get_value(self.ema_weights) K.batch_set_value(zip(self.model_weights, ema_weights))
def new_update(x, new_x): if is_one_of(x, params) and self._do_lazy_optimization(x): g = self.grads[x] r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x)
def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x)