예제 #1
0
    def _resource_apply(self, grad, var, indices=None):
        # 准备变量
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = K.cast(self.epsilon, var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)
        beta_1_t_power = K.pow(beta_1_t, local_step)
        beta_2_t_power = K.pow(beta_2_t, local_step)

        # 更新公式
        if indices is None:
            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2)
        else:
            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
            with tf.control_dependencies(mv_ops):
                m_t = self._resource_scatter_add(m, indices,
                                                 (1 - beta_1_t) * grad)
                v_t = self._resource_scatter_add(v, indices,
                                                 (1 - beta_2_t) * grad**2)

        # 返回算子
        with tf.control_dependencies([m_t, v_t]):
            if self.bias_correction:
                m_t = m_t / (1.0 - beta_1_t_power)
                v_t = v_t / (1.0 - beta_2_t_power)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            return K.update(var, var_t)
예제 #2
0
        def _resource_apply(self, grad, var, indices=None):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            # 获取梯度
            ag = self.get_slot(var, 'ag')

            old_update = K.update

            def new_update(x, new_x):
                new_x = K.switch(cond, new_x, x)
                return old_update(x, new_x)

            K.update = new_update
            ag_t = ag / self.grad_accum_steps
            op = super(NewOptimizer, self)._resource_apply(ag_t, var)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies([op]):
                ag_t = K.switch(cond, K.zeros_like(ag), ag)
                with tf.control_dependencies([K.update(ag, ag_t)]):
                    if indices is None:
                        ag_t = K.update(ag, ag + grad)
                    else:
                        ag_t = self._resource_scatter_add(ag, indices, grad)

            return ag_t
예제 #3
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def build(self, input_shape):
     super(ConditionalRandomField, self).build(input_shape)
     output_dim = input_shape[-1]
     self._trans = self.add_weight(name='trans',
                                   shape=(output_dim, output_dim),
                                   initializer='glorot_uniform',
                                   trainable=True)
     if self.lr_multiplier != 1:
         K.set_value(self._trans, K.eval(self._trans) / self.lr_multiplier)
예제 #4
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def sparse_loss(self, y_true, y_pred):
     """y_true需要是整数形式(非one hot)
     """
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 转为one hot
     y_true = K.one_hot(y_true, K.shape(self.trans)[0])
     return self.dense_loss(y_true, y_pred)
예제 #5
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def compute_mask(self, inputs, mask=None):
     if self.conditional:
         masks = [K.expand_dims(m, 0) for m in mask if m is not None]
         if len(masks) == 0:
             return None
         else:
             return K.all(K.concatenate(masks, axis=0), axis=0)
     else:
         return mask
예제 #6
0
        def get_gradients(self, loss, params):
            grads = []
            for g in super(NewOptimizer, self).get_gradients(loss, params):
                if isinstance(g, tf.IndexedSlices):
                    g = tf.convert_to_tensor(g)
                if K.ndim(g) > 1:
                    g = g - K.mean(g, axis=range(1, K.ndim(g)), keepdims=True)
                grads.append(g)

            return grads
예제 #7
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def call(self, inputs, mode='embedding'):
     """新增mode参数,可以为embedding或dense。如果为embedding,
     则等价于普通Embedding层;如果为dense,则等价于无bias的Dense层。
     """
     self._current_mode = mode
     if mode == 'embedding':
         return super(Embedding, self).call(inputs)
     else:
         kernel = K.transpose(self.embeddings)
         return K.dot(inputs, kernel)
예제 #8
0
파일: layers.py 프로젝트: ForestLee/nn_qa
    def build(self, input_shape):
        super(MaximumEntropyMarkovModel, self).build(input_shape)
        output_dim = input_shape[-1]

        if self.hidden_dim is None:
            self._trans = self.add_weight(name='trans',
                                          shape=(output_dim, output_dim),
                                          initializer='glorot_uniform',
                                          trainable=True)
            if self.lr_multiplier != 1:
                K.set_value(self._trans,
                            K.eval(self._trans) / self.lr_multiplier)
        else:
            self._l_trans = self.add_weight(name='l_trans',
                                            shape=(output_dim,
                                                   self.hidden_dim),
                                            initializer='glorot_uniform',
                                            trainable=True)
            self._r_trans = self.add_weight(name='r_trans',
                                            shape=(output_dim,
                                                   self.hidden_dim),
                                            initializer='glorot_uniform',
                                            trainable=True)

            if self.lr_multiplier != 1:
                K.set_value(self._l_trans,
                            K.eval(self._l_trans) / self.lr_multiplier)
                K.set_value(self._r_trans,
                            K.eval(self._r_trans) / self.lr_multiplier)
예제 #9
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def compute_mask(self, inputs, mask=None):
     """为了适配T5,保证第一个token不被mask
     """
     if self._current_mode == 'embedding':
         mask = super(Embedding, self).compute_mask(inputs, mask)
         if mask is not None:
             mask1 = K.ones_like(mask[:, :1], dtype='bool')
             mask2 = mask[:, 1:]
             return K.concatenate([mask1, mask2], 1)
     else:
         return mask
예제 #10
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def log_norm_step(self, inputs, states):
     """递归计算归一化因子
     要点:1、递归计算;2、用logsumexp避免溢出。
     """
     inputs, mask = inputs[:, :-1], inputs[:, -1:]
     states = K.expand_dims(states[0], 2)  # (batch_size, output_dim, 1)
     trans = K.expand_dims(self.trans, 0)  # (1, output_dim, output_dim)
     outputs = tf.reduce_logsumexp(states + trans,
                                   1)  # (batch_size, output_dim)
     outputs = outputs + inputs
     outputs = mask * outputs + (1 - mask) * states[:, :, 0]
     return outputs, [outputs]
예제 #11
0
 def new_update(x, new_x):
     if x is var and self._do_lazy_optimization(x):
         if indices is None:
             r = K.any(K.not_equal(grad, 0.0),
                       axis=-1,
                       keepdims=True)
             new_x = x + (new_x - x) * K.cast(r, K.floatx())
             return old_update(x, new_x)
         else:
             return self._resource_scatter_add(
                 x, indices, K.gather(new_x - x, indices))
     return old_update(x, new_x)
예제 #12
0
        def apply_ema_weights(self, bias_correction=True):
            """备份原模型权重,然后将平均权重应用到模型上去。
            """
            self.old_weights = K.batch_get_value(self.model_weights)
            ema_weights = K.batch_get_value(self.ema_weights)

            if bias_correction:
                iterations = K.eval(self.iterations)
                scale = 1.0 - np.power(self.ema_momentum, iterations)
                ema_weights = [weight / scale for weight in ema_weights]

            K.batch_set_value(zip(self.model_weights, ema_weights))
예제 #13
0
 def new_update(x, new_x):
     if is_one_of(x, params) and self._do_layer_adaptation(x):
         dx = new_x - x
         lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10)
         x_norm = tf.norm(x)
         g_norm = tf.norm(dx / lr_t)
         ratio = K.switch(
             x_norm > 0.0,
             K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0),
             1.0)
         new_x = x + dx * ratio
     return old_update(x, new_x)
예제 #14
0
 def new_update(x, new_x):
     if x is var and self._do_layer_adaptation(x):
         dx = new_x - x
         lr_t = self._decayed_lr(x.dtype.base_dtype)
         lr_t = K.clip(lr_t, K.epsilon(), 1e10)
         x_norm = tf.norm(x)
         g_norm = tf.norm(dx / lr_t)
         ratio = K.switch(
             x_norm > 0.0,
             K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.0),
             1.0)
         new_x = x + dx * ratio
     return old_update(x, new_x)
예제 #15
0
        def get_updates(self, loss, params):
            updates = super(NewOptimizer, self).get_updates(loss, params)
            self.model_weights = params
            self.ema_weights = [K.zeros(K.shape(w)) for w in params]
            self.old_weights = K.batch_get_value(params)

            ema_updates, ema_momentum = [], self.ema_momentum
            with tf.control_dependencies(updates):
                for w1, w2 in zip(self.ema_weights, params):
                    new_w = ema_momentum * w1 + (1 - ema_momentum) * w2
                    ema_updates.append(K.update(w1, new_w))

            return ema_updates
예제 #16
0
        def _resource_apply(self, grad, var, indices=None):
            op = super(NewOptimizer, self)._resource_apply(grad, var, indices)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_var = self.get_slot(var, 'slow_var')
            slow_var_t = slow_var + alpha * (var - slow_var)

            with tf.control_dependencies([op]):
                slow_update = K.update(slow_var,
                                       K.switch(cond, slow_var_t, slow_var))
                with tf.control_dependencies([slow_update]):
                    copy_update = K.update(var, K.switch(cond, slow_var, var))

            return copy_update
예제 #17
0
 def _resource_apply_dense(self, grad, var):
     op = super(NewOptimizer, self)._resource_apply_dense(grad, var)
     ema = self.get_slot(var, 'ema')
     ema_momentum = self.ema_momentum
     with tf.control_dependencies([op]):
         return K.update(
             ema, ema * ema_momentum + var * (1.0 - ema_momentum))
예제 #18
0
파일: layers.py 프로젝트: ForestLee/nn_qa
    def call(self, inputs):
        """如果是条件Layer Norm,则默认以list为输入,第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma

        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta

        return outputs
예제 #19
0
        def get_updates(self, loss, params):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_%s' % i) for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(NewOptimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies(updates):
                accum_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for g, ag in zip(grads, self.accum_grads)
                ]

            return accum_updates
예제 #20
0
 def _create_slots(self, var_list):
     for var in var_list:
         if self.beta1 > 0.0:
             self.add_slot(var, 'm')
         shape = K.int_shape(var)
         factored_shape = self.factored_shape(shape)
         if factored_shape is None:
             self.add_slot(var, 'v')
         else:
             shape1, axis1, shape2, axis2 = factored_shape
             value1, value2 = np.zeros(shape1), np.zeros(shape2)
             self.add_slot(var, 'vr', value1)
             self.add_slot(var, 'vc', value2)
예제 #21
0
 def __init__(self,
              learning_rate=0.001,
              beta_1=0.9,
              beta_2=0.999,
              epsilon=1e-6,
              bias_correction=True,
              **kwargs):
     kwargs['name'] = kwargs.get('name') or 'Adam'
     super(Adam, self).__init__(**kwargs)
     self._set_hyper('learning_rate', learning_rate)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self.epsilon = epsilon or K.epislon()
     self.bias_correction = bias_correction
예제 #22
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def compute_position_ids(self, inputs):
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     max_position = (self.input_dim - 1) // 2
     pos_ids = K.clip(pos_ids, -max_position, max_position)
     pos_ids = pos_ids + max_position
     return pos_ids
예제 #23
0
 def learning_rate(self):
     if self._learning_rate is None:
         iterations = K.cast(self.iterations + 1, K.floatx())
         learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01)
         if self.multiply_by_parameter_scale:
             return learning_rate
         else:
             return learning_rate * 0.05
     else:
         if not hasattr(self, '__learning_rate'):
             with K.name_scope(self.__class__.__name__):
                 self.__learning_rate = K.variable(self._learning_rate,
                                                   name='learning_rate')
         return self.__learning_rate
예제 #24
0
        def get_updates(self, loss, params):
            updates = super(NewOptimizer, self).get_updates(loss, params)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_vars = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='slow_var_%s' % i) for i, p in enumerate(params)
            ]

            with tf.control_dependencies(updates):
                slow_updates = [
                    K.update(q, K.switch(cond, q + alpha * (p - q), q))
                    for p, q in zip(params, slow_vars)
                ]
                with tf.control_dependencies(slow_updates):
                    copy_updates = [
                        K.update(p, K.switch(cond, q, p))
                        for p, q in zip(params, slow_vars)
                    ]

            return copy_updates
예제 #25
0
파일: layers.py 프로젝트: ForestLee/nn_qa
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
     mask = K.cast(mask, K.floatx())
     # 计算目标分数
     y_true, y_pred = y_true * mask, y_pred * mask
     target_score = self.target_score(y_true, y_pred)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     y_pred = K.concatenate([y_pred, mask], axis=2)
     input_length = K.int_shape(y_pred[:, 1:])[1]
     log_norm, _, _ = K.rnn(self.log_norm_step,
                            y_pred[:, 1:],
                            init_states,
                            input_length=input_length)  # 最后一步的log Z向量
     log_norm = tf.reduce_logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
예제 #26
0
파일: layers.py 프로젝트: ForestLee/nn_qa
    def call(self, inputs):
        """如果custom_position_ids,那么第二个输入为自定义的位置id
        """
        if self.custom_position_ids:
            inputs, position_ids = inputs
            if K.dtype(position_ids) != 'int32':
                position_ids = K.cast(position_ids, 'int32')
            pos_embeddings = K.gather(self.embeddings, position_ids)
        else:
            input_shape = K.shape(inputs)
            batch_size, seq_len = input_shape[0], input_shape[1]
            pos_embeddings = self.embeddings[:seq_len]
            pos_embeddings = K.expand_dims(pos_embeddings, 0)
            if self.merge_mode != 'add':
                pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1])

        if self.merge_mode == 'add':
            return inputs + pos_embeddings
        else:
            return K.concatenate([inputs, pos_embeddings])
예제 #27
0
 def reset_old_weights(self):
     """恢复模型到旧权重。
     """
     K.batch_set_value(zip(self.model_weights, self.old_weights))
예제 #28
0
 def apply_ema_weights(self):
     """备份原模型权重,然后将平均权重应用到模型上去。
     """
     self.old_weights = K.batch_get_value(self.model_weights)
     ema_weights = K.batch_get_value(self.ema_weights)
     K.batch_set_value(zip(self.model_weights, ema_weights))
예제 #29
0
 def new_update(x, new_x):
     if is_one_of(x, params) and self._do_lazy_optimization(x):
         g = self.grads[x]
         r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True)
         new_x = x + (new_x - x) * K.cast(r, K.floatx())
     return old_update(x, new_x)
예제 #30
0
 def new_update(x, new_x):
     new_x = K.switch(cond, new_x, x)
     return old_update(x, new_x)