def extend_with_piecewise_linear_lr(base_optimizer, name=None): """返回新的优化器类,加入分段线性学习率 """ class new_optimizer(base_optimizer): """带有分段线性学习率的优化器 其中schedule是形如{1000: 1, 2000: 0.1}的字典, 表示0~1000步内学习率线性地从零增加到100%,然后 1000~2000步内线性地降到10%,2000步以后保持10% """ def __init__(self, lr_schedule, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.lr_schedule = {int(i): j for i, j in lr_schedule.items()} def _decayed_lr(self, var_dtype): lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule) lr_t = super(new_optimizer, self)._decayed_lr(var_dtype) return lr_t * K.cast(lr_multiplier, var_dtype) def get_config(self): config = {'lr_schedule': self.lr_schedule} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def encode(self, first_text, second_text=None, max_length=None, first_length=None, second_length=None): """输出文本对应token id和segment id 如果传入first_length,则强行padding第一个句子到指定长度; 同理,如果传入second_length,则强行padding第二个句子到指定长度。 """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): second_tokens = self.tokenize(second_text, add_cls=False) else: second_tokens = second_text if max_length is not None: self.truncate_sequence(max_length, first_tokens, second_tokens, -2) first_token_ids = self.tokens_to_ids(first_tokens) if first_length is not None: first_token_ids = first_token_ids[:first_length] first_token_ids.extend([self._token_pad_id] * (first_length - len(first_token_ids))) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: second_token_ids = self.tokens_to_ids(second_tokens) if second_length is not None: second_token_ids = second_token_ids[:second_length] second_token_ids.extend( [self._token_pad_id] * (second_length - len(second_token_ids))) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def extend_with_layer_adaptation(base_optimizer, name=None): """返回新的优化器类,加入层自适应学习率 """ class new_optimizer(base_optimizer): """带有层自适应学习率的优化器 用每一层参数的模长来校正当前参数的学习率 https://arxiv.org/abs/1904.00962 """ def __init__(self, exclude_from_layer_adaptation=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.exclude_from_layer_adaptation = exclude_from_layer_adaptation or [] def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_layer_adaptation(self, w): return (not string_matching(w.name, self.exclude_from_layer_adaptation)) def get_config(self): config = { 'exclude_from_layer_adaptation': self.exclude_from_layer_adaptation } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_lookahead(base_optimizer, name=None): """返回新的优化器类,加入look ahead """ class new_optimizer(base_optimizer): """带有look ahead的优化器 https://arxiv.org/abs/1907.08610 steps_per_slow_update: 即论文中的k; slow_step_size: 即论文中的alpha。 """ def __init__(self, steps_per_slow_update=5, slow_step_size=0.5, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.steps_per_slow_update = steps_per_slow_update self.slow_step_size = slow_step_size def _create_slots(self, var_list): super(new_optimizer, self)._create_slots(var_list) for var in var_list: self.add_slot(var, 'slow_var') def _resource_apply_op(self, grad, var, indices=None): op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_var = self.get_slot(var, 'slow_var') slow_var_t = slow_var + alpha * (var - slow_var) with tf.control_dependencies([op]): slow_update = K.update(slow_var, K.switch(cond, slow_var_t, slow_var)) with tf.control_dependencies([slow_update]): copy_update = K.update(var, K.switch(cond, slow_var, var)) return copy_update def get_config(self): config = { 'steps_per_slow_update': self.steps_per_slow_update, 'slow_step_size': self.slow_step_size } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_gradient_accumulation(base_optimizer, name=None): """返回新的优化器类,加入梯度累积 """ class new_optimizer(base_optimizer): """带有梯度累积的优化器 """ def __init__(self, grad_accum_steps, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.grad_accum_steps = grad_accum_steps def _create_slots(self, var_list): super(new_optimizer, self)._create_slots(var_list) for var in var_list: self.add_slot(var, 'ag') def _resource_apply_op(self, grad, var, indices=None): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 ag = self.get_slot(var, 'ag') old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update ag_t = ag / self.grad_accum_steps op = super(new_optimizer, self)._resource_apply_op(ag_t, var) K.update = old_update # 累积梯度 with tf.control_dependencies([op]): ag_t = K.switch(cond, K.zeros_like(ag), ag) with tf.control_dependencies([K.update(ag, ag_t)]): if indices is None: ag_t = K.update(ag, ag + grad) else: ag_t = self._resource_scatter_add(ag, indices, grad) return ag_t def get_config(self): config = {'grad_accum_steps': self.grad_accum_steps} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_lazy_optimization(base_optimizer, name=None): """返回新的优化器类,加入懒惰更新 """ class new_optimizer(base_optimizer): """带有懒惰更新的优化器 使得部分权重(尤其是embedding)只有在梯度不等于0时 才发生更新。 """ def __init__(self, include_in_lazy_optimization=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.include_in_lazy_optimization = include_in_lazy_optimization or [] def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_lazy_optimization(x): if indices is None: r = K.any(K.not_equal(grad, 0.), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x) else: return self._resource_scatter_add( x, indices, K.gather(new_x - x, indices)) return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_lazy_optimization(self, w): return string_matching(w.name, self.include_in_lazy_optimization) def get_config(self): config = { 'include_in_lazy_optimization': self.include_in_lazy_optimization } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_weight_decay(base_optimizer, name=None): """返回新的优化器类,加入权重衰减 """ class new_optimizer(base_optimizer): """带有权重衰减的优化器 """ def __init__(self, weight_decay_rate, exclude_from_weight_decay=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.weight_decay_rate = weight_decay_rate self.exclude_from_weight_decay = exclude_from_weight_decay or [] def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_weight_decay(x): lr_t = self._decayed_lr(x.dtype.base_dtype) new_x = new_x - lr_t * self.weight_decay_rate * x return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_weight_decay(self, w): return (not string_matching(w.name, self.exclude_from_weight_decay)) def get_config(self): config = { 'weight_decay_rate': self.weight_decay_rate, 'exclude_from_weight_decay': self.exclude_from_weight_decay } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def __init__(self, token_dict, do_lower_case=False): """初始化 """ super(Tokenizer, self).__init__(do_lower_case) if is_string(token_dict): token_dict = load_vocab(token_dict) self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} for token in ['pad', 'cls', 'sep', 'unk', 'mask']: try: _token_id = token_dict[getattr(self, '_token_%s' % token)] setattr(self, '_token_%s_id' % token, _token_id) except: pass self._vocab_size = len(token_dict)