def encode( self, first_text, second_text=None, maxlen=None, pattern='S*E*E' ): """输出文本对应token id和segment id """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): if pattern == 'S*E*E': idx = int(bool(self._token_start)) second_tokens = self.tokenize(second_text)[idx:] elif pattern == 'S*ES*E': second_tokens = self.tokenize(second_text) else: second_tokens = second_text if maxlen is not None: self.truncate_sequence(maxlen, first_tokens, second_tokens, -2) first_token_ids = self.tokens_to_ids(first_tokens) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: second_token_ids = self.tokens_to_ids(second_tokens) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def gen_caption(image, topk=2, caption_maxlen=64): """beam search解码 每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索 """ if is_string(image): image = read_image(image) image = np.array([image for _ in range(topk)]) image = preprocess_input(image) target_ids = [[tokenizer._token_cls_id] for _ in range(topk)] # 候选答案id target_scores = [0] * topk # 候选答案分数 for i in range(caption_maxlen): # 强制要求输出不超过caption_maxlen字 _target_ids = target_ids _segment_ids = [[0] * len(t) for t in target_ids] _probas = model.predict([_target_ids, _segment_ids, image])[:, -1, 3:] # 直接忽略[PAD], [UNK], [CLS] _log_probas = np.log(_probas + 1e-6) # 取对数,方便计算 _topk_arg = _log_probas.argsort(axis=1)[:, -topk:] # 每一项选出topk _candidate_ids, _candidate_scores = [], [] for j, (ids, sco) in enumerate(zip(target_ids, target_scores)): # 预测第一个字的时候,输入的topk事实上都是同一个, # 所以只需要看第一个,不需要遍历后面的。 if i == 0 and j > 0: continue for k in _topk_arg[j]: _candidate_ids.append(ids + [k + 3]) _candidate_scores.append(sco + _log_probas[j][k]) _topk_arg = np.argsort(_candidate_scores)[-topk:] # 从中选出新的topk target_ids = [_candidate_ids[k] for k in _topk_arg] target_scores = [_candidate_scores[k] for k in _topk_arg] best_one = np.argmax(target_scores) if target_ids[best_one][-1] == 3: return tokenizer.decode(target_ids[best_one]) # 如果caption_maxlen字都找不到结束符,直接返回 return tokenizer.decode(target_ids[np.argmax(target_scores)])
def generate(self, text, image, topk=1): if is_string(image): image = read_image(os.path.join('../auto_annot', image)) max_c_len = maxlen - self.maxlen token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len) output_ids = self.beam_search([token_ids, segment_ids, image], topk) return tokenizer.decode(output_ids)
def extend_with_gradient_accumulation(base_optimizer, name=None): """返回新的优化器类,加入梯度累积 """ class new_optimizer(base_optimizer): """带有梯度累积的优化器 """ def __init__(self, grad_accum_steps, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.grad_accum_steps = grad_accum_steps self._first_get_gradients = True def get_gradients(self, loss, params): if self._first_get_gradients: self._first_get_gradients = False return super(new_optimizer, self).get_gradients(loss, params) else: return [ag / self.grad_accum_steps for ag in self.accum_grads] @K.symbolic def get_updates(self, loss, params): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) cond = K.cast(cond, K.floatx()) # 获取梯度 grads = self.get_gradients(loss, params) self.accum_grads = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='accum_grad_%s' % i) for i, p in enumerate(params) ] old_update = K.update def new_update(x, new_x): new_x = cond * new_x + (1 - cond) * x return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update # 累积梯度 with tf.control_dependencies(updates): accum_updates = [ K.update(ag, g + (1 - cond) * ag) for g, ag in zip(grads, self.accum_grads) ] return accum_updates def get_config(self): config = {'grad_accum_steps': self.grad_accum_steps} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_piecewise_linear_lr_v2(base_optimizer, name=None): """返回新的优化器类,加入分段线性学习率 """ class new_optimizer(base_optimizer): """带有分段线性学习率的优化器 其中schedule是形如{1000: 1, 2000: 0.1}的字典, 表示0~1000步内学习率线性地从零增加到100%,然后 1000~2000步内线性地降到10%,2000步以后保持10% """ def __init__(self, lr_schedule, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.lr_schedule = {int(i): j for i, j in lr_schedule.items()} def _decayed_lr(self, var_dtype): lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule) lr_t = super(new_optimizer, self)._decayed_lr(var_dtype) return lr_t * K.cast(lr_multiplier, var_dtype) def get_config(self): config = {'lr_schedule': self.lr_schedule} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def __init__(self, token_dict, do_lower_case=False, pre_tokenize=None, **kwargs): """这里的pre_tokenize是外部传入的分词函数,用作对文本进行预分词。如果传入 pre_tokenize,则先执行pre_tokenize(text),然后在它的基础上执行原本的 tokenize函数。 """ super(Tokenizer, self).__init__(**kwargs) if is_string(token_dict): token_dict = load_vocab(token_dict) self._do_lower_case = do_lower_case self._pre_tokenize = pre_tokenize self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} self._vocab_size = len(token_dict) for token in ['pad', 'unk', 'mask', 'start', 'end']: try: _token_id = token_dict[getattr(self, '_token_%s' % token)] setattr(self, '_token_%s_id' % token, _token_id) except: pass
def encode( self, first_text, second_text=None, max_length=None, first_length=None, second_length=None ): """输出文本对应token id和segment id 如果传入first_length,则强行padding第一个句子到指定长度; 同理,如果传入second_length,则强行padding第二个句子到指定长度。 """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): idx = int(bool(self._token_start)) second_tokens = self.tokenize(second_text)[idx:] else: second_tokens = second_text if max_length is not None: self.truncate_sequence(max_length, first_tokens, second_tokens, -2) first_token_ids = self.tokens_to_ids(first_tokens) if first_length is not None: first_token_ids = first_token_ids[:first_length] first_token_ids.extend([self._token_pad_id] * (first_length - len(first_token_ids))) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: second_token_ids = self.tokens_to_ids(second_tokens) if second_length is not None: second_token_ids = second_token_ids[:second_length] second_token_ids.extend([self._token_pad_id] * (second_length - len(second_token_ids))) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def extend_with_layer_adaptation(base_optimizer, name=None): """返回新的优化器类,加入层自适应学习率 """ class new_optimizer(base_optimizer): """带有层自适应学习率的优化器 用每一层参数的模长来校正当前参数的学习率 https://arxiv.org/abs/1904.00962 """ def __init__(self, exclude_from_layer_adaptation=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.exclude_from_layer_adaptation = exclude_from_layer_adaptation or [] if not hasattr(self, 'learning_rate'): self.learning_rate = self.lr @K.symbolic def get_updates(self, loss, params): old_update = K.update def new_update(x, new_x): if is_one_of(x, params) and self._do_layer_adaptation(x): dx = new_x - x lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update return updates def _do_layer_adaptation(self, w): return (not string_matching(w.name, self.exclude_from_layer_adaptation)) def get_config(self): config = { 'exclude_from_layer_adaptation': self.exclude_from_layer_adaptation } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def encode(self, first_text, second_text=None, maxlen=None, pattern='S*E*E', truncate_from='right'): """输出文本对应token id和segment id """ if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): second_tokens = self.tokenize(second_text) else: second_tokens = second_text if maxlen is not None: if truncate_from == 'right': index = -int(self._token_end is not None) - 1 elif truncate_from == 'left': index = int(self._token_start is not None) else: index = truncate_from if second_text is not None and pattern == 'S*E*E': maxlen += 1 truncate_sequences(maxlen, index, first_tokens, second_tokens) first_token_ids = self.tokens_to_ids(first_tokens) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: if pattern == 'S*E*E': idx = int(bool(self._token_start)) second_tokens = second_tokens[idx:] second_token_ids = self.tokens_to_ids(second_tokens) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def extend_with_lookahead(base_optimizer, name=None): """返回新的优化器类,加入look ahead """ class new_optimizer(base_optimizer): """带有look ahead的优化器 https://arxiv.org/abs/1907.08610 steps_per_slow_update: 即论文中的k; slow_step_size: 即论文中的alpha。 """ def __init__(self, steps_per_slow_update=5, slow_step_size=0.5, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.steps_per_slow_update = steps_per_slow_update self.slow_step_size = slow_step_size @K.symbolic def get_updates(self, loss, params): updates = super(new_optimizer, self).get_updates(loss, params) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_vars = [ K.zeros(K.int_shape(p), dtype=K.dtype(p), name='slow_var_%s' % i) for i, p in enumerate(params) ] with tf.control_dependencies(updates): slow_updates = [ K.update(q, K.switch(cond, q + alpha * (p - q), q)) for p, q in zip(params, slow_vars) ] with tf.control_dependencies(slow_updates): copy_updates = [ K.update(p, K.switch(cond, q, p)) for p, q in zip(params, slow_vars) ] return copy_updates def get_config(self): config = { 'steps_per_slow_update': self.steps_per_slow_update, 'slow_step_size': self.slow_step_size } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_layer_adaptation_v2(base_optimizer, name=None): """返回新的优化器类,加入层自适应学习率 """ class new_optimizer(base_optimizer): """带有层自适应学习率的优化器 用每一层参数的模长来校正当前参数的学习率 https://arxiv.org/abs/1904.00962 """ def __init__(self, exclude_from_layer_adaptation=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.exclude_from_layer_adaptation = exclude_from_layer_adaptation or [] def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_layer_adaptation(x): dx = new_x - x lr_t = self._decayed_lr(x.dtype.base_dtype) lr_t = K.clip(lr_t, K.epsilon(), 1e10) x_norm = tf.norm(x) g_norm = tf.norm(dx / lr_t) ratio = K.switch( x_norm > 0., K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.), 1.) new_x = x + dx * ratio return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_layer_adaptation(self, w): return (not string_matching(w.name, self.exclude_from_layer_adaptation)) def get_config(self): config = { 'exclude_from_layer_adaptation': self.exclude_from_layer_adaptation } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def new_extend_with(BaseOptimizer, name=None): NewOptimizer = base_extend_with(BaseOptimizer) if is_string(name): NewOptimizer.__name__ = name name = NewOptimizer.__name__ keras.utils.get_custom_objects()[name] = NewOptimizer return NewOptimizer
def extend_with_lazy_optimization(base_optimizer, name=None): """返回新的优化器类,加入懒惰更新 """ class new_optimizer(base_optimizer): """带有懒惰更新的优化器 使得部分权重(尤其是embedding)只有在梯度不等于0时 才发生更新。 """ def __init__(self, include_in_lazy_optimization=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.include_in_lazy_optimization = include_in_lazy_optimization or [] self._first_get_gradients = True def get_gradients(self, loss, params): if self._first_get_gradients: self._first_get_gradients = False return super(new_optimizer, self).get_gradients(loss, params) else: return [self.grads[p] for p in params] @K.symbolic def get_updates(self, loss, params): self.grads = dict(zip(params, self.get_gradients(loss, params))) old_update = K.update def new_update(x, new_x): if is_one_of(x, params) and self._do_lazy_optimization(x): g = self.grads[x] r = K.any(K.not_equal(g, 0.), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update return updates def _do_lazy_optimization(self, w): return string_matching(w.name, self.include_in_lazy_optimization) def get_config(self): config = { 'include_in_lazy_optimization': self.include_in_lazy_optimization } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] # 处理mask idx = 3 if q_mask: q_mask = inputs[idx] idx += 1 else: q_mask = None if v_mask: v_mask = inputs[idx] idx += 1 else: v_mask = None if a_mask: if len(inputs) > idx: a_mask = inputs[idx] else: a_mask = 'history_only' else: a_mask = None # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask) and a_mask == 'history_only': ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def extend_with_gradient_accumulation_v2(base_optimizer, name=None): """返回新的优化器类,加入梯度累积 """ class new_optimizer(base_optimizer): """带有梯度累积的优化器 """ def __init__(self, grad_accum_steps, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.grad_accum_steps = grad_accum_steps def _create_slots(self, var_list): super(new_optimizer, self)._create_slots(var_list) for var in var_list: self.add_slot(var, 'ag') def _resource_apply_op(self, grad, var, indices=None): # 更新判据 cond = K.equal(self.iterations % self.grad_accum_steps, 0) # 获取梯度 ag = self.get_slot(var, 'ag') old_update = K.update def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x) K.update = new_update ag_t = ag / self.grad_accum_steps op = super(new_optimizer, self)._resource_apply_op(ag_t, var) K.update = old_update # 累积梯度 with tf.control_dependencies([op]): ag_t = K.switch(cond, K.zeros_like(ag), ag) with tf.control_dependencies([K.update(ag, ag_t)]): if indices is None: ag_t = K.update(ag, ag + grad) else: ag_t = self._resource_scatter_add(ag, indices, grad) return ag_t def get_config(self): config = {'grad_accum_steps': self.grad_accum_steps} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_lookahead_v2(base_optimizer, name=None): """返回新的优化器类,加入look ahead """ class new_optimizer(base_optimizer): """带有look ahead的优化器 https://arxiv.org/abs/1907.08610 steps_per_slow_update: 即论文中的k; slow_step_size: 即论文中的alpha。 """ def __init__(self, steps_per_slow_update=5, slow_step_size=0.5, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.steps_per_slow_update = steps_per_slow_update self.slow_step_size = slow_step_size def _create_slots(self, var_list): super(new_optimizer, self)._create_slots(var_list) for var in var_list: self.add_slot(var, 'slow_var') def _resource_apply_op(self, grad, var, indices=None): op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) k, alpha = self.steps_per_slow_update, self.slow_step_size cond = K.equal(self.iterations % k, 0) slow_var = self.get_slot(var, 'slow_var') slow_var_t = slow_var + alpha * (var - slow_var) with tf.control_dependencies([op]): slow_update = K.update(slow_var, K.switch(cond, slow_var_t, slow_var)) with tf.control_dependencies([slow_update]): copy_update = K.update(var, K.switch(cond, slow_var, var)) return copy_update def get_config(self): config = { 'steps_per_slow_update': self.steps_per_slow_update, 'slow_step_size': self.slow_step_size } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def encode(self, first_text, second_text=None, maxlen=None, max_length=None): """输出文本对应token id和segment id """ # 向后兼容 if maxlen is None and max_length is not None: print( 'From tokenizers.py: The argument max_length is deprecated. Please use maxlen instead.' ) maxlen = maxlen or max_length if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text if second_text is None: second_tokens = None elif is_string(second_text): idx = int(bool(self._token_start)) second_tokens = self.tokenize(second_text)[idx:] else: second_tokens = second_text if maxlen is not None: self.truncate_sequence(maxlen, first_tokens, second_tokens, -2) first_token_ids = self.tokens_to_ids(first_tokens) first_segment_ids = [0] * len(first_token_ids) if second_text is not None: second_token_ids = self.tokens_to_ids(second_tokens) second_segment_ids = [1] * len(second_token_ids) first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) return first_token_ids, first_segment_ids
def extend_with_lazy_optimization_v2(base_optimizer, name=None): """返回新的优化器类,加入懒惰更新 """ class new_optimizer(base_optimizer): """带有懒惰更新的优化器 使得部分权重(尤其是embedding)只有在梯度不等于0时 才发生更新。 """ def __init__(self, include_in_lazy_optimization=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.include_in_lazy_optimization = include_in_lazy_optimization or [] self._first_get_gradients = True def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_lazy_optimization(x): if indices is None: r = K.any(K.not_equal(grad, 0.), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x) else: return self._resource_scatter_add( x, indices, K.gather(new_x - x, indices)) return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_lazy_optimization(self, w): return string_matching(w.name, self.include_in_lazy_optimization) def get_config(self): config = { 'include_in_lazy_optimization': self.include_in_lazy_optimization } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_weight_decay(base_optimizer, name=None): """返回新的优化器类,加入权重衰减 """ class new_optimizer(base_optimizer): """带有权重衰减的优化器 """ def __init__(self, weight_decay_rate, exclude_from_weight_decay=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.weight_decay_rate = weight_decay_rate self.exclude_from_weight_decay = exclude_from_weight_decay or [] if not hasattr(self, 'learning_rate'): self.learning_rate = self.lr @K.symbolic def get_updates(self, loss, params): old_update = K.update def new_update(x, new_x): if is_one_of(x, params) and self._do_weight_decay(x): new_x = new_x - self.learning_rate * self.weight_decay_rate * x return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update return updates def _do_weight_decay(self, w): return (not string_matching(w.name, self.exclude_from_weight_decay)) def get_config(self): config = { 'weight_decay_rate': self.weight_decay_rate, 'exclude_from_weight_decay': self.exclude_from_weight_decay } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def extend_with_weight_decay_v2(base_optimizer, name=None): """返回新的优化器类,加入权重衰减 """ class new_optimizer(base_optimizer): """带有权重衰减的优化器 """ def __init__(self, weight_decay_rate, exclude_from_weight_decay=None, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.weight_decay_rate = weight_decay_rate self.exclude_from_weight_decay = exclude_from_weight_decay or [] def _resource_apply_op(self, grad, var, indices=None): old_update = K.update def new_update(x, new_x): if x is var and self._do_weight_decay(x): lr_t = self._decayed_lr(x.dtype.base_dtype) new_x = new_x - lr_t * self.weight_decay_rate * x return old_update(x, new_x) K.update = new_update op = super(new_optimizer, self)._resource_apply_op(grad, var, indices) K.update = old_update return op def _do_weight_decay(self, w): return (not string_matching(w.name, self.exclude_from_weight_decay)) def get_config(self): config = { 'weight_decay_rate': self.weight_decay_rate, 'exclude_from_weight_decay': self.exclude_from_weight_decay } base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def __init__(self, token_dict, do_lower_case=False, **kwargs): super(Tokenizer, self).__init__(**kwargs) if is_string(token_dict): token_dict = load_vocab(token_dict) self._do_lower_case = do_lower_case self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} self._vocab_size = len(token_dict) for token in ['pad', 'unk', 'mask', 'start', 'end']: try: _token_id = token_dict[getattr(self, '_token_%s' % token)] setattr(self, '_token_%s_id' % token, _token_id) except: pass
def __init__(self, token_dict, do_lower_case=False): """初始化 """ super(Tokenizer, self).__init__(do_lower_case) if is_string(token_dict): token_dict = load_vocab(token_dict) self._token_dict = token_dict self._token_dict_inv = {v: k for k, v in token_dict.items()} for token in ['pad', 'cls', 'sep', 'unk', 'mask']: try: _token_id = token_dict[getattr(self, '_token_%s' % token)] setattr(self, '_token_%s_id' % token, _token_id) except: pass self._vocab_size = len(token_dict)
def extend_with_piecewise_linear_lr(base_optimizer, name=None): """返回新的优化器类,加入分段线性学习率 """ class new_optimizer(base_optimizer): """带有分段线性学习率的优化器 其中schedule是形如{1000: 1, 2000: 0.1}的字典, 表示0~1000步内学习率线性地从零增加到100%,然后 1000~2000步内线性地降到10%,2000步以后保持10% """ def __init__(self, lr_schedule, *args, **kwargs): super(new_optimizer, self).__init__(*args, **kwargs) self.lr_schedule = {int(i): j for i, j in lr_schedule.items()} @K.symbolic def get_updates(self, loss, params): lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule) old_update = K.update def new_update(x, new_x): if is_one_of(x, params): new_x = x + (new_x - x) * lr_multiplier return old_update(x, new_x) K.update = new_update updates = super(new_optimizer, self).get_updates(loss, params) K.update = old_update return updates def get_config(self): config = {'lr_schedule': self.lr_schedule} base_config = super(new_optimizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) if is_string(name): new_optimizer.__name__ = name keras.utils.get_custom_objects()[name] = new_optimizer return new_optimizer
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ # 处理mask inputs = inputs[:] for i, mask in enumerate([q_mask, v_mask, a_mask]): if not mask: inputs.insert(3 + i, None) q, k, v, q_mask, v_mask = inputs[:5] if len(inputs) == 5: a_mask = 'history_only' elif len(inputs) == 6: a_mask = inputs[-1] else: raise ValueError('wrong inputs for MultiHeadAttention.') # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[3] if q_mask is not None: if not hasattr(self, 'q_mask_layer'): self.q_mask_layer = search_layer(q, q_mask) q_mask = self.q_mask_layer.output_mask if v_mask is not None: if not hasattr(self, 'v_mask_layer'): self.v_mask_layer = search_layer(v, v_mask) v_mask = self.v_mask_layer.output_mask # Pooling if self.pool_size > 1: is_self_attention = (q is k is v) q_in_len = K.shape(q)[1] q = sequence_masking(q, q_mask, 0) q = divisible_temporal_padding(q, self.pool_size) q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg') if is_self_attention: k = v = q else: k = sequence_masking(k, v_mask, 0) k = divisible_temporal_padding(k, self.pool_size) k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg') v = sequence_masking(v, v_mask, 0) v = divisible_temporal_padding(v, self.pool_size) v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg') if v_mask is not None: v_mask = v_mask[:, ::self.pool_size] if a_mask is not None and not is_string(a_mask): a_mask = a_mask[..., ::self.pool_size, ::self.pool_size] # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 恢复长度 if self.pool_size > 1: o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len] # 返回结果 o = sequence_masking(o, q_mask, 0) return o
def generate(self, image, topk=2): if is_string(image): image = read_image(image) image = preprocess_input(image) output_ids = self.decode([image], topk) return tokenizer.decode(output_ids)
def encode_multi(self, first_text: str, second_texts: List = None, limit_lengths: List = None, length_max: int = None, first_length: int = None, second_length: int = None, is_multi: bool = True): """ bert encode of multi-text-input, 处理多输入(segment_ids标注两个还是多个[0,1]), 同时第二句子可能会截断[CLS]/[SEP]等信息 Args: first_text: Any, first input of sentence when in single-task, pair-task or multi-task, eg. "macadam英文什么意思" second_texts: List, second inputs of sentence, eg. ["macadam?", "啥macadam?", "macadam什么意思"] limit_lengths: List, limit lengths of each in texts(second inputs), eg.[36, 36,128] length_max: int, max length of the whole sequence, eg. 512 first_length: int, max length of first_text, eg. 128 second_length: int, max length of the whole sequence of texts(second inputs), eg. 128 is_multi: bool, either sign sentence in texts with multi or not Returns: input of bert-like model """ # split charcter, 分字 if is_string(first_text): first_tokens = self.tokenize(first_text) else: first_tokens = first_text # token to id and pad, 多则截断|少则padding first_token_ids = self.tokens_to_ids(first_tokens) if first_length: first_token_ids = first_token_ids[:first_length] first_token_ids.extend([self._token_pad_id] * (first_length - len(first_token_ids))) # segment, like [0,0,0,0,0,1,1,1,1], 区分句子 first_segment_ids = [0] * len(first_tokens) # second segments, 第一句以后的句子 second_token_ids = [] second_segment_ids = [] # if texts exist, 如果存在第二句话或者是往上, 避免为空的情况 if second_texts: len_texts = len(second_texts) for i in range(len_texts): text = second_texts[i] if not text: tokens = None elif is_string(text): idx = int(bool(self._token_start)) tokens = self.tokenize(text)[idx:] else: tokens = text if tokens: # token to id, 字符转数字 token_ids = self.tokens_to_ids(tokens) if limit_lengths and limit_lengths[i]: token_ids = token_ids[:limit_lengths[i]] token_ids.extend([self._token_pad_id] * (limit_lengths[i] - len(token_ids))) # pull segment_id, 句子区分, 第一句为0, 第二句子s是否都标为1 if is_multi: id_sent = 1 if i % 2 == 0 else 0 else: id_sent = 1 segment_ids = [id_sent] * len(token_ids) second_token_ids.extend(token_ids) second_segment_ids.extend(segment_ids) # limit second texts, 限制除了第一个句子外的所有句子的长度 if second_length: second_token_ids = second_token_ids[:second_length] second_segment_ids = second_segment_ids[:second_length] # add all, 所有句子相加 first_token_ids.extend(second_token_ids) first_segment_ids.extend(second_segment_ids) # limit all, 限制所有句子的长度 if length_max: first_token_ids = first_token_ids[:length_max] first_segment_ids = first_segment_ids[:length_max] return first_token_ids, first_segment_ids
def generate(self, image, topk=1): if is_string(image): image = read_image(image) image = preprocess_input(image) output_ids = self.beam_search([image], topk) # 基于beam search return tokenizer.decode(output_ids)
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if p_bias != 't5_relative': # T5不用缩放 a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o