Пример #1
0
    def encode(
        self, first_text, second_text=None, maxlen=None, pattern='S*E*E'
    ):
        """输出文本对应token id和segment id
        """
        if is_string(first_text):
            first_tokens = self.tokenize(first_text)
        else:
            first_tokens = first_text

        if second_text is None:
            second_tokens = None
        elif is_string(second_text):
            if pattern == 'S*E*E':
                idx = int(bool(self._token_start))
                second_tokens = self.tokenize(second_text)[idx:]
            elif pattern == 'S*ES*E':
                second_tokens = self.tokenize(second_text)
        else:
            second_tokens = second_text

        if maxlen is not None:
            self.truncate_sequence(maxlen, first_tokens, second_tokens, -2)

        first_token_ids = self.tokens_to_ids(first_tokens)
        first_segment_ids = [0] * len(first_token_ids)

        if second_text is not None:
            second_token_ids = self.tokens_to_ids(second_tokens)
            second_segment_ids = [1] * len(second_token_ids)
            first_token_ids.extend(second_token_ids)
            first_segment_ids.extend(second_segment_ids)

        return first_token_ids, first_segment_ids
Пример #2
0
def gen_caption(image, topk=2, caption_maxlen=64):
    """beam search解码
    每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索
    """
    if is_string(image):
        image = read_image(image)
    image = np.array([image for _ in range(topk)])
    image = preprocess_input(image)
    target_ids = [[tokenizer._token_cls_id] for _ in range(topk)]  # 候选答案id
    target_scores = [0] * topk  # 候选答案分数
    for i in range(caption_maxlen):  # 强制要求输出不超过caption_maxlen字
        _target_ids = target_ids
        _segment_ids = [[0] * len(t) for t in target_ids]
        _probas = model.predict([_target_ids, _segment_ids,
                                 image])[:, -1, 3:]  # 直接忽略[PAD], [UNK], [CLS]
        _log_probas = np.log(_probas + 1e-6)  # 取对数,方便计算
        _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]  # 每一项选出topk
        _candidate_ids, _candidate_scores = [], []
        for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
            # 预测第一个字的时候,输入的topk事实上都是同一个,
            # 所以只需要看第一个,不需要遍历后面的。
            if i == 0 and j > 0:
                continue
            for k in _topk_arg[j]:
                _candidate_ids.append(ids + [k + 3])
                _candidate_scores.append(sco + _log_probas[j][k])
        _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
        target_ids = [_candidate_ids[k] for k in _topk_arg]
        target_scores = [_candidate_scores[k] for k in _topk_arg]
        best_one = np.argmax(target_scores)
        if target_ids[best_one][-1] == 3:
            return tokenizer.decode(target_ids[best_one])
    # 如果caption_maxlen字都找不到结束符,直接返回
    return tokenizer.decode(target_ids[np.argmax(target_scores)])
 def generate(self, text, image, topk=1):
   if is_string(image):
     image = read_image(os.path.join('../auto_annot', image))
   max_c_len = maxlen - self.maxlen
   token_ids, segment_ids = tokenizer.encode(text, maxlen=max_c_len)
   output_ids = self.beam_search([token_ids, segment_ids, image], topk)
   return tokenizer.decode(output_ids)
def extend_with_gradient_accumulation(base_optimizer, name=None):
    """返回新的优化器类,加入梯度累积
    """
    class new_optimizer(base_optimizer):
        """带有梯度累积的优化器
        """
        def __init__(self, grad_accum_steps, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.grad_accum_steps = grad_accum_steps
            self._first_get_gradients = True

        def get_gradients(self, loss, params):
            if self._first_get_gradients:
                self._first_get_gradients = False
                return super(new_optimizer, self).get_gradients(loss, params)
            else:
                return [ag / self.grad_accum_steps for ag in self.accum_grads]

        @K.symbolic
        def get_updates(self, loss, params):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_%s' % i) for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies(updates):
                accum_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for g, ag in zip(grads, self.accum_grads)
                ]

            return accum_updates

        def get_config(self):
            config = {'grad_accum_steps': self.grad_accum_steps}
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #5
0
def extend_with_piecewise_linear_lr_v2(base_optimizer, name=None):
    """返回新的优化器类,加入分段线性学习率
    """
    class new_optimizer(base_optimizer):
        """带有分段线性学习率的优化器
        其中schedule是形如{1000: 1, 2000: 0.1}的字典,
        表示0~1000步内学习率线性地从零增加到100%,然后
        1000~2000步内线性地降到10%,2000步以后保持10%
        """
        def __init__(self, lr_schedule, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.lr_schedule = {int(i): j for i, j in lr_schedule.items()}

        def _decayed_lr(self, var_dtype):
            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)
            lr_t = super(new_optimizer, self)._decayed_lr(var_dtype)
            return lr_t * K.cast(lr_multiplier, var_dtype)

        def get_config(self):
            config = {'lr_schedule': self.lr_schedule}
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #6
0
    def __init__(self,
                 token_dict,
                 do_lower_case=False,
                 pre_tokenize=None,
                 **kwargs):
        """这里的pre_tokenize是外部传入的分词函数,用作对文本进行预分词。如果传入
        pre_tokenize,则先执行pre_tokenize(text),然后在它的基础上执行原本的
        tokenize函数。
        """
        super(Tokenizer, self).__init__(**kwargs)
        if is_string(token_dict):
            token_dict = load_vocab(token_dict)

        self._do_lower_case = do_lower_case
        self._pre_tokenize = pre_tokenize
        self._token_dict = token_dict
        self._token_dict_inv = {v: k for k, v in token_dict.items()}
        self._vocab_size = len(token_dict)

        for token in ['pad', 'unk', 'mask', 'start', 'end']:
            try:
                _token_id = token_dict[getattr(self, '_token_%s' % token)]
                setattr(self, '_token_%s_id' % token, _token_id)
            except:
                pass
Пример #7
0
    def encode(
        self,
        first_text,
        second_text=None,
        max_length=None,
        first_length=None,
        second_length=None
    ):
        """输出文本对应token id和segment id
        如果传入first_length,则强行padding第一个句子到指定长度;
        同理,如果传入second_length,则强行padding第二个句子到指定长度。
        """
        if is_string(first_text):
            first_tokens = self.tokenize(first_text)
        else:
            first_tokens = first_text

        if second_text is None:
            second_tokens = None
        elif is_string(second_text):
            idx = int(bool(self._token_start))
            second_tokens = self.tokenize(second_text)[idx:]
        else:
            second_tokens = second_text

        if max_length is not None:
            self.truncate_sequence(max_length, first_tokens, second_tokens, -2)

        first_token_ids = self.tokens_to_ids(first_tokens)
        if first_length is not None:
            first_token_ids = first_token_ids[:first_length]
            first_token_ids.extend([self._token_pad_id] *
                                   (first_length - len(first_token_ids)))
        first_segment_ids = [0] * len(first_token_ids)

        if second_text is not None:
            second_token_ids = self.tokens_to_ids(second_tokens)
            if second_length is not None:
                second_token_ids = second_token_ids[:second_length]
                second_token_ids.extend([self._token_pad_id] *
                                        (second_length - len(second_token_ids)))
            second_segment_ids = [1] * len(second_token_ids)

            first_token_ids.extend(second_token_ids)
            first_segment_ids.extend(second_segment_ids)

        return first_token_ids, first_segment_ids
Пример #8
0
def extend_with_layer_adaptation(base_optimizer, name=None):
    """返回新的优化器类,加入层自适应学习率
    """
    class new_optimizer(base_optimizer):
        """带有层自适应学习率的优化器
        用每一层参数的模长来校正当前参数的学习率
        https://arxiv.org/abs/1904.00962
        """
        def __init__(self,
                     exclude_from_layer_adaptation=None,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.exclude_from_layer_adaptation = exclude_from_layer_adaptation or []
            if not hasattr(self, 'learning_rate'):
                self.learning_rate = self.lr

        @K.symbolic
        def get_updates(self, loss, params):
            old_update = K.update

            def new_update(x, new_x):
                if is_one_of(x, params) and self._do_layer_adaptation(x):
                    dx = new_x - x
                    lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10)
                    x_norm = tf.norm(x)
                    g_norm = tf.norm(dx / lr_t)
                    ratio = K.switch(
                        x_norm > 0.,
                        K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.),
                        1.)
                    new_x = x + dx * ratio
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            return updates

        def _do_layer_adaptation(self, w):
            return (not string_matching(w.name,
                                        self.exclude_from_layer_adaptation))

        def get_config(self):
            config = {
                'exclude_from_layer_adaptation':
                self.exclude_from_layer_adaptation
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #9
0
    def encode(self,
               first_text,
               second_text=None,
               maxlen=None,
               pattern='S*E*E',
               truncate_from='right'):
        """输出文本对应token id和segment id
        """
        if is_string(first_text):
            first_tokens = self.tokenize(first_text)
        else:
            first_tokens = first_text

        if second_text is None:
            second_tokens = None
        elif is_string(second_text):
            second_tokens = self.tokenize(second_text)
        else:
            second_tokens = second_text

        if maxlen is not None:
            if truncate_from == 'right':
                index = -int(self._token_end is not None) - 1
            elif truncate_from == 'left':
                index = int(self._token_start is not None)
            else:
                index = truncate_from
            if second_text is not None and pattern == 'S*E*E':
                maxlen += 1
            truncate_sequences(maxlen, index, first_tokens, second_tokens)

        first_token_ids = self.tokens_to_ids(first_tokens)
        first_segment_ids = [0] * len(first_token_ids)

        if second_text is not None:
            if pattern == 'S*E*E':
                idx = int(bool(self._token_start))
                second_tokens = second_tokens[idx:]
            second_token_ids = self.tokens_to_ids(second_tokens)
            second_segment_ids = [1] * len(second_token_ids)
            first_token_ids.extend(second_token_ids)
            first_segment_ids.extend(second_segment_ids)

        return first_token_ids, first_segment_ids
Пример #10
0
def extend_with_lookahead(base_optimizer, name=None):
    """返回新的优化器类,加入look ahead
    """
    class new_optimizer(base_optimizer):
        """带有look ahead的优化器
        https://arxiv.org/abs/1907.08610
        steps_per_slow_update: 即论文中的k;
        slow_step_size: 即论文中的alpha。
        """
        def __init__(self,
                     steps_per_slow_update=5,
                     slow_step_size=0.5,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.steps_per_slow_update = steps_per_slow_update
            self.slow_step_size = slow_step_size

        @K.symbolic
        def get_updates(self, loss, params):
            updates = super(new_optimizer, self).get_updates(loss, params)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_vars = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='slow_var_%s' % i) for i, p in enumerate(params)
            ]

            with tf.control_dependencies(updates):
                slow_updates = [
                    K.update(q, K.switch(cond, q + alpha * (p - q), q))
                    for p, q in zip(params, slow_vars)
                ]
                with tf.control_dependencies(slow_updates):
                    copy_updates = [
                        K.update(p, K.switch(cond, q, p))
                        for p, q in zip(params, slow_vars)
                    ]

            return copy_updates

        def get_config(self):
            config = {
                'steps_per_slow_update': self.steps_per_slow_update,
                'slow_step_size': self.slow_step_size
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #11
0
def extend_with_layer_adaptation_v2(base_optimizer, name=None):
    """返回新的优化器类,加入层自适应学习率
    """
    class new_optimizer(base_optimizer):
        """带有层自适应学习率的优化器
        用每一层参数的模长来校正当前参数的学习率
        https://arxiv.org/abs/1904.00962
        """
        def __init__(self,
                     exclude_from_layer_adaptation=None,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.exclude_from_layer_adaptation = exclude_from_layer_adaptation or []

        def _resource_apply_op(self, grad, var, indices=None):
            old_update = K.update

            def new_update(x, new_x):
                if x is var and self._do_layer_adaptation(x):
                    dx = new_x - x
                    lr_t = self._decayed_lr(x.dtype.base_dtype)
                    lr_t = K.clip(lr_t, K.epsilon(), 1e10)
                    x_norm = tf.norm(x)
                    g_norm = tf.norm(dx / lr_t)
                    ratio = K.switch(
                        x_norm > 0.,
                        K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.),
                        1.)
                    new_x = x + dx * ratio
                return old_update(x, new_x)

            K.update = new_update
            op = super(new_optimizer,
                       self)._resource_apply_op(grad, var, indices)
            K.update = old_update

            return op

        def _do_layer_adaptation(self, w):
            return (not string_matching(w.name,
                                        self.exclude_from_layer_adaptation))

        def get_config(self):
            config = {
                'exclude_from_layer_adaptation':
                self.exclude_from_layer_adaptation
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #12
0
    def new_extend_with(BaseOptimizer, name=None):
        NewOptimizer = base_extend_with(BaseOptimizer)

        if is_string(name):
            NewOptimizer.__name__ = name

        name = NewOptimizer.__name__
        keras.utils.get_custom_objects()[name] = NewOptimizer

        return NewOptimizer
Пример #13
0
def extend_with_lazy_optimization(base_optimizer, name=None):
    """返回新的优化器类,加入懒惰更新
    """
    class new_optimizer(base_optimizer):
        """带有懒惰更新的优化器
        使得部分权重(尤其是embedding)只有在梯度不等于0时
        才发生更新。
        """
        def __init__(self, include_in_lazy_optimization=None, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.include_in_lazy_optimization = include_in_lazy_optimization or []
            self._first_get_gradients = True

        def get_gradients(self, loss, params):
            if self._first_get_gradients:
                self._first_get_gradients = False
                return super(new_optimizer, self).get_gradients(loss, params)
            else:
                return [self.grads[p] for p in params]

        @K.symbolic
        def get_updates(self, loss, params):
            self.grads = dict(zip(params, self.get_gradients(loss, params)))

            old_update = K.update

            def new_update(x, new_x):
                if is_one_of(x, params) and self._do_lazy_optimization(x):
                    g = self.grads[x]
                    r = K.any(K.not_equal(g, 0.), axis=-1, keepdims=True)
                    new_x = x + (new_x - x) * K.cast(r, K.floatx())
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            return updates

        def _do_lazy_optimization(self, w):
            return string_matching(w.name, self.include_in_lazy_optimization)

        def get_config(self):
            config = {
                'include_in_lazy_optimization':
                self.include_in_lazy_optimization
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #14
0
 def call(self, inputs, q_mask=False, v_mask=False, a_mask=False):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     q, k, v = inputs[:3]
     # 处理mask
     idx = 3
     if q_mask:
         q_mask = inputs[idx]
         idx += 1
     else:
         q_mask = None
     if v_mask:
         v_mask = inputs[idx]
         idx += 1
     else:
         v_mask = None
     if a_mask:
         if len(inputs) > idx:
             a_mask = inputs[idx]
         else:
             a_mask = 'history_only'
     else:
         a_mask = None
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask) and a_mask == 'history_only':
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #15
0
def extend_with_gradient_accumulation_v2(base_optimizer, name=None):
    """返回新的优化器类,加入梯度累积
    """
    class new_optimizer(base_optimizer):
        """带有梯度累积的优化器
        """
        def __init__(self, grad_accum_steps, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.grad_accum_steps = grad_accum_steps

        def _create_slots(self, var_list):
            super(new_optimizer, self)._create_slots(var_list)
            for var in var_list:
                self.add_slot(var, 'ag')

        def _resource_apply_op(self, grad, var, indices=None):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            # 获取梯度
            ag = self.get_slot(var, 'ag')

            old_update = K.update

            def new_update(x, new_x):
                new_x = K.switch(cond, new_x, x)
                return old_update(x, new_x)

            K.update = new_update
            ag_t = ag / self.grad_accum_steps
            op = super(new_optimizer, self)._resource_apply_op(ag_t, var)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies([op]):
                ag_t = K.switch(cond, K.zeros_like(ag), ag)
                with tf.control_dependencies([K.update(ag, ag_t)]):
                    if indices is None:
                        ag_t = K.update(ag, ag + grad)
                    else:
                        ag_t = self._resource_scatter_add(ag, indices, grad)

            return ag_t

        def get_config(self):
            config = {'grad_accum_steps': self.grad_accum_steps}
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #16
0
def extend_with_lookahead_v2(base_optimizer, name=None):
    """返回新的优化器类,加入look ahead
    """
    class new_optimizer(base_optimizer):
        """带有look ahead的优化器
        https://arxiv.org/abs/1907.08610
        steps_per_slow_update: 即论文中的k;
        slow_step_size: 即论文中的alpha。
        """
        def __init__(self,
                     steps_per_slow_update=5,
                     slow_step_size=0.5,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.steps_per_slow_update = steps_per_slow_update
            self.slow_step_size = slow_step_size

        def _create_slots(self, var_list):
            super(new_optimizer, self)._create_slots(var_list)
            for var in var_list:
                self.add_slot(var, 'slow_var')

        def _resource_apply_op(self, grad, var, indices=None):
            op = super(new_optimizer,
                       self)._resource_apply_op(grad, var, indices)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_var = self.get_slot(var, 'slow_var')
            slow_var_t = slow_var + alpha * (var - slow_var)

            with tf.control_dependencies([op]):
                slow_update = K.update(slow_var,
                                       K.switch(cond, slow_var_t, slow_var))
                with tf.control_dependencies([slow_update]):
                    copy_update = K.update(var, K.switch(cond, slow_var, var))

            return copy_update

        def get_config(self):
            config = {
                'steps_per_slow_update': self.steps_per_slow_update,
                'slow_step_size': self.slow_step_size
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #17
0
    def encode(self,
               first_text,
               second_text=None,
               maxlen=None,
               max_length=None):
        """输出文本对应token id和segment id
        """
        # 向后兼容
        if maxlen is None and max_length is not None:
            print(
                'From tokenizers.py: The argument max_length is deprecated. Please use maxlen instead.'
            )
        maxlen = maxlen or max_length

        if is_string(first_text):
            first_tokens = self.tokenize(first_text)
        else:
            first_tokens = first_text

        if second_text is None:
            second_tokens = None
        elif is_string(second_text):
            idx = int(bool(self._token_start))
            second_tokens = self.tokenize(second_text)[idx:]
        else:
            second_tokens = second_text

        if maxlen is not None:
            self.truncate_sequence(maxlen, first_tokens, second_tokens, -2)

        first_token_ids = self.tokens_to_ids(first_tokens)
        first_segment_ids = [0] * len(first_token_ids)

        if second_text is not None:
            second_token_ids = self.tokens_to_ids(second_tokens)
            second_segment_ids = [1] * len(second_token_ids)
            first_token_ids.extend(second_token_ids)
            first_segment_ids.extend(second_segment_ids)

        return first_token_ids, first_segment_ids
Пример #18
0
def extend_with_lazy_optimization_v2(base_optimizer, name=None):
    """返回新的优化器类,加入懒惰更新
    """
    class new_optimizer(base_optimizer):
        """带有懒惰更新的优化器
        使得部分权重(尤其是embedding)只有在梯度不等于0时
        才发生更新。
        """
        def __init__(self, include_in_lazy_optimization=None, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.include_in_lazy_optimization = include_in_lazy_optimization or []
            self._first_get_gradients = True

        def _resource_apply_op(self, grad, var, indices=None):
            old_update = K.update

            def new_update(x, new_x):
                if x is var and self._do_lazy_optimization(x):
                    if indices is None:
                        r = K.any(K.not_equal(grad, 0.),
                                  axis=-1,
                                  keepdims=True)
                        new_x = x + (new_x - x) * K.cast(r, K.floatx())
                        return old_update(x, new_x)
                    else:
                        return self._resource_scatter_add(
                            x, indices, K.gather(new_x - x, indices))
                return old_update(x, new_x)

            K.update = new_update
            op = super(new_optimizer,
                       self)._resource_apply_op(grad, var, indices)
            K.update = old_update

            return op

        def _do_lazy_optimization(self, w):
            return string_matching(w.name, self.include_in_lazy_optimization)

        def get_config(self):
            config = {
                'include_in_lazy_optimization':
                self.include_in_lazy_optimization
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #19
0
def extend_with_weight_decay(base_optimizer, name=None):
    """返回新的优化器类,加入权重衰减
    """
    class new_optimizer(base_optimizer):
        """带有权重衰减的优化器
        """
        def __init__(self,
                     weight_decay_rate,
                     exclude_from_weight_decay=None,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.weight_decay_rate = weight_decay_rate
            self.exclude_from_weight_decay = exclude_from_weight_decay or []
            if not hasattr(self, 'learning_rate'):
                self.learning_rate = self.lr

        @K.symbolic
        def get_updates(self, loss, params):
            old_update = K.update

            def new_update(x, new_x):
                if is_one_of(x, params) and self._do_weight_decay(x):
                    new_x = new_x - self.learning_rate * self.weight_decay_rate * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            return updates

        def _do_weight_decay(self, w):
            return (not string_matching(w.name,
                                        self.exclude_from_weight_decay))

        def get_config(self):
            config = {
                'weight_decay_rate': self.weight_decay_rate,
                'exclude_from_weight_decay': self.exclude_from_weight_decay
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #20
0
def extend_with_weight_decay_v2(base_optimizer, name=None):
    """返回新的优化器类,加入权重衰减
    """
    class new_optimizer(base_optimizer):
        """带有权重衰减的优化器
        """
        def __init__(self,
                     weight_decay_rate,
                     exclude_from_weight_decay=None,
                     *args,
                     **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.weight_decay_rate = weight_decay_rate
            self.exclude_from_weight_decay = exclude_from_weight_decay or []

        def _resource_apply_op(self, grad, var, indices=None):
            old_update = K.update

            def new_update(x, new_x):
                if x is var and self._do_weight_decay(x):
                    lr_t = self._decayed_lr(x.dtype.base_dtype)
                    new_x = new_x - lr_t * self.weight_decay_rate * x
                return old_update(x, new_x)

            K.update = new_update
            op = super(new_optimizer,
                       self)._resource_apply_op(grad, var, indices)
            K.update = old_update

            return op

        def _do_weight_decay(self, w):
            return (not string_matching(w.name,
                                        self.exclude_from_weight_decay))

        def get_config(self):
            config = {
                'weight_decay_rate': self.weight_decay_rate,
                'exclude_from_weight_decay': self.exclude_from_weight_decay
            }
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #21
0
    def __init__(self, token_dict, do_lower_case=False, **kwargs):
        super(Tokenizer, self).__init__(**kwargs)
        if is_string(token_dict):
            token_dict = load_vocab(token_dict)

        self._do_lower_case = do_lower_case
        self._token_dict = token_dict
        self._token_dict_inv = {v: k for k, v in token_dict.items()}
        self._vocab_size = len(token_dict)

        for token in ['pad', 'unk', 'mask', 'start', 'end']:
            try:
                _token_id = token_dict[getattr(self, '_token_%s' % token)]
                setattr(self, '_token_%s_id' % token, _token_id)
            except:
                pass
Пример #22
0
    def __init__(self, token_dict, do_lower_case=False):
        """初始化
        """
        super(Tokenizer, self).__init__(do_lower_case)
        if is_string(token_dict):
            token_dict = load_vocab(token_dict)

        self._token_dict = token_dict
        self._token_dict_inv = {v: k for k, v in token_dict.items()}
        for token in ['pad', 'cls', 'sep', 'unk', 'mask']:
            try:
                _token_id = token_dict[getattr(self, '_token_%s' % token)]
                setattr(self, '_token_%s_id' % token, _token_id)
            except:
                pass
        self._vocab_size = len(token_dict)
Пример #23
0
def extend_with_piecewise_linear_lr(base_optimizer, name=None):
    """返回新的优化器类,加入分段线性学习率
    """
    class new_optimizer(base_optimizer):
        """带有分段线性学习率的优化器
        其中schedule是形如{1000: 1, 2000: 0.1}的字典,
        表示0~1000步内学习率线性地从零增加到100%,然后
        1000~2000步内线性地降到10%,2000步以后保持10%
        """
        def __init__(self, lr_schedule, *args, **kwargs):
            super(new_optimizer, self).__init__(*args, **kwargs)
            self.lr_schedule = {int(i): j for i, j in lr_schedule.items()}

        @K.symbolic
        def get_updates(self, loss, params):
            lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule)

            old_update = K.update

            def new_update(x, new_x):
                if is_one_of(x, params):
                    new_x = x + (new_x - x) * lr_multiplier
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            return updates

        def get_config(self):
            config = {'lr_schedule': self.lr_schedule}
            base_config = super(new_optimizer, self).get_config()
            return dict(list(base_config.items()) + list(config.items()))

    if is_string(name):
        new_optimizer.__name__ = name
        keras.utils.get_custom_objects()[name] = new_optimizer

    return new_optimizer
Пример #24
0
 def call(self, inputs, q_mask=False, v_mask=False, a_mask=False):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     # 处理mask
     inputs = inputs[:]
     for i, mask in enumerate([q_mask, v_mask, a_mask]):
         if not mask:
             inputs.insert(3 + i, None)
     q, k, v, q_mask, v_mask = inputs[:5]
     if len(inputs) == 5:
         a_mask = 'history_only'
     elif len(inputs) == 6:
         a_mask = inputs[-1]
     else:
         raise ValueError('wrong inputs for MultiHeadAttention.')
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #25
0
 def call(self, inputs, q_mask=None, v_mask=None, a_mask=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     q, k, v = inputs[:3]
     if a_mask:
         if len(inputs) == 3:
             a_mask = 'history_only'
         else:
             a_mask = inputs[3]
     if q_mask is not None:
         if not hasattr(self, 'q_mask_layer'):
             self.q_mask_layer = search_layer(q, q_mask)
         q_mask = self.q_mask_layer.output_mask
     if v_mask is not None:
         if not hasattr(self, 'v_mask_layer'):
             self.v_mask_layer = search_layer(v, v_mask)
         v_mask = self.v_mask_layer.output_mask
     # Pooling
     if self.pool_size > 1:
         is_self_attention = (q is k is v)
         q_in_len = K.shape(q)[1]
         q = sequence_masking(q, q_mask, 0)
         q = divisible_temporal_padding(q, self.pool_size)
         q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg')
         if is_self_attention:
             k = v = q
         else:
             k = sequence_masking(k, v_mask, 0)
             k = divisible_temporal_padding(k, self.pool_size)
             k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg')
             v = sequence_masking(v, v_mask, 0)
             v = divisible_temporal_padding(v, self.pool_size)
             v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg')
         if v_mask is not None:
             v_mask = v_mask[:, ::self.pool_size]
         if a_mask is not None and not is_string(a_mask):
             a_mask = a_mask[..., ::self.pool_size, ::self.pool_size]
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 恢复长度
     if self.pool_size > 1:
         o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len]
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o
Пример #26
0
 def generate(self, image, topk=2):
     if is_string(image):
         image = read_image(image)
     image = preprocess_input(image)
     output_ids = self.decode([image], topk)
     return tokenizer.decode(output_ids)
Пример #27
0
    def encode_multi(self,
                     first_text: str,
                     second_texts: List = None,
                     limit_lengths: List = None,
                     length_max: int = None,
                     first_length: int = None,
                     second_length: int = None,
                     is_multi: bool = True):
        """
        bert encode of multi-text-input, 处理多输入(segment_ids标注两个还是多个[0,1]), 同时第二句子可能会截断[CLS]/[SEP]等信息
        Args:
            first_text: Any, first input of sentence when in single-task, pair-task or multi-task, eg. "macadam英文什么意思"
            second_texts: List, second inputs of sentence, eg. ["macadam?", "啥macadam?", "macadam什么意思"] 
            limit_lengths: List, limit lengths of each in texts(second inputs), eg.[36, 36,128]
            length_max: int, max length of the whole sequence, eg. 512
            first_length: int, max length of first_text, eg. 128
            second_length: int, max length of the whole sequence of texts(second inputs), eg. 128
            is_multi: bool, either sign sentence in texts with multi or not
        Returns:
            input of bert-like model
        """

        # split charcter, 分字
        if is_string(first_text):
            first_tokens = self.tokenize(first_text)
        else:
            first_tokens = first_text
        # token to id and pad, 多则截断|少则padding
        first_token_ids = self.tokens_to_ids(first_tokens)
        if first_length:
            first_token_ids = first_token_ids[:first_length]
            first_token_ids.extend([self._token_pad_id] *
                                   (first_length - len(first_token_ids)))
        # segment, like [0,0,0,0,0,1,1,1,1], 区分句子
        first_segment_ids = [0] * len(first_tokens)
        # second segments, 第一句以后的句子
        second_token_ids = []
        second_segment_ids = []
        # if texts exist, 如果存在第二句话或者是往上, 避免为空的情况
        if second_texts:
            len_texts = len(second_texts)
            for i in range(len_texts):
                text = second_texts[i]
                if not text:
                    tokens = None
                elif is_string(text):
                    idx = int(bool(self._token_start))
                    tokens = self.tokenize(text)[idx:]
                else:
                    tokens = text
                if tokens:
                    # token to id, 字符转数字
                    token_ids = self.tokens_to_ids(tokens)
                    if limit_lengths and limit_lengths[i]:
                        token_ids = token_ids[:limit_lengths[i]]
                        token_ids.extend([self._token_pad_id] *
                                         (limit_lengths[i] - len(token_ids)))
                    # pull segment_id, 句子区分, 第一句为0, 第二句子s是否都标为1
                    if is_multi:
                        id_sent = 1 if i % 2 == 0 else 0
                    else:
                        id_sent = 1
                    segment_ids = [id_sent] * len(token_ids)
                    second_token_ids.extend(token_ids)
                    second_segment_ids.extend(segment_ids)
            # limit second texts, 限制除了第一个句子外的所有句子的长度
            if second_length:
                second_token_ids = second_token_ids[:second_length]
                second_segment_ids = second_segment_ids[:second_length]
            # add all, 所有句子相加
            first_token_ids.extend(second_token_ids)
            first_segment_ids.extend(second_segment_ids)
        # limit all, 限制所有句子的长度
        if length_max:
            first_token_ids = first_token_ids[:length_max]
            first_segment_ids = first_segment_ids[:length_max]

        return first_token_ids, first_segment_ids
Пример #28
0
 def generate(self, image, topk=1):
     if is_string(image):
         image = read_image(image)
     image = preprocess_input(image)
     output_ids = self.beam_search([image], topk)  # 基于beam search
     return tokenizer.decode(output_ids)
Пример #29
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     if a_mask:
         if len(inputs) == 3:
             a_mask = 'history_only'
         else:
             a_mask = inputs[n]
             n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if p_bias != 't5_relative':  # T5不用缩放
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o