예제 #1
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i))
            for (i, p) in enumerate(params)
        ]
        vs = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i))
            for (i, p) in enumerate(params)
        ]

        if self.amsgrad:
            vhats = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='vhat_' + str(i)) for (i, p) in enumerate(params)
            ]
        else:
            vhats = [
                K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))
            ]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
예제 #2
0
        def get_updates(self, loss, params):
            # 是否更新
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(shape=K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_{}'.format(i))
                for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(NewOptimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累计更新
            with K.control_dependencies(updates):
                acc_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for ag, g in zip(self.accum_grads, grads)
                ]

            return acc_updates
예제 #3
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
     mask = K.cast(mask, K.floatx())
     # 计算目标分数
     y_true, y_pred = y_true * mask, y_pred * mask
     target_score = self.path_score(y_pred, y_true)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     y_pred = K.concatenate([y_pred, mask], axis=2)
     input_length = K.int_shape(y_pred[:, 1:])[1]
     log_norm, _, _ = K.rnn(self.log_norm_step,
                            y_pred[:, 1:],
                            init_states,
                            input_length=input_length)  # 最后一步的log Z向量
     log_norm = K.logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
예제 #4
0
    def call(self, x, mask=None):
        x0 = x
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, 2)
        #         x = x0 * mask if mask is not None else x0
        x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0)  # drop mask so do not put mask to conv1d
        x = self.conv1d(x0)
        x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:]
        if self.dropout_rate is not None:
            g = K.in_train_phase(K.dropout(g, self.dropout_rate), g)
        g = K.sigmoid(g)
        # mask is none
        mask = mask if mask is not None else K.ones_like(x)

        if self.skip_connection:
            if K.int_shape(x0)[-1] != self.o_dim:
                x0 = self.conv1d_1x1(x0)
            return (x0 * (1 - g) + x * g) * mask
        return x * g * mask
예제 #5
0
train_generator = data_generator(data=train_data, batch_size=batch_size)
valid_generator = data_generator(data=valid_data, batch_size=batch_size)
train_transfer_generator = data_generator(data=train_data,
                                          batch_size=batch_size,
                                          transfer=True,
                                          data_augmentation=True)

# 加载预训练模型(3层)
teacher = build_transformer_model(config_path=config_path,
                                  checkpoint_path=checkpoint_path,
                                  return_keras_model=False,
                                  num_hidden_layers=num_hidden_layers,
                                  model='bert')

# 判别模型
x_in = Input(shape=K.int_shape(teacher.output)[1:])
x = Lambda(lambda x: x[:, 0])(x_in)
x = Dense(units=num_classes, activation='softmax')(x)
classifier = Model(x_in, x)

teacher_model = Model(teacher.inputs, classifier(teacher.output))
teacher_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(2e-5),  # 用足够小的学习率
    metrics=['sparse_categorical_accuracy'],
)

teacher_model.summary()


class FastbertClassifierLayer(Layer):
# 加载预训练模型(12层)
predecessor = build_transformer_model(config_path=config_path,
                                      checkpoint_path=checkpoint_path,
                                      return_keras_model=False,
                                      prefix='Predecessor-')

# 加载预训练模型(3层)
successor = build_transformer_model(config_path=config_path,
                                    checkpoint_path=checkpoint_path,
                                    return_keras_model=False,
                                    num_hidden_layers=3,
                                    prefix='Successor-')

# 判别模型
x_in = Input(shape=K.int_shape(predecessor.output)[1:])
x = Dense(num_labels)(x_in)
CRF = ConditionalRandomField(lr_multiplier=2)
x = CRF(x)
classifier = Model(x_in, x)

opt = Adam(learning_rate=lr)

predecessor_model = Model(predecessor.inputs, classifier(predecessor.outputs))
predecessor_model.compile(
    loss=predecessor_model.layers[-1].layers[-1].sparse_loss,
    optimizer=opt,
    metrics=[CRF.sparse_accuracy])

predecessor_model.summary()
예제 #7
0
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, 2)
            inputs = inputs - (1.0 - mask) * 1e12
        return K.softmax(inputs, 1)


# build model
model = build_transformer_model(
    config_path,
    checkpoint_path,
)

inputs = [
    Input(shape=K.int_shape(model.inputs[0])[1:]),
    Input(shape=K.int_shape(model.inputs[1])[1:])
]
output = model(inputs)
output = SinCosPositionEmbedding(K.int_shape(output)[-1])(output)

output = Dropout(0.5)(output)
output = Dense(384, activation='tanh')(output)

att = AttentionPooling1D(name='attention_pooling_1')(output)

output = ConcatSeq2Vec()([output, att])

output = DGCNN(dilation_rate=1, dropout_rate=0.1)(output)
output = DGCNN(dilation_rate=2, dropout_rate=0.1)(output)
output = DGCNN(dilation_rate=5, dropout_rate=0.1)(output)
예제 #8
0
    def compute_output_shape(self, input_shape):
        if self._mode == 'embedding':
            return super(Embedding, self).compute_output_shape(input_shape)

        return input_shape[:2] + (K.int_shape(self.embeddings)[0], )
            (f1, precision, recall, self.best_val_f1)
        )
        f1, precision, recall = evaluate(self.model, test_data)
        print(
            'test:  f1: %.5f, precision: %.5f, recall: %.5f\n' %
            (f1, precision, recall)
        )


teacher = build_transformer_model(
    config_path,
    checkpoint_path,
    return_keras_model=False
)

x_in = Input(shape=K.int_shape(teacher.output)[1:])
x = Lambda(lambda x: x)(x_in)
# softmax
x = Dense(num_labels, activation='softmax')(x)
teacher_classifier = Model(x_in, x)

teacher_model = Model(teacher.input, teacher_classifier(teacher.output))
teacher_model.summary()

teacher_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate),
    metrics=['sparse_categorical_accuracy']
)

student = build_transformer_model(
예제 #10
0
        if f1 >= self.best_val_f1:
            self.best_val_f1 = f1
            self.model.save_weights(self.model_name)
        print(
            'valid:  f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n'
            % (f1, precision, recall, self.best_val_f1))
        f1, precision, recall = evaluate(self.model, test_data)
        print('test:  f1: %.5f, precision: %.5f, recall: %.5f\n' %
              (f1, precision, recall))


bert = build_transformer_model(config_path,
                               checkpoint_path,
                               return_keras_model=False)

x_in = Input(shape=K.int_shape(bert.output)[1:])
x = Lambda(lambda x: x)(x_in)
# softmax
x = Dense(num_labels, activation='softmax')(x)
bert_classifier = Model(x_in, x)

teacher_model = Model(bert.input, bert_classifier(bert.output))
teacher_model.summary()

teacher_model.compile(loss='sparse_categorical_crossentropy',
                      optimizer=Adam(learning_rate),
                      metrics=['sparse_categorical_accuracy'])

if __name__ == '__main__':
    teacher_model_name = './best_teacher_model.weights'
    teacher_evaluator = Evaluator(teacher_model_name)