示例#1
0
def make_model(keep_words, label2id, train=False):
    model = load_pretrained_model(config_path,
                                  checkpoint_path,
                                  keep_words=keep_words,
                                  albert=True)
    class_num = len(label2id)
    # output = Attention(512, name='attention_1')(model.output)
    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(class_num, activation='softmax')(output)
    model = Model(model.input, output)

    # if train:
    #     # 微调albert的顶部几层;
    #     model.trainable = True
    #     set_trainable = False
    #     for layer in model.layers:
    #         if layer.name == 'Encoder-1-FeedForward-Norm': # 'attention_1':
    #             set_trainable = True
    #         if set_trainable:
    #             layer.trainable = True
    #         else:
    #             layer.trainable = False

    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
        metrics=['accuracy'])

    # 保存模型图
    plot_model(model, 'classify-albert.png')

    model.summary()
    return model
示例#2
0
def predict(fold=0):
    # from accum_optimizer import AccumOptimizer

    model = load_pretrained_model(
        config_path,
        checkpoint_path,
        seq2seq=False,
        keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    )
    x_in = keras.Input(shape=(None, ), name='Token')
    s_in = keras.Input(shape=(None, ), name='Segment')
    output = model([x_in, s_in])
    output = keras.layers.core.Lambda(lambda x: x[:, 0, :])(output)
    output = keras.layers.Dense(2, activation='sigmoid')(output)
    model = keras.Model([x_in, s_in], output)

    opt = Adam(5e-6)
    model.compile(opt, loss=[focal_loss(alpha=0.85)], metrics=['accuracy'])

    if fold == 0:
        model.summary()

    save_dir = join(MODEL_PATH, 'bert_wwm_aug_focal/out_{}'.format(fold))
    # save_dir = join(MODEL_PATH, 'bert_res/out_{}'.format(fold))
    save_path = join(save_dir, 'trained.ckpt')

    model.load_weights(save_path)
    logger.info('predict...')
    results = model.predict([trains_x, trains_s], batch_size=batch_size)
    print('result shape: {}'.format(results.shape))
    assert len(results) == dev.shape[0]

    keras.backend.clear_session()
    return results[:, 1]
def predict(text):
    with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f:
        tokenizer = pickle.load(f)

    with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f:
        keep_words = pickle.load(f)

    model = load_pretrained_model(config_path,
                                  checkpoint_path,
                                  keep_words=keep_words,
                                  albert=True)

    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(1, activation='sigmoid')(output)
    model = Model(model.input, output)

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
        metrics=['accuracy'])
    model.summary()
    model.load_weights(os.path.join(model_save_path,
                                    'checkpoint-02-0.15-0.939.hdf5'),
                       by_name=True,
                       skip_mismatch=True,
                       reshape=True)

    text = text[:maxlen]
    x1, x2 = tokenizer.encode(first=text)

    X1 = seq_padding([x1])
    X2 = seq_padding([x2])
    ret = model.predict([X1, X2])
    return ret
 def _get_model(self):
     model = load_pretrained_model(
         self.albert_config_path,
         self.albert_checkpoint_path,
         keep_words=self.keep_words,  # 只保留keep_words中的字,精简原字表
         albert=True)
     output = Lambda(lambda x: x[:, 0])(model.output)
     output = Dense(1, activation='sigmoid')(output)
     model = Model(model.input, output)
     return model
示例#5
0
def make_model(keep_words):
    model = load_pretrained_model(config_path,
                                  checkpoint_path,
                                  keep_words=keep_words,
                                  albert=True)

    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(1, activation='sigmoid')(output)
    model = Model(inputs=model.input, outputs=output)

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
        metrics=['accuracy'])
    model.summary()
    return model
def train(train_data, valid_data, tokenizer, keep_words):
    model = load_pretrained_model(config_path,
                                  checkpoint_path,
                                  keep_words=keep_words,
                                  albert=True)

    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(1, activation='sigmoid')(output)
    model = Model(model.input, output)

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
        metrics=['accuracy'])
    model.summary()

    train_D = data_generator(train_data, tokenizer=tokenizer)
    valid_D = data_generator(valid_data, tokenizer=tokenizer)

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)

    model_checkpoint = ModelCheckpoint(filepath=os.path.join(
        model_save_path,
        'checkpoint-{epoch:02d}-{val_loss:.2f}-{val_accuracy:.3f}.hdf5'),
                                       save_best_only=True,
                                       save_weights_only=False)

    tb = TensorBoard(
        log_dir=log_dir,  # log 目录
        histogram_freq=0,  # 按照何等频率(epoch)来计算直方图,0为不计算
        batch_size=32,  # 用多大量的数据计算直方图
        write_graph=True,  # 是否存储网络结构图
        write_grads=False,  # 是否可视化梯度直方图
        write_images=False,  # 是否可视化参数
        embeddings_freq=0,
        embeddings_layer_names=None,
        embeddings_metadata=None)

    model.fit_generator(train_D.__iter__(),
                        steps_per_epoch=len(train_D),
                        epochs=5,
                        validation_data=valid_D.__iter__(),
                        validation_steps=len(valid_D),
                        callbacks=[early_stopping, model_checkpoint, tb])
示例#7
0
    def __init__(self,
                 initial_model=True,
                 model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')):
        self.initial_model = initial_model
        token_dict = load_vocab(DICT_PATH)
        self.tokenizer = SimpleTokenizer(token_dict)
        self.model_path = model_path
        if initial_model:
            self.albert_model = load_pretrained_model(
                CONFIG_PATH,
                CHECKPOINT_PATH,
                # keep_words=keep_words,
                albert=True)
        else:
            self.load(model_path)

        for l in self.albert_model.layers:
            l.trainable = True
def make_model(keep_words, label2id):
    model = load_pretrained_model(
        config_path,
        checkpoint_path,
        keep_words=keep_words,
        albert=True
    )
    class_num = len(label2id)
    output = Lambda(lambda x: x[:, 0])(model.output)
    output = Dense(class_num, activation='softmax')(output)
    model = Model(model.input, output)

    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(1e-5),  # 用足够小的学习率
        # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
        metrics=['accuracy']
    )
    model.summary()
    return model
def build_model(keep_words, ner_units=None, rel_units=None):
    # construct model

    model = load_pretrained_model(config_path,
                                  checkpoint_path,
                                  keep_words=keep_words,
                                  albert=True)

    output = Lambda(lambda x: x[:, 0])(model.output)

    # dense = Dense(200, activation='relu')(output)
    # dense = BatchNormalization()(dense)
    ner_out = CRF(ner_units, sparse_target=True, name='ner_out')(model.output)
    # dense = Lambda(lambda x: x, output_shape=lambda s: s)(dense)
    # attention_out = Attention(200, name='attention_1')(dense)
    # lambda_out = Lambda(lambda x: x[:, 0])(dense)
    # lambda_out = BatchNormalization()(lambda_out)
    rel_out = Dense(rel_units, activation='softmax', name='rel_out')(output)

    model = Model(model.input, outputs=[ner_out, rel_out])
    # model.compile(optimizer='rmsprop', loss='binary_crossentropy', loss_weights=[1., 0.2])
    # lr不能太低,如:5e-7,太低了,反而会使模型训练还没达到最优就提前终止了;
    model.compile(optimizer=Adam(lr=5e-6),
                  loss={
                      'ner_out': crf_loss,
                      'rel_out': 'categorical_crossentropy'
                  },
                  metrics={
                      'ner_out': crf_viterbi_accuracy,
                      'rel_out': 'accuracy'
                  },
                  loss_weights={
                      'ner_out': 0.5,
                      'rel_out': 0.5
                  })
    model.summary()

    # 保存模型图
    plot_model(model, 'ner_classify_albert_tiny.png')

    return model
示例#10
0
def creat_model(config, keep_words):
    """构建模型"""
    model = load_pretrained_model(
        config.bert_config,
        config.bert_checkpoint,
        #config.pretrained_model_path,
        seq2seq=True,
        keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    )

    # 交叉熵作为loss,并mask掉输入部分的预测
    # 目标tokens
    y_in = model.input[0][:, 1:]
    y_mask = model.input[1][:, 1:]
    # 预测tokens,预测与目标错开一位
    y = model.output[:, :-1]
    cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
    cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

    model.add_loss(cross_entropy)
    model.compile(optimizer=Adam(1e-5))

    return model
示例#11
0
    def build_model(self):

        model = load_pretrained_model(config_path,
                                      checkpoint_path,
                                      keep_words=self.keep_words,
                                      albert=True)

        # output = Lambda(lambda x: x[:, 0])(model.output)
        output = CRF(len(self.label2id), sparse_target=True)(model.output)
        model = Model(model.input, output)

        model.compile(
            loss=crf_loss,
            optimizer=Adam(1e-5),  # 用足够小的学习率
            # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
            metrics=[crf_accuracy])

        # 保存模型图
        plot_model(model, 'ner-albert.png')

        model.summary()
        # print('model.input_shape: {}, model.output_shape: {}'.format(model.input_shape, model.output_shape))

        return model
示例#12
0
import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

print(tf.__version__)

base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\'
config_path = base_path + 'bert_config.json'
checkpoint_path = base_path + 'bert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
示例#13
0
import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])

base_path = 'D:\AI\Data\\albert_large_zh\\'
config_path = base_path + 'albert_config_large.json'
checkpoint_path = base_path + 'albert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              albert=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”权
示例#14
0
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam

model = load_pretrained_model(config_path,
                              checkpoint_path,
                              keep_words=keep_words,
                              albert=True)

output = Lambda(lambda x: x[:, 0])(model.output)
output = Dense(1, activation='sigmoid')(output)
model = Model(model.input, output)

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-5),  # 用足够小的学习率
    # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
    metrics=['accuracy'])
model.summary()

train_D = data_generator(train_data)
valid_D = data_generator(valid_data)
示例#15
0
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam

model = load_pretrained_model(
    config_path,
    checkpoint_path,
    keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    albert=True)

output = Lambda(lambda x: x[:, 0])(model.output)
output = Dense(1, activation='sigmoid')(output)
model = Model(model.input, output)

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-5),  # 用足够小的学习率
    # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
    metrics=['accuracy'])
model.summary()

train_D = data_generator(train_data)
import os
from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh'
# albert_model_path = '/notebooks/albert_zh/albert_large_zh'
# https://storage.googleapis.com/albert_zh/albert_large_zh.zip

config_path = os.path.join(albert_model_path, 'albert_config_large.json')
checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt')
dict_path = os.path.join(albert_model_path, 'vocab.txt')

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型,加载权重

# token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')
token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京')

print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids))

# mask掉“技术”
# token_ids[3] = token_ids[4] = token_dict['[MASK]']
token_ids[4] = token_ids[5] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
# print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
print(tokenizer.decode(probas.argmax(axis=1)))
示例#17
0
def data_generator():
    while True:
        X, S = [], []
        for a, b in read_text():
            x, s = tokenizer.encode(a, b)
            X.append(x)
            S.append(s)
            if len(X) == batch_size:
                X = padding(X)
                S = padding(S)
                yield [X, S], None
                X, S = [], []


model = load_pretrained_model(config_path,
                              checkpoint_path,
                              seq2seq=True,
                              keep_words=keep_words)

model.summary()

# 交叉熵作为loss,并mask掉输入部分的预测
y_in = model.input[0][:, 1:]  # 目标tokens
y_mask = model.input[1][:, 1:]
y = model.output[:, :-1]  # 预测tokens,预测与目标错开一位
cross_entropy = K.sparse_categorical_crossentropy(y_in, y)
cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)

model.add_loss(cross_entropy)
model.compile(optimizer=Adam(1e-5))

示例#18
0
def train(fold=0, only_predict=False, need_val=True):
    # from accum_optimizer import AccumOptimizer

    if fold in []:
        only_predict = True
    model = load_pretrained_model(
        config_path,
        checkpoint_path,
        seq2seq=False,
        keep_words=keep_words,  # 只保留keep_words中的字,精简原字表
    )
    x_in = keras.Input(shape=(None, ), name='Token')
    s_in = keras.Input(shape=(None, ), name='Segment')
    output = model([x_in, s_in])
    output = keras.layers.core.Lambda(lambda x: x[:, 0, :])(output)
    output = keras.layers.Dense(2, activation='sigmoid')(output)
    model = keras.Model([x_in, s_in], output)

    opt = Adam(5e-6)
    model.compile(opt, loss=[focal_loss(alpha=0.85)], metrics=['accuracy'])

    if fold == 0:
        model.summary()

    save_dir = join(MODEL_PATH, 'bert_res/out_{}'.format(fold))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if not only_predict and init_epoch == 0:
        for l in os.listdir(save_dir):  # 删空
            os.remove(join(save_dir, l))
    save_path = join(save_dir, 'trained.ckpt')

    checkpoint_callback = keras.callbacks.ModelCheckpoint(
        save_path,
        monitor='val_loss',
        verbose=0,
        save_best_only=False,
        save_weights_only=True,
        mode='min',
        period=1)
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir=save_dir,
                                                       histogram_freq=0,
                                                       write_graph=False,
                                                       write_grads=False,
                                                       update_freq=320)
    # weight_decay_callback = keras.callbacks.LearningRateScheduler(
    #     schedule=lambda epoch, lr: lr * (epochs - epoch) / epochs if epoch > 0 else 1e-6
    # )

    if only_predict:
        model.load_weights(save_path)
    else:
        if init_epoch > 0:
            logger.info('Continue train. Load weight...')
            model.load_weights(save_path)
        model.fit_generator(
            data_generator(fold, True),
            validation_data=data_generator(fold, False),
            validation_steps=100,
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            verbose=2,
            # workers=3,
            class_weight=None,
            initial_epoch=init_epoch,
            callbacks=[
                checkpoint_callback,
                tensorboard_callback,
                # weight_decay_callback,
                LogRecord()
            ])
    if need_val:
        logger.info('evaluate...')
        if only_predict:  # 这个执行一次,用于重新统计 eva_len
            next(data_generator(fold, False))
        eva_result = model.evaluate_generator(data_generator(fold, False),
                                              steps=int(eva_len / batch_size))
    else:
        eva_result = []
    logger.info('predict...')
    results = model.predict([trains_x, trains_s], batch_size=batch_size)
    print('result shape: {}'.format(results.shape))
    assert len(results) == test.shape[0]

    keras.backend.clear_session()
    return results[:, 1], eva_result