Exemplos de SimpleTokenizer.encode em Python, exemplos de bert4keras.utils.SimpleTokenizer.encode em Python

Exemplo n.º 1

0

Exibir arquivo

import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])

print(tf.__version__)

base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\'
config_path = base_path + 'bert_config.json'
checkpoint_path = base_path + 'bert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path)  # 建立模型，加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

Exemplo n.º 2

0

Exibir arquivo

Arquivo: albert.py Projeto: enningxie/Semantic-Matching-with-Bert

class Albert(object):
    def __init__(self,
                 mode='inference',
                 mode_='part',
                 model_name=None,
                 dataset_name=None):
        self.maxlen = 32
        self.albert_config_path = '/Data/public/Bert/albert_tiny_489k/albert_config_tiny.json'
        self.albert_checkpoint_path = '/Data/public/Bert/albert_tiny_489k/albert_model.ckpt'
        self.albert_dict_path = '/Data/public/Bert/albert_tiny_489k/vocab.txt'
        self.train_data_path = 'data/train_{}.csv'.format(dataset_name)
        self.dev_data_path = 'data/dev_{}.csv'.format(dataset_name)
        self.test_data_path = 'data/test_{}.csv'.format(dataset_name)
        # albert_tiny_250k.h5 挺好的
        # self.restore_model_path = 'saved_models/test_albert_tiny_{}.h5'.format(model_name)
        self.restore_model_path = '/Data/models/{}'.format(model_name)

        # albert
        self.albert_process_data(mode_)
        if mode == 'train':
            self.model = self._get_model()
            self.train()
        elif mode == 'inference':
            self._init_model()

    # todo keep words 工业场景下需要remove
    def albert_process_data(self, mode='part'):
        _token_dict = load_vocab(self.albert_dict_path)  # 读取字典
        # 只取涉及数据集中出现的字
        if mode == 'part':
            train_df = pd.read_csv(self.train_data_path,
                                   names=['seq1', 'seq2', 'label'])
            valid_df = pd.read_csv(self.dev_data_path,
                                   names=['seq1', 'seq2', 'label'])
            test_df = pd.read_csv(self.test_data_path,
                                  names=['seq1', 'seq2', 'label'])
            # total data
            tmp_df = pd.concat([train_df, valid_df, test_df])
            chars = defaultdict(int)
            for _, tmp_row in tmp_df.iterrows():
                for tmp_char in tmp_row.seq1:
                    chars[tmp_char] += 1
                for tmp_char in tmp_row.seq2:
                    chars[tmp_char] += 1
            # 过滤低频字
            chars = {i: j for i, j in chars.items() if j >= 4}
            self.token_dict, self.keep_words = {}, []  # keep_words是在bert中保留的字表
            # 保留特殊字符
            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])
            # 字典只保留数据中出现的高频字
            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])
        elif mode == 'full':
            self.token_dict, self.keep_words = _token_dict, []
            for k in self.token_dict:
                self.keep_words.append(self.token_dict[k])
        self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

    # data pre-processing operation
    def _data_preprocessing(self, sentence1, sentence2):
        X1, X2 = [], []
        for tmp_sent1, tmp_sent2 in zip(sentence1, sentence2):
            x1, x2 = self.tokenizer.encode(first=tmp_sent1[:self.maxlen],
                                           second=tmp_sent2[:self.maxlen])
            X1.append(x1)
            X2.append(x2)
        X1 = self._seq_padding(X1)
        X2 = self._seq_padding(X2)
        # X1 = pad_sequences(X1, maxlen=67, padding='post', truncating='post')
        # X2 = pad_sequences(X2, maxlen=67, padding='post', truncating='post')
        return X1, X2

    def _seq_padding(self, X, padding=0):
        L = [len(x) for x in X]
        ML = max(L)
        padded_sent = np.array([
            np.concatenate([x, [padding] *
                            (ML - len(x))]) if len(x) < ML else x for x in X
        ])
        return padded_sent

    # prepare data for training
    def _prepare_data(self, data_path):
        data = pd.read_csv(data_path)
        sent_1 = data['sentence1'].values
        sent_2 = data['sentence2'].values
        label = data['label'].values
        X1_pad, X2_pad = self._data_preprocessing(sent_1, sent_2)
        # X1 = np.vstack((X1_pad, X2_pad))
        # X2 = np.vstack((X2_pad, X1_pad))
        # y_train = np.hstack((label, label))
        return X1_pad, X2_pad, label

    # albert for Semantic matching, model architecture
    def _get_model(self):
        model = load_pretrained_model(
            self.albert_config_path,
            self.albert_checkpoint_path,
            keep_words=self.keep_words,  # 只保留keep_words中的字，精简原字表
            albert=True)
        output = Lambda(lambda x: x[:, 0])(model.output)
        output = Dense(1, activation='sigmoid')(output)
        model = Model(model.input, output)
        return model

    # model training operation
    def train(self):
        # train_data
        train_x1, train_x2, train_label = self._prepare_data(
            self.train_data_path)
        # dev_data
        dev_x1, dev_x2, dev_label = self._prepare_data(self.dev_data_path)
        checkpoint = ModelCheckpoint(self.restore_model_path,
                                     monitor='val_accuracy',
                                     verbose=0,
                                     save_best_only=True,
                                     save_weights_only=False)
        early_stop = EarlyStopping(monitor='val_accuracy',
                                   patience=3,
                                   verbose=0,
                                   mode='auto',
                                   baseline=None,
                                   restore_best_weights=True)
        self.model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(1e-4),  # 用足够小的学习率
            metrics=['accuracy'])
        self.model.summary()
        self.model.fit(x=[train_x1, train_x2],
                       y=train_label,
                       batch_size=64,
                       epochs=10,
                       verbose=1,
                       callbacks=[checkpoint, early_stop],
                       validation_data=([dev_x1, dev_x2], dev_label))

    # model predict operation
    def predict(self, sentence1, sentence2):
        X1, X2 = self._data_preprocessing(sentence1, sentence2)
        y_pred = self.model.predict([X1, X2], batch_size=1024)
        return y_pred

    def test(self):
        self.model.compile(
            loss='binary_crossentropy',
            optimizer=Adam(1e-4),  # 用足够小的学习率
            metrics=['accuracy'])
        # test_data
        test_x1, test_x2, test_label = self._prepare_data(self.dev_data_path)
        test_loss, test_acc = self.model.evaluate(x=[test_x1, test_x2],
                                                  y=test_label)
        print('test loss: {}'.format(test_loss))
        print('test acc: {}'.format(test_acc))

    def _init_model(self):
        self.model = load_model(self.restore_model_path)
        sentence1 = '干嘛呢'
        sentence2 = '你是机器人'
        print('model albert loaded succeed. ({})'.format(
            self.predict([sentence1], [sentence2]).item()))

Exemplo n.º 3

0

Exibir arquivo

class SemanticModel():
    def __init__(self, batch_size=32, train=False):
        self.batch_size = batch_size
        if train:
            chars = set()
            train_datas = read_datas(TRAIN_DATA_FILE)
            dev_datas = read_datas(DEV_DATA_FILE)
            test_datas = read_datas(TEST_DATA_FILE)
            for text1, text2, label in itertools.chain(train_datas, dev_datas):
                chars.update(set(text1))
                chars.update(set(text2))

            _token_dict = load_vocab(dict_path)  # 读取词典
            self.token_dict, self.keep_words = {}, []

            for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
                self.token_dict[c] = len(self.token_dict)
                self.keep_words.append(_token_dict[c])

            for c in chars:
                if c in _token_dict:
                    self.token_dict[c] = len(self.token_dict)
                    self.keep_words.append(_token_dict[c])

            self.tokenizer = SimpleTokenizer(self.token_dict)  # 建立分词器

            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "wb") as f:
                pickle.dump(self.tokenizer, f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "wb") as f:
                pickle.dump(self.keep_words, f)

        else:
            with open(os.path.join(model_save_path, 'tokenizer.pkl'),
                      "rb") as f:
                self.tokenizer = pickle.load(f)

            with open(os.path.join(model_save_path, 'keep_words.pkl'),
                      "rb") as f:
                self.keep_words = pickle.load(f)

        self.model = self.make_model()

    def make_model(self):
        model = load_pretrained_model(config_path,
                                      checkpoint_path,
                                      keep_words=self.keep_words,
                                      albert=True)

        output = Lambda(lambda x: x[:, 0])(model.output)
        # print(output.shape)
        output = Dense(1,
                       activation='sigmoid')(output)  # tanh, sigmoid, softmax
        model = Model(inputs=model.input, outputs=output)

        model.compile(
            loss=
            'binary_crossentropy',  # categorical_crossentropy binary_crossentropy
            optimizer=Adam(2e-6),  # 用足够小的学习率
            # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
            metrics=['accuracy'])
        model.summary()
        return model

    def gnerator_data(self, file_name):

        X1, X2, Y = [], [], []
        while True:
            for text1, text2, label in read_datas(file_name):

                text1 = text1[:INPUT_LENGTH]
                text2 = text2[:INPUT_LENGTH]
                text1 = unicodedata.normalize('NFKD', text1).strip().lower()
                text2 = unicodedata.normalize('NFKD', text2).strip().lower()
                x1, x2 = self.tokenizer.encode(first=text1, second=text2)
                y = int(label)

                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                # Y.append(to_categorical(y))
                if len(X1) == self.batch_size:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    # print(X1.shape, X2.shape, Y.shape)
                    yield [X1, X2], Y
                    X1, X2, Y = [], [], []

    def train(self):

        early_stopping = EarlyStopping(monitor='val_loss', patience=3)

        model_checkpoint = ModelCheckpoint(filepath=os.path.join(
            model_save_path,
            'similarity-{epoch:02d}-{val_loss:.2f}-{val_acc:.3f}.hdf5'),
                                           save_best_only=True,
                                           save_weights_only=False)

        tb = TensorBoard(
            log_dir=log_dir,  # log 目录
            histogram_freq=0,  # 按照何等频率（epoch）来计算直方图，0为不计算
            batch_size=32,  # 用多大量的数据计算直方图
            write_graph=True,  # 是否存储网络结构图
            write_grads=False,  # 是否可视化梯度直方图
            write_images=False,  # 是否可视化参数
            embeddings_freq=0,
            embeddings_layer_names=None,
            embeddings_metadata=None)

        hist = self.model.fit_generator(
            self.gnerator_data(TRAIN_DATA_FILE),
            steps_per_epoch=1000,
            epochs=100,
            validation_data=self.gnerator_data(DEV_DATA_FILE),
            validation_steps=100,
            callbacks=[early_stopping, model_checkpoint, tb])
        print(hist.history.items())

    def predict(self,
                text1,
                text2,
                weitht_file='similarity-01-0.55-0.741.hdf5'):

        self.model.load_weights(os.path.join(model_save_path, weitht_file),
                                by_name=True,
                                skip_mismatch=True,
                                reshape=True)

        text1 = text1[:INPUT_LENGTH]
        text2 = text2[:INPUT_LENGTH]
        text1 = unicodedata.normalize('NFKD', text1).strip().lower()
        text2 = unicodedata.normalize('NFKD', text2).strip().lower()
        x1, x2 = self.tokenizer.encode(first=text1, second=text2)

        X1 = seq_padding([x1])
        X2 = seq_padding([x2])
        ret = self.model.predict([X1, X2])
        return ret

    def batch_predict(self, question, database):
        text1 = question
        text1 = text1[:INPUT_LENGTH]
        X1, X2 = [], []
        for text2 in database:
            text2 = text2[:INPUT_LENGTH]
            text1 = unicodedata.normalize('NFKD', text1).strip().lower()
            text2 = unicodedata.normalize('NFKD', text2).strip().lower()
            x1, x2 = self.tokenizer.encode(first=text1, second=text2)
            X1.append(x1)
            X2.append(x2)
        X1 = seq_padding(X1)
        X2 = seq_padding(X2)
        ret = self.model.predict([X1, X2])

        return ret

Exemplo n.º 4

0

Exibir arquivo

Arquivo: basic_masked_language_model.py Projeto: gswyhq/bert4keras

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh'
# albert_model_path = '/notebooks/albert_zh/albert_large_zh'
# https://storage.googleapis.com/albert_zh/albert_large_zh.zip

config_path = os.path.join(albert_model_path, 'albert_config_large.json')
checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt')
dict_path = os.path.join(albert_model_path, 'vocab.txt')

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              with_mlm=True)  # 建立模型，加载权重

# token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')
token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京')

print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids))

# mask掉“技术”
# token_ids[3] = token_ids[4] = token_dict['[MASK]']
token_ids[4] = token_ids[5] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
# print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
print(tokenizer.decode(probas.argmax(axis=1)))

Exemplo n.º 5

0

Exibir arquivo

class AlbertClassify:
    def __init__(self,
                 initial_model=True,
                 model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')):
        self.initial_model = initial_model
        token_dict = load_vocab(DICT_PATH)
        self.tokenizer = SimpleTokenizer(token_dict)
        self.model_path = model_path
        if initial_model:
            self.albert_model = load_pretrained_model(
                CONFIG_PATH,
                CHECKPOINT_PATH,
                # keep_words=keep_words,
                albert=True)
        else:
            self.load(model_path)

        for l in self.albert_model.layers:
            l.trainable = True

    def train(self, train_data, valid_data):
        train_D = DataGenerator(train_data, self.tokenizer,
                                CONFIG['batch_size'], CONFIG['max_len'])
        valid_D = DataGenerator(valid_data, self.tokenizer,
                                CONFIG['batch_size'], CONFIG['max_len'])

        output = Lambda(lambda x: x[:, 0])(self.albert_model.output)
        output = Dense(1, activation='sigmoid')(output)
        self.model = Model(self.albert_model.input, output)

        save = ModelCheckpoint(os.path.join(self.model_path),
                               monitor='val_acc',
                               verbose=1,
                               save_best_only=True,
                               mode='auto')
        early_stopping = EarlyStopping(monitor='val_acc',
                                       min_delta=0,
                                       patience=3,
                                       verbose=1,
                                       mode='auto')
        callbacks = [save, early_stopping]

        if self.initial_model:
            x1_in = Input(shape=(None, ))
            x2_in = Input(shape=(None, ))

            x_in = self.albert_model([x1_in, x2_in])
            x_in = Lambda(lambda x: x[:, 0])(x_in)
            p = Dense(1, activation='sigmoid')(x_in)
            self.model = Model([x1_in, x2_in], p)
        else:
            self.model = self.albert_model

        self.model.compile(
            loss='binary_crossentropy',
            # optimizer=RAdam(1e-5),  # 用足够小的学习率
            optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {
                1000: 1e-5,
                2000: 6e-5
            }),
            metrics=[
                'accuracy', process.get_precision, process.get_recall,
                process.get_f1
            ])
        self.model.summary()

        self.model.fit_generator(
            train_D.__iter__(),
            steps_per_epoch=len(train_D),
            epochs=CONFIG['epochs'],
            validation_data=valid_D.__iter__(),
            validation_steps=len(valid_D),
            callbacks=callbacks,
            use_multiprocessing=CONFIG['use_multiprocessing'],
        )

    def predict(self, test_data):
        """
        预测
        :param test_data:
        :return:
        """
        X1 = []
        X2 = []
        for s in test_data:
            x1, x2 = self.tokenizer.encode(first=s[:CONFIG['max_len']])
            X1.append(x1)
            X2.append(x2)
        X1 = seq_padding(X1)
        X2 = seq_padding(X2)
        predict_results = self.model.predict([X1, X2])
        return predict_results

    def load(self, model_path):
        """
        load the pre-trained model
        """
        try:
            self.albert_model = load_model(str(model_path),
                                           custom_objects=get_custom_objects(),
                                           compile=False)
        except Exception as ex:
            print('load error')
        return self

Exemplo n.º 6

0

Exibir arquivo

import tensorflow as tf

from bert4keras.bert import load_pretrained_model
from bert4keras.utils import SimpleTokenizer, load_vocab
import numpy as np

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(
    gpus[0],
    [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)])

base_path = 'D:\AI\Data\\albert_large_zh\\'
config_path = base_path + 'albert_config_large.json'
checkpoint_path = base_path + 'albert_model.ckpt'
dict_path = base_path + 'vocab.txt'

token_dict = load_vocab(dict_path)  # 读取词典
tokenizer = SimpleTokenizer(token_dict)  # 建立分词器
model = load_pretrained_model(config_path, checkpoint_path,
                              albert=True)  # 建立模型，加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”权