Python SpTokenizer示例，bert4keras.tokenizers.SpTokenizer Python示例

示例#1

0

显示文件

文件： basic_language_model_cpm_lm.py 项目： zouxiaoyuonly/bert4keras

checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt'
spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'


def pre_tokenize(text):
    """分词前处理函数
    """
    return [
        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
        for w in jieba.cut(text, cut_all=False)
    ]


tokenizer = SpTokenizer(spm_path,
                        token_start=None,
                        token_end=None,
                        pre_tokenize=pre_tokenize,
                        token_translate={u'\u2583': '<cls>'})  # 建立分词器

model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='gpt2')  # 建立模型，加载权重


class TextExpansion(AutoRegressiveDecoder):
    """基于随机采样的文本续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)
        return model.predict(token_ids)[:, -1]

示例#2

0

显示文件

文件： basic_language_model_cpm_lm.py 项目： CurisZhou/bert4keras

spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model'


def pre_tokenize(text):
    """分词前处理函数
    """
    return [
        w.replace(' ', u'\u2582').replace('\n', u'\u2583')
        for w in jieba.cut(text, cut_all=False)
    ]


tokenizer = SpTokenizer(
    spm_path,
    token_start=None,
    token_end=None,
    pre_tokenize=pre_tokenize,
    # '\u2583'为换行符，此处为将换行符替换为'<cls>'特殊符.
    token_translate={u'\u2583': '<cls>'}
)  # 建立分词器

model = build_transformer_model(
    config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2'
)  # 建立模型，加载权重


class TextExpansion(AutoRegressiveDecoder):
    """基于随机采样的文本续写
    """
    @AutoRegressiveDecoder.wraps(default_rtype='probas')
    def predict(self, inputs, output_ids, states):
        token_ids = np.concatenate([inputs[0], output_ids], 1)

示例#3

0

显示文件

文件： task_seq2seq_autotitle_csl_mt5.py 项目： zouxiaoyuonly/bert4keras

    """
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            title, content = l.strip().split('\t')
            D.append((title, content))
    return D


# 加载数据集
train_data = load_data('/root/csl/train.tsv')
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')

# 加载分词器
tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>')
keep_tokens = json.load(open(keep_tokens_path))


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_c_token_ids, batch_t_token_ids = [], []
        for is_end, (title, content) in self.sample(random):
            c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len)
            t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len)
            batch_c_token_ids.append(c_token_ids)
            batch_t_token_ids.append([0] + t_token_ids)
            if len(batch_c_token_ids) == self.batch_size or is_end:
                batch_c_token_ids = sequence_padding(batch_c_token_ids)

示例#4

0

显示文件

"""
@File   : load_albert.py
@Author : Pengy
@Date   : 2020/9/28
@Description : Input your description here ... 
"""
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import SpTokenizer
from keras.layers import LSTM, Dense
from keras.models import Model
import numpy as np

config_path = '../Models/albert_base_v2/albert_base/albert_config.json'
checkpoint_path = '../Models/albert_base_v2/albert_base/model.ckpt-best'
vocab_path = '../Models/albert_base_v2/albert_base/30k-clean.vocab'
spm_path = '../Models/albert_base_v2/albert_base/30k-clean.model'

tokenizer = SpTokenizer(spm_path)
model = build_transformer_model(config_path=config_path,
                                checkpoint_path=checkpoint_path,
                                model='albert')
model.summary()

token_ids, segment_ids = tokenizer.encode('language model')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

output = LSTM(64)(model.output)
output = Dense(32)(output)
my_model = Model(model.input, output)
my_model.summary()

示例#5

0

显示文件

checkpoint_path = os.path.join(bert_path, 'model.ckpt-best')
dict_path = os.path.join(bert_path, '30k-clean.vocab')
spm_path = os.path.join(bert_path, '30k-clean.model')

# load data
def load_data(filename):
    D = []
    with open(filename, encoding='gb2312') as f:
        for l in f:
            text, label = l.strip().split('\t')
            D.append((text, int(label)))
    return D


# Create a tokenizer
tokenizer = SpTokenizer(spm_path)
#tokenizer = Tokenizer(dict_path, do_lower_case=True)


class data_generator(DataGenerator):
    """data generator
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) 
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == self.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)

示例#6

0

显示文件

文件： train.py 项目： peterli1001/t5-pegasus

epochs = 100000
summary_rate = 0.25
t_maxlen = maxlen // 4
s_maxlen = maxlen - t_maxlen

# T5配置
config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json'
checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000'
spm_path = '/root/kg/bert/mt5/sentencepiece.model'

# PEGASUS
dict_path_1 = '/root/kg/bert/chinese_pegasus_L-12_H-768_A-12/vocab.txt'
dict_path_2 = '/root/kg/bert/chinese_t5_pegasus_base/vocab.txt'

# 构建词表
sp_tokenizer = SpTokenizer(spm_path, token_start=None, token_end=None)
token_dict = load_vocab(dict_path_1)
keep_tokens, new_token_dict, n = [], {}, 0
for t, _ in sorted(token_dict.items(), key=lambda s: s[1]):
    if n < 106:
        new_token_dict[t] = n
        n += 1
        continue
    if t.startswith('##'):
        i = sp_tokenizer.token_to_id(t[2:])
        if i == 2:
            i = sp_tokenizer.token_to_id(u'\u2581' + t)
    else:
        i = sp_tokenizer.token_to_id(u'\u2581' + t)
        if i == 2:
            i = sp_tokenizer.token_to_id(t)

示例#7

0

显示文件

 def setUpClass(cls) -> None:
     model_path = '../models/mt5_base/sentencepiece_cn.model'
     cls.raw_tokenizer = SpTokenizer(model_path, token_start=None, token_end='</s>')
     cls.my_tokenizer = SentencePieceTokenizer(model_path, token_start=None, token_end='</s>')