示例#1
0
    def __init__(self):
        self.maxlen = 128
        self.config_path = 'chinese_simbert_L-12_H-768_A-12/bert_config.json'
        self.checkpoint_path = 'chinese_simbert_L-12_H-768_A-12/bert_model.ckpt'
        self.dict_path = 'chinese_simbert_L-12_H-768_A-12/vocab.txt'

        # 加载并精简词表,建立分词器
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )

        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)

        # 建立加载模型
        self.bert = build_transformer_model(
            self.config_path,
            self.checkpoint_path,
            with_pool='linear',
            application='unilm',
            keep_tokens=self.keep_tokens,  # 只保留keep_tokens中的字,精简原字表
            return_keras_model=False,
        )

        self.encoder = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[0])
        self.seq2seq = keras.models.Model(self.bert.model.inputs,
                                          self.bert.model.outputs[1])
示例#2
0
def build_model():
    config_path = GUWEN_CONFIG_PATH if use_guwenbert else ROBERTA_CONFIG_PATH
    checkpoint_path = GUWEN_CHECKPOINT_PATH if use_guwenbert else ROBERTA_CHECKPOINT_PATH
    dict_path = GUWEN_DICT_PATH if use_guwenbert else ROBERTA_DICT_PATH

    token_dict, keep_tokens = load_vocab(
        dict_path=dict_path,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    tokenizer = Tokenizer(token_dict, do_lower_case=True)

    model = build_transformer_model(
        config_path,
        checkpoint_path,
        application='unilm',
        # keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
    )

    # 加载训练好的模型
    model.load_weights(BEST_MODEL_PATH)

    autotitle = AutoTitle(start_id=None,
                          end_id=tokenizer._token_end_id,
                          maxlen=50)

    text = '却话巴山夜雨时'
    token_ids, segment_ids = tokenizer.encode(text)
    inputs = np.array([token_ids, segment_ids])
    inputs = [np.array([i]) for i in inputs]
    print(autotitle.predict(inputs, np.empty((1, 0), dtype=int), states=None))
    print(autotitle.generate("却话巴山夜雨时"))
    return autotitle
示例#3
0
    def buildmodel(self):
        self.token_dict, self.keep_tokens = load_vocab(
            dict_path=self.dict_path,
            simplified=True,
            startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        )
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)

        if self.pretrain_type == 'albert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='albert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        elif self.pretrain_type == 'bert':
            model = build_transformer_model(
                config_path,
                checkpoint_path,
                model='bert',
                with_mlm=True,
                keep_tokens=self.keep_tokens,
            )
        output = Lambda(lambda x: x[:, 1:self.max_a_len + 1])(model.output)
        #print(output.shape)
        self.model = Model(model.input, output)
        self.model.compile(loss=self.masked_cross_entropy,
                           optimizer=Adam(self.lr))
        self.model.summary()
def loadDic(dicPath):
    token_dict, keep_tokens = load_vocab(
        dict_path=dicPath,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    tokenizer = Tokenizer(token_dict, do_lower_case=True)
    return token_dict, keep_tokens, tokenizer
    def __iter__(self):
        token_dict,keep_tokens = load_vocab(dict_path=Config.COCAB_PATH,simplified=True,startswith=['[PAD]','[UNK]','[SEP]','[MASK]'])
        tokenizer = Tokenizer(token_dict,do_lower_case=True)

        model_path = Config.seq2seq_model_path
        if not os.path.exists(model_path):
            raise Exception('dd')
        set_session(sess)
        model = load_model(model_path)

        self.autotitle = AutoTitle(model=model,tokenizer=tokenizer,start_id=None,end_id=tokenizer._token_end_id,maxlen=128)
示例#6
0
def get_keep_tokens():
    counts = json.load(open('counts.json'))
    del counts['[CLS]']
    del counts['[SEP]']
    token_dict = load_vocab(BaseConfig.dict_path)
    freqs = [
        counts.get(i, 0)
        for i, j in sorted(token_dict.items(), key=lambda s: s[1])
    ]
    keep_tokens = list(np.argsort(freqs)[::-1])
    return keep_tokens
示例#7
0
def initTokenizer(dicPath='../data/dic.txt',
                  diclenth=1000,
                  handle=EnglishDicHandle):
    token_dict, keep_tokens = load_vocab(
        dict_path=dicPath,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    token_dict = handle(token_dict, diclenth)
    tokenizer = Tokenizer(token_dict, do_lower_case=True)

    return tokenizer, token_dict
示例#8
0
 def create_tokenizer(self):
     keep_tokens = []
     if self.simplified_tokenizer:
         token_dict, keep_tokens = load_vocab(
             dict_path=self.pre_trained_model_dict_path,
             simplified=True,
             startswith=["[PAD]", "[UNK]", "[CLS]", "[SEP]"],
         )
         tokenizer = Tokenizer(token_dict, do_lower_case=True)
     else:
         tokenizer = Tokenizer(self.pre_trained_model_dict_path,
                               do_lower_case=True)
     return tokenizer, keep_tokens
示例#9
0
    def build_keras4bert(self):
        import bert4keras
        from bert4keras.models import build_transformer_model
        from bert4keras.tokenizers import Tokenizer,load_vocab
        import os
        self.embedding_type = 'bert'
        config_path = os.path.join(self.corpus_path, 'bert_config.json')
        checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt')
        dict_path = os.path.join(self.corpus_path, 'vocab.txt')
        self.model = bert4keras.models.build_transformer_model(config_path=config_path,
                                                               checkpoint_path=checkpoint_path)

        # 加载并精简词表,建立分词器
        self.token_dict, keep_tokens = load_vocab(
            dict_path=dict_path,
            simplified=True,
            startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
        )
        self.vocab_size = len(self.token_dict)
        self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
示例#10
0
def create_tokenizer(sentences: typing.List[str]) -> typing.Tuple[Tokenizer, typing.List]:
    """
    根据新的数据集,精简词表,重新创建tokenizer

    Args:
        sentences: 评论数据句子的列表

    Returns:
        tokenizer,keep_tokens
    """
    # 加载下载的词表
    _token_dict = load_vocab(settings.DICT_PATH)
    _tokenizer = Tokenizer(_token_dict, do_lower_case=True)

    # 统计词频
    counter = Counter()
    for sentence in sentences:
        _tokens = _tokenizer.tokenize(sentence)
        # 统计词频时,移除[CLS]和[SEP]字符
        counter.update(_tokens[1:-1])
    # 过滤低频词
    tokens_and_counts = [(token, count) for token, count in counter.items() if count >= settings.MIN_WORD_FREQUENCY]
    # 按词频倒序排列
    sorted_tokens_and_counts = sorted(tokens_and_counts, key=lambda x: -x[1])
    # 去掉词频,只保留token
    most_tokens = [token for token, count in sorted_tokens_and_counts]
    # 构建新词典
    tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] + most_tokens
    keep_tokens = []
    token_dict = {}
    for token in tokens:
        if token in _token_dict:
            token_dict[token] = len(token_dict)
            keep_tokens.append(_token_dict[token])
    # 使用新词典构建分词器
    tokenizer = Tokenizer(token_dict, do_lower_case=True)
    return tokenizer, keep_tokens
示例#11
0
parse.add_argument('-t','--TEST_DATA_PATH',default=os.path.join(sys.path[0],'test.txt'),help='测试数据路径')
parse.add_argument('-c','--BERT_CONFIG',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','bert_config.json'),help='bert配置路径')
parse.add_argument('-m','--BERT_MODEL',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','bert_model.ckpt'),help='bert模型路径')
parse.add_argument('-v','--BERT_VOCAB',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','vocab.txt'),help='bert模型词汇表')
parse.add_argument('-ck','--MODEL_PATH',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'checkpoints','model.h5'),help='模型保存路径')
parse.add_argument('-l','--BERT_LAYER',default='Transformer-11-FeedForward-Norm',help='bert模型修改层')
parse.add_argument('-b','--BATCH_SIZE',default=32,help='batch size')
parse.add_argument('-e','--EPOCHS',default=2,help='epochs')
parse.add_argument('-M','--MAX_LEN',default=68,help='文本句长')
args = parse.parse_args()

_labels = ['TIME','LOC','PER','ORG']
_labels_num = len(_labels)*2 + 1

# 加载分词器
token_dict = load_vocab(dict_path=args.BERT_VOCAB)
tokenizer = Tokenizer(token_dict=token_dict)
token_head = tokenizer._token_start_id
token_end = tokenizer._token_end_id

def id_label_dict():
    """
    标注与数字映射词典
    :return:
    """
    id2label = dict(enumerate(_labels))
    label2id = {}
    for k,v in id2label.items():
        label2id[v] = k
    return label2id,id2label
# 模型配置
maxlen = 128
batch_size = 32
num_classes = 2
epochs = 20

# bert配置
config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)


def load_data(filenames):
    D = []
    for filename in filenames:
        with open(filename, encoding='utf-8') as f:
            for l in f:
                text, label = l.strip().split('\t')
                if len(text) <= maxlen - 2:
                    D.append((text, int(label)))
                else:
                    tmp = ''
示例#13
0
from bert4keras.tokenizers import Tokenizer , load_vocab
import json
import numpy as np
dict_path = "vocab.txt"

tokenizer = Tokenizer(load_vocab(dict_path))
maskID = tokenizer.token_to_id(tokenizer._token_mask)



def write_Json(content,fileName):
    with open(fileName,"w") as f:
        json.dump(content,f,indent=2)


def read_json(fileName):
    fp = open(fileName,"r")
    f = json.load(fp)
    return f


def cal_mask(inputs,corrupts,labels):
    assert inputs.shape == corrupts.shape and corrupts.shape == labels.shape
    masked = (labels == 1)
    correct = (inputs == corrupts)
    masked = masked.astype(np.float)
    correct = correct.astype(np.float)
    mask = masked * correct
    return mask
                    continue
                data.append((wrong1, right1))
            except Exception as err:
                print(line)
    return data


all_data = load_data(corpus_path)
random.shuffle(all_data)

valid_data = all_data[:len(all_data) // 8]
train_data = all_data[len(all_data) // 8:]

# 加载精简词表
token_dict, keep_words = load_vocab(
    dict_path=vocab_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

tokenizer = Tokenizer(token_dict, do_lower_case=True)


class MyDataGenerator(DataGenerator):
    def __iter__(self, random=True):
        """
        单条样本格式: [cls]错误词汇[sep][mask][mask]..[sep]
        :param random:
        :return:
        """
        batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], []
        for is_end, D in self.sample(random):
            wrong, right = D
示例#15
0
文件: dataset.py 项目: bettyYsj/nlp
    # 后半部分不能包含禁止词
    __, last_part = line.split(':')
    ignore_flag = False
    for dis_word in disallowed_words:
        if dis_word in last_part:
            ignore_flag = True
            break
    if ignore_flag:
        continue
    # 长度不能超过最大长度
    if len(last_part) > max_len - 2:
        continue
    poetry.append(last_part)

# 预训练模型中的词典和分词器
_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 统计所有词的词频
word_frequency_count = defaultdict(int)
for line in poetry:
    for t in _tokenizer.tokenize(line):
        word_frequency_count[t] += 1
# 过滤掉低频词
tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency]
# 按词频排序
tokens = sorted(tokens, key=lambda x: -x[1])
# 去掉词频,只保留词列表
tokens = [token for token, count in tokens]

# 构建新的token->id映射关系、和新词表
示例#16
0
from transformers import *
pretrained_weights = 'bert-base-chinese'
pre_tokenizer = BertTokenizer.from_pretrained(pretrained_weights)      # tokenizer.vocab_size = 21128

#  parameters
maxlen = 64
batch_size = 128
epochs = 99999
ser = 'dango'

#  setting vocabulary
config_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'

token_dict, keep_tokens = load_vocab(dict_path=dict_path, simplified=True,
    startswith=['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[PAD]', '[UNK]', '[CLS]', '[SEP]'],)
#  [unused1]: Other, [unused2]: Like, [unused3]: Sadness, [unused4]: Disgust, [unused5]: Anger, [unused6]: Happiness
tokenizer = Tokenizer(token_dict, do_lower_case=True)

#  reading data
questions, answers, answer_ids = [], [], []
f = open('/home/'+ser+'/STC3/data/questions.txt','r',encoding='gbk')
lines = f.readlines()
for line in lines:
    line = line.strip()
    questions.append(line)
f.close()
f = open('/home/'+ser+'/STC3/data/answers.txt','r',encoding='gbk')
lines = f.readlines()
for line in lines:
    line = line.strip()
示例#17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_path',
                        type=str,
                        required=True,
                        help='BERT配置文件路径')
    parser.add_argument('--checkpoint_path',
                        type=str,
                        required=True,
                        help='BERT权重路径')
    parser.add_argument('--dict_path', type=str, required=True, help='词表路径')
    parser.add_argument('--train_data_path',
                        type=str,
                        required=True,
                        help='训练集路径')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')
    parser.add_argument('--batch_size',
                        default=8,
                        type=int,
                        required=False,
                        help='训练batch_size')
    parser.add_argument('--lr',
                        default=1e-5,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--topk1',
                        default=25,
                        type=int,
                        required=False,
                        help='最大长度')
    parser.add_argument('--topk2',
                        default=2,
                        type=int,
                        required=False,
                        help='最大长度')
    parser.add_argument('--max_seq_len',
                        default=256,
                        type=int,
                        required=False,
                        help='最大长度')
    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    maxlen = args.max_seq_len
    config_path = args.config_path
    checkpoint_path = args.checkpoint_path
    dict_path = args.dict_path
    batch_size = args.batch_size
    epochs = args.epochs
    topk1 = args.topk1
    topk2 = args.topk2
    num_classes = 2
    lr = args.lr

    config_path = args.config_path
    checkpoint_path = args.checkpoint_path
    dict_path = args.dict_path
    train_data = args.train_data_path

    token_dict, keep_tokens = load_vocab(
        dict_path=dict_path,
        simplified=True,
        startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
    )
    tokenizer = Tokenizer(token_dict, do_lower_case=True)

    train_df = pd.read_csv(train_data, sep='\t', header=None)
    train_df.columns = ['s1', 's2', 'label']

    class data_generator(DataGenerator):
        """数据生成器
        """
        def __iter__(self, r=False):
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            batch_token_ids, batch_segment_ids, batch_labels = [], [], []
            for i in idxs:
                line = self.data.loc[i]
                if (random.random() < 0.5):
                    s1 = line['s1'].replace('***', '*')
                    s2 = line['s2'].replace('***', '*')
                else:
                    s2 = line['s1'].replace('***', '*')
                    s1 = line['s2'].replace('***', '*')
                token_ids, segment_ids = tokenizer.encode(s1,
                                                          s2,
                                                          max_length=maxlen)
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_labels.append([line['label']])
                if len(batch_token_ids) == self.batch_size or i == idxs[-1]:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_labels = sequence_padding(batch_labels)
                    yield [batch_token_ids, batch_segment_ids,
                           batch_labels], None
                    batch_token_ids, batch_segment_ids, batch_labels = [], [], []

    class CrossEntropy(Loss):
        """交叉熵作为loss,并mask掉padding部分
        """
        def compute_loss(self, inputs, mask=None):
            y_true, y_pred = inputs
            if mask[1] is None:
                y_mask = 1.0
            else:
                y_mask = K.cast(mask[1], K.floatx())[:, 1:]
            y_true = y_true[:, 1:]  # 目标token_ids
            y_pred = y_pred[:, :-1]  # 预测序列,错开一位
            loss = K.sparse_categorical_crossentropy(y_true, y_pred)
            loss = K.sum(loss * y_mask) / K.sum(y_mask)
            return loss

    c_in = Input(shape=(1, ))
    c = Embedding(num_classes, maxlen)(c_in)
    c = Reshape((maxlen, ))(c)

    model = build_transformer_model(
        config_path,
        checkpoint_path,
        application='lm',
        keep_tokens=keep_tokens,  # 只保留keep_tokens中的字,精简原字表
        layer_norm_cond=c,
        additional_input_layers=c_in,
    )
    output = CrossEntropy(1)([model.inputs[0], model.outputs[0]])
    model = Model(model.inputs, output)
    model.compile(optimizer=Adam(lr))
    model.summary()

    def random_generate(c=0, n=2, s1_topk=5):
        """随机采样生成句子对
        每次从最高概率的topk个token中随机采样一个
        """
        label_ids = [[c] for _ in range(n)]
        target_ids = [[2] for _ in range(n)]
        sep_index = [0 for _ in range(n)]
        R = []
        for i in range(64):
            segment_ids = []
            for t, index in zip(target_ids, sep_index):
                if index > 0:
                    segment_ids.append([0] * index + [1] * (len(t) - index))
                else:
                    segment_ids.append([0] * len(t))
            # 下面直接忽略[PAD], [UNK], [CLS]
            _probas = model.predict([target_ids, segment_ids,
                                     label_ids])[:, -1, 3:]
            for j, p in enumerate(_probas):
                p_arg_topk = p.argsort()[::-1][:s1_topk]
                #if 0 in p_arg_topk:
                #    target_ids[j].append(3)
                #else:
                p_topk = p[p_arg_topk]
                p = p_topk / sum(p_topk)
                idx = np.random.choice(len(p), p=p)
                target_ids[j].append(p_arg_topk[idx] + 3)

                if p_arg_topk[idx] + 3 == 3 and sep_index[j] == 0:
                    sep_index[j] = i
        for tokens in target_ids:
            tokens.append(3)
            cls_index = tokens.index(3)
            R.append(tokenizer.decode(tokens[:cls_index]))
            #sentences.sort(key = lambda i:len(i),reverse=True)
        return R

    def gen_sent(s, label, topk=2):
        """beam search解码
        每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索
        """
        label_ids = [[label] for _ in range(topk)]
        token_ids, segment_ids = tokenizer.encode(s)
        target_ids = [[] for _ in range(topk)]  # 候选答案id
        target_scores = [0] * topk  # 候选答案分数
        for i in range(64):  # 强制要求输出不超过max_output_len字
            _target_ids = [token_ids + t for t in target_ids]
            _segment_ids = [segment_ids + [1] * len(t) for t in target_ids]
            _probas = model.predict([_target_ids, _segment_ids,
                                     label_ids])[:, -1,
                                                 3:]  # 直接忽略[PAD], [UNK], [CLS]
            _log_probas = np.log(_probas + 1e-6)  # 取对数,方便计算
            _topk_arg = _log_probas.argsort(axis=1)[:, -topk:]  # 每一项选出topk
            _candidate_ids, _candidate_scores = [], []
            for j, (ids, sco) in enumerate(zip(target_ids, target_scores)):
                # 预测第一个字的时候,输入的topk事实上都是同一个,
                # 所以只需要看第一个,不需要遍历后面的。
                if i == 0 and j > 0:
                    continue
                for k in _topk_arg[j]:
                    _candidate_ids.append(ids + [k + 3])
                    _candidate_scores.append(sco + _log_probas[j][k])
            _topk_arg = np.argsort(_candidate_scores)[-topk:]  # 从中选出新的topk
            target_ids = [_candidate_ids[k] for k in _topk_arg]
            target_scores = [_candidate_scores[k] for k in _topk_arg]
            best_one = np.argmax(target_scores)
            if target_ids[best_one][-1] == 3:
                return tokenizer.decode(target_ids[best_one])
        # 如果max_output_len字都找不到结束符,直接返回
        return tokenizer.decode(target_ids[np.argmax(target_scores)])

    def gen_sen_pair(label, n, s1_topk, s2_topk):
        s1_pair = random_generate(label, n, s1_topk)
        output = []
        for line in s1_pair:
            s2 = gen_sent(line, label, s2_topk)
            output.append([line, s2])
        return output

    class Evaluate(keras.callbacks.Callback):
        def __init__(self):
            self.lowest = 1e10

        def on_epoch_end(self, epoch, logs=None):
            # 保存最优
            if logs['loss'] <= self.lowest:
                self.lowest = logs['loss']
                model.save_weights('./best_model.weights')
            print("正样本:")
            print(gen_sen_pair(1, 2, topk1, topk2))
            print("负样本:")
            print(gen_sen_pair(0, 2, topk1, topk2))

    train_generator = data_generator(train_df, batch_size)
    evaluator = Evaluate()
    model.fit_generator(train_generator.forfit(),
                        steps_per_epoch=len(train_generator),
                        epochs=epochs,
                        callbacks=[evaluator])