# -*- coding: utf-8 -*- # @Date : 2020/7/15 # @Author : mingming.xu # @Email : [email protected] # @File : extract_feature.py from toolkit4nlp.models import build_transformer_model from toolkit4nlp.tokenizers import Tokenizer import numpy as np config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt) token_ids, segment_ids = tokenizer.encode(u'我爱你中国') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) '''[[[-0.00827767 0.52711666 -0.2616654 ... 0.7717162 0.6682844 -0.3481327 ] [ 0.3665638 0.35970846 0.0772187 ... -0.5211092 -0.46724823 0.07845997] [ 0.6985213 -0.04391993 -1.3160559 ... 1.061864 0.8293197 0.07258661] ... [ 0.25169933 0.3048255 -1.2513847 ... 0.5438095 0.46753633 -0.61883307] [ 0.07904327 -0.08373377 -0.3963912 ... 0.29524678 0.74877214
for i, l in enumerate(f): l = json.loads(l) text, label, label_des = l['sentence'], l['label'], l['label_desc'] label = int(label) - 100 if int(label) < 105 else int(label) - 101 D.append((text, int(label), label_des)) return D # 加载数据集 train_data = load_data( '/home/mingming.xu/datasets/NLP/CLUE/tnews_public/train.json') valid_data = load_data( '/home/mingming.xu/datasets/NLP/CLUE/tnews_public/dev.json') # tokenizer tokenizer = Tokenizer(dict_path, do_lower_case=True) all_data = train_data + valid_data pretrain_data = [d[0] for d in all_data] # whole word mask pretrain_data = [jieba.lcut(d) for d in pretrain_data] def random_masking(lines): """对输入进行随机mask, 支持多行 """ if type(lines[0]) != list: lines = [lines] sources, targets = [tokenizer._token_start_id], [0]
""" GPT闲聊demo,参考:https://github.com/thu-coai/CDial-GPT """ import numpy as np from toolkit4nlp.models import build_transformer_model from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.utils import AutoRegressiveDecoder config_path = 'D:/pretrain/GPT_LCCC-base-tf/gpt_config.json' checkpoint_path = 'D:/pretrain/GPT_LCCC-base-tf/gpt_model.ckpt' dict_path = 'D:/pretrain/GPT_LCCC-base-tf/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) speakers = [ tokenizer.token_to_id('[speaker1]'), tokenizer.token_to_id('[speaker2]') ] model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='gpt', ) model.summary() class ChatBot(AutoRegressiveDecoder): """ 随机采样生成对话 """
D.append((text, int(label), label_des)) return D # 加载数据集 train_data = load_data( '/home/mingming.xu/datasets/NLP/CLUE/iflytek/train.json') valid_data = load_data('/home/mingming.xu/datasets/NLP/CLUE/iflytek/dev.json') # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( vocab_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) class data_generator(DataGenerator): """迁移时由于没有额外的label data,所以通过data augmentation 来模拟。 方法是切分句子后重复后shuffle再重新组成新的句子 """ def __init__(self, data_augmentation=False, transfer=False, **kwargs): super(data_generator, self).__init__(**kwargs) self.data_augmentation = data_augmentation self.transfer = transfer def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label, label_des) in self.get_sample(): if self.data_augmentation:
test_result_path = 'test_result.txt' test_score_path = 'test_score.txt' bert_config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' bert_dict = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' bert_checkpoint = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' maxlen = 256 bert_layers = 12 lr_multi = 2 # crf loss 放大 num_labes = 4 lr = 1e-5 batch_size = 16 # 实际梯度在此基础上累加2次,即 batch_size = 32 epochs = 5 tokenizer = Tokenizer(bert_dict, do_lower_case=True) def load_data(data_path): items = [] with open(data_path) as f: for line in f: chunk = re.split(' +', line.strip()) items.append(chunk) return items data = load_data(train_path) row_nums = list(range(len(data))) np.random.shuffle(row_nums)
from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.utils import ViterbiDecoder, pad_sequences, DataGenerator from toolkit4nlp.optimizers import Adam from toolkit4nlp.layers import * from toolkit4nlp.backend import K, sequence_masking vocab_dict = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' config_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' data_dir = '/home/mingming.xu/datasets/NLP/ner/china-people-daily-ner-corpus/' train_path = os.path.join(data_dir, 'example.train') test_path = os.path.join(data_dir, 'example.test') val_path = os.path.join(data_dir, 'example.dev') tokenizer = Tokenizer(vocab_dict, do_lower_case=True) maxlen = 256 lr = 1e-5 epochs = 5 batch_size = 16 def load_data(filename): D = [] with open(filename, encoding='utf-8') as f: f = f.read() for l in f.split('\n\n'): if not l: continue d, last_flag = [], ''
return x, y return TrainingDataset.load_tfrecord(record_names, batch_size, parse_func) if __name__ == "__main__": from toolkit4nlp.tokenizers import Tokenizer import re import json import glob import jieba_fast as jieba from tqdm import tqdm jieba.initialize() vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) seq_length = 512 def word_seg(text): return jieba.lcut(text) def generate_corp(): file_names = glob.glob('/home/mingming.xu/datasets/NLP/qa/dureader_robust-data/pretraining/*') count, sentences = 0, [] for fname in file_names: with open(fname) as fin: for p in json.load(fin)['data'][0]['paragraphs']: para = [qa['question'] for qa in p['qas']]
# -*- coding: utf-8 -*- # @Date : 2020/7/16 # @Author : mingming.xu # @Email : [email protected] # @File : mask_language_model.py import numpy as np from toolkit4nlp.tokenizers import Tokenizer from toolkit4nlp.models import build_transformer_model config = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_config.json' ckpt = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/bert_model.ckpt' vocab = '/home/mingming.xu/pretrain/NLP/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(vocab, do_lower_case=True) model = build_transformer_model(config, checkpoint_path=ckpt, with_mlm=True) # tokens, segs = tokenizer.encode('北京网聘技术有限公司') tokens, segs = tokenizer.encode('科学技术是第一生产力') tokens[3] = tokens[4] = tokenizer._token_dict['[MASK]'] prob = model.predict([np.array([tokens]), np.array([segs])])[0] print(tokenizer.decode(np.argmax(prob[3:5], axis=1))) ''' 正确结果应该是: 技术 '''
train_data = load_data('train') test_data = load_data('test') num_classes = 32 maxlen = 128 batch_size = 8 # BERT base config_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/bert_config.json' checkpoint_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/model.ckpt' dict_path = '/home/mingming.xu/pretrain/NLP/nezha_base_wwm/vocab.txt' # tokenizer tokenizer = Tokenizer(dict_path, do_lower_case=True) # pattern pattern = '直接回答问题:' mask_idx = [1] id2label = {0: '间', 1: '直'} label2id = {v: k for k, v in id2label.items()} labels = list(id2label.values()) def random_masking(token_ids): """对输入进行随机mask """ rands = np.random.random(len(token_ids))