def evaluate_report(df_data): model = tf.keras.models.load_model('{}-model.h5'.format(model_name)) true_y_list = [i for i in df_data["new_label"].tolist()] pred_y_list = [] for text in df_data["text"].tolist(): tokenizer = Tokenizer(dict_path, do_lower_case=True) token_ids, segment_ids = tokenizer.encode(first_text=text, maxlen=maxlen) token_list = sequence_padding([token_ids]) segment_list = sequence_padding([segment_ids]) label = model.predict([np.array(token_list), np.array(segment_list)]).argmax(axis=1) pred_y_list.append(label[0]) with open("label.json", "r", encoding="utf-8") as f: labels = json.loads(f.read()) target_name_list = list(labels.values()) report = classification_report(true_y_list, pred_y_list, target_names=target_name_list, digits=4, output_dict=True) print(report) df = pd.DataFrame(report).transpose() df.to_csv("{}-report.csv".format(model_type), encoding='utf_8_sig', index=True)
class MaskedLM(object): def __init__(self, topK): self.topK = topK self.tokenizer = Tokenizer(BERT_VOCAB_PATH, do_lower_case='True') self.model = build_transformer_model(BERT_CONFIG_PATH, BERT_CHECKPOINT_PATH, with_mlm=True) def tokenizer_text(self, text): # ['[CLS]', '我', '喜', '欢', '吃', '程', '度', '的', '火', '锅', '[SEP]'] self.toeken = self.tokenizer.tokenize(text) # [101, 2769, 1599, 3614, 1391, 4923, 2428, 4638, 4125, 7222, 102] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] self.token_ids, self.segment_ids = self.tokenizer.encode(text) def find_top_candidates(self, error_index): for i in error_index: # 将错误词的id换成[MASK]的id self.token_ids[i] = self.tokenizer._token_dict['[MASK]'] # 第 5,6 个位置被替换为mask的ID-103,[101, 2769, 1599, 3614, 1391, 103, 103, 4638, 4125, 7222, 102] # 预测每一个token的概率分布 probs.shape = [len(toekn_ids),vocab_size] probs = self.model.predict( [np.array([self.token_ids]), np.array([self.segment_ids])])[0] for i in range(len(error_index)): # 拿到error_id error_id = error_index[i] # 取出概率分布里面,概率最大的topK个的位置id,argsort是升序,取负之后倒序 top_k_probs = np.argsort(-probs[error_id])[:self.topK] candidates, find_prob = self.tokenizer.decode( top_k_probs), probs[error_id][top_k_probs] print(dict(zip(candidates, find_prob)))
class ExtractBertFeatures: def __init__(self, model_name='uncased_L-4_H-256_A-4'): bert_model_dir = os.path.join(CUR_PASTH, 'data', model_name) self.load_model(model_dir=bert_model_dir) pass def load_model(self, model_dir): config_name = os.path.join(model_dir, 'bert_config.json') checkpoint_name = os.path.join(model_dir, 'bert_model.ckpt') vocab_name = os.path.join(model_dir, 'vocab.txt') self.tokenizer = Tokenizer(vocab_name, do_lower_case=True) # 建立分词器 self.model = build_transformer_model(config_name, checkpoint_name) # 建立模型,加载权重 def predict(self, x, second_text=None, max_length=None, first_length=None, second_length=None, use_multiprocessing=False): token_ids, segment_ids = self.tokenizer.encode(x, second_text, max_length, first_length, second_length) features = self.model.predict( [np.array([token_ids]), np.array([segment_ids])]) return features
def build_model(): config_path = GUWEN_CONFIG_PATH if use_guwenbert else ROBERTA_CONFIG_PATH checkpoint_path = GUWEN_CHECKPOINT_PATH if use_guwenbert else ROBERTA_CHECKPOINT_PATH dict_path = GUWEN_DICT_PATH if use_guwenbert else ROBERTA_DICT_PATH token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) model = build_transformer_model( config_path, checkpoint_path, application='unilm', # keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ) # 加载训练好的模型 model.load_weights(BEST_MODEL_PATH) autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=50) text = '却话巴山夜雨时' token_ids, segment_ids = tokenizer.encode(text) inputs = np.array([token_ids, segment_ids]) inputs = [np.array([i]) for i in inputs] print(autotitle.predict(inputs, np.empty((1, 0), dtype=int), states=None)) print(autotitle.generate("却话巴山夜雨时")) return autotitle
def pre_model(text): tokenizer = Tokenizer(dict_path, do_lower_case=True) token_ids, segment_ids = tokenizer.encode(first_text=text, maxlen=maxlen) token_list = sequence_padding([token_ids]) segment_list = sequence_padding([segment_ids]) label = model.predict([np.array(token_list), np.array(segment_list)]).argmax(axis=1) return int(label[0])
class MaskedLM(): def __init__(self,topK): self.topK = topK self.tokenizer = Tokenizer(Config.BERT_VOCAB_PATH,do_lower_case=True) self.model = build_transformer_model(Config.BERT_CONFIG_PATH,Config.BERT_CHECKPOINT_PATH,with_mlm = True) self.token_ids, self.segment_ids = self.tokenizer.encode(' ') def tokenizer_text(self,text): self.token_ids,self.segment_ids = self.tokenizer.encode(text) def find_topn_candidates(self,error_index): for i in error_index: self.token_ids[i] = self.tokenizer._token_dict['[MASK]'] #将待纠正的词用mask替换掉 probs = self.model.predict([np.array([self.token_ids]),np.array([self.segment_ids])])[0] for i in range(len(error_index)): error_id = error_index[i] top_k_probs = np.argsort(-probs[error_id])[:self.topK] candidates,fin_prob = self.tokenizer.decode(top_k_probs),probs[error_id][top_k_probs] print(dict(zip(candidates,fin_prob)))
def vec4(tex): tokenizer = Tokenizer(dict_path, do_lower_case=True) token_ids, segment_ids = tokenizer.encode(tex) print("Token ID: " + str(token_ids)) print("Segment ID:" + str(segment_ids)) print('\n ===== predicting =====\n') vec1 = model.predict([np.array([token_ids]), np.array([segment_ids])]) print(vec1.shape) return vec1
class Simparams: def __init__(self): self.max_seq_length = 128 self.corpus_text = 'data/corpus3.json' self.config_path = 'data/chinese_simbert_L-12_H-768_A-12/bert_config.json' self.checkpoint_path = 'data/chinese_simbert_L-12_H-768_A-12/bert_model.ckpt' self.vocab_file = 'data/chinese_simbert_L-12_H-768_A-12/vocab.txt' self.tokenizer = Tokenizer(self.vocab_file, do_lower_case=True) # 建立分词器 # 加载模型 self.bert = build_transformer_model( self.config_path, self.checkpoint_path, with_pool='linear', application='unilm', return_keras_model=False, ) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) # 加载数据库语料 with open(self.corpus_text, 'r', encoding='utf-8') as load_f: self.classes = json.load(load_f) self.corpus = eval(self.classes) self.list_vec = [] self.list_corpus = [] for c, v in self.corpus.items(): self.list_vec.append(v) self.list_corpus.append(c) # 新增语料 df = pd.read_excel('data/新增数据.xlsx') for vn in range(len(df)): self.list_corpus.append(df['新增语料'][vn]) self.list_vec.append(self.vec(df['新增语料'][vn])) self.list_vec = np.concatenate(self.list_vec).reshape(-1, 768) def vec(self, query): token_ids, segment_ids = self.tokenizer.encode(query, max_length=self.max_seq_length) vec = self.encoder.predict([[token_ids], [segment_ids]])[0] # 求单位向量 vec /= (vec ** 2).sum() ** 0.5 return vec
def test_load_and_save(self): current_folder = os.path.abspath( os.path.dirname(os.path.realpath(__file__))) bert_path = os.path.join(current_folder, 'assets', 'bert_sample_model') config_path = os.path.join(bert_path, 'bert_config.json') checkpoint_path = os.path.join(bert_path, 'bert_model.ckpt') dict_path = os.path.join(bert_path, 'vocab.txt') bert_model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='bert', application='encoder', return_keras_model=True) tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'jack play all day') token_ids, segment_ids = to_array([token_ids], [segment_ids]) print('\n ===== predicting =====\n') print(bert_model.predict([token_ids, segment_ids])) # Serialize model _ = bert_model.to_json()
# BERT配置 config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重 # 文本编码 text = u'计算机的鼠标有什么比较特殊的用途呢' words = jieba.lcut(text) spans = [] token_ids = [tokenizer._token_start_id] for w in words: w_ids = tokenizer.encode(w)[0][1:-1] token_ids.extend(w_ids) spans.append((len(token_ids) - len(w_ids), len(token_ids))) token_ids.append(tokenizer._token_end_id) length = len(spans) def dist(x, y): """距离函数(默认用欧氏距离) 可以尝试换用内积或者cos距离,结果差不多。 """ return np.sqrt(((x - y)**2).sum()) batch_token_ids = np.array([token_ids] * (length * (length + 1) // 2))
# ========== data preparation: ========== labels = sorted(list(set(df.label))) assert len(labels) == num_classes, 'wrong num of classes!' label2idx = {name: i for name, i in zip(labels, range(num_classes))} #%% print('start tokenizing...') t = time.time() X_token = [] X_seg = [] y = [] i = 0 for content, label in zip(list(df.content), list(df.label)): i += 1 if i % 1000 == 0: print(i) token_ids, seg_ids = tokenizer.encode(content, maxlen=maxlen) X_token.append(token_ids) X_seg.append(seg_ids) y.append(label2idx[label]) # the sequences we obtained from above may have different length, so use Padding: X_token = sequence_padding(X_token) X_seg = sequence_padding(X_seg) y = np.array(y) print('tokenizing time cost:', time.time() - t, 's.') #%% # ========== model traing: ========== old_list = [] ls_list = [] lcm_list = []
class CmtClassifier: def __init__(self, model_type, model_para_paths, label_filepath, origin): # 属性赋值 self.model_type = model_type self.model_para_paths = model_para_paths self.label_filepath = label_filepath # 加载编号-标签字典 with open(label_filepath, "r") as fin: reader = csv.reader(fin) self.label_dict = {int(row[0]): row[1] for row in reader} # 创建分词器 self.tokenizer = Tokenizer(model_para_paths[2], do_lower_case=True) if origin: # 表示构建一个还未经过微调的模型 # 模型的上游 bert = build_transformer_model( config_path=model_para_paths[0], checkpoint_path=model_para_paths[1], model=model_type, return_keras_model=False, ) # 取[CLS]这个token的输出向量作为下游任务的输入 output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) # 模型的下游 output = Dense( units=len(self.label_dict), activation='softmax', kernel_initializer=bert.initializer )(output) # 连接模型的输入与输出 self.model = keras.models.Model(bert.model.input, output) else: # 表示模型已经过微调 self.model = build_transformer_model( config_path=model_para_paths[0], checkpoint_path=model_para_paths[1], model=model_type, return_keras_model=False, ) self.model.summary() # 显示模型结构 def fit(self, train_filepath, valid_filepath, temp_save_path, maxlen=128, learning_rate=1e-4, epochs=5, batch_size=32): train_data = load_data(train_filepath) train_generator = CmtDataGenerator(train_data, batch_size, self.tokenizer) callbacks = None if valid_filepath != "" and valid_filepath is not None \ and temp_save_path != "" and temp_save_path is not None: valid_data = load_data(valid_filepath) valid_generator = CmtDataGenerator(valid_data, batch_size, self.tokenizer) evaluator = Evaluator(self.model, valid_generator, temp_save_path) callbacks = [evaluator] AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') self.model.compile( loss='sparse_categorical_crossentropy', optimizer=AdamLR(learning_rate=learning_rate, lr_schedule={ 1000: 1, 2000: 0.1 }), metrics=['accuracy'], ) self.model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=callbacks ) if callbacks is not None: self.model.load_weights(temp_save_path) def clean_data(self, input): at_pattern = re.compile("//@.*?:") url_pattern = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+") # 去掉Email和URL input = re.sub(at_pattern, "", input) input = re.sub(url_pattern, "", input) # 去掉首尾空格 return input.strip() def predict(self, input): tids, sids = self.encode(self.clean_data(input), 128) y_pred = self.model.predict([tids, sids]).argmax(axis=1) return y_pred, self.label_dict[y_pred] def encode(self, input, maxlen=None): return self.tokenizer.encode(input, maxlen) def load_weights(self, param_path): self.model.load_weights(param_path) def save_model(self, savepath): self.model.save(savepath)
class SynonymsGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ def __init__(self, model_path, max_len=32, seed=1): # super().__init__() setup_seed(seed) self.config_path = os.path.join(model_path, "bert_config.json") self.checkpoint_path = os.path.join(model_path, "bert_model.ckpt") self.dict_path = os.path.join(model_path, "vocab.txt") self.max_len = max_len self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True) self.bert = build_transformer_model( self.config_path, self.checkpoint_path, with_pool='linear', application='unilm', return_keras_model=False, ) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.seq2seq = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[1]) super().__init__(start_id=None, end_id=self.tokenizer._token_end_id, maxlen=self.max_len) @AutoRegressiveDecoder.set_rtype('probas') def predict(self, inputs, output_ids, states): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1) return self.seq2seq.predict([token_ids, segment_ids])[:, -1] def generate(self, text, n=1, topk=5): token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.max_len) output_ids = self.random_sample([token_ids, segment_ids], n, topk) return [self.tokenizer.decode(ids) for ids in output_ids] def gen_synonyms(self, text, n=100, k=20, threhold=0.75): """"含义: 产生sent的n个相似句,然后返回最相似的k个。 做法:用seq2seq生成,并用encoder算相似度并排序。 """ r = self.generate(text, n) r = [i for i in set(r) if i != text] r = [text] + r X, S = [], [] for t in r: x, s = self.tokenizer.encode(t) X.append(x) S.append(s) X = sequence_padding(X) S = sequence_padding(S) Z = self.encoder.predict([X, S]) Z /= (Z**2).sum(axis=1, keepdims=True)**0.5 scores = np.dot(Z[1:], Z[0]) argsort = scores.argsort() scores = scores.tolist() # print(scores.shape) # return [(r[i + 1], scores[i]) for i in argsort[::-1][:k] if scores[i] > threhold] return [(r[i + 1], scores[i]) for i in argsort[::-1][:k]]
# 建立加载模型 bert = build_transformer_model( config_path, checkpoint_path, with_pool='linear', application='unilm', return_keras_model=False, ) encoder = keras.models.Model(bert.model.inputs, bert.model.outputs[0]) seq2seq = keras.models.Model(bert.model.inputs, bert.model.outputs[1]) ques = ['姚明的女儿', '姚明父亲'] X, S = [], [] for que in ques: x, s = tokenizer.encode(que) X.append(x) S.append(s) X = sequence_padding(X) S = sequence_padding(S) with graph.as_default(): Z = encoder.predict([X, S]) class SynonymsGenerator(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.set_rtype('probas') def predict(self, inputs, output_ids, step): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1)
token_dict, keep_tokens, compound_tokens = json.load( open(seq2seq_config_json)) else: # 加载并精简词表 token_dict, keep_tokens = load_vocab( dict_path=nezha_dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) pure_tokenizer = Tokenizer(token_dict.copy(), do_lower_case=True) user_dict = [] for w in load_user_dict(user_dict_path) + load_user_dict(user_dict_path_2): if w not in token_dict: token_dict[w] = len(token_dict) user_dict.append(w) compound_tokens = [pure_tokenizer.encode(w)[0][1:-1] for w in user_dict] json.dump([token_dict, keep_tokens, compound_tokens], open(seq2seq_config_json, 'w')) tokenizer = Tokenizer(token_dict, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) def generate_copy_labels(source, target): """构建copy机制对应的label """ mapping = longest_common_subsequence(source, target)[1] source_labels = [0] * len(source) target_labels = [0] * len(target) i0, j0 = -2, -2
# layer_name = 'Transformer-9-FeedForward-Norm' #获取层的名称 # intermediate_layer_model = Model(inputs=model.input, # outputs=model.get_layer(layer_name).output)#创建的新模型 for layers in model.layers: print(layers.name) maxlen = 70 # 读取处理数据 f1 = 'D:/cluster/data/train.json' res = load_data(f1) output = [] print('开始提取') # 根据提取特征的方法获得词向量 for r in res: token_ids, segment_ids = tokenizer.encode(r, max_length=maxlen) if vector_name == 'cls': cls_vector = model.predict( [np.array([token_ids]), np.array([segment_ids])])[0][0] output.append(cls_vector) elif vector_name == 'mean': new = [] vector = model.predict( [np.array([token_ids]), np.array([segment_ids])])[0] for i in range(768): temp = 0 for j in range(len(vector)): temp += vector[j][i]
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
class ReextractBertTrainHandler(): def __init__(self, params, Train=False): self.bert_config_path = model_root_path + "chinese_L-12_H-768_A-12/bert_config.json" self.bert_checkpoint_path = model_root_path + "chinese_L-12_H-768_A-12/bert_model.ckpt" self.bert_vocab_path = model_root_path + "chinese_L-12_H-768_A-12/vocab.txt" self.tokenizer = Tokenizer(self.bert_vocab_path, do_lower_case=True) self.model_path = model_root_path + "best_model.weights" self.params_path = model_root_path + 'params.json' gpu_id = params.get("gpu_id", None) self._set_gpu_id(gpu_id) # 设置训练的GPU_ID self.memory_fraction = params.get('memory_fraction') if Train: self.train_data_file_path = params.get('train_data_path') self.valid_data_file_path = params.get('valid_data_path') self.maxlen = params.get('maxlen', 128) self.batch_size = params.get('batch_size', 32) self.epoch = params.get('epoch') self.data_process() else: load_params = json.load(open(self.params_path, encoding='utf-8')) self.maxlen = load_params.get('maxlen') self.num_classes = load_params.get('num_classes') self.p2s_dict = load_params.get('p2s_dict') self.i2p_dict = load_params.get('i2p_dict') self.p2o_dict = load_params.get('p2o_dict') self.build_model() if not Train: self.load_model() def _set_gpu_id(self, gpu_id): if gpu_id: os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) def data_process(self): self.train_data, self.valid_data, self.p2s_dict, self.p2o_dict, self.i2p_dict, self.p2i_dict = data_process( self.train_data_file_path, self.valid_data_file_path, self.maxlen, self.params_path) self.num_classes = len(self.i2p_dict) self.train_generator = Data_Generator(self.train_data, self.batch_size, self.tokenizer, self.p2i_dict, self.maxlen) def extrac_subject(self, inputs): """根据subject_ids从output中取出subject的向量表征 """ output, subject_ids = inputs subject_ids = K.cast(subject_ids, 'int32') start = batch_gather(output, subject_ids[:, :1]) end = batch_gather(output, subject_ids[:, 1:]) subject = K.concatenate([start, end], 2) return subject[:, 0] def build_model(self): import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.allocator_type = 'BFC' # A "Best-fit with coalescing" algorithm, simplified from a version of dlmalloc. if self.memory_fraction: config.gpu_options.per_process_gpu_memory_fraction = self.memory_fraction config.gpu_options.allow_growth = False else: config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # 补充输入 subject_labels = Input(shape=(None, 2), name='Subject-Labels') subject_ids = Input(shape=(2,), name='Subject-Ids') object_labels = Input(shape=(None, self.num_classes, 2), name='Object-Labels') # 加载预训练模型 bert = build_transformer_model( config_path=self.bert_config_path, checkpoint_path=self.bert_checkpoint_path, return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x ** 2)(output) self.subject_model = Model(bert.model.inputs, subject_preds) # 传入subject,预测object # 通过Conditional Layer Normalization将subject融入到object的预测中 output = bert.model.layers[-2].get_output_at(-1) subject = Lambda(self.extrac_subject)([output, subject_ids]) output = LayerNormalization(conditional=True)([output, subject]) output = Dense(units=self.num_classes * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x ** 4)(output) object_preds = Reshape((-1, self.num_classes, 2))(output) self.object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 self.train_model = Model(bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) self.train_model.add_loss(subject_loss + object_loss) AdamEMA = extend_with_exponential_moving_average(Adam, name='AdamEMA') self.optimizer = AdamEMA(lr=1e-4) self.train_model.compile(optimizer=self.optimizer) def load_model(self): self.train_model.load_weights(self.model_path) def predict(self, text): """ 抽取输入text所包含的三元组 text:str(<离开>是由张宇谱曲,演唱) """ tokens = self.tokenizer.tokenize(text, max_length=self.maxlen) token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.maxlen) # 抽取subject subject_preds = self.subject_model.predict([[token_ids], [segment_ids]]) start = np.where(subject_preds[0, :, 0] > 0.6)[0] end = np.where(subject_preds[0, :, 1] > 0.5)[0] subjects = [] for i in start: j = end[end >= i] if len(j) > 0: j = j[0] subjects.append((i, j)) if subjects: spoes = [] token_ids = np.repeat([token_ids], len(subjects), 0) segment_ids = np.repeat([segment_ids], len(subjects), 0) subjects = np.array(subjects) # 传入subject,抽取object和predicate object_preds = self.object_model.predict([token_ids, segment_ids, subjects]) for subject, object_pred in zip(subjects, object_preds): start = np.where(object_pred[:, :, 0] > 0.6) end = np.where(object_pred[:, :, 1] > 0.5) for _start, predicate1 in zip(*start): for _end, predicate2 in zip(*end): if _start <= _end and predicate1 == predicate2: spoes.append((subject, predicate1, (_start, _end))) break return [ ( [self.tokenizer.decode(token_ids[0, s[0]:s[1] + 1], tokens[s[0]:s[1] + 1]), self.p2s_dict[self.i2p_dict[p]]], self.i2p_dict[p], [self.tokenizer.decode(token_ids[0, o[0]:o[1] + 1], tokens[o[0]:o[1] + 1]), self.p2o_dict[self.i2p_dict[p]]], (s[0], s[1] + 1), (o[0], o[1] + 1) ) for s, p, o in spoes ] else: return [] def train(self): evaluator = Evaluator(self.train_model, self.model_path, self.tokenizer, self.predict, self.optimizer, self.valid_data) self.train_model.fit_generator(self.train_generator.forfit(), steps_per_epoch=len(self.train_generator), epochs=self.epoch, callbacks=[evaluator])
class XlnetEmbedding(BaseEmbedding): def __init__(self, hyper_parameters): self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [24]) self.xlnet_embed = hyper_parameters['embedding'].get('xlnet_embed', {}) self.batch_size = hyper_parameters['model'].get('batch_size', 2) super().__init__(hyper_parameters) def build_config(self, path_config: str=None): # reader config of bert self.configs = {} if path_config is not None: self.configs.update(json.load(open(path_config))) def build(self): from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # set_custom_objects() self.build_config(self.config_path) # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # # debug时候查看layers # self.model_layers = model.layers # len_layers = self.model_layers.__len__() # print(len_layers) num_hidden_layers = self.configs.get("n_layer", 12) layer_real = [i for i in range(num_hidden_layers)] + [-i for i in range(num_hidden_layers)] # 简要判别一下 self.layer_indexes = [i if i in layer_real else -2 for i in self.layer_indexes] output_layer = "FeedForward-Normal-{0}" layer_dict = [model.get_layer(output_layer.format(i + 1)).get_output_at(node_index=0) for i in range(num_hidden_layers)] # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = layer_dict[self.layer_indexes[0]] else: encoder_layer = layer_dict[-1] # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [layer_dict[lay] if lay in layer_real else layer_dict[-1] # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) # def xlnet_concat(x): # x_concat = K.concatenate(x, axis=1) # return x_concat # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp) def sentence2idx(self, text, second_text=None): # text = extract_chinese(str(text).upper()) text = str(text).upper() tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) \ if len(tokens) < self.target_len \ else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros((1, 1)) # np.array([[self.memory_len]]) # np.zeros((1, 1)) masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens)) if len(tokens) < self.target_len else []) mask_input = np.expand_dims(np.array(masks), axis=0) if self.trainable: return [token_input, segment_input, memory_length_input, mask_input] else: return [token_input, segment_input, memory_length_input]
class BertEmbedding(BaseEmbedding): def __init__(self, hyper_parameters): self.layer_indexes = hyper_parameters['embedding'].get('layer_indexes', [12]) super().__init__(hyper_parameters) def build(self): import keras_bert self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') check_point_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') print('load bert model start!') model = keras_bert.load_trained_model_from_checkpoint(config_path, check_point_path, seq_len=self.len_max, trainable=self.trainable) print('load bert model end!') # bert model all layers layer_dict = [6] layer_0 = 7 for i in range(12): layer_0 = layer_0 + 8 layer_dict.append(layer_0) print(layer_dict) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in [i + 1 for i in range(13)]: encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0] - 1]).output else: encoder_layer = model.get_layer(index=layer_dict[-1]).output # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 else: # layer_indexes must be [1,2,3,......12] # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] all_layers = [model.get_layer(index=layer_dict[lay - 1]).output if lay in [i + 1 for i in range(13)] else model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 for lay in self.layer_indexes] all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) self.embedding_size = self.model.output_shape[-1] # word2idx = {} # with open(dict_path, 'r', encoding='utf-8') as f: # words = f.read().splitlines() # for idx, word in enumerate(words): # word2idx[word] = idx # for key, value in self.ot_dict.items(): # word2idx[key] = value # # self.token2idx = word2idx # reader tokenizer self.token_dict = {} with codecs.open(dict_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.token_dict[token] = len(self.token_dict) self.vocab_size = len(self.token_dict) self.tokenizer = keras_bert.Tokenizer(self.token_dict) def build_keras4bert(self): import bert4keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer,load_vocab import os self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') self.model = bert4keras.models.build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path) # 加载并精简词表,建立分词器 self.token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) self.vocab_size = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) def sentence2idx(self, text, second_text=None): text = extract_chinese(str(text).upper()) text = str(text).upper() input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max) return [input_id, input_type_id]
class Bert4KearsBase(BaseModel): def __init__(self, config): # config_path,checkpoint_path,dict_path ''' config = {"config_path":,"checkpoint_path":,"save_dir":,"dict_path":} ''' super().__init__(config) init_dir(self.save_dir) self.tokenizer = Tokenizer(self.config['dict_path'], do_lower_case=True) self.graph = tf.get_default_graph() self.model_name = None self.best_weights_path = None self.model_path = None def optimizer(self): AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR') _optimizer = AdamLR(lr=1e-5, lr_schedule={1000: 1, 2000: 0.1}) return _optimizer def _init_model(self): # 加载预训练模型 bert = build_transformer_model( config_path=self.config['config_path'], checkpoint_path=self.config['checkpoint_path'], model=self.model_name, return_keras_model=False, ) output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output) output = Dense(units=self.num_labels, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) return model def _load_data(self, path): df = load_df(path) D = [] for text, label in zip(df['text'], df['label']): D.append((str(text), int(label))) return D def process_data(self, train_path, dev_path, test_path): train_data = self._load_data(train_path) dev_data = self._load_data(dev_path) test_data = self._load_data(test_path) train_generator = data_generator(train_data, self.tokenizer, self.max_len, self.batch_size) dev_generator = data_generator(dev_data, self.tokenizer, self.max_len, self.batch_size) test_generator = data_generator(test_data, self.tokenizer, self.max_len, self.batch_size) return train_generator, dev_generator, test_generator def train(self, train_path, dev_path, test_path): self.set_seed(self.seed) # 为了可复现 train_generator, dev_generator, test_generator = self.process_data( train_path, dev_path, test_path) # load model with self.graph.as_default(): self.model = self._init_model() _optimizer = self.optimizer() self.model.compile( loss='sparse_categorical_crossentropy', optimizer=_optimizer, metrics=['accuracy'], ) # start train early_stopping_monitor = EarlyStopping(patience=self.patience, verbose=1) checkpoint = ModelCheckpoint(self.best_weights_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto', period=1) callbacks = [early_stopping_monitor, checkpoint] self.model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), validation_data=dev_generator.forfit(), validation_steps=len(dev_generator), epochs=self.epochs, callbacks=callbacks) self.model.load_weights(self.best_weights_path) self.model.save(self.model_path) model_report = self.evaluate(test_path) return model_report def load_model(self, model_path): self.model = keras.models.load_model(model_path, custom_objects=custom_objects) def demo(self, text): text_list = [text] pred_list = self.demo_text_list(text_list) pred = pred_list[0] return pred def demo_text_list(self, text_list): batch_token_ids, batch_segment_ids = [], [] for text in text_list: token_ids, segment_ids = self.tokenizer.encode( text, max_length=self.max_len) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) with self.graph.as_default(): preds = self.model.predict([batch_token_ids, batch_segment_ids]) if self.num_labels == 2: pred_list = preds[:, 1] else: pred_list = np.argmax(preds, axis=1).flatten() return pred_list def release(self): # K.clear_session() del self.model del self.graph del self.tokenizer
from bert4keras.models import build_transformer_model from roformer import RoFormerModel, RoFormerTokenizer jieba.initialize() config_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/bert_config.json' checkpoint_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/bert_model.ckpt' dict_path = 'E:/BaiduNetdiskDownload/chinese_roformer_L-12_H-768_A-12/vocab.txt' # converted_ckpt_path = "pretrained_models/chinese_roformer_base" converted_ckpt_path = "junnyu/roformer_chinese_base" #https://huggingface.co/junnyu/roformer_chinese_base tokenizer = Tokenizer(dict_path, do_lower_case=True, pre_tokenize=lambda s: jieba.cut(s, HMM=False)) text = "这里基本保留了唐宋遗留下来的坊巷格局和大量明清古建筑,其中各级文保单位29处,被誉为“里坊制度的活化石”“明清建筑博物馆”!" #bert4keras inputs = tokenizer.encode(text) tf_inputs = [ tf.convert_to_tensor(inputs[0])[None], tf.convert_to_tensor(inputs[1])[None] ] model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='roformer') bert4keras_outputs = torch.tensor(model(tf_inputs).numpy()) # pt roformer_tokenizer = RoFormerTokenizer.from_pretrained(converted_ckpt_path) pt_model = RoFormerModel.from_pretrained(converted_ckpt_path, add_pooling_layer=False) pt_inputs = roformer_tokenizer(text, return_tensors="pt") with torch.no_grad(): pt_outputs = pt_model(**pt_inputs).last_hidden_state
class XlnetEmbedding(BaseEmbedding): def __init__(self, hyper_parameters): self.layer_indexes = hyper_parameters['embedding'].get( 'layer_indexes', [24]) self.xlnet_embed = hyper_parameters['embedding'].get('xlnet_embed', {}) self.batch_size = hyper_parameters['model'].get('batch_size', 2) super().__init__(hyper_parameters) def build(self): from keras_xlnet import load_trained_model_from_checkpoint, set_custom_objects from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI self.embedding_type = 'xlnet' self.checkpoint_path = os.path.join(self.corpus_path, 'xlnet_model.ckpt') self.config_path = os.path.join(self.corpus_path, 'xlnet_config.json') self.spiece_model = os.path.join(self.corpus_path, 'spiece.model') self.attention_type = self.xlnet_embed.get('attention_type', 'bi') # or 'uni' self.attention_type = ATTENTION_TYPE_BI if self.attention_type == 'bi' else ATTENTION_TYPE_UNI self.memory_len = self.xlnet_embed.get('memory_len', 0) self.target_len = self.xlnet_embed.get('target_len', 5) print('load xlnet model start!') # 模型加载 model = load_trained_model_from_checkpoint( checkpoint_path=self.checkpoint_path, attention_type=self.attention_type, in_train_phase=self.trainable, config_path=self.config_path, memory_len=self.memory_len, target_len=self.target_len, batch_size=self.batch_size, mask_index=0) # set_custom_objects() # 字典加载 self.tokenizer = Tokenizer(self.spiece_model) # debug时候查看layers self.model_layers = model.layers len_layers = self.model_layers.__len__() print(len_layers) layer_real = [i for i in range(25)] + [-i for i in range(25)] # 简要判别一下 self.layer_indexes = [ i if i in layer_real else -2 for i in self.layer_indexes ] len_couche = int((len_layers - 6) / 10) # 一共246个layer # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 # 一共24层 layer_dict = [] layer_0 = 7 for i in range(len_couche): layer_0 = layer_0 + 10 layer_dict.append(layer_0) layer_dict.append(247) # 测试 get_output_at # def get_number(index): # try: # model_node = model.get_output_at(node_index=index) # gg = 0 # except: # print('node index wrong!') # print(index) # list_index = [i for i in range(25)] + [-i for i in range(25)] # for li in list_index: # get_number(li) # 输出它本身 if len(self.layer_indexes) == 0: encoder_layer = model.output # 分类如果只有一层,取得不正确的话就取倒数第二层 elif len(self.layer_indexes) == 1: if self.layer_indexes[0] in layer_real: encoder_layer = model.get_layer( index=layer_dict[self.layer_indexes[0]]).get_output_at( node_index=0) else: encoder_layer = model.get_layer( index=layer_dict[-1]).get_output_at(node_index=0) # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 else: # layer_indexes must be [0, 1, 2,3,......24] all_layers = [ model.get_layer(index=layer_dict[lay]).get_output_at( node_index=0) if lay in layer_real else model.get_layer( index=layer_dict[-1]).get_output_at( node_index=0) # 如果给出不正确,就默认输出倒数第一层 for lay in self.layer_indexes ] print(self.layer_indexes) print(all_layers) all_layers_select = [] for all_layers_one in all_layers: all_layers_select.append(all_layers_one) encoder_layer = Add()(all_layers_select) print(encoder_layer.shape) # def xlnet_concat(x): # x_concat = K.concatenate(x, axis=1) # return x_concat # encoder_layer = Lambda(xlnet_concat, name='xlnet_concat')(all_layers) self.output = NonMaskingLayer()(encoder_layer) self.input = model.inputs self.model = Model(self.input, self.output) print("load KerasXlnetEmbedding end") model.summary(132) self.embedding_size = self.model.output_shape[-1] self.vocab_size = len(self.tokenizer.sp) def sentence2idx(self, text, second_text=None): # text = extract_chinese(str(text).upper()) text = str(text).upper() tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) \ if len(tokens) < self.target_len \ else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros( (1, 1)) # np.array([[self.memory_len]]) # np.zeros((1, 1)) masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens)) if len(tokens) < self.target_len else []) mask_input = np.expand_dims(np.array(masks), axis=0) if self.trainable: return [ token_input, segment_input, memory_length_input, mask_input ] else: return [token_input, segment_input, memory_length_input]
class MRCTrainer(): def __init__(self, train_param, model_save_path): self.lr = train_param['learning_rate'] self.max_p_len = train_param['max_p_len'] self.max_q_len = train_param['max_q_len'] self.max_a_len = train_param['max_a_len'] self.epochs = train_param['epochs'] self.pretrain_type = train_param['pretrain_type'] self.batch_size = train_param['batch_size'] self.config_path = train_param['config_path'] self.checkpoint_path = train_param['checkpoint_path'] self.dict_path = train_param['dict_path'] self.model_config = train_param self.model_config['model_save_path'] = model_save_path self.model_save_path = model_save_path self.buildmodel() def masked_cross_entropy(self, y_true, y_pred): y_true = K.reshape(y_true, [K.shape(y_true)[0], -1]) y_mask = K.cast(K.not_equal(y_true, 0), K.floatx()) cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) return cross_entropy def buildmodel(self): self.token_dict, self.keep_tokens = load_vocab( dict_path=self.dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], ) self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) if self.pretrain_type == 'albert': model = build_transformer_model( config_path, checkpoint_path, model='albert', with_mlm=True, keep_tokens=self.keep_tokens, ) elif self.pretrain_type == 'bert': model = build_transformer_model( config_path, checkpoint_path, model='bert', with_mlm=True, keep_tokens=self.keep_tokens, ) output = Lambda(lambda x: x[:, 1:self.max_a_len + 1])(model.output) #print(output.shape) self.model = Model(model.input, output) self.model.compile(loss=self.masked_cross_entropy, optimizer=Adam(self.lr)) self.model.summary() def fit(self, train_data): params_file = os.path.join(self.model_save_path, 'config.json') with open(params_file, 'w', encoding='utf-8') as json_file: json.dump(self.model_config, json_file, indent=4, ensure_ascii=False) evaluator = Evaluator(self.model, self.model_save_path) train_generator = data_generator(train_data, self.tokenizer, self.batch_size, self.max_a_len, self.max_q_len, self.max_p_len) self.model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator]) def get_ngram_set(self, x, n): """生成ngram合集,返回结果格式是: {(n-1)-gram: set([n-gram的第n个字集合])} """ result = {} for i in range(len(x) - n + 1): k = tuple(x[i:i + n]) if k[:-1] not in result: result[k[:-1]] = set() result[k[:-1]].add(k[-1]) return result def gen_answer(self, question, passage): token_ids, segment_ids = [], [] passage = re.sub(u' |、|;|,', ',', passage) p_token_ids, _ = self.tokenizer.encode(passage, max_length=self.max_p_len + 1) q_token_ids, _ = self.tokenizer.encode(question, max_length=self.max_q_len + 1) token_ids = [self.tokenizer._token_start_id] token_ids += [self.tokenizer._token_mask_id] * max_a_len token_ids += [self.tokenizer._token_end_id] token_ids += q_token_ids[1:] + p_token_ids[1:] segment_ids = [0] * len(token_ids[-1]) token_ids = sequence_padding(token_ids) segment_ids = sequence_padding(segment_ids) probas = self.model.predict([token_ids, segment_ids]) results = {} a, score = tuple(), 0. for i in range(max_a_len): idxs = list(self.get_ngram_set(token_ids, i + 1)[a]) print("idxs", idxs) if self.tokenizer._token_end_id not in idxs: idxs.append(self.tokenizer._token_end_id) pi = np.zeros_like(probas[i]) pi[idxs] = probas[i, idxs] a = a + (pi.argmax(), ) score += pi.max() if a[-1] == self.tokenizer._token_end_id: break score = score / (i + 1) a = self.tokenizer.decode(a) if a: results[a] = results.get(a, []) + [score] results = { k: (np.array(v)**2).sum() / (sum(v) + 1) for k, v in results.items() } return results def evalue(self): result = [] return result
# 测试代码可用性: 提取特征 from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer import numpy as np config_path = '../models/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '../models//bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '../models//bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'阅读理解') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636 0.39056838]
) # 建立模型,加载权重 sentences = [] init_sent = u'科学技术是第一生产力。' # 给定句子或者None minlen, maxlen = 8, 32 steps = 10000 converged_steps = 1000 vocab_size = tokenizer._vocab_size if init_sent is None: length = np.random.randint(minlen, maxlen + 1) tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]'] token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) else: token_ids, segment_ids = tokenizer.encode(init_sent) length = len(token_ids) - 2 for _ in tqdm(range(steps), desc='Sampling'): # Gibbs采样流程:随机mask掉一个token,然后通过MLM模型重新采样这个token。 i = np.random.choice(length) + 1 token_ids[i] = tokenizer._token_mask_id probas = model.predict(to_array([token_ids], [segment_ids]))[0, i] token = np.random.choice(vocab_size, p=probas) token_ids[i] = token sentences.append(tokenizer.decode(token_ids)) print(u'部分随机采样结果:') for _ in range(10): print(np.random.choice(sentences[converged_steps:]))
import numpy as np from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import to_array config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') token_ids, segment_ids = to_array([token_ids], [segment_ids]) print('\n ===== predicting =====\n') print(model.predict([token_ids, segment_ids])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636
class simBERT(object): '''基于BERT的语义计算引擎''' def __init__(self, config=bert_config_path, checkpoint=bert_checkpoint_path, dicts=bert_dict_path): from bert4keras.backend import keras from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import sequence_padding from bert4keras.models import build_transformer_model self.config_path = config self.checkpoint_path = checkpoint self.dict_path = dicts self.tokenizer = Tokenizer(self.dict_path, do_lower_case=True) self.sequence_padding = sequence_padding self.bert = build_transformer_model( self.config_path, self.checkpoint_path, with_pool='linear', application='unlim', return_keras_model=False, ) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) # self.seq2seq = keras.models.Model(self.bert.model.inputs,self.bert.model.outputs[1]) def sent2vec(self, sent): # ------------------------------------------------- # description: 句子转向量 # param sent str,输入句子 # return: # ------------------------------------------------- if isinstance(sent, list): X, S = [], [] for s in sent: x, s = self.tokenizer.encode(s) X.append(x) S.append(s) X = self.sequence_padding(X) S = self.sequence_padding(S) # Z = self.encoder.predict([X,S]) else: x, s = self.tokenizer.encode(sent) X = self.sequence_padding([x]) S = self.sequence_padding([s]) Z = self.encoder.predict([X, S], verbose=1) # 将向量归一化,便于计算各类距离 return normalize(Z) def keywords(self, token=None, text='', topn=1, with_sim=True): # ------------------------------------------------- # description: 关键词匹配算法,当有token时,返回token中和句子相似度最大的词语;当无token时,返回句中关键词 # param token list,待选词表,可为空 # param text str,输入文本 # param topn int,默认为1,最大不超过token词表的最大长度 # param with_sim boolean 当为True时,返回带相似度的结果 # return: # ------------------------------------------------- if token is not None: r = token + [text] # r = token + [c for c in cut(text) if len(c) > 1] else: token = [c for c in cut(text) if len(c) > 1] r = token + token X, S = [], [] for t in r: x, s = self.tokenizer.encode(t) X.append(x) S.append(s) X = self.sequence_padding(X) S = self.sequence_padding(S) Z = normalize(self.encoder.predict([X, S])) score = np.dot(Z[len(token):], Z[:len(token)].T) # print(score.shape) if with_sim: return [(token[i], score[0][i]) for i in topK(score, topn)[1]] return np.array(token)[topK(score, topn)[1]] def sentence_similarity(self, sent_1, sent_2): # ------------------------------------------------- # description: 句子相似度计算 # param sent_1 str,输入语句 # param sent_2 str,输入语句 # return: # ------------------------------------------------- sent_vec_1 = self.sent2vec(sent_1) sent_vec_2 = self.sent2vec(sent_2) similarity = np.dot(sent_vec_1, sent_vec_2.T) return similarity[0][0]
D.append((text1, text2, int(label))) return D # 加载数据集 train_data = load_data('datasets/lcqmc/lcqmc.train.data') valid_data = load_data('datasets/lcqmc/lcqmc.valid.data') test_data = load_data('datasets/lcqmc/lcqmc.test.data') # 测试相似度效果 data = valid_data a_token_ids, b_token_ids, labels = [], [], [] texts = [] for d in data: token_ids = tokenizer.encode(d[0], max_length=maxlen)[0] a_token_ids.append(token_ids) token_ids = tokenizer.encode(d[1], max_length=maxlen)[0] b_token_ids.append(token_ids) labels.append(d[2]) texts.extend(d[:2]) a_token_ids = sequence_padding(a_token_ids) b_token_ids = sequence_padding(b_token_ids) a_vecs = encoder.predict([a_token_ids, np.zeros_like(a_token_ids)], verbose=True) b_vecs = encoder.predict([b_token_ids, np.zeros_like(b_token_ids)], verbose=True) labels = np.array(labels) a_vecs = a_vecs / (a_vecs**2).sum(axis=1, keepdims=True)**0.5
output = Dense(units=2, activation='softmax', kernel_initializer=bert.initializer)(output) model = keras.models.Model(bert.model.input, output) model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(config['learning_rate']), metrics=['sparse_categorical_accuracy']) model.load_weights('Disaster_Rumor_Detection_best_model_1_0.weights') # Tokenizer tokenizer = Tokenizer(config['dict_path'], do_lower_case=True) # Make predictions table = [] for idx, row in df_test.iterrows(): token_ids, seg_ids = tokenizer.encode(row['text'], maxlen=config['max_len']) result = model.predict([[token_ids], [seg_ids]]).argmax(axis=1) table.append([row['id'], result[0]]) print('Data id{} prediction done!'.format(row['id'])) print('And result is {}'.format(result[0])) print('-' * 60) final_result = pd.DataFrame(table, columns=['id', 'target']) if __name__ == '__main__': print(final_result.head()) final_result.to_csv('mysubmission.csv', index=False)