def test_vector(self): text = '语言模型' tokens1, segs1 = self.sjl_tokenizer.encode(text) tokens2, segs2 = self.my_tokenizer.transform(text) self.assertEqual(tokens1, tokens2) self.assertEqual(segs1, segs2) tokens1, segs1 = to_array([tokens1], [segs1]) tokens2, segs2 = to_array([tokens2], [segs2]) from bert4keras.models import build_transformer_model print(build_transformer_model.__module__) model = build_transformer_model(config_path, checkpoint_path) res1 = model.predict([tokens1, segs1]) del model gc.collect() from garnet.models.build import build_transformer_model print(build_transformer_model.__module__) model = build_transformer_model(config_path, checkpoint_path) res2 = model.predict([tokens2, segs2]) del model gc.collect() shape = res1.shape self.assertEqual(np.sum(res1), np.sum(res2)) for k in range(shape[0]): for i in range(shape[1]): for j in range(shape[2]): self.assertAlmostEqual(res1[k, i, j], res2[k, i, j])
def get_similarity_bert(strx, stry, bm ,tokenizer): token_ids, segment_ids = tokenizer.encode(strx) token_ids, segment_ids = to_array([token_ids], [segment_ids]) a = bm.predict([token_ids, segment_ids]) token_ids, segment_ids = tokenizer.encode(stry) token_ids, segment_ids = to_array([token_ids], [segment_ids]) b = bm.predict([token_ids, segment_ids]) return cos_sim(a[0][0], b[0][0])
def generate(self, context): token_ids, segment_ids = tokenizer.encode(context) context_len = len(token_ids) segment_id = segment_ids[-1] + 1 sentence = '' words = [] gen_tokens = [] for i in range(self.maxlen): token_ids.append(tokenizer._token_dict['[MASK]']) segment_ids.append(segment_id) tokens, segments = to_array([token_ids], [segment_ids]) probas = biden_model.predict([tokens, segments])[0] # token = probas[context_len + i].argmax() ids = np.argsort(probas[context_len + i])[::-1] for token in ids: if token not in gen_tokens: gen_tokens.append(token) break words.append(tokenizer.decode([token])) token_ids[context_len + i] = token if token in self.end_id: sentence = ' '.join(words) return sentence sentence = ' '.join(words) sentence += '.' return sentence
def recognize(self, text): tokens = tokenizer.tokenize(text) # while len(tokens) > 512: # tokens.pop(-2) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = model.predict([token_ids, segment_ids])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[(label - 1) // 2]]) else: if starting: entities[-1][0].append(i) # else: # starting = False else: starting = False ner_answer = [] for w, l in entities: ner_answer.append([mapping[w[0]][0], mapping[w[-1]][-1] + 1, l]) return ner_answer
def recognize(self, text, tokenizer, models, loader): tokens = tokenizer.tokenize(text) # 将文本切分成字符 mapping = tokenizer.rematch(text, tokens) # 将字符按顺序映射成id token_ids = tokenizer.tokens_to_ids(tokens) # 将字符按字典映射成id segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = 0 if isinstance(models, list): for model in models: nodes += model.predict([token_ids, segment_ids ])[0] # shape[len(text), 27] nodes /= len(models) else: nodes = models.predict([token_ids, segment_ids])[0] # shape[len(text), 27] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: # 如果标签id为奇数则为实体起始位置 starting = True entities.append([[i], loader.id2label[(label - 1) // 2] ]) # 根据起始位置的label确定实体类别 elif starting: entities[-1][0].append( i) # 实体内部字符的下标加入列表,第二个维度只有一个值为类别名称,所以都取0 else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]
def recognize(self, text): """ :param text: :return: """ tokens = tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = model.predict([token_ids, segment_ids])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]
def extract_spoes(text, threshold=0): """抽取输入text所包含的三元组 """ tokens = tokenizer.tokenize(text, maxlen=maxlen) mapping = tokenizer.rematch(text, tokens) token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) token_ids, segment_ids = to_array([token_ids], [segment_ids]) outputs = model.predict([token_ids, segment_ids]) outputs = [o[0] for o in outputs] # 抽取subject和object subjects, objects = set(), set() outputs[0][:, [0, -1]] -= np.inf outputs[0][:, :, [0, -1]] -= np.inf for l, h, t in zip(*np.where(outputs[0] > threshold)): if l == 0: subjects.add((h, t)) else: objects.add((h, t)) # 识别对应的predicate spoes = set() for sh, st in subjects: for oh, ot in objects: p1s = np.where(outputs[1][:, sh, oh] > threshold)[0] p2s = np.where(outputs[2][:, st, ot] > threshold)[0] ps = set(p1s) & set(p2s) for p in ps: spoes.add( (text[mapping[sh][0]:mapping[st][-1] + 1], id2predicate[p], text[mapping[oh][0]:mapping[ot][-1] + 1])) return list(spoes)
def recognize(self, text): ########### 为了获取输入向量 tokens = tokenizer.tokenize(text) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) ############ ############ segment_ids往往是用来判断输入几句话的 同一输入的是一句话 因此全部是0 segment_ids = [0] * len(token_ids) ############ token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = model.predict([token_ids, segment_ids])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities]
def extract_features(self, text: str): """ 编码测试 :return: """ token_ids, segment_ids = self.tokenizer.encode(u'{}'.format(text)) token_ids, segment_ids = to_array([token_ids], [segment_ids]) print("\n === features === \n") print(self.predict([token_ids, segment_ids]))
def vec2(tex): # 编码测试 token_ids, segment_ids = tokenizer.encode(tex) token_ids, segment_ids = to_array([token_ids], [segment_ids]) tmp = model.predict([token_ids, segment_ids])[:, 0, :] # print(tmp) return tmp
def vec2(tex): # 编码测试 token_ids, segment_ids = tokenizer.encode(tex) token_ids, segment_ids = to_array([token_ids], [segment_ids]) global tmp # tmp=model.predict([token_ids, segment_ids])[:,0,:] # 0代表取[CLS] tmp3=tmp(tex) # print(tmp) return tmp3
def recognize(self, text, threshold=0): tokens = tokenizer.tokenize(text, maxlen=512) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) scores = model.predict([token_ids, segment_ids])[0] scores[:, [0, -1]] -= np.inf scores[:, :, [0, -1]] -= np.inf entities = [] for l, start, end in zip(*np.where(scores > threshold)): entities.append( (mapping[start][0], mapping[end][-1], categories[l])) return entities
def evaluate(data, Seq_ner, Tag_ner, model): """评测函数 """ token_list, seq_list, tag_list = data X, Y, Z = 1e-10, 1e-10, 1e-10 for token, seq, tag in tqdm(zip(token_list, seq_list, tag_list)): token_ids = to_array([token]) P = model.predict([token_ids]) S, T = list(Seq_ner.decode(P[0][0])), list(Tag_ner.decode(P[1][0])) X += 1 if (S == seq) else 0 Y += 1 if (T == tag) else 0 Z += 1 seq_acc, tag_acc = X / Z, Y / Z return seq_acc, tag_acc
def test_masked_predict(self): text = "科学技术是第一生产力" tokens, segs = self.tokenizer.transform(text) print(tokens) tokens[3] = tokens[4] = self.tokenizer.token2id( self.tokenizer.token_mask) print(tokens) tokens, segs = to_array([tokens], [segs]) probs = self.model.predict([tokens, segs])[1][0] pred_ids = probs.argmax(axis=1) print(pred_ids) text = self.tokenizer.reverse_transform(list(pred_ids)) print(text) self.assertEqual(text[3:5], '技术')
def predict_to_file(in_file, out_file): """输出预测结果到文件 结果文件可以提交到 https://www.cluebenchmarks.com 评测。 """ fw = open(out_file, 'w') with open(in_file) as fr: for l in tqdm(fr): l = json.loads(l) text = l['sentence'] token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) token_ids, segment_ids = to_array([token_ids], [segment_ids]) label = model.predict([token_ids, segment_ids])[0].argmax() l = json.dumps({'id': str(l['id']), 'label': str(label)}) fw.write(l + '\n') fw.close()
def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) token_ids, segment_ids = to_array([token_ids, segment_ids]) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids], batch_labels batch_token_ids, batch_segment_ids, batch_labels = [], [], []
def tokenize(self, text): tokens = tokenizer.tokenize(text) while len(tokens) > 512: tokens.pop(-2) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = model.predict([token_ids, segment_ids])[0] labels = self.decode(nodes) words = [] for i, label in enumerate(labels[1:-1]): if label < 2 or len(words) == 0: words.append([i + 1]) else: words[-1].append(i + 1) return [text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1] for w in words]
def extract_spoes(text): """抽取输入text所包含的三元组 """ tokens = tokenizer.tokenize(text, maxlen=maxlen) mapping = tokenizer.rematch(text, tokens) token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) token_ids, segment_ids = to_array([token_ids], [segment_ids]) # 抽取subject subject_preds = subject_model.predict([token_ids, segment_ids]) subject_preds[:, [0, -1]] *= 0 start = np.where(subject_preds[0, :, 0] > 0.6)[0] end = np.where(subject_preds[0, :, 1] > 0.5)[0] subjects = [] for i in start: j = end[end >= i] if len(j) > 0: j = j[0] subjects.append((i, j)) if subjects: spoes = [] token_ids = np.repeat(token_ids, len(subjects), 0) segment_ids = np.repeat(segment_ids, len(subjects), 0) subjects = np.array(subjects) # 传入subject,抽取object和predicate object_preds = object_model.predict([token_ids, segment_ids, subjects]) object_preds[:, [0, -1]] *= 0 for subject, object_pred in zip(subjects, object_preds): start = np.where(object_pred[:, :, 0] > 0.6) end = np.where(object_pred[:, :, 1] > 0.5) for _start, predicate1 in zip(*start): for _end, predicate2 in zip(*end): if _start <= _end and predicate1 == predicate2: spoes.append( ((mapping[subject[0]][0], mapping[subject[1]][-1]), predicate1, (mapping[_start][0], mapping[_end][-1])) ) break return [(text[s[0]:s[1] + 1], id2predicate[p], text[o[0]:o[1] + 1]) for s, p, o, in spoes] else: return []
def recognize(self, text): tokens = tokenizer.tokenize(text, maxlen=512) mapping = tokenizer.rematch(text, tokens) token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) token_ids, segment_ids = to_array([token_ids], [segment_ids]) nodes = model.predict([token_ids, segment_ids])[0] labels = self.decode(nodes) entities, starting = [], False for i, label in enumerate(labels): if label > 0: if label % 2 == 1: starting = True entities.append([[i], categories[(label - 1) // 2]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(mapping[w[0]][0], mapping[w[-1]][-1], l) for w, l in entities]
def test_load_and_save(self): current_folder = os.path.abspath( os.path.dirname(os.path.realpath(__file__))) bert_path = os.path.join(current_folder, 'assets', 'bert_sample_model') config_path = os.path.join(bert_path, 'bert_config.json') checkpoint_path = os.path.join(bert_path, 'bert_model.ckpt') dict_path = os.path.join(bert_path, 'vocab.txt') bert_model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='bert', application='encoder', return_keras_model=True) tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'jack play all day') token_ids, segment_ids = to_array([token_ids], [segment_ids]) print('\n ===== predicting =====\n') print(bert_model.predict([token_ids, segment_ids])) # Serialize model _ = bert_model.to_json()
) # 建立模型,加载权重 sentences = [] init_sent = u'科学技术是第一生产力。' # 给定句子或者None minlen, maxlen = 8, 32 steps = 10000 converged_steps = 1000 vocab_size = tokenizer._vocab_size if init_sent is None: length = np.random.randint(minlen, maxlen + 1) tokens = ['[CLS]'] + ['[MASK]'] * length + ['[SEP]'] token_ids = tokenizer.tokens_to_ids(tokens) segment_ids = [0] * len(token_ids) else: token_ids, segment_ids = tokenizer.encode(init_sent) length = len(token_ids) - 2 for _ in tqdm(range(steps), desc='Sampling'): # Gibbs采样流程:随机mask掉一个token,然后通过MLM模型重新采样这个token。 i = np.random.choice(length) + 1 token_ids[i] = tokenizer._token_mask_id probas = model.predict(to_array([token_ids], [segment_ids]))[0, i] token = np.random.choice(vocab_size, p=probas) token_ids[i] = token sentences.append(tokenizer.decode(token_ids)) print(u'部分随机采样结果:') for _ in range(10): print(np.random.choice(sentences[converged_steps:]))
def bert_feature_extract(txt): # 编码测试 token_ids, segment_ids = tokenizer.encode(txt) token_ids, segment_ids = to_array([token_ids], [segment_ids]) # print() return model.predict([token_ids, segment_ids])[0][0]
def toids(s): token_ids, segment_ids = tokenizer.encode(s) token_ids, segment_ids = to_array([token_ids], [segment_ids]) return [token_ids, segment_ids]
import numpy as np from bert4keras.backend import keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.snippets import to_array config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_transformer_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') token_ids, segment_ids = to_array([token_ids], [segment_ids]) print('\n ===== predicting =====\n') print(model.predict([token_ids, segment_ids])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636 0.39056838]
def sentiment(text): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) # word encoding token_ids, segment_ids = to_array([token_ids], [segment_ids]) y_pred = model.predict([token_ids, segment_ids]).argmax(axis=1) return y_pred