def __init__(self, entity2relations_dict='data/entity2relations_dict.pkl', seqPair2similarity_dict='data/seqPair2similarity_dict.pkl'): self._entity2relations = self._load_dict(entity2relations_dict) self._seqPair2similarity = self._load_dict(seqPair2similarity_dict) self._similarity_dict_path = seqPair2similarity_dict self._relation_paths_dict_path = entity2relations_dict self._model = BertSim() self._model.mode = tf.estimator.ModeKeys.PREDICT
def __init__(self): #加载微调过的文本匹配模型 self.simmer = BertSim() self.tokenizer = BertTokenizer.from_pretrained(BERT_ID) self.device = torch.device('cuda:0') self.simmer.load_state_dict(torch.load('../data/model/similarity.pt')) self.simmer.to(self.device) print('bert相似度匹配模型加载完成') print('tuple extractor loaded')
def __init__(self): try: self.entity2relations_dic = pickle.load( open('../data/entity2relation_dic.pkl', 'rb')) except: self.entity2relations_dic = {} try: self.sentencepair2sim = pickle.load( open('../data/sentencepair2sim_dic.pkl', 'rb')) except: self.sentencepair2sim = {} self.simmer = BertSim() self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT) print('tuples extractor loaded')
def __init__(self): #加载一些缓存 try: self.entity2relations_dic = pickle.load(open('../data/entity2relation_dic.pkl','rb')) except: self.entity2relations_dic = {} #加载基于tensorflow的微调过的文本匹配模型 self.simmer = BertSim() self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT) print ('bert相似度匹配模型加载完成') #加载简单-复杂问题分类模型 #self.question_classify_model = get_model() print ('问题分类模型加载完成') print ('tuples extractor loaded')
class TupleExtractor(object): def __init__(self): try: self.entity2relations_dic = pickle.load( open('../data/entity2relation_dic.pkl', 'rb')) except: self.entity2relations_dic = {} try: self.sentencepair2sim = pickle.load( open('../data/sentencepair2sim_dic.pkl', 'rb')) except: self.sentencepair2sim = {} self.simmer = BertSim() self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT) print('tuples extractor loaded') def extract_tuples(self, candidate_entitys, question): '''''' candidate_tuples = {} for entity in candidate_entitys: #得到该实体的所有关系路径 starttime = time.time() relations = GetRelationPaths(entity) mention = candidate_entitys[entity][0] for r in relations: this_tuple = tuple([entity] + r) #生成候选tuple predicates = [relation[1:-1] for relation in r] #python-list 关系名列表 human_question = '的'.join([mention] + predicates) score = [entity] + [s for s in candidate_entitys[entity][0:1] ] #初始化特征 try: sim2 = self.sentencepair2sim[question + human_question] except: sim2 = self.simmer.predict(question, human_question)[0][1] self.sentencepair2sim[question + human_question] = sim2 self.sentencepair2sim[question + human_question] = sim2 score.append(sim2) candidate_tuples[this_tuple] = score print('====查询候选关系并计算特征耗费%.2f秒====' % (time.time() - starttime)) return candidate_tuples def GetCandidateAns(self, corpus): '''根据mention,得到所有候选实体,进一步去知识库检索候选答案 候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比 ''' true_num = 0 hop2_num = 0 hop2_true_num = 0 all_tuples_num = 0 for i in range(len(corpus)): dic = corpus[i] question = dic['question'] gold_tuple = dic['gold_tuple'] gold_entitys = dic['gold_entitys'] candidate_entitys = dic['candidate_entity_filter'] candidate_tuples = self.extract_tuples(candidate_entitys, question) print(i) print(question) all_tuples_num += len(candidate_tuples) dic['candidate_tuples'] = candidate_tuples #判断gold tuple是否包含在candidate_tuples_list中 if_true = 0 for thistuple in candidate_tuples: if len(gold_tuple) == len( set(gold_tuple).intersection(set(thistuple))): if_true = 1 break if if_true == 1: true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_num += 1 print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus))) print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num)) print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus))) pickle.dump(self.entity2relations_dic, open('../data/entity2relation_dic.pkl', 'wb')) pickle.dump(self.sentencepair2sim, open('../data/sentencepair2sim_dic.pkl', 'wb')) return corpus
class TupleExtractor(object): def __init__(self): #加载一些缓存 try: self.entity2relations_dic = pickle.load( open('../data/entity2relation_dic.pkl', 'rb')) except: self.entity2relations_dic = {} #加载基于tensorflow的微调过的文本匹配模型 self.simmer = BertSim() self.simmer.set_mode(tf.estimator.ModeKeys.PREDICT) print('bert相似度匹配模型加载完成') #加载简单-复杂问题分类模型 #self.question_classify_model = get_model() print('问题分类模型加载完成') print('tuples extractor loaded') def extract_tuples(self, candidate_entitys, question): '''''' candidate_tuples = {} entity_list = candidate_entitys.keys() #得到有序的实体列表 inputs = [] #获取所有候选路径的BERT输入 for entity in entity_list: #得到该实体的所有关系路径 starttime = time.time() relations = GetRelationPaths(entity) mention = candidate_entitys[entity][0] for r in relations: predicates = [relation[1:-1] for relation in r] #python-list 关系名列表 human_question = '的'.join([mention] + predicates) inputs.append((question, human_question)) #将所有路径输入BERT获得分数 print('====共有{}个候选路径===='.format(len(inputs))) bert_scores = [] batch_size = 128 if len(inputs) % batch_size == 0: num_batches = len(inputs) // batch_size else: num_batches = len(inputs) // batch_size + 1 starttime = time.time() for i in range(num_batches): begin = i * batch_size end = min(len(inputs), (i + 1) * batch_size) self.simmer.input_queue.put(inputs[begin:end]) prediction = self.simmer.output_queue.get() bert_scores.extend( [prediction[i][1] for i in range(len(prediction))]) print('====为所有路径计算特征耗费%.2f秒====' % (time.time() - starttime)) index = 0 for entity in entity_list: #得到该实体的所有关系路径 starttime = time.time() relations = GetRelationPaths(entity) mention = candidate_entitys[entity][0] for r in relations: this_tuple = tuple([entity] + r) #生成候选tuple score = [entity] + candidate_entitys[entity] #初始化特征 sim2 = bert_scores[index] index += 1 score.append(sim2) candidate_tuples[this_tuple] = score print('====得到实体%s的所有候选路径及其特征====' % (entity)) return candidate_tuples def GetCandidateAns(self, corpus): '''根据mention,得到所有候选实体,进一步去知识库检索候选答案 候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比 ''' true_num = 0 hop2_num = 0 hop2_true_num = 0 all_tuples_num = 0 for i in range(len(corpus)): dic = corpus[i] question = dic['question'] gold_tuple = dic['gold_tuple'] gold_entitys = dic['gold_entitys'] candidate_entitys = dic['candidate_entity_filter'] print(i) print(question) candidate_tuples = self.extract_tuples(candidate_entitys, question) all_tuples_num += len(candidate_tuples) dic['candidate_tuples'] = candidate_tuples #判断gold tuple是否包含在candidate_tuples_list中 if_true = 0 for thistuple in candidate_tuples: if len(gold_tuple) == len(set(gold_tuple) & set(thistuple)): if_true = 1 break if if_true == 1: true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_true_num += 1 if len(gold_tuple) <= 3 and len(gold_entitys) == 1: hop2_num += 1 print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus))) print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num)) print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus))) pickle.dump(self.entity2relations_dic, open('../data/entity2relation_dic.pkl', 'wb')) return corpus
import numpy as np import pandas as pd import urllib.request import urllib.parse import tensorflow as tf from db import load_data_kudu from global_config import Logger sys.path.append('/home/mqq/zwshi/bert/') from similarity import BertSim # 模块导入 https://blog.csdn.net/xiongchengluo1129/article/details/80453599 loginfo = Logger("recommend_articles.log", "info") file = "./NERdata/q_t_a_testing_predict.txt" bs = BertSim() bs.set_mode(tf.estimator.ModeKeys.PREDICT) def dataset_test(): ''' 用训练问答对中的实体+属性,去知识库中进行问答测试准确率上限 :return: ''' with open(file) as f: total = 0 recall = 0 correct = 0 for line in f: question, entity, attribute, answer, ner = line.split("\t")
class TupleExtractor(object): def __init__(self): #加载微调过的文本匹配模型 self.simmer = BertSim() self.tokenizer = BertTokenizer.from_pretrained(BERT_ID) self.device = torch.device('cuda:0') self.simmer.load_state_dict(torch.load('../data/model/similarity.pt')) self.simmer.to(self.device) print('bert相似度匹配模型加载完成') print('tuple extractor loaded') def extract_tuples(self, candidate_entitys, question, entity2relations): '''''' candidate_tuples = {} entity_list = candidate_entitys.keys() # 得到有序的实体列表 count, st = 0, time.time() for entity in entity_list: mention = candidate_entitys[entity][0] relations = entity2relations[entity] for r in relations: #python-list 关系名列表 predicates = [relation[1:-1] for relation in r] human_question = '的'.join([mention] + predicates) logits = predict(self.simmer, self.tokenizer, self.device, question, human_question) sim = logits[0][1].item() this_tuple = tuple([entity] + r) # e, [r|r1, r2] # [entity, mention, feats] feature = [entity] + candidate_entitys[entity] + [sim] candidate_tuples[this_tuple] = feature count += 1 print('====共有{}个候选路径===='.format(count)) print('====为所有路径计算特征耗费%.2f秒====' % (time.time() - st)) return candidate_tuples def get_candidate_ans(self, corpus): '''根据mention,得到所有候选实体,进一步去知识库检索候选答案 候选答案格式为tuple(entity,relation1,relation2) 这样便于和标准答案对比 ''' true_num = 0 hop2_num = 0 hop2_true_num = 0 all_tuples_num = 0 relation_list, st = [], time.time() for i, item in enumerate(corpus): print(i) candidate_entity = item['candidate_entity_filter'] entity_relation = dict() for e in candidate_entity: ret = get_relation_paths(e) entity_relation[e] = ret print('实体: %s查找到%d候选路径' % (e, len(ret))) relation_list.append(entity_relation) print() print('查询时间开销:%.2fs' % (time.time() - st)) for i in range(len(corpus)): dic = corpus[i] question = dic['question'] gold_entities = dic['gold_entities'] gold_relations = dic['gold_relations'] gold_tuple = tuple(gold_entities + gold_relations) candidate_entitys = dic['candidate_entity_filter'] relations = relation_list[i] print(i) print(question) candidate_tuples = self.extract_tuples(candidate_entitys, question, relations) all_tuples_num += len(candidate_tuples) dic['candidate_tuples'] = candidate_tuples corpus[i] = dic #判断gold tuple是否包含在candidate_tuples_list中 if_true = 0 for thistuple in candidate_tuples: if len(gold_tuple) == len(set(gold_tuple) & set(thistuple)): if_true = 1 break if if_true == 1: true_num += 1 if len(gold_tuple) <= 3 and len(gold_entities) == 1: hop2_true_num += 1 if len(gold_tuple) <= 3 and len(gold_entities) == 1: hop2_num += 1 print('所有问题里,候选答案能覆盖标准查询路径的比例为:%.3f' % (true_num / len(corpus))) print('单实体问题中,候选答案能覆盖标准查询路径的比例为:%.3f' % (hop2_true_num / hop2_num)) print('平均每个问题的候选答案数量为:%.3f' % (all_tuples_num / len(corpus))) return corpus
from similarity import BertSim import tensorflow as tf bs = BertSim() bs.set_mode(tf.estimator.ModeKeys.TRAIN) bs.train()
class AnswerCandidate(Candidate): def __init__(self, entity2relations_dict='data/entity2relations_dict.pkl', seqPair2similarity_dict='data/seqPair2similarity_dict.pkl'): self._entity2relations = self._load_dict(entity2relations_dict) self._seqPair2similarity = self._load_dict(seqPair2similarity_dict) self._similarity_dict_path = seqPair2similarity_dict self._relation_paths_dict_path = entity2relations_dict self._model = BertSim() self._model.mode = tf.estimator.ModeKeys.PREDICT def _similarity_of(self, faked, seq): k = faked + seq if k not in self._seqPair2similarity: self._seqPair2similarity[k] = self._model.predict(faked, seq) return self._seqPair2similarity[k] def _relation_paths_of(self, entity): if entity not in self._entity2relations: return [] return self._entity2relations[entity] def _candidates_of(self, entity2feats, question): answer2feats = {} for entity, feats in entity2feats.items(): relation_paths = self._relation_paths_of(entity) if not relation_paths: continue mention = feats[0] for relations in relation_paths: answer = (entity, *relations) predicates = [spo[1:-1] for spo in relations] hypothesis = '的'.join([mention] + predicates) feats = [ entity, mention, self._similarity_of(hypothesis, question) ] answer2feats[answer] = feats return answer2feats def candidates_of(self, subject2feats: Dict[str, list], question: str): return self._candidates_of(subject2feats, question) def add_candidates_to_corpus(self, corpus: Corpus): num_answers = .0 num_2hop = .0 num_cover = {'all': .0, '2hop': .0} for i, sample in enumerate(corpus): question = sample['question'] gold_answer = sample['gold_tuple'] gold_entities = sample['gold_entitys'] subject_linked = sample['subject_linked'] candidate_answers = self._candidates_of(subject_linked, question) num_answers += len(candidate_answers) sample['candidate_answer'] = candidate_answers ever_cover = False for answer in candidate_answers: if set(answer).issuperset(gold_answer): ever_cover = True print('* Question: ({}){}\n*\tAnswer: {}'.format( i, question, answer)) break if ever_cover: num_cover['all'] += 1 if len(gold_answer) <= 3 and len(gold_entities) == 1: num_cover['2hop'] += 1 if len(gold_answer) <= 3 and len(gold_entities) == 1: num_2hop += 1 # if i > 500 and i % 500 == 0: # print(">>> Caching query dict... <<< ") # self.cache_similarity_query() # self.cache_relation_paths() print("* For {}".format(corpus.name)) print('* Cover ratio in all questions: {:.2f}'.format( num_cover['all'] / len(corpus))) print('* Cover ratio in single-entity questions: {:.2f}'.format( num_cover['2hop'] / num_2hop)) print('* Averaged candidates per question: {:.2f}'.format(num_answers / len(corpus))) return corpus def cache_similarity_query(self): with open(self._similarity_dict_path, 'wb') as f: pickle.dump(self._seqPair2similarity, f) def cache_relation_paths(self): with open(self._relation_paths_dict_path, 'wb') as f: pickle.dump(self._entity2relations, f)