def create_pandas_kb_alias_data(): kb_file = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8') train_file = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8') kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) subject_id_list = [] subject_list = [] subjects = {} # from kb file for line in tqdm(kb_file, desc='deal kb_file'): jstr = ujson.loads(line) subject_id = jstr['subject_id'] subject = com_utils.cht_to_chs(jstr['subject'].strip().lower()) subject_id_list.append(subject_id) subject_list.append(subject) alias = jstr['alias'] subjects[subject] = 1 for alia in alias: alia_str = com_utils.cht_to_chs(alia.strip().lower()) if subjects.get(alia_str) is not None: continue else: subjects[alia_str] = 1 subject_id_list.append(subject_id) subject_list.append(alia_str) # from train file for line in tqdm(train_file, desc='deal train file'): jstr = ujson.loads(line) mention_data = jstr['mention_data'] for mention in mention_data: mention_text = mention['mention'] mention_text = com_utils.cht_to_chs(mention_text.lower()) kb_id = mention['kb_id'] kb_entity = kb_dict.get(kb_id) is_match = False if kb_entity is not None: kb_subject = kb_entity['subject'] kb_alias = kb_entity['alias'] if kb_subject == mention_text: is_match = True if not is_match: for alia in kb_alias: if alia == mention_text: is_match = True if not is_match: if subjects.get(mention_text) is not None: continue else: subjects[mention_text] = 1 subject_id_list.append(kb_id) subject_list.append(mention_text) pandas_dict = {'subject_id': subject_id_list, 'subject': subject_list} df = pd.DataFrame.from_dict(pandas_dict) df.to_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_alias_data) print("success create pandas kb alia data file")
def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file, mode=fasttextConfig.create_data_word): print("create {} sup train data".format(index)) kb_alias_df = pd.read_csv(kb_alia_file) stopwords = data_utils.get_stopword_list(stopword_file) train_datas = open(train_data_file, 'r', encoding='utf-8').readlines() kb_dict = com_utils.pickle_load(kb_dict_file) train_out_file = open(out_file, 'w', encoding='utf-8') text_ids = {} max_extend_countd = 3 for line in tqdm(train_datas, desc='deal {} train file'.format(index)): jstr = ujson.loads(line) text = jstr['text'] text_id = jstr['text_id'] if text_ids.get(text_id) == max_extend_countd: continue mentions = jstr['mention_data'] for mention in mentions: mention_id = mention['kb_id'] mention_text = mention['mention'] neighbor_text = com_utils.get_neighbor_sentence(text, mention_text) # true values kb_entity = kb_dict.get(mention_id) if kb_entity is not None: out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_true, mode) train_out_file.write(out_str) # false values alia_ids = [] alia_count = 0 alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)] for _, item in alias_df.iterrows(): a_id = str(item['subject_id']) if a_id != mention_id: alia_ids.append(a_id) alia_count += 1 if alia_count == max_extend_countd: break if len(alia_ids) > 0: for alia_id in alia_ids: alia_entity = kb_dict.get(alia_id) if alia_entity is not None: out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_false, mode) train_out_file.write(out_str) # add text text_ids = com_utils.dict_add(text_ids, text_id) # 清理资源 train_out_file.close() train_datas = None train_out_file = None kb_alias_df = None stopwords = None kb_dict = None
def get_kb_text(kb_str, cut_client, stopwords): kb_datas = kb_str['data'] result = kb_str['subject'] + ' ' for kb_data in kb_datas: result += kb_data['predicate'] + ' ' cut_texts = cut_client.cut_text(kb_data['object']) for text in cut_texts: if stopwords.get(text) is None and text != ' ': result += com_utils.cht_to_chs(text.strip('\n')) if not text.isdigit(): result += ' ' return result[0:len(result) - 1]
def create_pandas_kb_data(): kb_file = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8') subject_id_list = [] subject_list = [] type_list = [] data_list = [] for line in tqdm(kb_file, desc='deal kb file'): jstr = ujson.loads(line) subject_id_list.append(jstr['subject_id']) subject_list.append(com_utils.cht_to_chs(jstr['subject'].lower())) type_list.append(jstr['type']) data_list.append(jstr['data']) pandas_dict = {'subject_id': subject_id_list, 'subject': subject_list, 'type': type_list, 'data': data_list} df = pd.DataFrame.from_dict(pandas_dict) df.to_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_csv) print("success create pandas kb file")
def get_all_text(subject, datas): result_str = com_utils.cht_to_chs(subject) + ' ' for data in datas: result_str += data['predicate'] + ' ' result_str += data['object'] + ' ' return result_str[0:len(result_str) - 1]
def create_dev_mention_cands_data(index, mention_file, pd_file, alia_kb_df, out_file): print("start create {} mention cands".format(index)) dev_mention_data = com_utils.pickle_load(mention_file) print("{} data length is {}".format(index, len(dev_mention_data))) pd_df = pandas.read_csv(pd_file) alia_kb_df = pandas.read_csv(alia_kb_df) alia_kb_df.fillna('') count = 0 for dev_data in tqdm(dev_mention_data, desc='find {} cands'.format(index)): # count += 1 # if (count < 465): # continue mention_data = dev_data['mention_data'] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = [] cand_ids = {} # match orginal mention_text_proc = com_utils.cht_to_chs(mention_text.lower()) mention_text_proc = com_utils.complete_brankets(mention_text_proc) # print(mention_text_proc) mention_text_proc_extend = mention_text_proc[ 0:len(mention_text_proc) - 1] subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc) for _, item in subject_df.iterrows(): s_id = str(item['subject_id']) if cand_ids.get(s_id) is not None: continue cand_ids[s_id] = 1 subject = item['subject'] # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) cands.append({ 'cand_id': s_id, 'cand_subject': subject, 'cand_text': text, 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type'])) }) # match more # subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc_extend) # for _, item in subject_df.iterrows(): # s_id = str(item['subject_id']) # if cand_ids.get(s_id) is not None: # continue # cand_ids[s_id] = 1 # subject = item['subject'] # # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) # text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) # cands.append({'cand_id': s_id, 'cand_subject': subject, 'cand_text': text, # 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))}) # match alias alias_subject_ids = [] # match orginal alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc) for _, item in alias_df.iterrows(): a_id = str(item['subject_id']) if alias_subject_ids.__contains__(a_id): continue alias_subject_ids.append(a_id) # match more # alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc_extend) # for _, item in alias_df.iterrows(): # a_id = str(item['subject_id']) # if alias_subject_ids.__contains__(a_id): # continue # alias_subject_ids.append(a_id) for alia_id in alias_subject_ids: alias_df = pd_df[pd_df['subject_id'] == int(alia_id)] for _, item in alias_df.iterrows(): b_id = str(item['subject_id']) if cand_ids.get(b_id) is not None: continue cand_ids[b_id] = 1 subject = item['subject'] # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) text = data_utils.get_all_text( item['subject'], ast.literal_eval(item['data'])) cands.append({ 'cand_id': b_id, 'cand_subject': subject, 'cand_text': text, 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type'])) }) # match gen subject # gen_subject_ids = [] # for gen_subject in mention['gen_subjects']: # gen_text = com_utils.cht_to_chs(gen_subject.lower()) # alias_df = alia_kb_df[alia_kb_df['subject'] == gen_text] # for _, item in alias_df.iterrows(): # a_id = str(item['subject_id']) # if gen_subject_ids.__contains__(a_id): # continue # gen_subject_ids.append(a_id) # for alia_id in gen_subject_ids: # alias_df = pd_df[pd_df['subject_id'] == int(alia_id)] # for _, item in alias_df.iterrows(): # b_id = str(item['subject_id']) # if cand_ids.get(b_id) is not None: # continue # cand_ids[b_id] = 1 # subject = item['subject'] # # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) # text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) # cands.append({'cand_id': b_id, 'cand_subject': subject, 'cand_text': text, # 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))}) mention['cands'] = cands com_utils.pickle_save(dev_mention_data, out_file) print("success create {} dev data with mention and cands!".format(index))
def eval_sup(mode=fasttextConfig.create_data_word): print("start use the fasttext/supervised model to predict eval data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) # unsup_model = fastText.load_model( # fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.model_skipgram)) unsup_model = word2vec.Word2VecKeyedVectors.load( fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model) sup_model = fastText.load_model(fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model) kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_eval_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_eval_data, 'w', encoding='utf-8') # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): if len(line.strip('\n')) == 0: continue jstr = ujson.loads(line) dev_entity = {} text = com_utils.cht_to_chs(jstr['text'].lower()) dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] mentions = [] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = mention['cands'] if len(cands) == 0: continue # use supervised model to choose mention supervise_cands = [] for cand in cands: neighbor_text = com_utils.get_neighbor_sentence( text, com_utils.cht_to_chs(mention_text.lower())) cand_entity = kb_dict.get(cand['cand_id']) if cand_entity is not None: out_str = com_utils.get_entity_mention_pair_text( com_utils.cht_to_chs(cand_entity['text'].lower()), neighbor_text, stopwords, cut_client, mode=mode) result = sup_model.predict(out_str.strip('\n'))[0][0] if result == fasttextConfig.label_true: supervise_cands.append(cand) if len(supervise_cands) == 0: supervise_cands = cands # unsupervise model choose item max_cand = None # score list score_list = [] mention_neighbor_sentence = text for i, cand in enumerate(supervise_cands): # score = fasttext_get_sim(unsup_model, mention_neighbor_sentence, # com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) score = gensim_get_sim( unsup_model, mention_neighbor_sentence, com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) if score < fasttextConfig.min_entity_similarity_threshold: continue score_list.append({ 'cand_id': cand['cand_id'], 'cand_score': score, 'cand_type': cand['cand_type'] }) score_list.sort(key=get_socre_key, reverse=True) if len(score_list) > 0: max_cand = score_list[0] # find the best cand if max_cand is not None: mentions.append({ 'kb_id': max_cand['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for optim_mention in mentions: mention_offset = int(optim_mention['offset']) mention_len = len(optim_mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for optim_mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, optim_mention): change_mentions.append(optim_mention) mentions = change_mentions change_mentions = [] for optim_mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, optim_mention ) and optim_mention['mention'] not in comConfig.punctuation: change_mentions.append(optim_mention) mentions = change_mentions mentions.sort(key=get_mention_offset) dev_entity['mention_data'] = mentions out_file.write(ujson.dumps(dev_entity, ensure_ascii=False)) out_file.write('\n') print("success create supervised eval result")
def test_sup(mode=fasttextConfig.create_data_word): print("start use the fasttext model/supervise model to predict test data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) unsup_model_fasttext = fastText.load_model( fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.choose_model)) unsup_model_gensim = word2vec.Word2VecKeyedVectors.load( fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model) sup_model = fastText.load_model(fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_fasttext_test, 'w', encoding='utf-8') # f1 parmas gen_mention_count = 0 original_mention_count = 0 correct_mention_count = 0 # count = 0 # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): # count += 1 # if count < 3456: # continue jstr = ujson.loads(line) dev_entity = {} text = com_utils.cht_to_chs(jstr['text'].lower()) dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] original_mention_data = jstr['mention_data_original'] mentions = [] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = mention['cands'] if len(cands) == 0: continue # use supervised model to choose mention supervise_cands = [] for cand in cands: neighbor_text = com_utils.get_neighbor_sentence( text, com_utils.cht_to_chs(mention_text.lower())) cand_entity = kb_dict.get(cand['cand_id']) if cand_entity is not None: out_str = com_utils.get_entity_mention_pair_text( com_utils.cht_to_chs(cand_entity['text'].lower()), neighbor_text, stopwords, cut_client, mode=mode) # print(out_str) result = sup_model.predict(out_str.replace('\n', ' '))[0][0] if result == fasttextConfig.label_true: supervise_cands.append(cand) # unsupervise model choose item max_cand = None if len(supervise_cands) == 0: supervise_cands = cands # score list score_list = [] mention_neighbor_sentence = text for i, cand in enumerate(supervise_cands): # score_fasttext = fasttext_get_sim(unsup_model_fasttext, mention_neighbor_sentence, # com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) score_gensim = gensim_get_sim( unsup_model_gensim, mention_neighbor_sentence, com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) # score = (0.8 * score_gensim) + (0.2 * score_fasttext) score = score_gensim # if score > max_score: # max_score = score # max_index = score if score < fasttextConfig.min_entity_similarity_threshold: continue score_list.append({ 'cand_id': cand['cand_id'], 'cand_score': score, 'cand_type': cand['cand_type'] }) # if max_score < fasttextConfig.min_entity_similarity_threshold: # continue # find the best cand # find_type = False score_list.sort(key=get_socre_key, reverse=True) # for item in score_list: # if item['cand_type'] == mention['type']: # find_type = True # if find_type: # for item in score_list: # if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold: # max_cand = item if max_cand is None: if len(score_list) > 0: max_cand = score_list[0] # find the best cand if max_cand is not None: mentions.append({ 'kb_id': max_cand['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for mention in mentions: mention_offset = int(mention['offset']) mention_len = len(mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if mention_offset == int(sub_mention['offset']) and len( mention['mention']) > len(sub_mention['mention']): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, mention): change_mentions.append(mention) mentions = change_mentions change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, mention ) and mention['mention'] not in comConfig.punctuation: change_mentions.append(mention) mentions = change_mentions mentions.sort(key=get_mention_offset) # optim mentions # calc f1 for mention in mentions: if is_find_correct_entity(mention['kb_id'], original_mention_data): correct_mention_count += 1 gen_mention_count += len(mentions) for orginal_mention in original_mention_data: if orginal_mention['kb_id'] != 'NIL': original_mention_count += 1 # out result dev_entity['mention_data'] = mentions dev_entity['mention_data_original'] = original_mention_data out_file.write(ujson.dumps(dev_entity, ensure_ascii=False)) out_file.write('\n') precision = correct_mention_count / gen_mention_count recall = correct_mention_count / original_mention_count f1 = 2 * precision * recall / (precision + recall) print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format( precision, recall, f1))
def create_fasttext_unsup_train_data(): print("start create unsup fasttext data...") if not os.path.exists(fileConfig.dir_fasttext): os.mkdir(fileConfig.dir_fasttext) kb_datas = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8') train_datas = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8') dev_datas = open(fileConfig.dir_data + fileConfig.file_dev_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, 'w', encoding='utf-8') stopword_list = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) print("prepare train data") train_sentence = [] # kb data for line in tqdm(kb_datas, desc='deal kb data'): jstr = ujson.loads(line) train_sentence.append(data_utils.get_kb_text(jstr, cut_client, stopword_list)) # train data for line in tqdm(train_datas, desc='deal train data'): jstr = ujson.loads(line) text = jstr['text'] text_len = len(text) save_str = '' str_point = 0 mention_datas = jstr['mention_data'] for mention in mention_datas: mention_offset = int(mention['offset']) mention_text = mention['mention'] sub_text = text[str_point:mention_offset] cut_texts = cut_client.cut_text(sub_text) for s_text in cut_texts: if s_text != ' ': save_str += com_utils.cht_to_chs(s_text) if not s_text.isdigit(): save_str += ' ' if len(sub_text) > 0 and not sub_text.isdigit(): save_str += ' ' str_point += mention_offset - str_point save_str += mention_text mention_text_len = len(mention_text) if mention_text_len > 0 and not mention_text.isdigit(): save_str += ' ' str_point += mention_text_len if str_point < text_len: sub_text = text[str_point:text_len] cut_texts = cut_client.cut_text(sub_text) for s_text in cut_texts: if s_text != ' ': save_str += com_utils.cht_to_chs(s_text) if not s_text.isdigit(): save_str += ' ' train_sentence.append(save_str) # # dev data for line in tqdm(dev_datas, desc='deal dev data'): jstr = ujson.loads(line) text_list = cut_client.cut_text(jstr['text'].lower()) save_str = '' for dev_text in text_list: if dev_text != ' ': save_str += com_utils.cht_to_chs(dev_text) if not dev_text.isdigit(): save_str += ' ' train_sentence.append(save_str) line_len = len(train_sentence) print("save train data, data len:{}".format(line_len)) for i, line in enumerate(train_sentence): if i < line_len - 1: out_file.writelines(line) out_file.write('\n') else: out_file.writelines(line) print("success save fasttext train file")