def create_kb_dict(): if not os.path.exists(fileConfig.dir_kb_info): os.mkdir(fileConfig.dir_kb_info) kb_datas = [line for line in open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8').readlines()] kb_dict = {} for kb_data in tqdm(kb_datas, desc='init kb dict'): kb_data = ujson.loads(kb_data) subject_id = kb_data['subject_id'] if subject_id in kb_dict: raise Exception('key : {} exist'.format(subject_id)) # text = data_utils.get_text(kb_data['data'], kb_data['subject']) all_alias = {} subject = kb_data['subject'] alias = kb_data['alias'] all_alias = com_utils.dict_add(all_alias, subject) for alia in alias: alia_text = alia if all_alias.get(alia_text) is not None: continue all_alias = com_utils.dict_add(all_alias, alia_text) text = data_utils.get_all_text(kb_data['subject'], kb_data['data']) kb_dict[subject_id] = {'type': kb_data['type'], 'subject': subject, 'alias': list(all_alias), 'text': text} com_utils.pickle_save(kb_dict, fileConfig.dir_kb_info + fileConfig.file_kb_dict) print("create kb dict success")
def deal_ner_predict_data(predict_list, data_list, out_file): # init label map id2label = {i: label for i, label in enumerate(nerConfig.labels)} # init predict list label_list = [] for labels in predict_list: for label in labels: item = label[label != -1] label_list.append(item) datas = [] for data, label in zip(data_list, label_list): text_list = data.text_a if len(text_list) != len(label): print('text len:{} labels len:{}'.format(len(text_list), len(label))) assert len(text_list) == len(label) labels = [] for item in label: labels.append(id2label.get(item.item())) datas.append({ 'text': text_list, 'tag': labels, 'mention_data_original': data.mention_data }) com_utils.pickle_save(datas, out_file)
def create_nel_train_data(): if not os.path.exists(fileConfig.dir_nel): os.mkdir(fileConfig.dir_nel) train_data = open(fileConfig.dir_data + fileConfig.file_train_data, 'r') kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) pd_df = pd.read_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_csv) data_list = [] for line in tqdm(train_data, desc='create entity link train data'): # for line in train_data: jstr = ujson.loads(line) text_id = jstr['text_id'] text = jstr['text'] mention_datas = jstr['mention_data'] for mention_data in mention_datas: kb_id = mention_data['kb_id'] mention = mention_data['mention'] start = mention_data['offset'] end = int(start) + len(mention) - 1 kb_entity = kb_dict.get(kb_id) if kb_entity is not None: entity_cands, entity_ids, entity_text = data_utils.get_entity_cands(kb_entity, kb_id, pd_df) else: continue data_list.append({'text_id': text_id, 'mention_text': text, 'mention': mention, 'mention_position': [start, end], 'entity_cands': entity_cands, 'entity_text': entity_text, 'entity_ids': entity_ids}) com_utils.pickle_save(data_list, fileConfig.dir_nel + fileConfig.file_nel_entity_link_train_data) print("success create nel entity link train data")
def create_jieba_dict(): data_file = open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8') com_utils.check_dir(fileConfig.dir_jieba) com_utils.check_dir(fileConfig.dir_kb_info) out_file = open(fileConfig.dir_jieba + fileConfig.file_jieba_dict, 'w', encoding='utf-8') words = {} for line in tqdm(data_file, desc='read file'): jstr = ujson.loads(line) subject = jstr['subject'].strip() words = com_utils.dict_add(words, subject) alias = jstr['alias'] for item in alias: words = com_utils.dict_add(words, item.strip()) # save jieba kb com_utils.pickle_save(words, fileConfig.dir_kb_info + fileConfig.file_jieba_kb) # find most common words = Counter(words).most_common() # save file save_str = '' count = 0 for word in tqdm(words): save_str += word[0] + '\n' count += 1 if count % 100 == 0: out_file.write(save_str) save_str = '' if len(save_str) > 0: print("write remid str") out_file.write(save_str) print("success build jieba dict")
def create_ner_data(train_file_path=None, out_file_path=None): if not os.path.exists(fileConfig.dir_ner): os.mkdir(fileConfig.dir_ner) train_file = open(train_file_path, mode='r', encoding='utf-8') data_list = [] for i, line in tqdm(enumerate(train_file), desc='create ner data'): jstr = ujson.loads(line) text_id = jstr['text_id'] # print(text_id) text_list = list(jstr['text']) mentions = jstr['mention_data'] text_len = len(text_list) tag_list = [nerConfig.O_seg] * text_len for mention in mentions: kb_id = mention['kb_id'] if kb_id == 'NIL': continue mention_len = len(mention['mention']) offset = int(mention['offset']) # tag = nerConfig.NIL_seg if mention['kb_id'] == nerConfig.NIL_seg else nerConfig.KB_seg tag = nerConfig.KB_seg # tag = com_utils.get_kb_type(kb_dict[kb_id]['type']) # tag B tag_list[offset] = nerConfig.B_seg + tag if mention_len == 1: continue # tag I for i in range(offset + 1, offset + mention_len - 1): tag_list[i] = nerConfig.I_seg + tag # tag E tag_list[offset + mention_len - 1] = nerConfig.E_seg + tag data_list.append({'id': text_id, 'text': text_list, 'tag': tag_list, 'mention_data': mentions}) com_utils.pickle_save(data_list, out_file_path) print("success create ner data")
def split_eval_mention(num): dev_mention_data = com_utils.pickle_load( fileConfig.dir_ner + fileConfig.file_ner_eval_mention_data) data_len = len(dev_mention_data) block_size = data_len / num for i in range(1, num + 1): data_iter = dev_mention_data[int((i - 1) * block_size):int(i * block_size)] com_utils.pickle_save( data_iter, fileConfig.dir_ner_split + fileConfig.file_ner_eval_mention_split.format(i)) print("success split test mention to:{} files".format(num))
def create_eval_ner_data(): if not os.path.exists(fileConfig.dir_ner): os.mkdir(fileConfig.dir_ner) eval_file = open(fileConfig.dir_data + fileConfig.file_eval_data, mode='r', encoding='utf-8') data_list = [] for i, line in tqdm(enumerate(eval_file), desc='create eval ner data'): jstr = ujson.loads(line) text_id = jstr['text_id'] text_list = list(jstr['text']) text_lenth = len(text_list) tag_list = [nerConfig.O_seg] * text_lenth data_list.append({'id': text_id, 'text': text_list, 'tag': tag_list}) com_utils.pickle_save(data_list, fileConfig.dir_ner + fileConfig.file_ner_eval_data) print("success create ner eval data")
def train(): datas = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) vectorizer = TfidfVectorizer() train_sentence = [] print("prepare train data") for key, data in tqdm(datas.items(), desc='init train data'): train_sentence.append(' '.join(cut_client.cut_text(data['text']))) print("start train tfidf model") X = vectorizer.fit_transform(train_sentence) print("save model and keyword") tfidf_save_data = [X, vectorizer] if not os.path.exists(fileConfig.dir_tfidf): os.mkdir(fileConfig.dir_tfidf) com_utils.pickle_save(tfidf_save_data, fileConfig.dir_tfidf + fileConfig.file_tfidf_save_data) print("success train and save tfidf file")
def split_train_test_data(): print('start split train data...') data_list = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8').readlines() data_len = len(data_list) # train_size = int(data_len * comConfig.train_ratio) test_size = 10000 random.seed(comConfig.random_seed) random.shuffle(data_list) # train_data = data_list[:train_size] train_data = data_list[:data_len - test_size] dev_data = data_list[data_len - test_size:data_len] com_utils.pickle_save(train_data, fileConfig.dir_data + fileConfig.file_train_pkl) com_utils.pickle_save(dev_data, fileConfig.dir_data + fileConfig.file_test_pkl) print("success split data set")
def gen_simi_subject_list(file_path): print('start gen similar subject...') file_datas = com_utils.pickle_load(file_path) gensim_model = word2vec.Word2VecKeyedVectors.load( fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model) for item in tqdm(file_datas, 'gen simi subject'): mention_data = item['mention_data'] for mention in mention_data: mention_text = mention['mention'] try: mention['gen_subjects'] = get_simi_subject_list( gensim_model.most_similar(positive=[mention_text], topn=5)) except BaseException: mention['gen_subjects'] = [] com_utils.pickle_save(file_datas, file_path) print('success gen similar subject...')
def split_train_data(train_file_path=None, out_train_file=None, out_dev_file=None, is_split=True): data_list = com_utils.pickle_load(train_file_path) if not is_split: dev_list = com_utils.pickle_load(fileConfig.dir_ner + fileConfig.file_extend_ner_dev_data) data_len = len(data_list) # train_size = int(data_len * comConfig.train_ratio) test_size = 10000 random.seed(comConfig.random_seed) random.shuffle(data_list) # train_data = data_list[:train_size] if is_split: train_data = data_list[:data_len - test_size] dev_data = data_list[data_len - test_size:data_len] com_utils.pickle_save(train_data, out_train_file) com_utils.pickle_save(dev_data, out_dev_file) else: train_data = data_list com_utils.pickle_save(train_data, out_train_file) com_utils.pickle_save(dev_list, out_dev_file) print("success split data set")
def create_dev_mention_cands_data(index, mention_file, pd_file, alia_kb_df, out_file): print("start create {} mention cands".format(index)) dev_mention_data = com_utils.pickle_load(mention_file) print("{} data length is {}".format(index, len(dev_mention_data))) pd_df = pandas.read_csv(pd_file) alia_kb_df = pandas.read_csv(alia_kb_df) alia_kb_df.fillna('') count = 0 for dev_data in tqdm(dev_mention_data, desc='find {} cands'.format(index)): # count += 1 # if (count < 465): # continue mention_data = dev_data['mention_data'] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = [] cand_ids = {} # match orginal mention_text_proc = com_utils.cht_to_chs(mention_text.lower()) mention_text_proc = com_utils.complete_brankets(mention_text_proc) # print(mention_text_proc) mention_text_proc_extend = mention_text_proc[ 0:len(mention_text_proc) - 1] subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc) for _, item in subject_df.iterrows(): s_id = str(item['subject_id']) if cand_ids.get(s_id) is not None: continue cand_ids[s_id] = 1 subject = item['subject'] # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) cands.append({ 'cand_id': s_id, 'cand_subject': subject, 'cand_text': text, 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type'])) }) # match more # subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc_extend) # for _, item in subject_df.iterrows(): # s_id = str(item['subject_id']) # if cand_ids.get(s_id) is not None: # continue # cand_ids[s_id] = 1 # subject = item['subject'] # # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) # text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) # cands.append({'cand_id': s_id, 'cand_subject': subject, 'cand_text': text, # 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))}) # match alias alias_subject_ids = [] # match orginal alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc) for _, item in alias_df.iterrows(): a_id = str(item['subject_id']) if alias_subject_ids.__contains__(a_id): continue alias_subject_ids.append(a_id) # match more # alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc_extend) # for _, item in alias_df.iterrows(): # a_id = str(item['subject_id']) # if alias_subject_ids.__contains__(a_id): # continue # alias_subject_ids.append(a_id) for alia_id in alias_subject_ids: alias_df = pd_df[pd_df['subject_id'] == int(alia_id)] for _, item in alias_df.iterrows(): b_id = str(item['subject_id']) if cand_ids.get(b_id) is not None: continue cand_ids[b_id] = 1 subject = item['subject'] # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) text = data_utils.get_all_text( item['subject'], ast.literal_eval(item['data'])) cands.append({ 'cand_id': b_id, 'cand_subject': subject, 'cand_text': text, 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type'])) }) # match gen subject # gen_subject_ids = [] # for gen_subject in mention['gen_subjects']: # gen_text = com_utils.cht_to_chs(gen_subject.lower()) # alias_df = alia_kb_df[alia_kb_df['subject'] == gen_text] # for _, item in alias_df.iterrows(): # a_id = str(item['subject_id']) # if gen_subject_ids.__contains__(a_id): # continue # gen_subject_ids.append(a_id) # for alia_id in gen_subject_ids: # alias_df = pd_df[pd_df['subject_id'] == int(alia_id)] # for _, item in alias_df.iterrows(): # b_id = str(item['subject_id']) # if cand_ids.get(b_id) is not None: # continue # cand_ids[b_id] = 1 # subject = item['subject'] # # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject']) # text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data'])) # cands.append({'cand_id': b_id, 'cand_subject': subject, 'cand_text': text, # 'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))}) mention['cands'] = cands com_utils.pickle_save(dev_mention_data, out_file) print("success create {} dev data with mention and cands!".format(index))
def create_dev_mention_data(mode, ner_datas, out_file): ner_datas = com_utils.pickle_load(ner_datas) jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_jieba_kb) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) gen_more_words = data_utils.get_stopword_list( fileConfig.dir_stopword + fileConfig.file_analysis_gen_more) text_id = 1 dev_mention_data = [] # count = 0 for data in tqdm(ner_datas, 'find entity'): # count += 1 # if count < 1496: # continue text = ''.join(data['text']) tag_list = data['tag'] start_index = 0 mention_length = 0 is_find = False mentions = [] type_dict = {} # use tag find for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or ( tag.find(nerConfig.I_seg) > -1 and not is_find): type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length = 1 is_find = True elif tag.find(nerConfig.E_seg) > -1 and not is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.I_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.E_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} elif tag == nerConfig.O_seg: is_find = False mention_length = 0 type_dict = {} # use jieba find jieba_entities = cut_client.cut_text(text) for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or tag.find( nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1: jieba_offset = i jieba_char = text[i] jieba_text = get_jieba_mention(jieba_entities, jieba_char, jieba_offset) if jieba_text is None: continue elif jieba_text == '_' or jieba_text == '-': continue elif data_utils.is_punctuation(jieba_text): continue elif len(jieba_text) == 1: continue elif stopwords.get(jieba_text) is not None: continue # elif gen_more_words.get(jieba_text) is not None: # continue jieba_offset = jieba_offset - jieba_text.find(jieba_char) if len(jieba_text) <= comConfig.max_jieba_cut_len and ( jieba_dict.get(jieba_text) is not None): type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O' if jieba_text is None: continue if not is_already_find_mention(mentions, jieba_text, jieba_offset): mentions.append({ 'mention': jieba_text, 'offset': str(jieba_offset), 'type': type_str }) # find inner brackets mentions bracket_mentions = data_utils.get_mention_inner_brackets( text, tag_list) if len(bracket_mentions) > 0: mentions += bracket_mentions # completion mentions # mentions_com = [] # for mention in mentions: # mention_str = mention['mention'] # try: # for find in re.finditer(mention_str, text): # find_offset = find.span()[0] # if find_offset != int(mention['offset']): # mentions_com.append( # {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']}) # except BaseException: # # print("occur error when match mention str in completion mentions, error value:{} text:{}".format( # # mention_str, text)) # pass # mentions_com.append(mention) # mentions = mentions_com # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for mention in mentions: mention_offset = int(mention['offset']) mention_len = len(mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if mention_offset == int(sub_mention['offset']) and len( mention['mention']) > len(sub_mention['mention']): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, mention): change_mentions.append(mention) mentions = change_mentions change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, mention ) and mention['mention'] not in comConfig.punctuation: change_mentions.append(mention) mentions = change_mentions # optim mentions # sort mentions mentions.sort(key=get_offset) # optimize the mention data mentions_optim = [] for mention in mentions: mentions_optim.append({ 'mention': get_optim_mention_text(jieba_entities, mention['mention']), 'offset': mention['offset'], 'type': mention['type'] }) if mode == 1: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) elif mode == 2: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim, 'mention_data_original': data['mention_data_original'] }) elif mode == 3: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) text_id += 1 com_utils.pickle_save(dev_mention_data, out_file) print("success create dev data with mentions, mode:{}".format(mode))