def gen_extra_description_for_mesh(): res_dir = 'e:/el/tmpres/demo/del-data/' mesh_record_file = res_dir + 'records_info_with_wiki.txt' wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl' description_file = 'e:/el/tmpres/demo/wiki-all/text.txt' links_file = 'e:/el/tmpres/demo/wiki-all/links.txt' word_idf_file = 'e:/el/tmpres/demo/word_idf.txt' dst_file = 'e:/el/tmpres/demo/extra_description_for_mesh.txt' tfidf = TfIdf(word_idf_file) wiki_info = WikiInfo(wiki_info_file, links_file, description_file) mesh_records = MeshRecord.load_mesh_records(mesh_record_file) # wiki_text = wiki_info.get_info(37220)[1] # __get_extra_wiki_description('', wiki_text, tfidf) fout = open(dst_file, 'wb') for i, (mesh_id, record) in enumerate(mesh_records.iteritems()): if record.wid < 0: continue info = wiki_info.get_info(record.wid) if info and info[1]: mesh_text = record.mesh_desc.decode('utf-8') wiki_text = info[1].decode('utf-8') extra_desc = __get_extra_wiki_description(mesh_text, wiki_text, tfidf) fout.write('%s\n%s\n' % (mesh_id, extra_desc.encode('utf-8'))) # if i > 10: # break fout.close()
def main(): start_time = time() wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/el/tmpres/demo/wiki-all/text.txt' wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.pkl' word_idf_file = 'e:/el/tmpres/demo/word_idf.txt' tfidf = TfIdf(word_idf_file) wiki_info = WikiInfo(wiki_info_file, links_file, description_file) wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) input_file = 'input/00000001.txt' fin = open(input_file, 'rb') doc_text = fin.read() doc_text = doc_text.decode('utf-8') fin.close() pos = input_file.rfind('/') file_name = input_file[pos + 1:] ner_result_file = os.path.join('output', file_name + '.ner') merged_mention_list = mentiondetection.clean_ner_result(ner_result_file) merged_mention_list.sort(key=lambda x: x.span[0]) wiki_link.link_all(doc_text, merged_mention_list) for mention in merged_mention_list: if (not mention.mesh_id) and mention.chebi_id < 0 < mention.wid: cur_name = doc_text[mention.span[0]:mention.span[1] + 1].lower() print cur_name, mention.wid, wiki_info.get_info(mention.wid)[0] print time() - start_time
def test(): start_time = time() text = 'last opportunities Texas senator Cruz' word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt' tfidf = TfIdf(word_idf_file) wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/el/tmpres/demo/wiki-all/text.txt' wiki_info = WikiInfo(wiki_info_file, links_file, description_file) wiki_link = WikiLink('e:/el/tmpres/wiki/dict/name_candidates.pkl', wiki_info, tfidf) context_tfidf = tfidf.get_tfidf_from_text(text) print wiki_link.link_with_context('cruz', context_tfidf) print time() - start_time
def init_model(): # extra_wiki_desc_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt' # extra_parents_file = 'e:/el/tmpres/demo/extra_parents.txt' # # name_wid_file = 'e:/el/tmpres/demo/dict/single_candidates_wid_dict.txt' # record_file = 'd:/data/lab_demo/med_edl_data/records_info_with_wiki.txt' # dict_file = 'd:/data/lab_demo/med_edl_data/med_dict_ascii_with_ids_edited.txt' # tree_number_file = 'd:/data/lab_demo/med_edl_data/id_tn.txt' # res_dir = '/media/dhl/Data/el/tmpres/demo/del-data/' # input_file = '/media/dhl/Data/el/tmpres/NER/NER/00000001.txt.bak' # output_file = '/media/dhl/Data/el/tmpres/demo/result/result-linux.json' res_dir = 'e:/data/el/tmpres/demo/del-data/' extra_wiki_desc_file = res_dir + 'wiki_extra_sentences.txt' extra_parents_file = res_dir + 'extra_parents.txt' mesh_record_file = res_dir + 'records_info_with_wiki.txt' mesh_dict_file = res_dir + 'med_dict_ascii_with_ids_edited.txt' exclude_words_file = res_dir + 'exclude_words.txt' tree_number_file = res_dir + 'id_tn.txt' obo_file = res_dir + 'chebi.obo' word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt' # wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.txt' wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl' # wiki_info_file = r'E:\el\tmpres\demo\wiki-med\new\wiki-info.txt' # links_file = r'E:\el\tmpres\demo\wiki-med\new\links.txt' # description_file = r'E:\el\tmpres\demo\wiki-med\new\text.txt' wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt' mesh_extra_description_file = 'e:/data/el/tmpres/demo/extra_description_for_mesh.txt' chebi_terms = ChebiTerm.load_obo_file(obo_file) print '%d chebi terms' % len(chebi_terms) mesh_match = MeshMatch(mesh_dict_file, exclude_words_file) mesh_records = MeshRecord.load_mesh_records(mesh_record_file) mesh_tree = MeshTree(tree_number_file, mesh_records) wiki_info = WikiInfo(wiki_info_file, links_file, description_file) tfidf = TfIdf(word_idf_file) wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) extra_wiki_desc = ioutils.load_wiki_extra_descriptions( mesh_extra_description_file) # extra_wiki_desc = ioutils.load_wiki_extra_sentences(extra_wiki_desc_file) med_link = MedLink(extra_parents_file, mesh_match, mesh_records, mesh_tree, chebi_terms, wiki_info, extra_wiki_desc, wiki_link) return med_link
def init_model(): res_dir = 'e:/data/el/tmpres/' # res_dir = '/home/dhl/data/el/tmpres/' del_res_dir = os.path.join(res_dir, 'demo/del-data/') # extra_wiki_desc_file = del_res_dir + 'wiki_extra_sentences.txt' # extra_parents_file = del_res_dir + 'extra_parents.txt' # mesh_record_file = del_res_dir + 'records_info_with_wiki.txt' # mesh_dict_file = del_res_dir + 'med_dict_ascii_with_ids_edited.txt' # exclude_words_file = del_res_dir + 'exclude_words.txt' # tree_number_file = del_res_dir + 'id_tn.txt' # obo_file = del_res_dir + 'chebi.obo' extra_wiki_desc_file = os.path.join(del_res_dir, 'wiki_extra_sentences.txt') extra_parents_file = os.path.join(del_res_dir, 'extra_parents.txt') mesh_record_file = os.path.join(del_res_dir, 'records_info_with_wiki.txt') mesh_dict_file = os.path.join(del_res_dir, 'med_dict_ascii_with_ids_edited.txt') exclude_words_file = os.path.join(del_res_dir, 'exclude_words.txt') tree_number_file = os.path.join(del_res_dir, 'id_tn.txt') obo_file = os.path.join(del_res_dir, 'chebi.obo') word_idf_file = os.path.join(res_dir, 'demo/word_idf.txt') wiki_candidates_file = os.path.join(res_dir, 'wiki/dict/name_candidates.pkl') wiki_info_file = os.path.join(res_dir, 'demo/wiki-all/wiki-info.pkl') links_file = os.path.join(res_dir, 'demo/wiki-all/links.txt') description_file = os.path.join(res_dir, 'demo/wiki-all/text.txt') mesh_extra_description_file = os.path.join( res_dir, 'demo/extra_description_for_mesh.txt') # wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.txt' # wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl' # wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl' # links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt' # description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt' # mesh_extra_description_file = 'e:/data/el/tmpres/demo/extra_description_for_mesh.txt' chebi_terms = ChebiTerm.load_obo_file(obo_file) mesh_match = MeshMatch(mesh_dict_file, exclude_words_file) mesh_records = MeshRecord.load_mesh_records(mesh_record_file) mesh_tree = MeshTree(tree_number_file, mesh_records) wiki_info = WikiInfo(wiki_info_file, links_file, description_file) tfidf = TfIdf(word_idf_file) wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) extra_wiki_desc = ioutils.load_wiki_extra_descriptions( mesh_extra_description_file) # extra_wiki_desc = ioutils.load_wiki_extra_sentences(extra_wiki_desc_file) tmp_med_link = MedLink(extra_parents_file, mesh_match, mesh_records, mesh_tree, chebi_terms, wiki_info, extra_wiki_desc, wiki_link) return tmp_med_link
def __init_mellink(): word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt' wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl' wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt' wiki_info = WikiInfo(wiki_info_file, links_file, description_file) tfidf = TfIdf(word_idf_file) wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) return MedLink(wiki_info=wiki_info, wiki_link=wiki_link)
def link_with_context(self, sname, context_tfidf): if not self.tfidf: return -1, [] pos = bisect_left(self.name_list, sname) if self.name_list[pos] != sname: return -1, [] beg_idx = self.beg_indices[pos] if pos == len(self.beg_indices) - 1: end_idx = len(self.candidates) else: end_idx = self.beg_indices[pos + 1] # result_wid = self.candidates[beg_idx] if end_idx == beg_idx + 1: tmpwid = self.candidates[beg_idx] wiki_info = self.wiki_info.get_info(tmpwid) if wiki_info and wiki_info[1]: if 'may refer to' in wiki_info[1] or 'may stand for' in wiki_info[1]: return -1, [] return tmpwid, [tmpwid] sum_cnts = 0.0 for i in xrange(beg_idx, end_idx): sum_cnts += self.cnts[i] cur_candidates = list() # max_score = -1 for i in xrange(beg_idx, end_idx): cur_wid = self.candidates[i] wiki_info = self.wiki_info.get_info(cur_wid) if wiki_info and wiki_info[1]: if 'may refer to' in wiki_info[1] or 'may stand for' in wiki_info[1]: continue candidate_tfidf = self.tfidf.get_tfidf_from_text(wiki_info[1].decode('utf-8')) sim = TfIdf.sim(candidate_tfidf, context_tfidf) # if sname == 'elisa' or sname == 'sfn': # print cur_wid, wiki_info[0], sim, self.cnts[i], sum_cnts, self.cnts[i] / sum_cnts # print cur_wid, wiki_info[0], sim # score = sim + 0.0 * self.cnts[i] / sum_cnts score = sim cur_candidates.append((cur_wid, score)) # if score > max_score: # max_score = score # result_wid = cur_wid cur_candidates.sort(key=lambda x: -x[1]) cur_candidates = [x[0] for x in cur_candidates] # if cur_candidates[0][0] != result_wid: # print 'not equal!' # print cur_candidates return cur_candidates[0], cur_candidates
def gen_extra_sentences(): word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt' tfidf = TfIdf(word_idf_file) mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt' merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt' merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt' extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt' mesh_ids = list() wids = list() fin = open(mesh_id_wid_file, 'rb') for line in fin: vals = line.strip().split('\t') mesh_ids.append(vals[0]) wids.append(int(vals[1])) fin.close() fin_desc = open(merged_desc_file, 'rb') fin_token_desc = open(merged_tokenized_desc_file, 'rb') fout = open(extra_sentence_file, 'wb') for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate( izip(mesh_ids, fin_desc, fin_token_desc)): mesh_token_desc = mesh_token_desc.strip() mesh_desc_words = mesh_token_desc.split(' ') mesh_sentence_ends = find_sentence_ends(mesh_desc_words) wiki_desc = fin_desc.next().strip() wiki_token_desc = fin_token_desc.next().strip() wiki_desc_words = wiki_token_desc.split(' ') wiki_sentence_ends = find_sentence_ends(wiki_desc_words) extra_sentence_indices = __get_sentences_to_add( mesh_desc_words, mesh_sentence_ends, wiki_desc_words, wiki_sentence_ends, tfidf) wiki_words_to_pos_list = tokenized_text_match(wiki_desc, wiki_desc_words) original_sentences = get_original_sentences(wiki_desc, wiki_words_to_pos_list, wiki_sentence_ends) fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices))) for j in extra_sentence_indices: fout.write('%s\n' % original_sentences[j]) # if idx == 10000: # break fin_desc.close() fin_token_desc.close() fout.close()
def __get_sentences_to_add(prev_text_words, prev_sentence_ends, new_text_words, new_sentence_ends, tfidf): prev_tfidf_vecs = get_tfidf_of_sentences(prev_text_words, prev_sentence_ends, tfidf) new_tfidf_vecs = get_tfidf_of_sentences(new_text_words, new_sentence_ends, tfidf) wanted_sentence_indices = list() for nidx, new_tfidf_vec in enumerate(new_tfidf_vecs): to_add = True for pidx, prev_tfidf_vec in enumerate(prev_tfidf_vecs): sim_val = TfIdf.sim(new_tfidf_vec, prev_tfidf_vec) if sim_val > 0.95: to_add = False # print sim_val, 'too similar' break if to_add: wanted_sentence_indices.append(nidx) return wanted_sentence_indices