def add_oov_processed(): code_cuis = read.read_from_tsv( "data/AskAPatient/codes_single_synonyms_tsv.tsv") code_cuis_dict = { line[0]: line[2] for line in code_cuis if len(line[3]) > 2 } cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict") cui_st = read.read_from_json("data/AskAPatient/cui_st_dict") code_labels = read.read_from_json( "data/AskAPatient/label_texts_dict_AskAPatient") codes_synonyms_tsv = {} codes_st_tsv = [] for code in ask: code_st_tsv = [code, code_labels[code]] if code in ask: if code in code_cuis_dict: cui = code_cuis_dict[code] synonym = list(set(cui_synonyms[cui])) code_st_tsv += [ cui, " [SEP] ".join(synonym)[:100], cui_st[cui] ] else: synonym = code_labels[code] codes_synonyms_tsv[code] = synonym codes_st_tsv.append(code_st_tsv) read.save_in_json("data/AskAPatient/code_dict_complete", codes_synonyms_tsv) read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
def process_ontology(): ontology = read.read_from_tsv("data/ontology.tsv") concept_mentions = {} for idx, [synonym, concept] in enumerate(ontology): read.add_dict(concept_mentions, concept, synonym) concepts = list(concept_mentions.keys()) synonyms = [] concept_mention_idx = {} idx = 0 for concept in concepts: concept_synonyms = list(set(concept_mentions[concept])) synonyms += concept_synonyms end = idx + len(concept_synonyms) for index in range(idx, end): concept_mention_idx[concept] = (idx, end) idx = end synonyms = [[item] for item in synonyms] read.save_in_tsv("data/ontology/ontology_synonyms.tsv", synonyms) read.save_in_json("data/ontology/ontology_concept", concepts) read.save_in_json("data/ontology/ontology_concept_synonyms_idx", concept_mention_idx)
def main(mention_embeddings_path, synonym_ebmedding_path, concept_synonym_idx_path, top_k, concept_pre_path, concept_score_pre_path): query = np.load(mention_embeddings_path) documents = np.load(synonym_ebmedding_path) concept_synonym_idx = read.read_from_json(concept_synonym_idx_path) id2concept = { int(i): cui for cui, item in concept_synonym_idx.items() for i in range(int(item[0]), int(item[1])) } similarity_matrix = cosine_similarity(query, documents) similarity_matrix = similarity_matrix.astype(np.float16) idx = np.argsort(similarity_matrix) idx = idx.astype(np.int32) top_k = int(top_k) idx = idx[:, ::-1][:, :top_k] concept_score_pre = [ row[idx[i]] for i, row in enumerate(similarity_matrix) ] concept_pre = [[id2concept[item] for item in row] for row in idx] read.save_in_json(concept_pre_path, concept_pre) np.save(concept_score_pre_path, concept_score_pre)
def cui_labels(): code_cuis = {} cui_infos = read.read_from_json("data/SMM4H/synonyms") print(len(cui_infos)) for line in cui_infos: code = line[10] cui = line[0] if line[12] == "PT": if code in code_cuis: code_cuis[code] += [cui] else: code_cuis[code] = [cui] print(len(code_cuis)) extra = { "10012259": ["C0011253"], "10024130": ["C0023222"], "10000497": ["C0702166"], "1002243744151": ["C0917801"], "10040991": ["C0851578"], "10007541": ["C0018799"], "10027433": ["C0851358"], "10014698": ["C0014130"], "10044027": ["C0011334 "], "10013663": ["C1510472"], "MEDDRA PT": ["MEDDRA PT"] } code_cuis.update(extra) for code, cuis in code_cuis.items(): if len(cuis) > 1: print(code) read.save_in_json("data/SMM4H/code_cuis", code_cuis) read.save_in_json("data/SMM4H/label", list(code_cuis.keys()))
def textfile2list_twa_st(): data = read.readfrom_txt(file_path_st) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: txt_list[line[0]] = line[2] read.save_in_json("data/TwADR-L/cui_st_dict", txt_list)
def textfile2list_smm4h_st(): file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] data = read.readfrom_txt(file_path_st) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list[line[0]] = line[2] read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
def evaluate(feature1,nbio_entity_list,entity_pair_list,model,y1,x1): import numpy as np import read_files as read p_lab, p_acc, p_vals = svm_predict(y1, x1, model, options="-b 1") p_vals_1 = np.asarray(p_vals)[:, 0] nbio_entity_list = read.lists2list(nbio_entity_list) entity_pair_list = read.lists2list(entity_pair_list) pair_wise = dict() key_list = list() for nbio_entity in nbio_entity_list: sid = nbio_entity[0] eid = nbio_entity[1] key_list.append(str(sid) + "_" + str(eid)) index = 0 for entity_pair in entity_pair_list: sid = entity_pair[0][0] eid = entity_pair[0][1] key = str(sid)+"_"+str(eid) if not pair_wise.has_key(key): label = [int(entity_pair[2])] pair_wise[key] =[[list(entity_pair[1])+label],[p_vals_1[index]]] else: value =pair_wise[key] bio_list,pro_list = value label = [int(entity_pair[2])] bio_list.append(list(entity_pair[1])+label) pro_list.append(p_vals_1[index]) pair_wise[key] = [bio_list,pro_list] index +=1 correct = 0 index = 0 answers = list() for key in key_list: bio_list, pro_list = pair_wise[key] i = np.argmax(pro_list) if bio_list[i][7] == 1: correct+=1 answers.append([nbio_entity_list[index],bio_list[i]]) else: answers.append([nbio_entity_list[index], bio_list[i],get_true(bio_list)]) index+=1 read.save_in_json('dongfang/evaluation_'+feature1,answers) precision = float(correct)/float(index) print precision,correct,all
def textfile2list_twa(): data = read.readfrom_txt(file_path_synonym) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: if line[0] not in txt_list: txt_list[line[0]] = [line[14]] else: txt_list[line[0]] += [line[14]] read.save_in_json("data/TwADR-L/cui_dict", txt_list)
def textfile2list_smm4h(): code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" data = read.readfrom_txt(file_path_synonym) txt_list = [] for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list.append(line) read.save_in_json("data/SMM4H/synonyms_all", txt_list)
def patient_admission_duration(): patient_ids = [] admission_id = [] patient_mental_not_suicide_all = read.read_from_tsv(os.path.join(output_folder,"mental_patient_all_notes.tsv")) row_patient_admission_time ={} for row in patient_mental_not_suicide_all: row_patient_admission= row[0]+"_"+row[1] + "_" + row[2] add_key_dict(row_patient_admission_time, row_patient_admission, row[3]) print("patients (mental_not_suicide) time information are collected......") patient_mental_not_suicide_admission = read.read_from_tsv(os.path.join(output_folder,"mental_patient_admission_notes.tsv")) mental_not_suicide_patient_admission_time = {} for row in patient_mental_not_suicide_admission: if row[1] not in patient_ids: patient_ids.append(row[1]) if row[2] not in admission_id: admission_id.append(row[2]) add_key_dict(mental_not_suicide_patient_admission_time, row[1], row[3]) print("patients_admission (mental_not_suicide) timelines are collected......") mental_not_suicide_patient_all_time_formatted ={} for key, value in row_patient_admission_time.items(): time_items = value[0].split("-") datetime_new = date(int(time_items[0]), int(time_items[1]), int(time_items[2])) mental_not_suicide_patient_all_time_formatted[key] = datetime_new mental_not_suicide_patient_admission_period = get_duration(mental_not_suicide_patient_admission_time) print("patients_admission (mental_not_suicide) timelines are calculated......") patient_admission_before = [] patient_admission_meanwhile = [] for patient_id in patient_ids: mental_not_suicide_admission_time = mental_not_suicide_patient_admission_period[patient_id] mental_not_suicide_patient_all_time = get_patient_for_admission(mental_not_suicide_patient_all_time_formatted, patient_id) for key, admission_time in mental_not_suicide_patient_all_time.items(): row_id, _, _ = key.split("_") if admission_time < mental_not_suicide_admission_time[0]: patient_admission_before.append(row_id) elif admission_time > mental_not_suicide_admission_time[1]: None else: patient_admission_meanwhile.append(row_id) read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_before"),patient_admission_before) read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_meanwhile"),patient_admission_meanwhile) return patient_admission_before, patient_admission_meanwhile
def code_synonyms(): txt_list = read.read_from_json("data/SMM4H/synonyms_all") code_synonyms = {} code_synonyms_new = {} for line in txt_list: if line[0] not in code_synonyms: code_synonyms[line[0]] = [line[14]] else: code_synonyms[line[0]] += [line[14]] for code, synonyms in code_synonyms.items(): code_synonyms_new[code] = list(set(synonyms)) read.save_in_json("data/SMM4H/cui_synonyms", code_synonyms_new)
def textfile2list_twa(): data = read.readfrom_txt(file_path) cuis_twa = [] txt_list = [] for line in data.splitlines(): if "SNO" in line: print(line) line = line.split('|') if line[0] in twa: cuis_twa.append(line[0]) txt_list.append(line) read.save_in_json("data/TwADR-L/synonyms", txt_list) read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa)))
def textfile2list_smm4h(): file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" smm4h = read.read_from_json("data/SMM4H/labels_ori") data = read.readfrom_txt(file_path) cuis_smm4h = [] txt_list = [] for line in data.splitlines(): # if "MDR" in line: # print(line) line = line.split('|') if "MDR" in line[11]: cuis_smm4h.append(line[0]) txt_list.append(line) read.save_in_json("data/SMM4H/synonyms", txt_list) read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
def textfile2list_ask(): data = read.readfrom_txt(file_path) cui_code_ask = {} codes = [] for line in data.splitlines(): line = line.split('|') if line[13] in ask: codes.append(line[13]) if line[0] not in cui_code_ask: cui_code_ask[line[0]] = [line[13]] else: cui_code_ask[line[0]] += [line[13]] # txt_list.append(line) # read.save_in_json("data/AskAPatient/synonyms",txt_list) read.save_in_json("data/AskAPatient/cui_codes", cui_code_ask) read.save_in_json("data/AskAPatient/codes", list(set(codes)))
def main(mention_embeddings_path, synonym_ebmedding_path, concepts_path, top_k, concept_pre_path, concept_score_pre_path): query = np.load(mention_embeddings_path) documents = np.load(synonym_ebmedding_path) concepts = read.read_from_json(concepts_path) similarity_matrix = cosine_similarity(query, documents) idx = np.argsort(similarity_matrix) idx = idx.astype(np.int32) top_k = int(top_k) idx = idx[:, ::-1][:, :top_k] concept_score_pre = [ row[idx[i]] for i, row in enumerate(similarity_matrix) ] concept_pre = [[concepts[int(item)] for item in row] for row in idx] read.save_in_json(concept_pre_path, concept_pre) np.save(concept_score_pre_path, concept_score_pre)
def add_oov(): cui_st = read.read_from_json("data/TwADR-L/cui_st_dict") cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict") ##### some cuis is out of vocabulary ######### cui_extra = [cui for cui in twa if cui not in twa_cuis] for cui in cui_extra: cui_st[cui] = cui_st[twa_cuis_dict[cui]] cui_synonyms[cui] = cui_synonyms[twa_cuis_dict[cui]] read.save_in_json("data/TwADR-L/cui_dict_complete", cui_synonyms) read.save_in_json("data/TwADR-L/cui_st_dict_complete", cui_st) # add_oov() # cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict_complete") # print(len(cui_synonyms)) # for cui in twa: # if cui not in cui_synonyms: # print(cui) # print(len(cui_synonyms))
def add_oov_smm4h(): cui_st = read.read_from_json("data/SMM4H/cui_st_dict") cui_synonyms = read.read_from_json("data/SMM4H/cui_synonyms") cui_st["MEDDRA PT"] = ["df1"] cui_synonyms["MEDDRA PT"] = ["Extracted ADR"] code_cuis = read.read_from_json("data/SMM4H/code_cuis") codes = read.read_from_json("data/SMM4H/label") code_st = {} code_synonyms = {} for code in codes: cui = code_cuis[code][0] if cui == 'C0011334 ': cui = 'C0011334' code_st[code] = cui_st[cui] code_synonyms[code] = cui_synonyms[cui] read.save_in_json("data/SMM4H/code_dict_complete", code_synonyms) read.save_in_json("data/SMM4H/code_st_dict_complete", code_st)
def analyze_patient_admission_date(): patient_ids = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/patient_id")) patient_admission_time = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/allnotes_row_patient_admission_time")) patient_admission_time_formatted = {} for key, value in patient_admission_time.items(): time_items = value[0].split("-") datetime_new = date(int(time_items[0]), int(time_items[1]), int(time_items[2])) patient_admission_time_formatted[key] = datetime_new suicide_patient_admission_time = read.read_from_json( os.path.join( cache_folder, "suicide_patient_id/suicidalnotes_patient_admission_time")) suicide_patient_admission_time = get_duration( suicide_patient_admission_time) patient_admission_before = {} patient_admission_meanwhile = {} patient_admission_after = {} for patient_id in patient_ids: suicide_admission_time = suicide_patient_admission_time[patient_id] patient_admission_time = get_patient_for_admission( patient_admission_time_formatted, patient_id) for key, admission_time in patient_admission_time.items(): row_id, _, _ = key.split("_") if admission_time < suicide_admission_time[0]: add_key_dict(patient_admission_before, patient_id, row_id) elif admission_time > suicide_admission_time[1]: add_key_dict(patient_admission_after, patient_id, row_id) else: add_key_dict(patient_admission_meanwhile, patient_id, row_id) print(len(patient_admission_before)) #print(patient_admission_before) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_before_patient_admission"), patient_admission_before) print(len(patient_admission_after)) # print(patient_admission_after) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_after_patient_admission"), patient_admission_after) print(len(patient_admission_meanwhile)) #print(patient_admission_meanwhile) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_meanwhile_patient_admission"), patient_admission_meanwhile)
def process_umls(): cui_all_snomed = {} cui_all_rxnorm = {} data = read.readfrom_txt( "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF") for line in data.splitlines(): line_split = line.split('|') #### 11 = vocabulary, 12 = term type 14 = term name, 16 = suppress 0 = cui if "SNOMEDCT" in line_split[11]: if line_split[16] == "N": cui_all_snomed = add_dict(cui_all_snomed, line_split[0], line_split[14]) if "RXNORM" in line_split[11]: if line_split[16] == "N": cui_all_rxnorm = add_dict(cui_all_rxnorm, line_split[0], line_split[14]) read.save_in_json("/extra/dongfangxu9/umls/processed/snomed_dict", cui_all_snomed) read.save_in_json("/extra/dongfangxu9/umls/processed/rxnorm_dict", cui_all_rxnorm)
def dev_evaluator(): ontology = read.read_from_tsv("data/ontology/ontology_synonyms.tsv") cui_mention_idx = read.read_from_json( "data/ontology/ontology_concept_synonyms_idx") corpus = {"doc_" + str(id): item[0] for id, item in enumerate(ontology)} read.save_in_json("data/evaluator_path/corpus", corpus) doc_id2mesh_all = {} mesh2doc_id_all = {} for key, item in cui_mention_idx.items(): doc_id2mesh = {"doc_" + str(id): key for id in range(item[0], item[1])} doc_id2mesh_all.update(doc_id2mesh) mesh2doc_id = { key: ["doc_" + str(id) for id in range(item[0], item[1])] } mesh2doc_id_all.update(mesh2doc_id) dev_input = read.read_from_tsv("data/input_raw/dev.tsv") mentions = [item[0] for item in dev_input] query = { "q_" + str(id): item[0] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } relevant_docs = { "q_" + str(id): mesh2doc_id_all[item[1]] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } read.save_in_json("data/evaluator_path/dev_queries", query) read.save_in_json("data/evaluator_path/dev_relevant_docs", relevant_docs) for qid, item in query.items(): text = [ ontology[int(doc_id.split("_")[1])][0] for doc_id in relevant_docs[qid] ] print(item, text)
def read_all_notes_for_patients_admission_time(): patient_id = [] admission_id = [] patient_notes_all = read.read_from_tsv( os.path.join(output_folder, "note_events_all.tsv"))[1:] row_patient_admission_time = {} for row in patient_notes_all: row_patient_admission = row[0] + "_" + row[1] + "_" + row[2] add_key_dict(row_patient_admission_time, row_patient_admission, row[3]) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/allnotes_row_patient_admission_time"), row_patient_admission_time) patient_notes_suicide = read.read_from_tsv( os.path.join(output_folder, "note_events_suicidal.tsv"))[1:] suicide_patient_admission_time = {} for row in patient_notes_suicide: if row[1] not in patient_id: patient_id.append(row[1]) if row[2] not in admission_id: admission_id.append(row[2]) add_key_dict(suicide_patient_admission_time, row[1], row[3]) read.save_in_json( os.path.join( cache_folder, "suicide_patient_id/suicidalnotes_patient_admission_time"), suicide_patient_admission_time) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/patient_id"), patient_id) print(len(admission_id)) print(len(patient_id))
def read_training(): training1 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training1.txt") training2 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training2.txt") training3 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training3.txt") training4 = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_training4.txt") data1, labels1 = get_label(training1) data2, labels2 = get_label(training2) data3, labels3 = get_label(training3) data4, labels4 = get_label(training4) test_data = read.textfile2list( "data/SMM4H/subtask3/task_3_normalization_evaluation.txt") data_test, labels_test = get_label(test_data) data_train = data1 + data2 + data3 + data4 labels_all = list(set(labels1 + labels2 + labels3 + labels4 + labels_test)) read.save_in_json("data/SMM4H/train_ori", data_train) read.save_in_json("data/SMM4H/test", data_test) read.save_in_json("data/SMM4H/labels_ori", labels_all)
def add_ooc_st(): code_cuis = read.read_from_tsv( "data/AskAPatient/codes_st_tsv_processed.tsv") code_cuis_dict = {line[0]: line[4] for line in code_cuis} read.save_in_json("data/AskAPatient/code_st_dict_complete", code_cuis_dict)