def add_oov_processed(): code_cuis = read.read_from_tsv( "data/AskAPatient/codes_single_synonyms_tsv.tsv") code_cuis_dict = { line[0]: line[2] for line in code_cuis if len(line[3]) > 2 } cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict") cui_st = read.read_from_json("data/AskAPatient/cui_st_dict") code_labels = read.read_from_json( "data/AskAPatient/label_texts_dict_AskAPatient") codes_synonyms_tsv = {} codes_st_tsv = [] for code in ask: code_st_tsv = [code, code_labels[code]] if code in ask: if code in code_cuis_dict: cui = code_cuis_dict[code] synonym = list(set(cui_synonyms[cui])) code_st_tsv += [ cui, " [SEP] ".join(synonym)[:100], cui_st[cui] ] else: synonym = code_labels[code] codes_synonyms_tsv[code] = synonym codes_st_tsv.append(code_st_tsv) read.save_in_json("data/AskAPatient/code_dict_complete", codes_synonyms_tsv) read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
def add_oov(): code_cuis = read.read_from_tsv("data/AskAPatient/code_cuis.tsv") code_cuis_dict = { line[0]: line[1:] for line in code_cuis if len(line[:-1]) > 0 } cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict") cui_st = read.read_from_json("data/AskAPatient/cui_st_dict") code_labels = read.read_from_json( "data/AskAPatient/label_texts_dict_AskAPatient") codes_synonyms_tsv = [] codes_st_tsv = [] for code in ask: code_synonyms_tsv = [code, code_labels[code]] code_st_tsv = [code, code_labels[code]] if code in ask: if code in code_cuis_dict: cuis = code_cuis_dict[code] for cui in cuis: code_synonyms_tsv += [ cui, " [SEP] ".join(cui_synonyms[cui])[:100] ] code_st_tsv += [ cui, " [SEP] ".join(cui_synonyms[cui])[:100], cui_st[cui] ] codes_synonyms_tsv.append(code_synonyms_tsv) codes_st_tsv.append(code_st_tsv) read.save_in_tsv("data/AskAPatient/codes_synonyms_tsv.tsv", codes_synonyms_tsv) read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
def analyze_patient_admission_date(): patient_ids = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/patient_id")) patient_admission_time = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/allnotes_row_patient_admission_time")) patient_admission_time_formatted = {} for key, value in patient_admission_time.items(): time_items = value[0].split("-") datetime_new = date(int(time_items[0]), int(time_items[1]), int(time_items[2])) patient_admission_time_formatted[key] = datetime_new suicide_patient_admission_time = read.read_from_json( os.path.join( cache_folder, "suicide_patient_id/suicidalnotes_patient_admission_time")) suicide_patient_admission_time = get_duration( suicide_patient_admission_time) patient_admission_before = {} patient_admission_meanwhile = {} patient_admission_after = {} for patient_id in patient_ids: suicide_admission_time = suicide_patient_admission_time[patient_id] patient_admission_time = get_patient_for_admission( patient_admission_time_formatted, patient_id) for key, admission_time in patient_admission_time.items(): row_id, _, _ = key.split("_") if admission_time < suicide_admission_time[0]: add_key_dict(patient_admission_before, patient_id, row_id) elif admission_time > suicide_admission_time[1]: add_key_dict(patient_admission_after, patient_id, row_id) else: add_key_dict(patient_admission_meanwhile, patient_id, row_id) print(len(patient_admission_before)) #print(patient_admission_before) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_before_patient_admission"), patient_admission_before) print(len(patient_admission_after)) # print(patient_admission_after) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_after_patient_admission"), patient_admission_after) print(len(patient_admission_meanwhile)) #print(patient_admission_meanwhile) read.save_in_json( os.path.join(cache_folder, "suicide_patient_id/suicide_meanwhile_patient_admission"), patient_admission_meanwhile)
def main(syn_path, cui_path, cui_idx_path, file_name): embeddings = np.load(syn_path) cuis = read.read_from_json(cui_path) cui_idx = read.read_from_json(cui_idx_path) avg = [] for cui in cuis: s, e = cui_idx[cui] embedding_syn = embeddings[s:e] avg.append(np.mean(embedding_syn, axis=0)) avg = np.asarray(avg) read.create_folder(file_name) np.save(file_name, avg)
def analyze(): rxnorm_term = read.read_from_tsv("data/umls/all_rxnorm_suppress.tsv") snomed_term = read.read_from_tsv("data/umls/all_snowmed_suppress.tsv") rxnorm_term = list(set([item[0] for item in rxnorm_term])) snomed_term = list(set([item[0] for item in snomed_term])) print(len(rxnorm_term)) print(len(snomed_term)) rxnorm_term1 = read.read_from_json("data/umls/rxnorm_dict") snomed_term1 = read.read_from_json("data/umls/snomed_dict") print(len(rxnorm_term1)) print(len(snomed_term1))
def cui_labels(): code_cuis = {} cui_infos = read.read_from_json("data/SMM4H/synonyms") print(len(cui_infos)) for line in cui_infos: code = line[10] cui = line[0] if line[12] == "PT": if code in code_cuis: code_cuis[code] += [cui] else: code_cuis[code] = [cui] print(len(code_cuis)) extra = { "10012259": ["C0011253"], "10024130": ["C0023222"], "10000497": ["C0702166"], "1002243744151": ["C0917801"], "10040991": ["C0851578"], "10007541": ["C0018799"], "10027433": ["C0851358"], "10014698": ["C0014130"], "10044027": ["C0011334 "], "10013663": ["C1510472"], "MEDDRA PT": ["MEDDRA PT"] } code_cuis.update(extra) for code, cuis in code_cuis.items(): if len(cuis) > 1: print(code) read.save_in_json("data/SMM4H/code_cuis", code_cuis) read.save_in_json("data/SMM4H/label", list(code_cuis.keys()))
def main(mention_embeddings_path, synonym_ebmedding_path, concept_synonym_idx_path, top_k, concept_pre_path, concept_score_pre_path): query = np.load(mention_embeddings_path) documents = np.load(synonym_ebmedding_path) concept_synonym_idx = read.read_from_json(concept_synonym_idx_path) id2concept = { int(i): cui for cui, item in concept_synonym_idx.items() for i in range(int(item[0]), int(item[1])) } similarity_matrix = cosine_similarity(query, documents) similarity_matrix = similarity_matrix.astype(np.float16) idx = np.argsort(similarity_matrix) idx = idx.astype(np.int32) top_k = int(top_k) idx = idx[:, ::-1][:, :top_k] concept_score_pre = [ row[idx[i]] for i, row in enumerate(similarity_matrix) ] concept_pre = [[id2concept[item] for item in row] for row in idx] read.save_in_json(concept_pre_path, concept_pre) np.save(concept_score_pre_path, concept_score_pre)
def get_snomed_rxnorm_umls(): rxnorm_term = read.read_from_json( "/extra/dongfangxu9/umls/processed/rxnorm_dict") snomed_term = read.read_from_json( "/extra/dongfangxu9/umls/processed/snomed_dict") cui_all = list(set(list(rxnorm_term.keys()) + list(snomed_term.keys()))) cui_all_synonyms = {} # print(len(cui_all)) data = read.readfrom_txt( "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF") for line in data.splitlines(): line_split = line.split('|') if line_split[0] in cui_all: cui_all_synonyms = add_dict(cui_all_synonyms, line_split[0], line_split)
def textfile2list_smm4h_st(): file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] data = read.readfrom_txt(file_path_st) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list[line[0]] = line[2] read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
def add_oov(): cui_st = read.read_from_json("data/TwADR-L/cui_st_dict") cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict") ##### some cuis is out of vocabulary ######### cui_extra = [cui for cui in twa if cui not in twa_cuis] for cui in cui_extra: cui_st[cui] = cui_st[twa_cuis_dict[cui]] cui_synonyms[cui] = cui_synonyms[twa_cuis_dict[cui]] read.save_in_json("data/TwADR-L/cui_dict_complete", cui_synonyms) read.save_in_json("data/TwADR-L/cui_st_dict_complete", cui_st) # add_oov() # cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict_complete") # print(len(cui_synonyms)) # for cui in twa: # if cui not in cui_synonyms: # print(cui) # print(len(cui_synonyms))
def add_oov_smm4h(): cui_st = read.read_from_json("data/SMM4H/cui_st_dict") cui_synonyms = read.read_from_json("data/SMM4H/cui_synonyms") cui_st["MEDDRA PT"] = ["df1"] cui_synonyms["MEDDRA PT"] = ["Extracted ADR"] code_cuis = read.read_from_json("data/SMM4H/code_cuis") codes = read.read_from_json("data/SMM4H/label") code_st = {} code_synonyms = {} for code in codes: cui = code_cuis[code][0] if cui == 'C0011334 ': cui = 'C0011334' code_st[code] = cui_st[cui] code_synonyms[code] = cui_synonyms[cui] read.save_in_json("data/SMM4H/code_dict_complete", code_synonyms) read.save_in_json("data/SMM4H/code_st_dict_complete", code_st)
def textfile2list_smm4h(): code_cuis = read.read_from_json("data/SMM4H/code_cuis") cuis = [cuis[0] for _, cuis in code_cuis.items()] file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" data = read.readfrom_txt(file_path_synonym) txt_list = [] for line in data.splitlines(): line = line.split('|') if line[0] in cuis: txt_list.append(line) read.save_in_json("data/SMM4H/synonyms_all", txt_list)
def code_synonyms(): txt_list = read.read_from_json("data/SMM4H/synonyms_all") code_synonyms = {} code_synonyms_new = {} for line in txt_list: if line[0] not in code_synonyms: code_synonyms[line[0]] = [line[14]] else: code_synonyms[line[0]] += [line[14]] for code, synonyms in code_synonyms.items(): code_synonyms_new[code] = list(set(synonyms)) read.save_in_json("data/SMM4H/cui_synonyms", code_synonyms_new)
def suicide_meanwhile_notes(file_name): target_description = [ "Nursing/other", "Nursing", "Physician", "Discharge summary", "Social Work", "General", "Nutrition", "Rehab Services", "Case Management", "Consult" ] suicide_meanwhile = read.read_from_json( os.path.join(cache_folder, "suicide_patient_id/" + file_name)) title = read.read_from_tsv( os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[:1] patient_notes_all = read.read_from_tsv( os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[1:] suicide_meanwhile_notes = title none_admission_id = [] admission_id = [] notes_all = [] notes_all_subset = [] # admission_id_new = read.read_from_json(os.path.join(cache_folder,"suicide_patient_id/admission_id")) for row in patient_notes_all: if row[1] in suicide_meanwhile: if row[0] in suicide_meanwhile[row[1]]: if row[2] == "": # print(row) none_admission_id.append(row[0]) elif row[2] not in admission_id: admission_id.append(row[2]) else: None if row[6] in target_description: notes_all_subset.append(row[0]) suicide_meanwhile_notes.append(row) notes_all.append(row[0]) print("patients: ", len(suicide_meanwhile)) print("admission with id: ", len(admission_id)) print("admission without id: ", len(none_admission_id)) print("all notes: ", len(notes_all)) # for admission_id_1 in admission_id: # if admission_id_1 not in admission_id_new: # print(admission_id_1) read.save_in_tsv( os.path.join(output_folder, "suicide_patient_id/" + file_name + ".tsv"), suicide_meanwhile_notes)
def textfile2list_smm4h(): file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" smm4h = read.read_from_json("data/SMM4H/labels_ori") data = read.readfrom_txt(file_path) cuis_smm4h = [] txt_list = [] for line in data.splitlines(): # if "MDR" in line: # print(line) line = line.split('|') if "MDR" in line[11]: cuis_smm4h.append(line[0]) txt_list.append(line) read.save_in_json("data/SMM4H/synonyms", txt_list) read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
def main(mention_embeddings_path, synonym_ebmedding_path, concepts_path, top_k, concept_pre_path, concept_score_pre_path): query = np.load(mention_embeddings_path) documents = np.load(synonym_ebmedding_path) concepts = read.read_from_json(concepts_path) similarity_matrix = cosine_similarity(query, documents) idx = np.argsort(similarity_matrix) idx = idx.astype(np.int32) top_k = int(top_k) idx = idx[:, ::-1][:, :top_k] concept_score_pre = [ row[idx[i]] for i, row in enumerate(similarity_matrix) ] concept_pre = [[concepts[int(item)] for item in row] for row in idx] read.save_in_json(concept_pre_path, concept_pre) np.save(concept_score_pre_path, concept_score_pre)
def dev_evaluator(): ontology = read.read_from_tsv("data/ontology/ontology_synonyms.tsv") cui_mention_idx = read.read_from_json( "data/ontology/ontology_concept_synonyms_idx") corpus = {"doc_" + str(id): item[0] for id, item in enumerate(ontology)} read.save_in_json("data/evaluator_path/corpus", corpus) doc_id2mesh_all = {} mesh2doc_id_all = {} for key, item in cui_mention_idx.items(): doc_id2mesh = {"doc_" + str(id): key for id in range(item[0], item[1])} doc_id2mesh_all.update(doc_id2mesh) mesh2doc_id = { key: ["doc_" + str(id) for id in range(item[0], item[1])] } mesh2doc_id_all.update(mesh2doc_id) dev_input = read.read_from_tsv("data/input_raw/dev.tsv") mentions = [item[0] for item in dev_input] query = { "q_" + str(id): item[0] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } relevant_docs = { "q_" + str(id): mesh2doc_id_all[item[1]] for id, item in enumerate(dev_input) if item[1] != "CUI-less" } read.save_in_json("data/evaluator_path/dev_queries", query) read.save_in_json("data/evaluator_path/dev_relevant_docs", relevant_docs) for qid, item in query.items(): text = [ ontology[int(doc_id.split("_")[1])][0] for doc_id in relevant_docs[qid] ] print(item, text)
import read_files as read file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" ask = read.read_from_json("data/AskAPatient/label") twa = read.read_from_json("data/TwADR-L/label") twa_cuis_all = read.read_from_tsv("data/TwADR-L/cui_cuis - cui_cuis.tsv") twa_cuis = [item[1] for item in twa_cuis_all] twa_cuis_dict = {item[0]: item[1] for item in twa_cuis_all} def textfile2list_twa(): data = read.readfrom_txt(file_path_synonym) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in twa_cuis: if line[0] not in txt_list: txt_list[line[0]] = [line[14]] else: txt_list[line[0]] += [line[14]] read.save_in_json("data/TwADR-L/cui_dict", txt_list) # textfile2list_twa() def textfile2list_twa_st(): data = read.readfrom_txt(file_path_st) txt_list = {}
import read_files as read file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" ask = read.read_from_json("data/AskAPatient/label") twa = read.read_from_json("data/TwADR-L/label") def textfile2list_twa(): data = read.readfrom_txt(file_path) cuis_twa = [] txt_list = [] for line in data.splitlines(): if "SNO" in line: print(line) line = line.split('|') if line[0] in twa: cuis_twa.append(line[0]) txt_list.append(line) read.save_in_json("data/TwADR-L/synonyms", txt_list) read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa))) # textfile2list_twa() # print(len(twa)) # cuis = list(set(read.read_from_json("data/TwADR-L/cuis"))) # print(len(cuis)) # print([cui for cui in twa if cui not in cuis]) def textfile2list_ask():
def model_training( train_data_path, evaluator_path, model_name, output_path, train_batch_size, num_epochs, samples_per_label, ): logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()], ) output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) os.makedirs(output_path, exist_ok=True) # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/' ### Create a torch.DataLoader that passes training batch instances to our model logging.info("Loading training dataset") train_set = read_dataset(train_data_path) # Load pretrained model word_embedding_model = models.Transformer(model_name) # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']}) # word_embedding_model.auto_model.resize_token_embeddings( # len(word_embedding_model.tokenizer)) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) # pooling_mode_mean_mark_tokens=True) # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model.max_seq_length = 16 logging.info("Read concept normalization training dataset") #### try different sample size #### train_data_sampler = SentenceLabelDataset( examples=train_set, samples_per_label=samples_per_label) ##### Try whether shuffle ##### By default, it shouldn't be shuffled every epoch train_dataloader = DataLoader(train_data_sampler, batch_size=train_batch_size, drop_last=True) ### Triplet losses #################### ### There are 4 triplet loss variants: ### - BatchHardTripletLoss ### - BatchHardSoftMarginTripletLoss ### - BatchSemiHardTripletLoss ### - BatchAllTripletLoss ####################################### # train_loss = losses.BatchAllTripletLoss(model=model) #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model) train_loss = losses.BatchHardSoftMarginTripletLoss(model) #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) # evaluator = [] logging.info("Read concept normalization val dataset") ir_queries = read.read_from_json( os.path.join(evaluator_path, "dev_queries")) ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus")) ir_relevant_docs = read.read_from_json( os.path.join(evaluator_path, "dev_relevant_docs")) ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs, corpus_chunk_size=300000, name="evaluation_results", map_at_k=[1, 3, 5, 10], batch_size=1024, show_progress_bar=True) # evaluator.append(ir_evaluator_n2c2_dev) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1]) logging.info("Performance before fine-tuning:") ir_evaluator_n2c2_dev(model) # warmup_steps = int( # len(train_dataset) * num_epochs / train_batch_size * 0.1 # ) # 10% of train data warmup_steps = 0 # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], # evaluator = None, evaluator=ir_evaluator_n2c2_dev, output_path_ignore_not_empty=True, optimizer_params={ 'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False }, epochs=num_epochs, warmup_steps=warmup_steps, output_path=output_path, )
# features = "dev/all_withoutonto" # feature1 = "all" # features = "dev/all" #data = "test/inputdata" sentences, entity_list, nbio_entity_list, entity_pair_list = get_data(data) connectors = [ 'at', 'in', 'on', 'of', 'has', 'have', 'with', 'without', 'contains', 'contain' ] part_of_knowledge = read.read_from_csv_ontology(part_of) texts, vocab = get_sentence(sentences) #print len(texts) #read.save_in_json("dev/vocab",vocab) vocab = read.read_from_json("dev/vocab") new_vocab = [item for item, freq in vocab.items() if freq >= 9] threshold = 4 #build_features(features,entity_list,nbio_entity_list,entity_pair_list,part_of_knowledge,texts,new_vocab,connectors,threshold) # from svmutil import * y1, x1 = svm_read_problem('data/' + features + '.txt') # model = svm_train(y1, x1,'-t 2 -b 1 -w1 7 -w-1 1 -h 0') # -w1 8 -w-1 1') -v 5 # svm_save_model('data/thomas/relation_extraction_'+feature1+'.model', model) model = svm_load_model('data/dev/relation_extraction_' + feature1 + '.model') import evaluate as test test.evaluate(feature1, nbio_entity_list, entity_pair_list, model, y1, x1)
import read_files as read file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF" file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF" ask = read.read_from_json("data/AskAPatient/label") cui_codes = read.read_from_json("data/AskAPatient/cui_codes") ask_cuis = [cui for cui, _ in cui_codes.items()] print(len(cui_codes)) def textfile2list_twa(): data = read.readfrom_txt(file_path_synonym) txt_list = {} for line in data.splitlines(): line = line.split('|') if line[0] in ask_cuis: if line[0] not in txt_list: txt_list[line[0]] = [line[14]] else: txt_list[line[0]] += [line[14]] read.save_in_json("data/AskAPatient/cui_dict", txt_list) # textfile2list_twa() def textfile2list_twa_st(): data = read.readfrom_txt(file_path_st) txt_list = {}