예제 #1
0
def add_oov_processed():
    code_cuis = read.read_from_tsv(
        "data/AskAPatient/codes_single_synonyms_tsv.tsv")
    code_cuis_dict = {
        line[0]: line[2]
        for line in code_cuis if len(line[3]) > 2
    }
    cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict")
    cui_st = read.read_from_json("data/AskAPatient/cui_st_dict")

    code_labels = read.read_from_json(
        "data/AskAPatient/label_texts_dict_AskAPatient")

    codes_synonyms_tsv = {}
    codes_st_tsv = []

    for code in ask:
        code_st_tsv = [code, code_labels[code]]
        if code in ask:
            if code in code_cuis_dict:
                cui = code_cuis_dict[code]
                synonym = list(set(cui_synonyms[cui]))
                code_st_tsv += [
                    cui, " [SEP] ".join(synonym)[:100], cui_st[cui]
                ]
            else:
                synonym = code_labels[code]

        codes_synonyms_tsv[code] = synonym
        codes_st_tsv.append(code_st_tsv)

    read.save_in_json("data/AskAPatient/code_dict_complete",
                      codes_synonyms_tsv)
    read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
예제 #2
0
def process_ontology():
    ontology = read.read_from_tsv("data/ontology.tsv")
    concept_mentions = {}

    for idx, [synonym, concept] in enumerate(ontology):
        read.add_dict(concept_mentions, concept, synonym)

    concepts = list(concept_mentions.keys())
    synonyms = []
    concept_mention_idx = {}
    idx = 0
    for concept in concepts:
        concept_synonyms = list(set(concept_mentions[concept]))
        synonyms += concept_synonyms
        end = idx + len(concept_synonyms)
        for index in range(idx, end):

            concept_mention_idx[concept] = (idx, end)
        idx = end

    synonyms = [[item] for item in synonyms]

    read.save_in_tsv("data/ontology/ontology_synonyms.tsv", synonyms)
    read.save_in_json("data/ontology/ontology_concept", concepts)
    read.save_in_json("data/ontology/ontology_concept_synonyms_idx",
                      concept_mention_idx)
def main(mention_embeddings_path, synonym_ebmedding_path,
         concept_synonym_idx_path, top_k, concept_pre_path,
         concept_score_pre_path):

    query = np.load(mention_embeddings_path)
    documents = np.load(synonym_ebmedding_path)

    concept_synonym_idx = read.read_from_json(concept_synonym_idx_path)
    id2concept = {
        int(i): cui
        for cui, item in concept_synonym_idx.items()
        for i in range(int(item[0]), int(item[1]))
    }

    similarity_matrix = cosine_similarity(query, documents)
    similarity_matrix = similarity_matrix.astype(np.float16)
    idx = np.argsort(similarity_matrix)
    idx = idx.astype(np.int32)
    top_k = int(top_k)
    idx = idx[:, ::-1][:, :top_k]

    concept_score_pre = [
        row[idx[i]] for i, row in enumerate(similarity_matrix)
    ]
    concept_pre = [[id2concept[item] for item in row] for row in idx]

    read.save_in_json(concept_pre_path, concept_pre)
    np.save(concept_score_pre_path, concept_score_pre)
예제 #4
0
def cui_labels():
    code_cuis = {}
    cui_infos = read.read_from_json("data/SMM4H/synonyms")
    print(len(cui_infos))
    for line in cui_infos:
        code = line[10]
        cui = line[0]
        if line[12] == "PT":
            if code in code_cuis:
                code_cuis[code] += [cui]
            else:
                code_cuis[code] = [cui]
    print(len(code_cuis))

    extra = {
        "10012259": ["C0011253"],
        "10024130": ["C0023222"],
        "10000497": ["C0702166"],
        "1002243744151": ["C0917801"],
        "10040991": ["C0851578"],
        "10007541": ["C0018799"],
        "10027433": ["C0851358"],
        "10014698": ["C0014130"],
        "10044027": ["C0011334 "],
        "10013663": ["C1510472"],
        "MEDDRA PT": ["MEDDRA PT"]
    }

    code_cuis.update(extra)
    for code, cuis in code_cuis.items():
        if len(cuis) > 1:
            print(code)
    read.save_in_json("data/SMM4H/code_cuis", code_cuis)
    read.save_in_json("data/SMM4H/label", list(code_cuis.keys()))
예제 #5
0
def textfile2list_twa_st():
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in twa_cuis:
            txt_list[line[0]] = line[2]
    read.save_in_json("data/TwADR-L/cui_st_dict", txt_list)
예제 #6
0
def textfile2list_smm4h_st():
    file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF"
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list[line[0]] = line[2]
    read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
def evaluate(feature1,nbio_entity_list,entity_pair_list,model,y1,x1):

    import numpy as np
    import read_files as read
    p_lab, p_acc, p_vals = svm_predict(y1, x1, model, options="-b 1")
    p_vals_1 = np.asarray(p_vals)[:, 0]
    nbio_entity_list = read.lists2list(nbio_entity_list)
    entity_pair_list = read.lists2list(entity_pair_list)

    pair_wise = dict()
    key_list = list()

    for nbio_entity in nbio_entity_list:
        sid = nbio_entity[0]
        eid = nbio_entity[1]
        key_list.append(str(sid) + "_" + str(eid))


    index = 0
    for entity_pair in entity_pair_list:
        sid = entity_pair[0][0]
        eid = entity_pair[0][1]
        key = str(sid)+"_"+str(eid)

        if not pair_wise.has_key(key):
            label = [int(entity_pair[2])]
            pair_wise[key] =[[list(entity_pair[1])+label],[p_vals_1[index]]]
        else:
            value =pair_wise[key]
            bio_list,pro_list = value
            label = [int(entity_pair[2])]
            bio_list.append(list(entity_pair[1])+label)
            pro_list.append(p_vals_1[index])
            pair_wise[key] = [bio_list,pro_list]
        index +=1

    correct = 0
    index = 0
    answers = list()
    for key in key_list:
        bio_list, pro_list = pair_wise[key]
        i = np.argmax(pro_list)
        if bio_list[i][7] == 1:
            correct+=1
            answers.append([nbio_entity_list[index],bio_list[i]])
        else:
            answers.append([nbio_entity_list[index], bio_list[i],get_true(bio_list)])
        index+=1

    read.save_in_json('dongfang/evaluation_'+feature1,answers)

    precision = float(correct)/float(index)
    print precision,correct,all
예제 #8
0
def textfile2list_twa():
    data = read.readfrom_txt(file_path_synonym)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in twa_cuis:
            if line[0] not in txt_list:
                txt_list[line[0]] = [line[14]]
            else:
                txt_list[line[0]] += [line[14]]

    read.save_in_json("data/TwADR-L/cui_dict", txt_list)
예제 #9
0
def textfile2list_smm4h():
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    data = read.readfrom_txt(file_path_synonym)
    txt_list = []
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms_all", txt_list)
예제 #10
0
def patient_admission_duration():
    patient_ids = []
    admission_id = []
    patient_mental_not_suicide_all = read.read_from_tsv(os.path.join(output_folder,"mental_patient_all_notes.tsv"))
    row_patient_admission_time ={}

    for row in patient_mental_not_suicide_all:
        row_patient_admission= row[0]+"_"+row[1] + "_" + row[2]
        add_key_dict(row_patient_admission_time, row_patient_admission, row[3])

    print("patients (mental_not_suicide) time information are collected......")

    patient_mental_not_suicide_admission = read.read_from_tsv(os.path.join(output_folder,"mental_patient_admission_notes.tsv"))

    mental_not_suicide_patient_admission_time = {}
    for row in patient_mental_not_suicide_admission:
        if row[1] not in patient_ids:
            patient_ids.append(row[1])
        if row[2] not in admission_id:
            admission_id.append(row[2])

        add_key_dict(mental_not_suicide_patient_admission_time, row[1], row[3])

    print("patients_admission (mental_not_suicide) timelines are collected......")
    mental_not_suicide_patient_all_time_formatted ={}
    for key, value in row_patient_admission_time.items():
        time_items = value[0].split("-")
        datetime_new = date(int(time_items[0]), int(time_items[1]), int(time_items[2]))
        mental_not_suicide_patient_all_time_formatted[key] = datetime_new

    mental_not_suicide_patient_admission_period = get_duration(mental_not_suicide_patient_admission_time)
    print("patients_admission (mental_not_suicide) timelines are calculated......")

    patient_admission_before = []
    patient_admission_meanwhile = []

    for patient_id in patient_ids:

        mental_not_suicide_admission_time = mental_not_suicide_patient_admission_period[patient_id]
        mental_not_suicide_patient_all_time = get_patient_for_admission(mental_not_suicide_patient_all_time_formatted, patient_id)

        for key, admission_time in mental_not_suicide_patient_all_time.items():
            row_id, _, _ = key.split("_")
            if admission_time < mental_not_suicide_admission_time[0]:
                patient_admission_before.append(row_id)
            elif admission_time > mental_not_suicide_admission_time[1]:
                None
            else:
                patient_admission_meanwhile.append(row_id)

    read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_before"),patient_admission_before)
    read.save_in_json(os.path.join(cache_folder,"mental_not_suicide_patient_admission_meanwhile"),patient_admission_meanwhile)
    return patient_admission_before, patient_admission_meanwhile
예제 #11
0
def code_synonyms():
    txt_list = read.read_from_json("data/SMM4H/synonyms_all")
    code_synonyms = {}
    code_synonyms_new = {}
    for line in txt_list:
        if line[0] not in code_synonyms:
            code_synonyms[line[0]] = [line[14]]
        else:
            code_synonyms[line[0]] += [line[14]]

    for code, synonyms in code_synonyms.items():
        code_synonyms_new[code] = list(set(synonyms))
    read.save_in_json("data/SMM4H/cui_synonyms", code_synonyms_new)
def textfile2list_twa():
    data = read.readfrom_txt(file_path)
    cuis_twa = []
    txt_list = []
    for line in data.splitlines():
        if "SNO" in line:
            print(line)
        line = line.split('|')
        if line[0] in twa:
            cuis_twa.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/TwADR-L/synonyms", txt_list)
    read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa)))
예제 #13
0
def textfile2list_smm4h():
    file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    smm4h = read.read_from_json("data/SMM4H/labels_ori")

    data = read.readfrom_txt(file_path)
    cuis_smm4h = []
    txt_list = []
    for line in data.splitlines():
        # if "MDR" in line:
        #     print(line)
        line = line.split('|')
        if "MDR" in line[11]:
            cuis_smm4h.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms", txt_list)
    read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
예제 #14
0
def textfile2list_ask():
    data = read.readfrom_txt(file_path)
    cui_code_ask = {}
    codes = []
    for line in data.splitlines():
        line = line.split('|')
        if line[13] in ask:
            codes.append(line[13])
            if line[0] not in cui_code_ask:
                cui_code_ask[line[0]] = [line[13]]
            else:
                cui_code_ask[line[0]] += [line[13]]
                # txt_list.append(line)

    # read.save_in_json("data/AskAPatient/synonyms",txt_list)
    read.save_in_json("data/AskAPatient/cui_codes", cui_code_ask)
    read.save_in_json("data/AskAPatient/codes", list(set(codes)))
예제 #15
0
def main(mention_embeddings_path, synonym_ebmedding_path, concepts_path, top_k,
         concept_pre_path, concept_score_pre_path):

    query = np.load(mention_embeddings_path)
    documents = np.load(synonym_ebmedding_path)
    concepts = read.read_from_json(concepts_path)

    similarity_matrix = cosine_similarity(query, documents)
    idx = np.argsort(similarity_matrix)
    idx = idx.astype(np.int32)
    top_k = int(top_k)
    idx = idx[:, ::-1][:, :top_k]
    concept_score_pre = [
        row[idx[i]] for i, row in enumerate(similarity_matrix)
    ]
    concept_pre = [[concepts[int(item)] for item in row] for row in idx]

    read.save_in_json(concept_pre_path, concept_pre)
    np.save(concept_score_pre_path, concept_score_pre)
예제 #16
0
def add_oov():
    cui_st = read.read_from_json("data/TwADR-L/cui_st_dict")
    cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict")
    ##### some cuis is out of vocabulary #########
    cui_extra = [cui for cui in twa if cui not in twa_cuis]
    for cui in cui_extra:
        cui_st[cui] = cui_st[twa_cuis_dict[cui]]
        cui_synonyms[cui] = cui_synonyms[twa_cuis_dict[cui]]
    read.save_in_json("data/TwADR-L/cui_dict_complete", cui_synonyms)
    read.save_in_json("data/TwADR-L/cui_st_dict_complete", cui_st)


# add_oov()
# cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict_complete")
# print(len(cui_synonyms))
# for cui in twa:
#     if cui not in cui_synonyms:
#         print(cui)

# print(len(cui_synonyms))
예제 #17
0
def add_oov_smm4h():
    cui_st = read.read_from_json("data/SMM4H/cui_st_dict")
    cui_synonyms = read.read_from_json("data/SMM4H/cui_synonyms")

    cui_st["MEDDRA PT"] = ["df1"]
    cui_synonyms["MEDDRA PT"] = ["Extracted ADR"]

    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    codes = read.read_from_json("data/SMM4H/label")

    code_st = {}
    code_synonyms = {}

    for code in codes:
        cui = code_cuis[code][0]
        if cui == 'C0011334 ':
            cui = 'C0011334'
        code_st[code] = cui_st[cui]
        code_synonyms[code] = cui_synonyms[cui]
    read.save_in_json("data/SMM4H/code_dict_complete", code_synonyms)
    read.save_in_json("data/SMM4H/code_st_dict_complete", code_st)
예제 #18
0
def analyze_patient_admission_date():
    patient_ids = read.read_from_json(
        os.path.join(cache_folder, "suicide_patient_id/patient_id"))

    patient_admission_time = read.read_from_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/allnotes_row_patient_admission_time"))
    patient_admission_time_formatted = {}
    for key, value in patient_admission_time.items():
        time_items = value[0].split("-")
        datetime_new = date(int(time_items[0]), int(time_items[1]),
                            int(time_items[2]))
        patient_admission_time_formatted[key] = datetime_new

    suicide_patient_admission_time = read.read_from_json(
        os.path.join(
            cache_folder,
            "suicide_patient_id/suicidalnotes_patient_admission_time"))
    suicide_patient_admission_time = get_duration(
        suicide_patient_admission_time)

    patient_admission_before = {}
    patient_admission_meanwhile = {}
    patient_admission_after = {}
    for patient_id in patient_ids:

        suicide_admission_time = suicide_patient_admission_time[patient_id]
        patient_admission_time = get_patient_for_admission(
            patient_admission_time_formatted, patient_id)

        for key, admission_time in patient_admission_time.items():
            row_id, _, _ = key.split("_")
            if admission_time < suicide_admission_time[0]:
                add_key_dict(patient_admission_before, patient_id, row_id)
            elif admission_time > suicide_admission_time[1]:
                add_key_dict(patient_admission_after, patient_id, row_id)
            else:
                add_key_dict(patient_admission_meanwhile, patient_id, row_id)

    print(len(patient_admission_before))
    #print(patient_admission_before)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_before_patient_admission"),
        patient_admission_before)
    print(len(patient_admission_after))
    # print(patient_admission_after)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_after_patient_admission"),
        patient_admission_after)
    print(len(patient_admission_meanwhile))
    #print(patient_admission_meanwhile)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_meanwhile_patient_admission"),
        patient_admission_meanwhile)
def process_umls():

    cui_all_snomed = {}
    cui_all_rxnorm = {}
    data = read.readfrom_txt(
        "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF")

    for line in data.splitlines():
        line_split = line.split('|')
        #### 11 = vocabulary, 12 = term type 14 = term name, 16 = suppress 0 = cui
        if "SNOMEDCT" in line_split[11]:
            if line_split[16] == "N":
                cui_all_snomed = add_dict(cui_all_snomed, line_split[0],
                                          line_split[14])

        if "RXNORM" in line_split[11]:
            if line_split[16] == "N":
                cui_all_rxnorm = add_dict(cui_all_rxnorm, line_split[0],
                                          line_split[14])

    read.save_in_json("/extra/dongfangxu9/umls/processed/snomed_dict",
                      cui_all_snomed)
    read.save_in_json("/extra/dongfangxu9/umls/processed/rxnorm_dict",
                      cui_all_rxnorm)
예제 #20
0
def dev_evaluator():

    ontology = read.read_from_tsv("data/ontology/ontology_synonyms.tsv")

    cui_mention_idx = read.read_from_json(
        "data/ontology/ontology_concept_synonyms_idx")

    corpus = {"doc_" + str(id): item[0] for id, item in enumerate(ontology)}

    read.save_in_json("data/evaluator_path/corpus", corpus)

    doc_id2mesh_all = {}
    mesh2doc_id_all = {}
    for key, item in cui_mention_idx.items():
        doc_id2mesh = {"doc_" + str(id): key for id in range(item[0], item[1])}
        doc_id2mesh_all.update(doc_id2mesh)
        mesh2doc_id = {
            key: ["doc_" + str(id) for id in range(item[0], item[1])]
        }
        mesh2doc_id_all.update(mesh2doc_id)

    dev_input = read.read_from_tsv("data/input_raw/dev.tsv")
    mentions = [item[0] for item in dev_input]

    query = {
        "q_" + str(id): item[0]
        for id, item in enumerate(dev_input) if item[1] != "CUI-less"
    }

    relevant_docs = {
        "q_" + str(id): mesh2doc_id_all[item[1]]
        for id, item in enumerate(dev_input) if item[1] != "CUI-less"
    }
    read.save_in_json("data/evaluator_path/dev_queries", query)
    read.save_in_json("data/evaluator_path/dev_relevant_docs", relevant_docs)

    for qid, item in query.items():

        text = [
            ontology[int(doc_id.split("_")[1])][0]
            for doc_id in relevant_docs[qid]
        ]
        print(item, text)
예제 #21
0
def read_all_notes_for_patients_admission_time():
    patient_id = []
    admission_id = []
    patient_notes_all = read.read_from_tsv(
        os.path.join(output_folder, "note_events_all.tsv"))[1:]
    row_patient_admission_time = {}

    for row in patient_notes_all:
        row_patient_admission = row[0] + "_" + row[1] + "_" + row[2]
        add_key_dict(row_patient_admission_time, row_patient_admission, row[3])

    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/allnotes_row_patient_admission_time"),
        row_patient_admission_time)

    patient_notes_suicide = read.read_from_tsv(
        os.path.join(output_folder, "note_events_suicidal.tsv"))[1:]

    suicide_patient_admission_time = {}
    for row in patient_notes_suicide:
        if row[1] not in patient_id:
            patient_id.append(row[1])
        if row[2] not in admission_id:
            admission_id.append(row[2])

        add_key_dict(suicide_patient_admission_time, row[1], row[3])

    read.save_in_json(
        os.path.join(
            cache_folder,
            "suicide_patient_id/suicidalnotes_patient_admission_time"),
        suicide_patient_admission_time)
    read.save_in_json(
        os.path.join(cache_folder, "suicide_patient_id/patient_id"),
        patient_id)
    print(len(admission_id))
    print(len(patient_id))
예제 #22
0
def read_training():
    training1 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training1.txt")
    training2 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training2.txt")
    training3 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training3.txt")
    training4 = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_training4.txt")
    data1, labels1 = get_label(training1)
    data2, labels2 = get_label(training2)
    data3, labels3 = get_label(training3)
    data4, labels4 = get_label(training4)

    test_data = read.textfile2list(
        "data/SMM4H/subtask3/task_3_normalization_evaluation.txt")
    data_test, labels_test = get_label(test_data)
    data_train = data1 + data2 + data3 + data4
    labels_all = list(set(labels1 + labels2 + labels3 + labels4 + labels_test))

    read.save_in_json("data/SMM4H/train_ori", data_train)
    read.save_in_json("data/SMM4H/test", data_test)
    read.save_in_json("data/SMM4H/labels_ori", labels_all)
예제 #23
0
def add_ooc_st():
    code_cuis = read.read_from_tsv(
        "data/AskAPatient/codes_st_tsv_processed.tsv")
    code_cuis_dict = {line[0]: line[4] for line in code_cuis}
    read.save_in_json("data/AskAPatient/code_st_dict_complete", code_cuis_dict)