Exemplo n.º 1
0
Arquivo: mcsm.py Projeto: pj0616/CODER
def mcsm(embedding_list, embedding_type_list, type_list=t_list, k=40, lang_range=['ENG'], check_intersection=False):
    if check_intersection:
        if not os.path.exists("intersection.txt"):
            intersection_cui = get_intersection(
                embedding_list, embedding_type_list)
            with open("intersection.txt", "w", encoding="utf-8") as f:
                for cui in intersection_cui:
                    f.write(cui.strip() + "\n")
        else:
            with open("intersection.txt", "r", encoding="utf-8") as f:
                lines = f.readlines()
            intersection_cui = [line.strip() for line in lines]

    umls = UMLS("../../umls", source_range='SNOMEDCT_US',
                lang_range=lang_range)
    if check_intersection:
        cui_list = [cui for cui in intersection_cui
                    if cui in umls.cui2sty and umls.cui2sty[cui] in type_list]
    else:
        cui_list = [cui for cui, sty in umls.cui2sty.items()
                    if sty in type_list]
    opt = []
    for index, embedding in enumerate(embedding_list):
        if embedding_type_list[index].lower() == "cui":
            opt.append(mcsm_cui(embedding, umls, cui_list, type_list, k))
        if embedding_type_list[index].lower() == "word":
            opt.append(mcsm_word(embedding, umls, cui_list, type_list, k))
        if embedding_type_list[index].lower() == "bert":
            opt.append(mcsm_bert(embedding, umls, cui_list,
                                 type_list, k, summary_method="MEAN"))
            opt.append(mcsm_bert(embedding, umls, cui_list,
                                 type_list, k, summary_method="CLS"))
    return opt
Exemplo n.º 2
0
 def __init__(self,
              umls_folder,
              model_name_or_path,
              lang,
              json_save_path=None,
              max_lui_per_cui=8,
              max_length=32):
     self.umls = UMLS(umls_folder, lang_range=lang)
     self.len = len(self.umls.rel)
     self.max_lui_per_cui = max_lui_per_cui
     self.max_length = max_length
     self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
     self.json_save_path = json_save_path
     self.calculate_class_count()
 def __init__(self,
              umls_folder,
              model_name_or_path,
              lang,
              json_save_path=None,
              max_lui_per_cui=8,
              max_length=32,
              negative_sampling=True,
              debug=False):
     self.debug = debug
     self.umls = UMLS(umls_folder, lang_range=lang, debug=self.debug)
     self.len = len(self.umls.rel)
     self.max_lui_per_cui = max_lui_per_cui
     self.max_length = max_length
     self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
     self.json_save_path = json_save_path
     self.calculate_class_count()
     self.negative_sampling = negative_sampling
Exemplo n.º 4
0
def mrm_ccs(embedding_list,
            embedding_type_list,
            k=40,
            check_intersection=False):
    cui_to_icd9, icd9_to_cui = get_icd9_cui_mappings()

    if check_intersection:
        if not os.path.exists("intersection.txt"):
            intersection_cui = get_intersection(embedding_list,
                                                embedding_type_list)
            with open("intersection.txt", "w", encoding="utf-8") as f:
                for cui in intersection_cui:
                    f.write(cui.strip() + "\n")
        else:
            with open("intersection.txt", "r", encoding="utf-8") as f:
                lines = f.readlines()
            intersection_cui = [line.strip() for line in lines]

    umls = UMLS("../../umls", only_load_dict=True)

    if check_intersection:
        cui_list = [
            cui for cui in intersection_cui if cui in list(cui_to_icd9.keys())
        ]
    else:
        cui_list = list(cui_to_icd9.keys())

    icd9_list = [cui_to_icd9[cui] for cui in cui_list]
    icd9_set = set(icd9_list)
    icd9_pair = get_icd9_pairs(icd9_set)
    icd9_coarse_pair = get_coarse_icd9_pairs(icd9_set)
    icd9_to_description = get_icd9_to_description()

    #icd9_reverse_dict_pair = get_icd9_reverse_dict(icd9_pair)
    #icd9_reverse_dict_coarse_pair = get_icd9_reverse_dict(icd9_coarse_pair)

    #ipdb.set_trace()

    # type label
    # Only part of the icd is calculated as center
    # icd9_to_check = set(icd9_pairs.keys())
    # icd9_to_check.intersection_update(set(icd9_to_idx.keys()))
    pair_center_label = []
    #pair_label = []
    coarse_pair_center_label = []
    #coarse_pair_label = []
    for cui in cui_list:
        if cui_to_icd9[cui] in icd9_pair:
            pair_center_label.append(1)
        else:
            pair_center_label.append(0)
        #pair_label.append(icd9_reverse_dict_pair[cui_to_icd9[cui]])

        if cui_to_icd9[cui] in icd9_coarse_pair:
            coarse_pair_center_label.append(1)
        else:
            coarse_pair_center_label.append(0)
        #coarse_pair_label.append(icd9_reverse_dict_coarse_pair[cui_to_icd9[cui]])

    # generate_description
    description = []
    for cui in cui_list:
        if cui in cui_to_icd9 and cui_to_icd9[cui] in icd9_to_description:
            description.append(icd9_to_description[cui_to_icd9[cui]])
        elif cui in cui_to_icd9 and tree.find(cui_to_icd9[cui]):
            description.append(tree.find(cui_to_icd9[cui]).description)
        elif cui in umls.cui2str:
            description.append(list(umls.cui2str[cui])[0])
        else:
            description.append("")
            print(f"Can not find description for {cui}")

    #ipdb.set_trace()

    opt = []
    for index, embedding in enumerate(embedding_list):
        print("*************************")
        if embedding_type_list[index].lower() == "cui":
            opt.append(
                mrm_ccs_cui(embedding, icd9_list, cui_list, pair_center_label,
                            icd9_pair, k))
            opt.append(
                mrm_ccs_cui(embedding, icd9_list, cui_list,
                            coarse_pair_center_label, icd9_coarse_pair, k))
        if embedding_type_list[index].lower() == "word":
            opt.append(
                mrm_ccs_word(embedding, icd9_list, description,
                             pair_center_label, icd9_pair, k))
            opt.append(
                mrm_ccs_word(embedding, icd9_list, description,
                             coarse_pair_center_label, icd9_coarse_pair, k))
        if embedding_type_list[index].lower() == "bert":
            opt.append(
                mrm_ccs_bert(embedding,
                             icd9_list,
                             description,
                             pair_center_label,
                             icd9_pair,
                             k,
                             summary_method="MEAN"))
            opt.append(
                mrm_ccs_bert(embedding,
                             icd9_list,
                             description,
                             coarse_pair_center_label,
                             icd9_coarse_pair,
                             k,
                             summary_method="MEAN"))
            opt.append(
                mrm_ccs_bert(embedding,
                             icd9_list,
                             description,
                             pair_center_label,
                             icd9_pair,
                             k,
                             summary_method="CLS"))
            opt.append(
                mrm_ccs_bert(embedding,
                             icd9_list,
                             description,
                             coarse_pair_center_label,
                             icd9_coarse_pair,
                             k,
                             summary_method="CLS"))
    return opt
Exemplo n.º 5
0
    ]
    train_input_1 = [
        cui2id.get(cui, use_embedding_count - 1) for cui in cui_train_1
    ]
    train_y = [rel2id[rel] for rel in rel_train]
    test_input_0 = [
        cui2id.get(cui, use_embedding_count - 1) for cui in cui_test_0
    ]
    test_input_1 = [
        cui2id.get(cui, use_embedding_count - 1) for cui in cui_test_1
    ]
    test_y = [rel2id[rel] for rel in rel_test]

# Find standard term name
if not embedding_type == 'cui':
    umls = UMLS("../../umls", only_load_dict=True)
    cui2str = {}
    #ipdb.set_trace()
    for cui in cui_set:
        standard_term = umls.search(code=cui, max_number=1)
        if standard_term is not None:
            cui2str[cui] = standard_term[0]
        else:
            cui2str[cui] = cui

# Deal word type embedding
if embedding_type == 'word':

    # tokenize
    from nltk.tokenize import word_tokenize
    cui2tokenize = {}
Exemplo n.º 6
0
def mrm_ndfrt(embedding_list, embedding_type_list, concept_filename, k=40, check_intersection=True):
    if check_intersection:
        if not os.path.exists("intersection.txt"):
            intersection_cui = get_intersection(
                embedding_list, embedding_type_list)
            with open("intersection.txt", "w", encoding="utf-8") as f:
                for cui in intersection_cui:
                    f.write(cui.strip() + "\n")
        else:
            with open("intersection.txt", "r", encoding="utf-8") as f:
                lines = f.readlines()
            intersection_cui = [line.strip() for line in lines]

    query_to_targets, cui_list = get_drug_diseases_to_check(concept_filename)
    umls = UMLS("../../umls", only_load_dict=True) # source_range='SNOMEDCT_US')#, only_load_dict=True)

    if check_intersection:
        cui_list = [cui for cui in cui_list if cui in intersection_cui]

    #cui_list = [cui for cui in umls.cui2str if umls.cui2sty[cui] in sty_list]
    #cui_list = [cui for cui in cui_list if cui in umls.sty_list]

    """
    for cui in cui_list:
        if not cui in umls.cui2str:
            print(cui)

    ipdb.set_trace()
    """

    opt = []
    """
    # Origin
    print("ORIGIN")
    for index, embedding in enumerate(embedding_list):
        if embedding_type_list[index].lower() == "cui":
            opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "origin"))
        if embedding_type_list[index].lower() == "word":
            opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "origin"))
        if embedding_type_list[index].lower() == "bert":
            #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
            #                     query_to_targets, k, "origin", summary_method="MEAN"))
            opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                 query_to_targets, k, "origin", summary_method="CLS"))

    # For UMLSBert
    for index, embedding in enumerate(embedding_list):
        if embedding_type_list[index].lower() == "bert":
            print("BETA")
            beta_path = os.path.join(embedding, "run", "1000000", "rel embedding")
            if os.path.exists(beta_path):
                if concept_filename.find('treat') >= 0:
                    method = "may_treat"
                else:
                    method = "may_prevent"
                #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                #                 query_to_targets, k, method, summary_method="MEAN"))
                opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                 query_to_targets, k, method, summary_method="CLS"))                

    # For average and max

    print("ALL")
    for index, embedding in enumerate(embedding_list):
        if embedding_type_list[index].lower() == "cui":
            opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "all"))
        if embedding_type_list[index].lower() == "word":
            opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "all"))
        if embedding_type_list[index].lower() == "bert":
            #opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
            #                     query_to_targets, k, "all", summary_method="MEAN"))
            opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                 query_to_targets, k, "all", summary_method="CLS"))
    """
    for index, embedding in enumerate(embedding_list):
        if embedding_type_list[index].lower() == "cui":
            opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "origin"))
            opt.append(mrm_ndfrt_cui(embedding, umls, cui_list, query_to_targets, k, "all"))
        if embedding_type_list[index].lower() == "word":
            opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "origin"))
            opt.append(mrm_ndfrt_word(embedding, umls, cui_list, query_to_targets, k, "all"))
        if embedding_type_list[index].lower() == "bert":
            opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                      query_to_targets, k, "origin", summary_method="CLS"))
            beta_path = os.path.join(embedding, "run", "1000000", "rel embedding")
            if os.path.exists(beta_path):
                if concept_filename.find('treat') >= 0:
                    method = "may_treat"
                else:
                    method = "may_prevent"  
                opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                            query_to_targets, k, method, summary_method="CLS"))
            opt.append(mrm_ndfrt_bert(embedding, umls, cui_list,
                                      query_to_targets, k, "all", summary_method="CLS"))                   

    return opt
from ndfrt_analysis import get_drug_diseases_to_check
import sys
sys.path.append("../../pretrain")
from load_umls import UMLS

query_to_targets = get_drug_diseases_to_check("may_treat_cui.txt")
query_to_targets_1 = get_drug_diseases_to_check("may_prevent_cui.txt")
cui_set = set()
for query, targets in query_to_targets.items():
    cui_set.update([query])
    cui_set.update(targets)
print(len(cui_set))
umls = UMLS("../../umls", source_range='SNOMEDCT_US')

sty_set = set()
count = 0
for cui in cui_set:
    if cui in umls.cui2sty:
        count += 1
        sty_set.update([umls.cui2sty[cui]])
print(count)
print(len(sty_set))
print(sty_set)

count = 0
for cui in umls.cui2sty:
    if umls.cui2sty[cui] in sty_set:
        count += 1
print(count)