示例#1
0
def add_oov_processed():
    code_cuis = read.read_from_tsv(
        "data/AskAPatient/codes_single_synonyms_tsv.tsv")
    code_cuis_dict = {
        line[0]: line[2]
        for line in code_cuis if len(line[3]) > 2
    }
    cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict")
    cui_st = read.read_from_json("data/AskAPatient/cui_st_dict")

    code_labels = read.read_from_json(
        "data/AskAPatient/label_texts_dict_AskAPatient")

    codes_synonyms_tsv = {}
    codes_st_tsv = []

    for code in ask:
        code_st_tsv = [code, code_labels[code]]
        if code in ask:
            if code in code_cuis_dict:
                cui = code_cuis_dict[code]
                synonym = list(set(cui_synonyms[cui]))
                code_st_tsv += [
                    cui, " [SEP] ".join(synonym)[:100], cui_st[cui]
                ]
            else:
                synonym = code_labels[code]

        codes_synonyms_tsv[code] = synonym
        codes_st_tsv.append(code_st_tsv)

    read.save_in_json("data/AskAPatient/code_dict_complete",
                      codes_synonyms_tsv)
    read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
示例#2
0
def add_oov():
    code_cuis = read.read_from_tsv("data/AskAPatient/code_cuis.tsv")
    code_cuis_dict = {
        line[0]: line[1:]
        for line in code_cuis if len(line[:-1]) > 0
    }
    cui_synonyms = read.read_from_json("data/AskAPatient/cui_dict")
    cui_st = read.read_from_json("data/AskAPatient/cui_st_dict")

    code_labels = read.read_from_json(
        "data/AskAPatient/label_texts_dict_AskAPatient")

    codes_synonyms_tsv = []
    codes_st_tsv = []

    for code in ask:
        code_synonyms_tsv = [code, code_labels[code]]
        code_st_tsv = [code, code_labels[code]]
        if code in ask:
            if code in code_cuis_dict:
                cuis = code_cuis_dict[code]
                for cui in cuis:
                    code_synonyms_tsv += [
                        cui, " [SEP] ".join(cui_synonyms[cui])[:100]
                    ]
                    code_st_tsv += [
                        cui, " [SEP] ".join(cui_synonyms[cui])[:100],
                        cui_st[cui]
                    ]
        codes_synonyms_tsv.append(code_synonyms_tsv)
        codes_st_tsv.append(code_st_tsv)

    read.save_in_tsv("data/AskAPatient/codes_synonyms_tsv.tsv",
                     codes_synonyms_tsv)
    read.save_in_tsv("data/AskAPatient/codes_st_tsv.tsv", codes_st_tsv)
示例#3
0
def analyze_patient_admission_date():
    patient_ids = read.read_from_json(
        os.path.join(cache_folder, "suicide_patient_id/patient_id"))

    patient_admission_time = read.read_from_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/allnotes_row_patient_admission_time"))
    patient_admission_time_formatted = {}
    for key, value in patient_admission_time.items():
        time_items = value[0].split("-")
        datetime_new = date(int(time_items[0]), int(time_items[1]),
                            int(time_items[2]))
        patient_admission_time_formatted[key] = datetime_new

    suicide_patient_admission_time = read.read_from_json(
        os.path.join(
            cache_folder,
            "suicide_patient_id/suicidalnotes_patient_admission_time"))
    suicide_patient_admission_time = get_duration(
        suicide_patient_admission_time)

    patient_admission_before = {}
    patient_admission_meanwhile = {}
    patient_admission_after = {}
    for patient_id in patient_ids:

        suicide_admission_time = suicide_patient_admission_time[patient_id]
        patient_admission_time = get_patient_for_admission(
            patient_admission_time_formatted, patient_id)

        for key, admission_time in patient_admission_time.items():
            row_id, _, _ = key.split("_")
            if admission_time < suicide_admission_time[0]:
                add_key_dict(patient_admission_before, patient_id, row_id)
            elif admission_time > suicide_admission_time[1]:
                add_key_dict(patient_admission_after, patient_id, row_id)
            else:
                add_key_dict(patient_admission_meanwhile, patient_id, row_id)

    print(len(patient_admission_before))
    #print(patient_admission_before)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_before_patient_admission"),
        patient_admission_before)
    print(len(patient_admission_after))
    # print(patient_admission_after)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_after_patient_admission"),
        patient_admission_after)
    print(len(patient_admission_meanwhile))
    #print(patient_admission_meanwhile)
    read.save_in_json(
        os.path.join(cache_folder,
                     "suicide_patient_id/suicide_meanwhile_patient_admission"),
        patient_admission_meanwhile)
示例#4
0
def main(syn_path, cui_path, cui_idx_path, file_name):
    embeddings = np.load(syn_path)
    cuis = read.read_from_json(cui_path)
    cui_idx = read.read_from_json(cui_idx_path)
    avg = []
    for cui in cuis:
        s, e = cui_idx[cui]
        embedding_syn = embeddings[s:e]
        avg.append(np.mean(embedding_syn, axis=0))
    avg = np.asarray(avg)

    read.create_folder(file_name)
    np.save(file_name, avg)
def analyze():
    rxnorm_term = read.read_from_tsv("data/umls/all_rxnorm_suppress.tsv")
    snomed_term = read.read_from_tsv("data/umls/all_snowmed_suppress.tsv")
    rxnorm_term = list(set([item[0] for item in rxnorm_term]))
    snomed_term = list(set([item[0] for item in snomed_term]))

    print(len(rxnorm_term))
    print(len(snomed_term))

    rxnorm_term1 = read.read_from_json("data/umls/rxnorm_dict")
    snomed_term1 = read.read_from_json("data/umls/snomed_dict")

    print(len(rxnorm_term1))
    print(len(snomed_term1))
示例#6
0
def cui_labels():
    code_cuis = {}
    cui_infos = read.read_from_json("data/SMM4H/synonyms")
    print(len(cui_infos))
    for line in cui_infos:
        code = line[10]
        cui = line[0]
        if line[12] == "PT":
            if code in code_cuis:
                code_cuis[code] += [cui]
            else:
                code_cuis[code] = [cui]
    print(len(code_cuis))

    extra = {
        "10012259": ["C0011253"],
        "10024130": ["C0023222"],
        "10000497": ["C0702166"],
        "1002243744151": ["C0917801"],
        "10040991": ["C0851578"],
        "10007541": ["C0018799"],
        "10027433": ["C0851358"],
        "10014698": ["C0014130"],
        "10044027": ["C0011334 "],
        "10013663": ["C1510472"],
        "MEDDRA PT": ["MEDDRA PT"]
    }

    code_cuis.update(extra)
    for code, cuis in code_cuis.items():
        if len(cuis) > 1:
            print(code)
    read.save_in_json("data/SMM4H/code_cuis", code_cuis)
    read.save_in_json("data/SMM4H/label", list(code_cuis.keys()))
def main(mention_embeddings_path, synonym_ebmedding_path,
         concept_synonym_idx_path, top_k, concept_pre_path,
         concept_score_pre_path):

    query = np.load(mention_embeddings_path)
    documents = np.load(synonym_ebmedding_path)

    concept_synonym_idx = read.read_from_json(concept_synonym_idx_path)
    id2concept = {
        int(i): cui
        for cui, item in concept_synonym_idx.items()
        for i in range(int(item[0]), int(item[1]))
    }

    similarity_matrix = cosine_similarity(query, documents)
    similarity_matrix = similarity_matrix.astype(np.float16)
    idx = np.argsort(similarity_matrix)
    idx = idx.astype(np.int32)
    top_k = int(top_k)
    idx = idx[:, ::-1][:, :top_k]

    concept_score_pre = [
        row[idx[i]] for i, row in enumerate(similarity_matrix)
    ]
    concept_pre = [[id2concept[item] for item in row] for row in idx]

    read.save_in_json(concept_pre_path, concept_pre)
    np.save(concept_score_pre_path, concept_score_pre)
def get_snomed_rxnorm_umls():

    rxnorm_term = read.read_from_json(
        "/extra/dongfangxu9/umls/processed/rxnorm_dict")
    snomed_term = read.read_from_json(
        "/extra/dongfangxu9/umls/processed/snomed_dict")
    cui_all = list(set(list(rxnorm_term.keys()) + list(snomed_term.keys())))

    cui_all_synonyms = {}
    # print(len(cui_all))
    data = read.readfrom_txt(
        "/extra/dongfangxu9/umls/umls_2017_subset/2017AB/META/MRCONSO.RRF")
    for line in data.splitlines():
        line_split = line.split('|')
        if line_split[0] in cui_all:
            cui_all_synonyms = add_dict(cui_all_synonyms, line_split[0],
                                        line_split)
示例#9
0
def textfile2list_smm4h_st():
    file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF"
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list[line[0]] = line[2]
    read.save_in_json("data/SMM4H/cui_st_dict", txt_list)
示例#10
0
def add_oov():
    cui_st = read.read_from_json("data/TwADR-L/cui_st_dict")
    cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict")
    ##### some cuis is out of vocabulary #########
    cui_extra = [cui for cui in twa if cui not in twa_cuis]
    for cui in cui_extra:
        cui_st[cui] = cui_st[twa_cuis_dict[cui]]
        cui_synonyms[cui] = cui_synonyms[twa_cuis_dict[cui]]
    read.save_in_json("data/TwADR-L/cui_dict_complete", cui_synonyms)
    read.save_in_json("data/TwADR-L/cui_st_dict_complete", cui_st)


# add_oov()
# cui_synonyms = read.read_from_json("data/TwADR-L/cui_dict_complete")
# print(len(cui_synonyms))
# for cui in twa:
#     if cui not in cui_synonyms:
#         print(cui)

# print(len(cui_synonyms))
示例#11
0
def add_oov_smm4h():
    cui_st = read.read_from_json("data/SMM4H/cui_st_dict")
    cui_synonyms = read.read_from_json("data/SMM4H/cui_synonyms")

    cui_st["MEDDRA PT"] = ["df1"]
    cui_synonyms["MEDDRA PT"] = ["Extracted ADR"]

    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    codes = read.read_from_json("data/SMM4H/label")

    code_st = {}
    code_synonyms = {}

    for code in codes:
        cui = code_cuis[code][0]
        if cui == 'C0011334 ':
            cui = 'C0011334'
        code_st[code] = cui_st[cui]
        code_synonyms[code] = cui_synonyms[cui]
    read.save_in_json("data/SMM4H/code_dict_complete", code_synonyms)
    read.save_in_json("data/SMM4H/code_st_dict_complete", code_st)
示例#12
0
def textfile2list_smm4h():
    code_cuis = read.read_from_json("data/SMM4H/code_cuis")
    cuis = [cuis[0] for _, cuis in code_cuis.items()]
    file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    data = read.readfrom_txt(file_path_synonym)
    txt_list = []
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in cuis:
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms_all", txt_list)
示例#13
0
def code_synonyms():
    txt_list = read.read_from_json("data/SMM4H/synonyms_all")
    code_synonyms = {}
    code_synonyms_new = {}
    for line in txt_list:
        if line[0] not in code_synonyms:
            code_synonyms[line[0]] = [line[14]]
        else:
            code_synonyms[line[0]] += [line[14]]

    for code, synonyms in code_synonyms.items():
        code_synonyms_new[code] = list(set(synonyms))
    read.save_in_json("data/SMM4H/cui_synonyms", code_synonyms_new)
示例#14
0
def suicide_meanwhile_notes(file_name):
    target_description = [
        "Nursing/other", "Nursing", "Physician", "Discharge summary",
        "Social Work", "General", "Nutrition", "Rehab Services",
        "Case Management", "Consult"
    ]

    suicide_meanwhile = read.read_from_json(
        os.path.join(cache_folder, "suicide_patient_id/" + file_name))
    title = read.read_from_tsv(
        os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[:1]
    patient_notes_all = read.read_from_tsv(
        os.path.join(cache_folder, "suicide_patient_notes_all.tsv"))[1:]
    suicide_meanwhile_notes = title
    none_admission_id = []
    admission_id = []
    notes_all = []
    notes_all_subset = []
    # admission_id_new = read.read_from_json(os.path.join(cache_folder,"suicide_patient_id/admission_id"))

    for row in patient_notes_all:
        if row[1] in suicide_meanwhile:
            if row[0] in suicide_meanwhile[row[1]]:
                if row[2] == "":
                    # print(row)
                    none_admission_id.append(row[0])
                elif row[2] not in admission_id:
                    admission_id.append(row[2])
                else:
                    None
                if row[6] in target_description:
                    notes_all_subset.append(row[0])
                suicide_meanwhile_notes.append(row)
                notes_all.append(row[0])

    print("patients: ", len(suicide_meanwhile))
    print("admission with id: ", len(admission_id))
    print("admission without id: ", len(none_admission_id))
    print("all notes: ", len(notes_all))

    # for admission_id_1 in admission_id:
    #     if admission_id_1 not in admission_id_new:
    #         print(admission_id_1)

    read.save_in_tsv(
        os.path.join(output_folder,
                     "suicide_patient_id/" + file_name + ".tsv"),
        suicide_meanwhile_notes)
示例#15
0
def textfile2list_smm4h():
    file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
    smm4h = read.read_from_json("data/SMM4H/labels_ori")

    data = read.readfrom_txt(file_path)
    cuis_smm4h = []
    txt_list = []
    for line in data.splitlines():
        # if "MDR" in line:
        #     print(line)
        line = line.split('|')
        if "MDR" in line[11]:
            cuis_smm4h.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/SMM4H/synonyms", txt_list)
    read.save_in_json("data/SMM4H/cuis", list(set(cuis_smm4h)))
示例#16
0
def main(mention_embeddings_path, synonym_ebmedding_path, concepts_path, top_k,
         concept_pre_path, concept_score_pre_path):

    query = np.load(mention_embeddings_path)
    documents = np.load(synonym_ebmedding_path)
    concepts = read.read_from_json(concepts_path)

    similarity_matrix = cosine_similarity(query, documents)
    idx = np.argsort(similarity_matrix)
    idx = idx.astype(np.int32)
    top_k = int(top_k)
    idx = idx[:, ::-1][:, :top_k]
    concept_score_pre = [
        row[idx[i]] for i, row in enumerate(similarity_matrix)
    ]
    concept_pre = [[concepts[int(item)] for item in row] for row in idx]

    read.save_in_json(concept_pre_path, concept_pre)
    np.save(concept_score_pre_path, concept_score_pre)
示例#17
0
def dev_evaluator():

    ontology = read.read_from_tsv("data/ontology/ontology_synonyms.tsv")

    cui_mention_idx = read.read_from_json(
        "data/ontology/ontology_concept_synonyms_idx")

    corpus = {"doc_" + str(id): item[0] for id, item in enumerate(ontology)}

    read.save_in_json("data/evaluator_path/corpus", corpus)

    doc_id2mesh_all = {}
    mesh2doc_id_all = {}
    for key, item in cui_mention_idx.items():
        doc_id2mesh = {"doc_" + str(id): key for id in range(item[0], item[1])}
        doc_id2mesh_all.update(doc_id2mesh)
        mesh2doc_id = {
            key: ["doc_" + str(id) for id in range(item[0], item[1])]
        }
        mesh2doc_id_all.update(mesh2doc_id)

    dev_input = read.read_from_tsv("data/input_raw/dev.tsv")
    mentions = [item[0] for item in dev_input]

    query = {
        "q_" + str(id): item[0]
        for id, item in enumerate(dev_input) if item[1] != "CUI-less"
    }

    relevant_docs = {
        "q_" + str(id): mesh2doc_id_all[item[1]]
        for id, item in enumerate(dev_input) if item[1] != "CUI-less"
    }
    read.save_in_json("data/evaluator_path/dev_queries", query)
    read.save_in_json("data/evaluator_path/dev_relevant_docs", relevant_docs)

    for qid, item in query.items():

        text = [
            ontology[int(doc_id.split("_")[1])][0]
            for doc_id in relevant_docs[qid]
        ]
        print(item, text)
示例#18
0
import read_files as read

file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF"
ask = read.read_from_json("data/AskAPatient/label")
twa = read.read_from_json("data/TwADR-L/label")
twa_cuis_all = read.read_from_tsv("data/TwADR-L/cui_cuis - cui_cuis.tsv")
twa_cuis = [item[1] for item in twa_cuis_all]
twa_cuis_dict = {item[0]: item[1] for item in twa_cuis_all}


def textfile2list_twa():
    data = read.readfrom_txt(file_path_synonym)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in twa_cuis:
            if line[0] not in txt_list:
                txt_list[line[0]] = [line[14]]
            else:
                txt_list[line[0]] += [line[14]]

    read.save_in_json("data/TwADR-L/cui_dict", txt_list)


# textfile2list_twa()


def textfile2list_twa_st():
    data = read.readfrom_txt(file_path_st)
    txt_list = {}
示例#19
0
import read_files as read

file_path = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
ask = read.read_from_json("data/AskAPatient/label")
twa = read.read_from_json("data/TwADR-L/label")


def textfile2list_twa():
    data = read.readfrom_txt(file_path)
    cuis_twa = []
    txt_list = []
    for line in data.splitlines():
        if "SNO" in line:
            print(line)
        line = line.split('|')
        if line[0] in twa:
            cuis_twa.append(line[0])
            txt_list.append(line)

    read.save_in_json("data/TwADR-L/synonyms", txt_list)
    read.save_in_json("data/TwADR-L/cuis", list(set(cuis_twa)))


# textfile2list_twa()
# print(len(twa))
# cuis = list(set(read.read_from_json("data/TwADR-L/cuis")))
# print(len(cuis))
# print([cui for cui in twa if cui not in cuis])


def textfile2list_ask():
示例#20
0
def model_training(
    train_data_path,
    evaluator_path,
    model_name,
    output_path,
    train_batch_size,
    num_epochs,
    samples_per_label,
):

    logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO,
        handlers=[LoggingHandler()],
    )

    output_path = (output_path + datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))

    os.makedirs(output_path, exist_ok=True)

    # You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/'

    ### Create a torch.DataLoader that passes training batch instances to our model

    logging.info("Loading training dataset")
    train_set = read_dataset(train_data_path)

    # Load pretrained model
    word_embedding_model = models.Transformer(model_name)
    # tokenizer_args={"additional_special_tokens": ['<e>', '</e>']})

    # word_embedding_model.auto_model.resize_token_embeddings(
    #     len(word_embedding_model.tokenizer))

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    # pooling_mode_mean_mark_tokens=True)

    # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=2048, activation_function=nn.Tanh())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    model.max_seq_length = 16

    logging.info("Read concept normalization training dataset")

    #### try different sample size ####

    train_data_sampler = SentenceLabelDataset(
        examples=train_set, samples_per_label=samples_per_label)

    ##### Try whether shuffle  #####  By default, it shouldn't be shuffled every epoch

    train_dataloader = DataLoader(train_data_sampler,
                                  batch_size=train_batch_size,
                                  drop_last=True)

    ### Triplet losses ####################
    ### There are 4 triplet loss variants:
    ### - BatchHardTripletLoss
    ### - BatchHardSoftMarginTripletLoss
    ### - BatchSemiHardTripletLoss
    ### - BatchAllTripletLoss
    #######################################

    # train_loss = losses.BatchAllTripletLoss(model=model)
    #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model)
    train_loss = losses.BatchHardSoftMarginTripletLoss(model)
    #train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model)

    # evaluator = []

    logging.info("Read concept normalization val dataset")

    ir_queries = read.read_from_json(
        os.path.join(evaluator_path, "dev_queries"))
    ir_corpus = read.read_from_json(os.path.join(evaluator_path, "corpus"))
    ir_relevant_docs = read.read_from_json(
        os.path.join(evaluator_path, "dev_relevant_docs"))
    ir_evaluator_n2c2_dev = evaluation.InformationRetrievalEvaluator(
        ir_queries,
        ir_corpus,
        ir_relevant_docs,
        corpus_chunk_size=300000,
        name="evaluation_results",
        map_at_k=[1, 3, 5, 10],
        batch_size=1024,
        show_progress_bar=True)

    # evaluator.append(ir_evaluator_n2c2_dev)
    # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
    # We optimize the model with respect to the score from the last evaluator (scores[-1])
    # seq_evaluator = evaluation.SequentialEvaluator(evaluator, main_score_function=lambda scores: scores[1])

    logging.info("Performance before fine-tuning:")
    ir_evaluator_n2c2_dev(model)

    # warmup_steps = int(
    #     len(train_dataset) * num_epochs / train_batch_size * 0.1
    # )  # 10% of train data
    warmup_steps = 0

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        # evaluator = None,
        evaluator=ir_evaluator_n2c2_dev,
        output_path_ignore_not_empty=True,
        optimizer_params={
            'lr': 1e-4,
            'eps': 1e-6,
            'correct_bias': False
        },
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        output_path=output_path,
    )
# features = "dev/all_withoutonto"

# feature1 = "all"
# features = "dev/all"

#data = "test/inputdata"
sentences, entity_list, nbio_entity_list, entity_pair_list = get_data(data)
connectors = [
    'at', 'in', 'on', 'of', 'has', 'have', 'with', 'without', 'contains',
    'contain'
]
part_of_knowledge = read.read_from_csv_ontology(part_of)
texts, vocab = get_sentence(sentences)
#print len(texts)
#read.save_in_json("dev/vocab",vocab)
vocab = read.read_from_json("dev/vocab")
new_vocab = [item for item, freq in vocab.items() if freq >= 9]

threshold = 4
#build_features(features,entity_list,nbio_entity_list,entity_pair_list,part_of_knowledge,texts,new_vocab,connectors,threshold)

#
from svmutil import *
y1, x1 = svm_read_problem('data/' + features + '.txt')

# model = svm_train(y1, x1,'-t 2 -b 1 -w1 7 -w-1 1 -h 0')  # -w1 8 -w-1 1')  -v 5
# svm_save_model('data/thomas/relation_extraction_'+feature1+'.model', model)

model = svm_load_model('data/dev/relation_extraction_' + feature1 + '.model')
import evaluate as test
test.evaluate(feature1, nbio_entity_list, entity_pair_list, model, y1, x1)
示例#22
0
import read_files as read

file_path_synonym = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRCONSO.RRF"
file_path_st = "/home/dongfang/umls_2017_AB_subset_test/2017AB/META/MRSTY.RRF"
ask = read.read_from_json("data/AskAPatient/label")

cui_codes = read.read_from_json("data/AskAPatient/cui_codes")
ask_cuis = [cui for cui, _ in cui_codes.items()]
print(len(cui_codes))


def textfile2list_twa():
    data = read.readfrom_txt(file_path_synonym)
    txt_list = {}
    for line in data.splitlines():
        line = line.split('|')
        if line[0] in ask_cuis:
            if line[0] not in txt_list:
                txt_list[line[0]] = [line[14]]
            else:
                txt_list[line[0]] += [line[14]]

    read.save_in_json("data/AskAPatient/cui_dict", txt_list)


# textfile2list_twa()


def textfile2list_twa_st():
    data = read.readfrom_txt(file_path_st)
    txt_list = {}