예제 #1
0
def process_list(data_path, list_name):
    """

    :param data_path:
    :param list_name:
    """

    new_list = open(data_path + list_name + '.txt', 'r', encoding='utf-8')
    elements_list = new_list.read().split('\n')
    if len(elements_list) == 1:
        elements_list = ''.join(elements_list)
        elements_list = elements_list.split(',')

    merpy.create_lexicon(elements_list, list_name)
    merpy.process_lexicon(list_name)

    return
예제 #2
0
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 23 15:57:57 2020

@author: André
"""

import merpy
import pandas as pd

decs_data = pd.read_csv('../mesinesp_data/DeCS_data.tsv', sep='\t')

conv_dict = {}
for index, row in decs_data.iterrows():
    l_terms = str(row['Synonyms']).split('|')
    if row['Term_Spanish'] not in l_terms:
        l_terms.append(row['Term_Spanish'])
                                
    for i in l_terms:
        conv_dict[i] = str(row['#DeCS_code'])

merpy.create_lexicon(conv_dict.keys(), "decslex")
merpy.create_mappings(conv_dict, "decslex")
merpy.show_lexicons()
merpy.process_lexicon("decslex")

#DEBUG
merpy.get_entities("lo nervio abducens es una aurelia aurita", "decslex")
def annotate_documents(task, subset, name_to_id, output_dir):
    """Recognise entities (NER) and link them to the respective CIE-O-3 code (Normalisation), if available, using MERPY"""

    lexicon_name = "cieo3"
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    dataset_dir = str()

    if subset == "train":
        dataset_dir = "./data/datasets/train-set-to-publish/cantemist-norm/"

    elif subset == "dev1":
        dataset_dir = "./data/datasets/dev-set1-to-publish/cantemist-norm/"

    elif subset == "dev2":
        dataset_dir = "./data/datasets/dev-set2-to-publish/cantemist-norm/"

    elif subset == "test":
        dataset_dir = "./data/datasets/test-background-set-to-publish"

    doc_count = int()
    doc_annot_ratio = int()
    doc_w_annotations_count = int()
    total_entity_count = int()
    linked_mentions = int()
    total_doc_count = int(len(os.listdir(dataset_dir)) / 2)

    for doc in os.listdir(dataset_dir):

        if doc[-3:] == "txt":

            doc_count += 1
            output_string = str()
            print("Annotating " + str(doc_count) + " of " +
                  str(total_doc_count) + " documents")

            with open(dataset_dir + doc, 'r') as input_file:
                text = input_file.read()
                input_file.close()
                doc_entity_count = int()

                entities = merpy.get_entities(text, lexicon_name)

                for entity in entities:

                    if entity != ['']:
                        total_entity_count += 1
                        doc_entity_count += 1

                        if len(entity
                               ) == 4:  # linked mentions with CIE-O-3 code
                            linked_mentions += 1
                            output_string += "T" + str(
                                doc_entity_count
                            ) + "\tMORFOLOGIA_NEOPLASIA " + entity[
                                0] + " " + entity[1] + "\t" + entity[2] + "\n"

                            if task == "norm":
                                output_string += "#" + str(
                                    doc_entity_count
                                ) + "\tAnnotatorNotes\tT" + str(
                                    doc_entity_count) + "\t" + entity[3] + "\n"

                        elif len(entity) == 3:  # mentions without CIE-O-3 code
                            output_string += "T" + str(
                                doc_entity_count
                            ) + "\tMORFOLOGIA_NEOPLASIA " + entity[
                                0] + " " + entity[1] + "\t" + entity[2] + "\n"

                            if task == "norm":
                                output_string += "#" + str(
                                    doc_entity_count
                                ) + "\tAnnotatorNotes T" + str(
                                    doc_entity_count) + "\tNA\n"

                if doc_entity_count > 0:
                    doc_w_annotations_count += 1

                elif doc_entity_count == 0:
                    output_string = "NA\tNA NA NA\tNA\n"

                    if task == "norm":
                        output_string += "#" + str(
                            doc_entity_count) + "\tAnnotatorNotes T" + str(
                                doc_entity_count) + "\tNA\n"

            # output annotations file
            output_filename = output_dir + doc[:-4] + ".ann"

            with open(output_filename, 'w') as output_file:
                output_file.write(output_string)
                output_file.close()

    try:
        doc_annot_ratio = float(doc_w_annotations_count / total_doc_count)
        mentions_ratio = float(total_entity_count / doc_w_annotations_count)
        doc_linked_ratio = float(linked_mentions / doc_w_annotations_count)
        linked_ratio = float(linked_mentions / total_entity_count)

    except:
        mentions_ratio = 0.0
        doc_linked_ratio = 0.0
        linked_ratio = 0.0

    output_str = "TOTAL DOCUMENTS: " + str(total_doc_count) + "\n"
    output_str += "DOCS WITH ANNOTATIONS: " + str(
        doc_w_annotations_count) + "\n"
    output_str += "RATIO OF DOCS WITH ANNOTATIONS: " + str(
        doc_annot_ratio) + "\n"
    output_str += "TOTAL ENTITY MENTIONS: " + str(total_entity_count) + "\n"
    output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n"
    output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n"
    output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str(
        doc_linked_ratio) + "\n"
    output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio)

    file_name = "./mer_annotations/" + task + "/" + task + "_" + subset + "_stats"

    with open(file_name, "w") as output:
        output.write(output_str)
        output.close()
예제 #4
0
with open('../bioasq_data/mesh_terms_synonyms.txt',
          encoding='utf-8') as finput_terms:
    l_terms_syn = finput_terms.readlines()

dict_terms_synonyms = {}
for i in l_terms_syn:
    aux = i.split('\t')
    dict_terms_synonyms[aux[0]] = aux[1].replace('\n', '')

conv_dict = {}
for key, values in dict_terms_synonyms.items():
    l_synonyms = values.split(',')
    if key not in l_synonyms:
        l_synonyms.append(key)

    for i in l_synonyms:
        conv_dict[i.strip()] = dict_terms.get(key)

merpy.create_lexicon(conv_dict.keys(), "meshlex")
merpy.create_mappings(conv_dict, "meshlex")
merpy.show_lexicons()
merpy.process_lexicon("meshlex")

#DEBUG
print(merpy.get_entities("I like abdominal injuries", "meshlex"))
print(merpy.get_entities("I like Calcimycin", "meshlex"))
print(
    merpy.get_entities(
        "I like Calcimycin it is a good aurelia aurita and Temefos is awesome! abate lowercase",
        "meshlex"))
예제 #5
0
def process_lexicons_4_mer():

    print("download latest obo files")
    merpy.download_lexicon("http://purl.obolibrary.org/obo/doid.owl",
                           "do",
                           ltype="owl")
    merpy.download_lexicon("http://purl.obolibrary.org/obo/go.owl",
                           "go",
                           ltype="owl")
    merpy.download_lexicon("http://purl.obolibrary.org/obo/hp.owl",
                           "hpo",
                           ltype="owl")
    merpy.download_lexicon(
        "ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl",
        "chebi",
        ltype="owl")
    merpy.download_lexicon("http://purl.obolibrary.org/obo/ncbitaxon.owl",
                           "taxon",
                           ltype="owl")
    merpy.download_lexicon(
        "https://raw.githubusercontent.com/CIDO-ontology/cido/master/src/ontology/cido.owl",
        "cido",
        "owl",
    )

    print("process lexicons")
    merpy.process_lexicon("do", ltype="owl")
    merpy.process_lexicon("go", ltype="owl")
    merpy.process_lexicon("hpo", ltype="owl")
    merpy.process_lexicon("chebi", ltype="owl")
    merpy.process_lexicon("taxon", ltype="owl")
    merpy.process_lexicon("cido", "owl")

    #Delete obsolete entities
    merpy.delete_obsolete("do")
    merpy.delete_obsolete("go")
    merpy.delete_obsolete("hpo")
    merpy.delete_obsolete("chebi")
    merpy.delete_obsolete("taxon")
    merpy.delete_obsolete("cido")

    merpy.delete_entity("protein", "chebi")
    merpy.delete_entity("protein", "cido")
    merpy.delete_entity("protein", "hpo")
    merpy.delete_entity("polypeptide chain", "chebi")
    merpy.delete_entity("data", "taxon")
    merpy.delete_entity("one", "chebi")
    merpy.delete_entity_by_uri("http://purl.obolibrary.org/obo/PATO_0000070",
                               "hpo")

    #Create and process english vocabularies
    #lexicon_name = "medic"
    #medic_name_to_id = load_ctd_vocabularies("CTD_diseases.tsv")
    #merpy.create_lexicon(medic_name_to_id.keys(), lexicon_name)
    #merpy.create_mappings(medic_name_to_id, lexicon_name)
    #merpy.process_lexicon(lexicon_name)

    #lexicon_name = "ctdChemicals"
    #chemicals_name_to_id = load_ctd_vocabularies("CTD_chemicals.tsv")
    #merpy.create_lexicon(chemicalsbireme_decs_spa2020.xm_name_to_id.keys(), lexicon_name)
    #merpy.create_mappings(chemicals_name_to_id, lexicon_name)
    #merpy.process_lexicon(lexicon_name)

    #lexicon_name = "ctdAnatomy"
    #anatomy_name_to_id = load_ctd_vocabularies("CTD_anatomy.tsv")
    #merpy.create_lexicon(anatomy_name_to_id.keys(), lexicon_name)
    #merpy.create_mappings(anatomy_name_to_id, lexicon_name)
    #merpy.process_lexicon(lexicon_name)

    ##Create and process english decs
    lexicon_name = "decsEN"
    name_to_id_spa = load_decs_xml("en")
    merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name)
    merpy.create_mappings(name_to_id_spa, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    ##Create and process spanish decs
    lexicon_name = "decsSPA"
    name_to_id_spa = load_decs_xml("spa")
    merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name)
    merpy.create_mappings(name_to_id_spa, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    #Create and process portuguese decs
    lexicon_name = "decsPT"
    name_to_id_spa = load_decs_xml("pt")
    merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name)
    merpy.create_mappings(name_to_id_spa, lexicon_name)
    merpy.process_lexicon(lexicon_name)
def build_relations_dict():
    """Iterates over all sentences in train, dev sets recognizes CIE-O-3, ICD10-CM and DeCS entities and establish a relation
        between two entities in a given sentece

    Ensures:
        dict stored in file './tmp/relations_cieo3_icd10cm.json' ES ICD10-CM <-> CIEO3 relations and in file 
        './tmp/relations_cieo3_esdecs.json' with ES DeCS <-> CIEO3 relations
    """

    #Create CIE-O-3 lexicon
    lexicon_name = "cieo3"
    ontology_graph, name_to_id, synonym_to_id = load_cieo3()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    #Create ICD10-CM lexicon
    lexicon_name = "icd10cmes"
    ontology_graph, name_to_id = load_spanish_icd10cm()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    #Create DECS lexicon
    lexicon_name = "es_decs"
    ontology_graph, name_to_id, synonym_to_id = load_es_decs()
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    filenames_1 = [
        "./data/datasets/train-set-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/train-set-to-publish/cantemist-norm/")
    ]
    filenames_2 = [
        "./data/datasets/dev-set1-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/dev-set1-to-publish/cantemist-norm/")
    ]
    filenames_3 = [
        "./data/datasets/dev-set2-to-publish/cantemist-norm/" + input_file
        for input_file in os.listdir(
            "./data/datasets/dev-set2-to-publish/cantemist-norm/")
    ]
    filenames_4 = [
        "./data/datasets/test-background-set-to-publish/" + input_file
        for input_file in os.listdir(
            "./data/datasets/test-background-set-to-publish/")
    ]

    filenames = filenames_1 + filenames_2 + filenames_3  # + filenames_4

    relations_1, relations_2 = dict(), dict()
    doc_count = int()

    for doc in filenames:

        if doc[-3:] == "txt":
            #if doc == "cc_onco1016.txt":
            doc_count += 1
            print("DOC_COUNT:", doc_count)
            with open(doc, 'r') as doc_file:
                text = doc_file.read()
                doc_file.close()

            sentences = [Sentence(sent) for sent in split_single(text)]

            for sentence in sentences:
                sent_text = sentence.to_original_text()
                cieo3_entities = merpy.get_entities(sent_text, "cieo3")
                icd10cm_entities = merpy.get_entities(sent_text, "icd10cmes")
                es_decs_entities = merpy.get_entities(sent_text, "es_decs")

                if icd10cm_entities != [['']] and cieo3_entities != [['']]:
                    icd10cm_codes = [entity[3] for entity in icd10cm_entities]
                    cieo3_codes = [entity[3] for entity in cieo3_entities]

                    for code in cieo3_codes:

                        if code in relations_1:
                            current_values = relations_1[code]
                            current_values.extend(icd10cm_codes)
                            relations_1[code] = current_values
                        else:
                            relations_1[code] = icd10cm_codes

                if es_decs_entities != [['']] and cieo3_entities != [['']]:
                    es_decs_codes = [entity[3] for entity in es_decs_entities]
                    cieo3_codes = [entity[3] for entity in cieo3_entities]

                    for code in cieo3_codes:

                        if code in relations_2:
                            current_values = relations_2[code]
                            current_values.extend(es_decs_codes)
                            relations_2[code] = es_decs_codes
                        else:
                            relations_2[code] = es_decs_codes

    #Output the relations into json files
    d = json.dumps(relations_1)
    d_file = open("./tmp/relations_cieo3_icd10cm.json", 'w')
    d_file.write(d)
    d_file.close()

    b = json.dumps(relations_2)
    b_file = open("./tmp/relations_cieo3_esdecs.json", 'w')
    b_file.write(b)
    b_file.close()
예제 #7
0
def annotate_documents(language, name_to_id):
    """
    Recognise entities (Named Entity Recognition) and link them to the respective ICD 10 CM code (Named Entity Linking), if available

    Requires:
        language: str, "pt", "en", "es" for Portuguese, English or Spanish, respectively

    Ensures:
        for each abstract in 'scielo_abstracts' dir creates an annotation file in 'mer_annotations' dir and an overall statistics file about the annotation process

    """

    lexicon_name = "icd10cm_" + language
    merpy.create_lexicon(name_to_id.keys(), lexicon_name)
    merpy.create_mappings(name_to_id, lexicon_name)
    merpy.process_lexicon(lexicon_name)

    abstracts_dir = "./scielo_abstracts/"
    doc_w_ann_count = int()
    entity_count = int()
    linked_mentions = int()

    for abstract in os.listdir(abstracts_dir):

        if abstract[-2:] == language:
            output_string = str()

            with open(abstracts_dir + abstract, 'r') as input_file:
                text = input_file.read()
                input_file.close()
                document_ent_count = int()

                entities = merpy.get_entities(text, lexicon_name)

                for entity in entities:

                    if entity != ['']:
                        entity_count += 1
                        document_ent_count += 1

                        if len(entity) == 4:  # linked mentions with ICD code
                            linked_mentions += 1
                            output_string += "T" + str(
                                document_ent_count
                            ) + "\t" + entity[0] + " " + entity[
                                1] + "\t" + entity[2] + "\t" + entity[3] + "\n"

                        elif len(entity) == 3:  # mentions without ICD code
                            output_string += "T" + str(
                                document_ent_count
                            ) + "\t" + entity[0] + " " + entity[
                                1] + "\t" + entity[2] + "\n"

                if document_ent_count > 0:
                    doc_w_ann_count += 1

            output_filename = "./mer_annotations/" + language + "/" + abstract + ".ann"

            with open(output_filename, 'w') as output_file:
                output_file.write(output_string)
                output_file.close()

    try:
        mentions_ratio = float(entity_count / doc_w_ann_count)
        doc_linked_ratio = float(linked_mentions / doc_w_ann_count)
        linked_ratio = float(linked_mentions / entity_count)

    except:
        mentions_ratio = 0.0
        doc_linked_ratio = 0.0
        linked_ratio = 0.0

    output_str = "DOCUMENTS WITH ANNOTATIONS: " + str(doc_w_ann_count) + "\n"
    output_str += "TOTAL ENTITY MENTIONS: " + str(entity_count) + "\n"
    output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n"
    output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n"
    output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str(
        doc_linked_ratio) + "\n"
    output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio)

    file_name = "mer_annotation_stats_" + language

    with open(file_name, "w") as output:
        output.write(output_str)
        output.close()
예제 #8
0
missing_texts = 0
total_entities = 0
total_sents = 0
total_docs = 0
# can be parallelized
for pmid in document_entities:
    if pmid == "":
        continue
    if pmid not in pmid_to_abst:
        print("missing this abstract:", pmid)
        #import pdb; pdb.set_trace()
        missing_texts += 1
        continue
    total_docs += 1
    doc = nlp(pmid_to_abst[pmid])
    merpy.create_lexicon(document_entities[pmid], "biomarker" + pmid)
    merpy.process_lexicon("biomarker" + pmid)
    doc_entities = merpy.get_entities(pmid_to_abst[pmid], "biomarker" + pmid)
    entity_spans = []
    for e in doc_entities:
        try:
            int(e[0]), int(e[1])
        except ValueError:
            print("ERROR", e)
            continue
        entity_spans.append(doc.char_span(int(e[0]), int(e[1]), label="GPE"))
    entity_spans = [e for e in entity_spans if e is not None]
    try:
        doc.ents = entity_spans[:]
    except:
        import pdb
예제 #9
0
                pmid_to_abst[values[0]] = values[1].strip().lower()
            else:
                pmid_to_abst[values[0]] += ". " + values[1].strip().lower()

missing_pmids = []
all_entities = set()
with open("publications_compounds.txt") as compounds_file:
    next(compounds_file)
    for line in compounds_file:
        values = line.strip().split("\t")
        pmid = values[1]
        all_entities.add(values[3].lower())
        if pmid not in pmid_to_abst:
            missing_pmids.append(pmid)

merpy.create_lexicon(all_entities, "biomarkers")
merpy.process_lexicon("biomarkers")

#recover missing pmids:
titles, abstracts = query_pubmed.get_titles_abstracts(missing_pmids)
for t in titles:
    pmid_to_abst[t[0]] = t[1].lower()
for a in abstracts:
    if a[0] not in pmid_to_abst:
        pmid_to_abst[a[0]] = a[1].lower()
    else:
        pmid_to_abst[a[0]] += ". " + a[1].lower()

missing_texts = 0
total_entities = 0
total_sents = 0