def process_list(data_path, list_name): """ :param data_path: :param list_name: """ new_list = open(data_path + list_name + '.txt', 'r', encoding='utf-8') elements_list = new_list.read().split('\n') if len(elements_list) == 1: elements_list = ''.join(elements_list) elements_list = elements_list.split(',') merpy.create_lexicon(elements_list, list_name) merpy.process_lexicon(list_name) return
# -*- coding: utf-8 -*- """ Created on Thu Apr 23 15:57:57 2020 @author: André """ import merpy import pandas as pd decs_data = pd.read_csv('../mesinesp_data/DeCS_data.tsv', sep='\t') conv_dict = {} for index, row in decs_data.iterrows(): l_terms = str(row['Synonyms']).split('|') if row['Term_Spanish'] not in l_terms: l_terms.append(row['Term_Spanish']) for i in l_terms: conv_dict[i] = str(row['#DeCS_code']) merpy.create_lexicon(conv_dict.keys(), "decslex") merpy.create_mappings(conv_dict, "decslex") merpy.show_lexicons() merpy.process_lexicon("decslex") #DEBUG merpy.get_entities("lo nervio abducens es una aurelia aurita", "decslex")
def annotate_documents(task, subset, name_to_id, output_dir): """Recognise entities (NER) and link them to the respective CIE-O-3 code (Normalisation), if available, using MERPY""" lexicon_name = "cieo3" merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) dataset_dir = str() if subset == "train": dataset_dir = "./data/datasets/train-set-to-publish/cantemist-norm/" elif subset == "dev1": dataset_dir = "./data/datasets/dev-set1-to-publish/cantemist-norm/" elif subset == "dev2": dataset_dir = "./data/datasets/dev-set2-to-publish/cantemist-norm/" elif subset == "test": dataset_dir = "./data/datasets/test-background-set-to-publish" doc_count = int() doc_annot_ratio = int() doc_w_annotations_count = int() total_entity_count = int() linked_mentions = int() total_doc_count = int(len(os.listdir(dataset_dir)) / 2) for doc in os.listdir(dataset_dir): if doc[-3:] == "txt": doc_count += 1 output_string = str() print("Annotating " + str(doc_count) + " of " + str(total_doc_count) + " documents") with open(dataset_dir + doc, 'r') as input_file: text = input_file.read() input_file.close() doc_entity_count = int() entities = merpy.get_entities(text, lexicon_name) for entity in entities: if entity != ['']: total_entity_count += 1 doc_entity_count += 1 if len(entity ) == 4: # linked mentions with CIE-O-3 code linked_mentions += 1 output_string += "T" + str( doc_entity_count ) + "\tMORFOLOGIA_NEOPLASIA " + entity[ 0] + " " + entity[1] + "\t" + entity[2] + "\n" if task == "norm": output_string += "#" + str( doc_entity_count ) + "\tAnnotatorNotes\tT" + str( doc_entity_count) + "\t" + entity[3] + "\n" elif len(entity) == 3: # mentions without CIE-O-3 code output_string += "T" + str( doc_entity_count ) + "\tMORFOLOGIA_NEOPLASIA " + entity[ 0] + " " + entity[1] + "\t" + entity[2] + "\n" if task == "norm": output_string += "#" + str( doc_entity_count ) + "\tAnnotatorNotes T" + str( doc_entity_count) + "\tNA\n" if doc_entity_count > 0: doc_w_annotations_count += 1 elif doc_entity_count == 0: output_string = "NA\tNA NA NA\tNA\n" if task == "norm": output_string += "#" + str( doc_entity_count) + "\tAnnotatorNotes T" + str( doc_entity_count) + "\tNA\n" # output annotations file output_filename = output_dir + doc[:-4] + ".ann" with open(output_filename, 'w') as output_file: output_file.write(output_string) output_file.close() try: doc_annot_ratio = float(doc_w_annotations_count / total_doc_count) mentions_ratio = float(total_entity_count / doc_w_annotations_count) doc_linked_ratio = float(linked_mentions / doc_w_annotations_count) linked_ratio = float(linked_mentions / total_entity_count) except: mentions_ratio = 0.0 doc_linked_ratio = 0.0 linked_ratio = 0.0 output_str = "TOTAL DOCUMENTS: " + str(total_doc_count) + "\n" output_str += "DOCS WITH ANNOTATIONS: " + str( doc_w_annotations_count) + "\n" output_str += "RATIO OF DOCS WITH ANNOTATIONS: " + str( doc_annot_ratio) + "\n" output_str += "TOTAL ENTITY MENTIONS: " + str(total_entity_count) + "\n" output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n" output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n" output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str( doc_linked_ratio) + "\n" output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio) file_name = "./mer_annotations/" + task + "/" + task + "_" + subset + "_stats" with open(file_name, "w") as output: output.write(output_str) output.close()
with open('../bioasq_data/mesh_terms_synonyms.txt', encoding='utf-8') as finput_terms: l_terms_syn = finput_terms.readlines() dict_terms_synonyms = {} for i in l_terms_syn: aux = i.split('\t') dict_terms_synonyms[aux[0]] = aux[1].replace('\n', '') conv_dict = {} for key, values in dict_terms_synonyms.items(): l_synonyms = values.split(',') if key not in l_synonyms: l_synonyms.append(key) for i in l_synonyms: conv_dict[i.strip()] = dict_terms.get(key) merpy.create_lexicon(conv_dict.keys(), "meshlex") merpy.create_mappings(conv_dict, "meshlex") merpy.show_lexicons() merpy.process_lexicon("meshlex") #DEBUG print(merpy.get_entities("I like abdominal injuries", "meshlex")) print(merpy.get_entities("I like Calcimycin", "meshlex")) print( merpy.get_entities( "I like Calcimycin it is a good aurelia aurita and Temefos is awesome! abate lowercase", "meshlex"))
def process_lexicons_4_mer(): print("download latest obo files") merpy.download_lexicon("http://purl.obolibrary.org/obo/doid.owl", "do", ltype="owl") merpy.download_lexicon("http://purl.obolibrary.org/obo/go.owl", "go", ltype="owl") merpy.download_lexicon("http://purl.obolibrary.org/obo/hp.owl", "hpo", ltype="owl") merpy.download_lexicon( "ftp://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl", "chebi", ltype="owl") merpy.download_lexicon("http://purl.obolibrary.org/obo/ncbitaxon.owl", "taxon", ltype="owl") merpy.download_lexicon( "https://raw.githubusercontent.com/CIDO-ontology/cido/master/src/ontology/cido.owl", "cido", "owl", ) print("process lexicons") merpy.process_lexicon("do", ltype="owl") merpy.process_lexicon("go", ltype="owl") merpy.process_lexicon("hpo", ltype="owl") merpy.process_lexicon("chebi", ltype="owl") merpy.process_lexicon("taxon", ltype="owl") merpy.process_lexicon("cido", "owl") #Delete obsolete entities merpy.delete_obsolete("do") merpy.delete_obsolete("go") merpy.delete_obsolete("hpo") merpy.delete_obsolete("chebi") merpy.delete_obsolete("taxon") merpy.delete_obsolete("cido") merpy.delete_entity("protein", "chebi") merpy.delete_entity("protein", "cido") merpy.delete_entity("protein", "hpo") merpy.delete_entity("polypeptide chain", "chebi") merpy.delete_entity("data", "taxon") merpy.delete_entity("one", "chebi") merpy.delete_entity_by_uri("http://purl.obolibrary.org/obo/PATO_0000070", "hpo") #Create and process english vocabularies #lexicon_name = "medic" #medic_name_to_id = load_ctd_vocabularies("CTD_diseases.tsv") #merpy.create_lexicon(medic_name_to_id.keys(), lexicon_name) #merpy.create_mappings(medic_name_to_id, lexicon_name) #merpy.process_lexicon(lexicon_name) #lexicon_name = "ctdChemicals" #chemicals_name_to_id = load_ctd_vocabularies("CTD_chemicals.tsv") #merpy.create_lexicon(chemicalsbireme_decs_spa2020.xm_name_to_id.keys(), lexicon_name) #merpy.create_mappings(chemicals_name_to_id, lexicon_name) #merpy.process_lexicon(lexicon_name) #lexicon_name = "ctdAnatomy" #anatomy_name_to_id = load_ctd_vocabularies("CTD_anatomy.tsv") #merpy.create_lexicon(anatomy_name_to_id.keys(), lexicon_name) #merpy.create_mappings(anatomy_name_to_id, lexicon_name) #merpy.process_lexicon(lexicon_name) ##Create and process english decs lexicon_name = "decsEN" name_to_id_spa = load_decs_xml("en") merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name) merpy.create_mappings(name_to_id_spa, lexicon_name) merpy.process_lexicon(lexicon_name) ##Create and process spanish decs lexicon_name = "decsSPA" name_to_id_spa = load_decs_xml("spa") merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name) merpy.create_mappings(name_to_id_spa, lexicon_name) merpy.process_lexicon(lexicon_name) #Create and process portuguese decs lexicon_name = "decsPT" name_to_id_spa = load_decs_xml("pt") merpy.create_lexicon(name_to_id_spa.keys(), lexicon_name) merpy.create_mappings(name_to_id_spa, lexicon_name) merpy.process_lexicon(lexicon_name)
def build_relations_dict(): """Iterates over all sentences in train, dev sets recognizes CIE-O-3, ICD10-CM and DeCS entities and establish a relation between two entities in a given sentece Ensures: dict stored in file './tmp/relations_cieo3_icd10cm.json' ES ICD10-CM <-> CIEO3 relations and in file './tmp/relations_cieo3_esdecs.json' with ES DeCS <-> CIEO3 relations """ #Create CIE-O-3 lexicon lexicon_name = "cieo3" ontology_graph, name_to_id, synonym_to_id = load_cieo3() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) #Create ICD10-CM lexicon lexicon_name = "icd10cmes" ontology_graph, name_to_id = load_spanish_icd10cm() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) #Create DECS lexicon lexicon_name = "es_decs" ontology_graph, name_to_id, synonym_to_id = load_es_decs() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) filenames_1 = [ "./data/datasets/train-set-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/train-set-to-publish/cantemist-norm/") ] filenames_2 = [ "./data/datasets/dev-set1-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/dev-set1-to-publish/cantemist-norm/") ] filenames_3 = [ "./data/datasets/dev-set2-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/dev-set2-to-publish/cantemist-norm/") ] filenames_4 = [ "./data/datasets/test-background-set-to-publish/" + input_file for input_file in os.listdir( "./data/datasets/test-background-set-to-publish/") ] filenames = filenames_1 + filenames_2 + filenames_3 # + filenames_4 relations_1, relations_2 = dict(), dict() doc_count = int() for doc in filenames: if doc[-3:] == "txt": #if doc == "cc_onco1016.txt": doc_count += 1 print("DOC_COUNT:", doc_count) with open(doc, 'r') as doc_file: text = doc_file.read() doc_file.close() sentences = [Sentence(sent) for sent in split_single(text)] for sentence in sentences: sent_text = sentence.to_original_text() cieo3_entities = merpy.get_entities(sent_text, "cieo3") icd10cm_entities = merpy.get_entities(sent_text, "icd10cmes") es_decs_entities = merpy.get_entities(sent_text, "es_decs") if icd10cm_entities != [['']] and cieo3_entities != [['']]: icd10cm_codes = [entity[3] for entity in icd10cm_entities] cieo3_codes = [entity[3] for entity in cieo3_entities] for code in cieo3_codes: if code in relations_1: current_values = relations_1[code] current_values.extend(icd10cm_codes) relations_1[code] = current_values else: relations_1[code] = icd10cm_codes if es_decs_entities != [['']] and cieo3_entities != [['']]: es_decs_codes = [entity[3] for entity in es_decs_entities] cieo3_codes = [entity[3] for entity in cieo3_entities] for code in cieo3_codes: if code in relations_2: current_values = relations_2[code] current_values.extend(es_decs_codes) relations_2[code] = es_decs_codes else: relations_2[code] = es_decs_codes #Output the relations into json files d = json.dumps(relations_1) d_file = open("./tmp/relations_cieo3_icd10cm.json", 'w') d_file.write(d) d_file.close() b = json.dumps(relations_2) b_file = open("./tmp/relations_cieo3_esdecs.json", 'w') b_file.write(b) b_file.close()
def annotate_documents(language, name_to_id): """ Recognise entities (Named Entity Recognition) and link them to the respective ICD 10 CM code (Named Entity Linking), if available Requires: language: str, "pt", "en", "es" for Portuguese, English or Spanish, respectively Ensures: for each abstract in 'scielo_abstracts' dir creates an annotation file in 'mer_annotations' dir and an overall statistics file about the annotation process """ lexicon_name = "icd10cm_" + language merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) abstracts_dir = "./scielo_abstracts/" doc_w_ann_count = int() entity_count = int() linked_mentions = int() for abstract in os.listdir(abstracts_dir): if abstract[-2:] == language: output_string = str() with open(abstracts_dir + abstract, 'r') as input_file: text = input_file.read() input_file.close() document_ent_count = int() entities = merpy.get_entities(text, lexicon_name) for entity in entities: if entity != ['']: entity_count += 1 document_ent_count += 1 if len(entity) == 4: # linked mentions with ICD code linked_mentions += 1 output_string += "T" + str( document_ent_count ) + "\t" + entity[0] + " " + entity[ 1] + "\t" + entity[2] + "\t" + entity[3] + "\n" elif len(entity) == 3: # mentions without ICD code output_string += "T" + str( document_ent_count ) + "\t" + entity[0] + " " + entity[ 1] + "\t" + entity[2] + "\n" if document_ent_count > 0: doc_w_ann_count += 1 output_filename = "./mer_annotations/" + language + "/" + abstract + ".ann" with open(output_filename, 'w') as output_file: output_file.write(output_string) output_file.close() try: mentions_ratio = float(entity_count / doc_w_ann_count) doc_linked_ratio = float(linked_mentions / doc_w_ann_count) linked_ratio = float(linked_mentions / entity_count) except: mentions_ratio = 0.0 doc_linked_ratio = 0.0 linked_ratio = 0.0 output_str = "DOCUMENTS WITH ANNOTATIONS: " + str(doc_w_ann_count) + "\n" output_str += "TOTAL ENTITY MENTIONS: " + str(entity_count) + "\n" output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n" output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n" output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str( doc_linked_ratio) + "\n" output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio) file_name = "mer_annotation_stats_" + language with open(file_name, "w") as output: output.write(output_str) output.close()
missing_texts = 0 total_entities = 0 total_sents = 0 total_docs = 0 # can be parallelized for pmid in document_entities: if pmid == "": continue if pmid not in pmid_to_abst: print("missing this abstract:", pmid) #import pdb; pdb.set_trace() missing_texts += 1 continue total_docs += 1 doc = nlp(pmid_to_abst[pmid]) merpy.create_lexicon(document_entities[pmid], "biomarker" + pmid) merpy.process_lexicon("biomarker" + pmid) doc_entities = merpy.get_entities(pmid_to_abst[pmid], "biomarker" + pmid) entity_spans = [] for e in doc_entities: try: int(e[0]), int(e[1]) except ValueError: print("ERROR", e) continue entity_spans.append(doc.char_span(int(e[0]), int(e[1]), label="GPE")) entity_spans = [e for e in entity_spans if e is not None] try: doc.ents = entity_spans[:] except: import pdb
pmid_to_abst[values[0]] = values[1].strip().lower() else: pmid_to_abst[values[0]] += ". " + values[1].strip().lower() missing_pmids = [] all_entities = set() with open("publications_compounds.txt") as compounds_file: next(compounds_file) for line in compounds_file: values = line.strip().split("\t") pmid = values[1] all_entities.add(values[3].lower()) if pmid not in pmid_to_abst: missing_pmids.append(pmid) merpy.create_lexicon(all_entities, "biomarkers") merpy.process_lexicon("biomarkers") #recover missing pmids: titles, abstracts = query_pubmed.get_titles_abstracts(missing_pmids) for t in titles: pmid_to_abst[t[0]] = t[1].lower() for a in abstracts: if a[0] not in pmid_to_abst: pmid_to_abst[a[0]] = a[1].lower() else: pmid_to_abst[a[0]] += ". " + a[1].lower() missing_texts = 0 total_entities = 0 total_sents = 0