def process_multiple_docs_lexicons_sp(docs, lexicons): """ Iterate through list of doc directories """ # create one empty list for each doc doc_dict = {i: d for i, d in enumerate(docs)} output_entities = [[]] * len(docs) doc_results = [] for idoc, doc in enumerate(docs): if sum(map(str.isalnum, doc)) < 5: # must have at least 5 alnum print("no words", doc) continue doc = re.sub(r"[^A-Za-z0-9 ]{2,}", repl, doc) for l in lexicons: doc_results += merpy.get_entities(doc, l) for e in doc_results: # doc_entities = merpy.get_entities_mp(doc_dict, lex, n_cores=10) # print(lex, entities) # for e in l_entities: if len(e) > 2: entity = [int(e[0]), int(e[1]), e[2]] if len(e) > 3: # URI entity.append(e[3]) if entity not in output_entities[idoc]: output_entities[idoc].append(entity) for i in range(len(output_entities)): output_entities[i] = sorted(output_entities[i]) return output_entities
def merpy_function(article_abstract): ''' Run MER. terms are inialized in the beginning of script with line: merpy.generate_lexicon("hp") :param article_abstract: variable containing the abstract text :return: a nested list with identified term, its position and url of term ''' document = article_abstract entities = merpy.get_entities(document, "hp") return entities
def annotations(corpus_path, data_path): """ :param corpus_path: :param data_path: """ merpy.download_lexicons() merpy.process_lexicon("hp") merpy.process_lexicon("doid") merpy.process_lexicon("radlex") process_list(data_path, 'chebi') process_list(data_path, 'medical_devices') process_list(data_path, 'temporal_list') process_list(data_path, 'population_vocabulary') for f in os.listdir(corpus_path): file_to_annotate = open(corpus_path + f, 'r', encoding = 'utf-8') file_content = file_to_annotate.read() file_to_annotate.close() entities_hp = merpy.get_entities(file_content, "hp") clean_hp_list = [] for hp in entities_hp: if 'HP' not in hp[-1]: pass else: clean_hp_list.append(hp) entities_doid = merpy.get_entities(file_content, "doid") entities_radlex = merpy.get_entities(file_content, "radlex") entities_devices = merpy.get_entities(file_content, "medical_devices") entities_chebi = merpy.get_entities(file_content, "chebi") entities_temporal = merpy.get_entities(file_content, "temporal") entities_population = merpy.get_entities(file_content, "population") entities = clean_hp_list + entities_doid + entities_devices + entities_chebi + entities_radlex + entities_temporal + entities_population entities_clean = [x for x in entities if x != ['']] entities_sorted = sorted(entities_clean, key = lambda position: int(position[0])) print('\n' + f + '\n') for entity in entities_sorted: print(entity) return
def map_to_ontology(entity_text, ontology): """ Run merpy on an entity string to get ontology URI """ if entity_text in normalization_dic[ontology]: return normalization_dic[ontology][entity_text] else: matches = merpy.get_entities(entity_text, ontology) if len(matches) > 1: print(entity_text, ontology, matches) if len(matches) == 0 or len(matches[-1]) < 4: print("no matches", entity_text, ontology, matches) normalization_dic[ontology][entity_text] = entity_text return entity_text else: # get the last match (or biggest TODO) best_match = matches[-1][3] normalization_dic[ontology][entity_text] = best_match return best_match
def call_mer_2nd_run(l_mer_terms, l_text, lexicon): for i in range(len(l_mer_terms)): if l_mer_terms[i][0] == ' ': l_aux = merpy.get_entities(l_text[i], lexicon) print('>>>>>', l_aux) #DEBUG if len(l_aux) > 0: l_aux_2 = [] if lexicon == 'decs_parlex': for i in range(len(l_aux)): #Spanish term try: if l_aux[i][2] not in l_aux_2: l_aux_2.append(l_aux[i][2]) try: #black magic to convert from string representation to list l_aux[i][3] = ast.literal_eval(l_aux[i][3]) #DEBUG - decs_parlex #print(l_mer_data[0]) #DeCS code #print(l_mer_data[1]) #Parent Info Tuple #print(l_mer_data[1][0]) #Parent DeCS code (if any) #print(l_mer_data[1][1]) #Parent Name (if any) #Spanish Parent if l_aux[i][3][1][1] != '-' and l_aux[i][3][1][ 1] not in l_aux_2: l_aux_2.append(l_aux[i][3][1][1]) except IndexError: l_aux_2.append(' ') except IndexError: l_aux_2.append(' ') l_mer_terms[i][0] = l_aux_2 return l_mer_terms
# -*- coding: utf-8 -*- """ Created on Thu Apr 23 15:57:57 2020 @author: André """ import merpy import pandas as pd decs_data = pd.read_csv('../mesinesp_data/DeCS_data.tsv', sep='\t') conv_dict = {} for index, row in decs_data.iterrows(): l_terms = str(row['Synonyms']).split('|') if row['Term_Spanish'] not in l_terms: l_terms.append(row['Term_Spanish']) for i in l_terms: conv_dict[i] = str(row['#DeCS_code']) merpy.create_lexicon(conv_dict.keys(), "decslex") merpy.create_mappings(conv_dict, "decslex") merpy.show_lexicons() merpy.process_lexicon("decslex") #DEBUG merpy.get_entities("lo nervio abducens es una aurelia aurita", "decslex")
import merpy import ssmpy import urllib.request # Download the Human Disease Ontology OWL file doid_link = 'http://purl.obolibrary.org/obo/doid.owl' with urllib.request.urlopen(doid_link) as response, open('doid.owl', 'wb') as out_file: data = response.read() out_file.write(data) ssmpy.create_semantic_base("doid.owl", "doid.db", "http://purl.obolibrary.org/obo/", "http://www.w3.org/2000/01/rdf-schema#subClassOf", '') ssmpy.semantic_base("doid.db") merpy.download_lexicon(doid_link, "doid", "owl") merpy.process_lexicon("doid", "owl") document = "zoophilia zoophobia zaspopathy" entities = merpy.get_entities(document, "doid") print(entities) print(merpy.get_similarities(entities, 'doid.db'))
with open('../bioasq_data/mesh_terms_synonyms.txt', encoding='utf-8') as finput_terms: l_terms_syn = finput_terms.readlines() dict_terms_synonyms = {} for i in l_terms_syn: aux = i.split('\t') dict_terms_synonyms[aux[0]] = aux[1].replace('\n', '') conv_dict = {} for key, values in dict_terms_synonyms.items(): l_synonyms = values.split(',') if key not in l_synonyms: l_synonyms.append(key) for i in l_synonyms: conv_dict[i.strip()] = dict_terms.get(key) merpy.create_lexicon(conv_dict.keys(), "meshlex") merpy.create_mappings(conv_dict, "meshlex") merpy.show_lexicons() merpy.process_lexicon("meshlex") #DEBUG print(merpy.get_entities("I like abdominal injuries", "meshlex")) print(merpy.get_entities("I like Calcimycin", "meshlex")) print( merpy.get_entities( "I like Calcimycin it is a good aurelia aurita and Temefos is awesome! abate lowercase", "meshlex"))
def build_relations_dict(): """Iterates over all sentences in train, dev sets recognizes CIE-O-3, ICD10-CM and DeCS entities and establish a relation between two entities in a given sentece Ensures: dict stored in file './tmp/relations_cieo3_icd10cm.json' ES ICD10-CM <-> CIEO3 relations and in file './tmp/relations_cieo3_esdecs.json' with ES DeCS <-> CIEO3 relations """ #Create CIE-O-3 lexicon lexicon_name = "cieo3" ontology_graph, name_to_id, synonym_to_id = load_cieo3() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) #Create ICD10-CM lexicon lexicon_name = "icd10cmes" ontology_graph, name_to_id = load_spanish_icd10cm() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) #Create DECS lexicon lexicon_name = "es_decs" ontology_graph, name_to_id, synonym_to_id = load_es_decs() merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) filenames_1 = [ "./data/datasets/train-set-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/train-set-to-publish/cantemist-norm/") ] filenames_2 = [ "./data/datasets/dev-set1-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/dev-set1-to-publish/cantemist-norm/") ] filenames_3 = [ "./data/datasets/dev-set2-to-publish/cantemist-norm/" + input_file for input_file in os.listdir( "./data/datasets/dev-set2-to-publish/cantemist-norm/") ] filenames_4 = [ "./data/datasets/test-background-set-to-publish/" + input_file for input_file in os.listdir( "./data/datasets/test-background-set-to-publish/") ] filenames = filenames_1 + filenames_2 + filenames_3 # + filenames_4 relations_1, relations_2 = dict(), dict() doc_count = int() for doc in filenames: if doc[-3:] == "txt": #if doc == "cc_onco1016.txt": doc_count += 1 print("DOC_COUNT:", doc_count) with open(doc, 'r') as doc_file: text = doc_file.read() doc_file.close() sentences = [Sentence(sent) for sent in split_single(text)] for sentence in sentences: sent_text = sentence.to_original_text() cieo3_entities = merpy.get_entities(sent_text, "cieo3") icd10cm_entities = merpy.get_entities(sent_text, "icd10cmes") es_decs_entities = merpy.get_entities(sent_text, "es_decs") if icd10cm_entities != [['']] and cieo3_entities != [['']]: icd10cm_codes = [entity[3] for entity in icd10cm_entities] cieo3_codes = [entity[3] for entity in cieo3_entities] for code in cieo3_codes: if code in relations_1: current_values = relations_1[code] current_values.extend(icd10cm_codes) relations_1[code] = current_values else: relations_1[code] = icd10cm_codes if es_decs_entities != [['']] and cieo3_entities != [['']]: es_decs_codes = [entity[3] for entity in es_decs_entities] cieo3_codes = [entity[3] for entity in cieo3_entities] for code in cieo3_codes: if code in relations_2: current_values = relations_2[code] current_values.extend(es_decs_codes) relations_2[code] = es_decs_codes else: relations_2[code] = es_decs_codes #Output the relations into json files d = json.dumps(relations_1) d_file = open("./tmp/relations_cieo3_icd10cm.json", 'w') d_file.write(d) d_file.close() b = json.dumps(relations_2) b_file = open("./tmp/relations_cieo3_esdecs.json", 'w') b_file.write(b) b_file.close()
def annotate_documents(task, subset, name_to_id, output_dir): """Recognise entities (NER) and link them to the respective CIE-O-3 code (Normalisation), if available, using MERPY""" lexicon_name = "cieo3" merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) dataset_dir = str() if subset == "train": dataset_dir = "./data/datasets/train-set-to-publish/cantemist-norm/" elif subset == "dev1": dataset_dir = "./data/datasets/dev-set1-to-publish/cantemist-norm/" elif subset == "dev2": dataset_dir = "./data/datasets/dev-set2-to-publish/cantemist-norm/" elif subset == "test": dataset_dir = "./data/datasets/test-background-set-to-publish" doc_count = int() doc_annot_ratio = int() doc_w_annotations_count = int() total_entity_count = int() linked_mentions = int() total_doc_count = int(len(os.listdir(dataset_dir)) / 2) for doc in os.listdir(dataset_dir): if doc[-3:] == "txt": doc_count += 1 output_string = str() print("Annotating " + str(doc_count) + " of " + str(total_doc_count) + " documents") with open(dataset_dir + doc, 'r') as input_file: text = input_file.read() input_file.close() doc_entity_count = int() entities = merpy.get_entities(text, lexicon_name) for entity in entities: if entity != ['']: total_entity_count += 1 doc_entity_count += 1 if len(entity ) == 4: # linked mentions with CIE-O-3 code linked_mentions += 1 output_string += "T" + str( doc_entity_count ) + "\tMORFOLOGIA_NEOPLASIA " + entity[ 0] + " " + entity[1] + "\t" + entity[2] + "\n" if task == "norm": output_string += "#" + str( doc_entity_count ) + "\tAnnotatorNotes\tT" + str( doc_entity_count) + "\t" + entity[3] + "\n" elif len(entity) == 3: # mentions without CIE-O-3 code output_string += "T" + str( doc_entity_count ) + "\tMORFOLOGIA_NEOPLASIA " + entity[ 0] + " " + entity[1] + "\t" + entity[2] + "\n" if task == "norm": output_string += "#" + str( doc_entity_count ) + "\tAnnotatorNotes T" + str( doc_entity_count) + "\tNA\n" if doc_entity_count > 0: doc_w_annotations_count += 1 elif doc_entity_count == 0: output_string = "NA\tNA NA NA\tNA\n" if task == "norm": output_string += "#" + str( doc_entity_count) + "\tAnnotatorNotes T" + str( doc_entity_count) + "\tNA\n" # output annotations file output_filename = output_dir + doc[:-4] + ".ann" with open(output_filename, 'w') as output_file: output_file.write(output_string) output_file.close() try: doc_annot_ratio = float(doc_w_annotations_count / total_doc_count) mentions_ratio = float(total_entity_count / doc_w_annotations_count) doc_linked_ratio = float(linked_mentions / doc_w_annotations_count) linked_ratio = float(linked_mentions / total_entity_count) except: mentions_ratio = 0.0 doc_linked_ratio = 0.0 linked_ratio = 0.0 output_str = "TOTAL DOCUMENTS: " + str(total_doc_count) + "\n" output_str += "DOCS WITH ANNOTATIONS: " + str( doc_w_annotations_count) + "\n" output_str += "RATIO OF DOCS WITH ANNOTATIONS: " + str( doc_annot_ratio) + "\n" output_str += "TOTAL ENTITY MENTIONS: " + str(total_entity_count) + "\n" output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n" output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n" output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str( doc_linked_ratio) + "\n" output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio) file_name = "./mer_annotations/" + task + "/" + task + "_" + subset + "_stats" with open(file_name, "w") as output: output.write(output_str) output.close()
def annotate_documents(language, name_to_id): """ Recognise entities (Named Entity Recognition) and link them to the respective ICD 10 CM code (Named Entity Linking), if available Requires: language: str, "pt", "en", "es" for Portuguese, English or Spanish, respectively Ensures: for each abstract in 'scielo_abstracts' dir creates an annotation file in 'mer_annotations' dir and an overall statistics file about the annotation process """ lexicon_name = "icd10cm_" + language merpy.create_lexicon(name_to_id.keys(), lexicon_name) merpy.create_mappings(name_to_id, lexicon_name) merpy.process_lexicon(lexicon_name) abstracts_dir = "./scielo_abstracts/" doc_w_ann_count = int() entity_count = int() linked_mentions = int() for abstract in os.listdir(abstracts_dir): if abstract[-2:] == language: output_string = str() with open(abstracts_dir + abstract, 'r') as input_file: text = input_file.read() input_file.close() document_ent_count = int() entities = merpy.get_entities(text, lexicon_name) for entity in entities: if entity != ['']: entity_count += 1 document_ent_count += 1 if len(entity) == 4: # linked mentions with ICD code linked_mentions += 1 output_string += "T" + str( document_ent_count ) + "\t" + entity[0] + " " + entity[ 1] + "\t" + entity[2] + "\t" + entity[3] + "\n" elif len(entity) == 3: # mentions without ICD code output_string += "T" + str( document_ent_count ) + "\t" + entity[0] + " " + entity[ 1] + "\t" + entity[2] + "\n" if document_ent_count > 0: doc_w_ann_count += 1 output_filename = "./mer_annotations/" + language + "/" + abstract + ".ann" with open(output_filename, 'w') as output_file: output_file.write(output_string) output_file.close() try: mentions_ratio = float(entity_count / doc_w_ann_count) doc_linked_ratio = float(linked_mentions / doc_w_ann_count) linked_ratio = float(linked_mentions / entity_count) except: mentions_ratio = 0.0 doc_linked_ratio = 0.0 linked_ratio = 0.0 output_str = "DOCUMENTS WITH ANNOTATIONS: " + str(doc_w_ann_count) + "\n" output_str += "TOTAL ENTITY MENTIONS: " + str(entity_count) + "\n" output_str += "ENTITY MENTIONS PER DOCUMENT: " + str(mentions_ratio) + "\n" output_str += "LINKED ENTITY MENTIONS: " + str(linked_mentions) + "\n" output_str += "LINKED ENTITY MENTIONS PER DOCUMENT: " + str( doc_linked_ratio) + "\n" output_str += "RATIO OF LINKED ENTITY MENTIONS: " + str(linked_ratio) file_name = "mer_annotation_stats_" + language with open(file_name, "w") as output: output.write(output_str) output.close()
for index, row in decs_data.iterrows(): l_terms = str(row['Synonyms']).split('|') if row['Term_Spanish'] not in l_terms: l_terms.append(row['Term_Spanish']) try: parent = dict_child_par.get(str(row['#DeCS_code'])) if parent == None: parent = '-' parent_info = '-' else: parent_info = decs_dict.get(parent) if parent_info == None: parent_info = '-' except KeyError: parent = '-' parent_info = '-' parent_tup = (parent, parent_info) for i in l_terms: conv_dict[i] = str([row['#DeCS_code'], parent_tup]) merpy.create_lexicon(conv_dict.keys(), "decsparlex") merpy.create_mappings(conv_dict, "decsparlex") merpy.show_lexicons() merpy.process_lexicon("decsparlex") #DEBUG print(merpy.get_entities("lo nervio abducens es un gran temefós", "decsparlex"))
import os import pandas as pd def get_entity_dict(ent): ent_dict = { "offsets": "LIVB {} {}".format(ent[0], ent[1]), "text": ent[2].replace("\n", " "), } return ent_dict if len(sys.argv) >= 3: input_dir = sys.argv[1] output_dir = sys.argv[2] result = [] for document in glob(os.path.join(input_dir, "*.txt")): with open(document, 'r') as f: data = f.read() entities = merpy.get_entities(data, "ncbi") entities = [get_entity_dict(ent) for ent in entities if len(ent) == 3] df = pd.DataFrame(entities) df = df.dropna() df = df.loc[df.astype(str).drop_duplicates().index] df = df.reset_index(drop=True) df = df.rename("T{}".format) ann_filename = os.path.basename(document).split(".")[0]+".ann" df.to_csv(os.path.join(output_dir, ann_filename), sep="\t", header=False)
import json import sys import merpy from tqdm import tqdm # Notice that this script requires: # # - that the MER's source code is changed, to allow the use of # <skos:prefLabel> and <skos:altLabel> properties (used by OCHV) # - that a lexicon (whose name is given as the third argument) has been # processed by MER # # The Dockerfile in this repository takes care of that. in_filename, lexicon_name, out_filename = sys.argv[1:] with open(in_filename) as f: data = json.load(f) with open(out_filename, 'w') as f: for key, text in tqdm(data.items()): annotations = merpy.get_entities(text, lexicon_name) json.dump({key: annotations}, f) f.write('\n') f.flush()
total_sents = 0 total_docs = 0 # can be parallelized for pmid in document_entities: if pmid == "": continue if pmid not in pmid_to_abst: print("missing this abstract:", pmid) #import pdb; pdb.set_trace() missing_texts += 1 continue total_docs += 1 doc = nlp(pmid_to_abst[pmid]) merpy.create_lexicon(document_entities[pmid], "biomarker" + pmid) merpy.process_lexicon("biomarker" + pmid) doc_entities = merpy.get_entities(pmid_to_abst[pmid], "biomarker" + pmid) entity_spans = [] for e in doc_entities: try: int(e[0]), int(e[1]) except ValueError: print("ERROR", e) continue entity_spans.append(doc.char_span(int(e[0]), int(e[1]), label="GPE")) entity_spans = [e for e in entity_spans if e is not None] try: doc.ents = entity_spans[:] except: import pdb pdb.set_trace() total_entities += len(entity_spans)