def unified_medical_language_entity_linker(model, document): """ This function links named entities to the Unified Medical Language System UMLS (https://www.nlm.nih.gov/research/umls/) Parameters: model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/) document(str): Document to be processed Returns: Attributes of Named entities accessible in the Unified Medical Language System database """ nlp = model.load() linker = UmlsEntityLinker( k=10, max_entities_per_mention=2) # parameters are tunable nlp.add_pipe(linker) doc = nlp(document) entity = doc.ents entity = [str(item) for item in entity ] # convert each entity tuple to list of strings entity = str(OrderedDict.fromkeys(entity)) # returns unique entities only entity = nlp(entity).ents # convert unique entities back to '.ents' object for entity in entity: for umls_ent in entity._.umls_ents: print("Entity Name:", entity) Concept_Id, Score = umls_ent print("Concept_Id = {} Score = {}".format(Concept_Id, Score)) print(linker.umls.cui_to_entity[umls_ent[0]])
def init_nlp(): spacy_nlp = spacy.load('en_core_sci_lg') new_vector = spacy_nlp( """Positive-sense single‐stranded ribonucleic acid virus, subgenus sarbecovirus of the genus Betacoronavirus. Also known as severe acute respiratory syndrome coronavirus 2, also known by 2019 novel coronavirus. It is contagious in humans and is the cause of the ongoing pandemic of coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious disease.""").vector vector_data = { "COVID-19": new_vector, "2019-nCoV": new_vector, "SARS-CoV-2": new_vector } for word, vector in vector_data.items(): spacy_nlp.vocab.set_vector(word, vector) spacy_nlp.max_length = 2000000 # We also need to detect language, or else we'll be parsing non-english text # as if it were English. #spacy_nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(spacy_nlp) spacy_nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize # the data for us. linker = UmlsEntityLinker(resolve_abbreviations=True) spacy_nlp.add_pipe(linker) return (spacy_nlp, linker)
def process_data(pid, doc_list): nlp = spacy.load("en_core_sci_sm") nlp.add_pipe( AbbreviationDetector(nlp)) # Add abbreviation deteciton module linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) # Add Entity linking module data = [] for i, doc in enumerate(doc_list): sci_res = nlp(doc['text']) res_list = {} for ent in sci_res.ents: start, end = ent.start_char, ent.end_char res_list[(start, end)] = ent._.umls_ents doc['result'] = res_list data.append(doc) if i % 10 == 0: print('Completed [{}] {}, {}'.format( pid, i, time.strftime("%d_%m_%Y") + '_' + time.strftime("%H:%M:%S"))) return data
def __init__(self): self.tagger = en_ner_bc5cdr_md.load() self.abbreviation_pipe = AbbreviationDetector(self.tagger) self.tagger.add_pipe(self.abbreviation_pipe) self.linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention=1) self.tagger.add_pipe(self.linker) print('NER Module Ready')
def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv") with tempfile.TemporaryDirectory() as dir_name: umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture) self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
def __init__(self, args): import scispacy, spacy from scispacy.abbreviation import AbbreviationDetector from scispacy.umls_linking import UmlsEntityLinker self.nlp = spacy.load("en_core_sci_sm") self.nlp.add_pipe(AbbreviationDetector( self.nlp)) # Add abbreviation deteciton module linker = UmlsEntityLinker( resolve_abbreviations=True) # Add Entity linking module self.nlp.add_pipe(linker)
def scispacy_el(sent: str): """ Test this code! """ linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) doc = nlp(sent) entities = doc.ents import pdb pdb.set_trace()
def __post_init__(self): self.nlp = spacy.load(self.language_model) self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize the data # for us. self.linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(self.linker)
def loadModel(model): """ Loading Named Entity Recognition model. Args: model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md Returns: nlp: loaded model linker: loaded add-on """ # Load the model nlp = model.load() # Add pipe features to pipeline linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) logging.info("Model and add-ons successfully loaded.") return nlp, linker
def init_umls_nlp_linker(): base_dir = '' tfidf_path = base_dir + 'tfidf_vectors_sparse.npz' ann_path = base_dir + 'nmslib_index.bin' ann_index = load_approximate_nearest_neighbours_index( tfidf_vectors_path=tfidf_path, ann_index_path=ann_path) vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib')) ann_concept = json.load( open(cached_path(base_dir + 'concept_aliases.json'))) umlsknowlegebase = UmlsKnowledgeBase( file_path=base_dir + 'umls_2017_aa_cat0129.json', types_file_path=base_dir + 'umls_semantic_type_tree.tsv') cg = CandidateGenerator(ann_index=ann_index, tfidf_vectorizer=vec, ann_concept_aliases_list=ann_concept, umls=umlsknowlegebase) linker = UmlsEntityLinker(candidate_generator=cg, max_entities_per_mention=1) nlp.add_pipe(linker) return linker
def __init__(self, umls_version= None): if umls_version is None: # if os.path.exists("nlp_model"): # print("loading nlp model from path") # from spacy.language import Language # Language.factories['EntityLinker'] = lambda nlp, **cfg: UmlsEntityLinker( **cfg) # self.nlp = spacy.load('nlp_model') # else: print("creating nlp model") self.linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp = spacy.load("en_core_sci_sm") self.nlp.add_pipe(self.linker) self.umls_data = self.linker.kb.cui_to_entity # self.nlp.to_disk('nlp_model') else: self.umls_data = None self.load(umls_version) self.umls_version = umls_version
def __init__(self, biospacy, rules, dysplasia_mappings, dict_path, aff_path): """ Load models and rules Params: biospacy (str): full spaCy pipeline for biomedical data rules (str): hand-crafted rules file path dysplasia_mappings (str): dysplasia mappings file path Returns: None """ self.nlp = spacy.load(biospacy) abbreviation_pipe = AbbreviationDetector( self.nlp) # add abbreviation detector to spaCy pipeline negex = Negex(self.nlp) # add negation detector to spaCy pipeline self.hun = hunspell.HunSpell(dict_path, aff_path) # add spell checker self.linker = UmlsEntityLinker( k=10, max_entities_per_mention=2, resolve_abbreviations=True ) # tunable params - add umls entity linker to spaCy pipeline self.nlp.add_pipe(abbreviation_pipe, name="abbrv_detector") self.nlp.add_pipe(self.linker, after="abbrv_detector") self.nlp.add_pipe(negex, last=True) self.nlp.add_pipe( self.expand_entity_mentions, name='expand_entities', after='ner' ) # add expand_entity_mentions to spaCy processing pipeline # load hand-crafted rules self.rules = utils.read_rules(rules) # set parameter to store the hand-crated rules restricted to a specific use-case (updated w/ self.set_rules() func) self.use_case_rules = dict() # set parameter to store candidate mentions from restricted rules self.use_case_candidates = list() # load dysplasia mappings self.dysplasia = utils.read_dysplasia_mappings(dysplasia_mappings) # set parameter to store dysplasia mappings restricted to a specific use-case self.use_case_dysplasia = dict()
def add_pipe(self, pipe): """Add Spacy pipes Args: pipe (str): pipe name """ print('Loading Spacy pipe: {}'.format(pipe)) pipe = pipe.lower() if pipe == 'abbreviation': # Abbreviation extraction abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) elif pipe == 'entitylinker': # Entity linker linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(linker) elif pipe == 'segmenter': # Rule Segmenter self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True) elif pipe == 'tokenizer': # Tokenizer self.nlp.tokenizer = combined_rule_tokenizer(self.nlp) elif pipe == 'textrank': # Textrank tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) print('Pipe loaded.')
def initialize_nlp(virus_lex_path: str, scispacy_model_name: str = "en_core_sci_lg"): """ Initialize scispacy nlp object and virus terms to the vocabulary. :param virus_lex_path: path to virus lexicon :param scispacy_model_name: name of scispacy model to use for w2v vectors :return: Scispacy nlp object """ # Load the scispacy large model # nlp = en_core_sci_lg.load(disable='parser') # I believe this should work, I wonder if it's not recommended for memory reasons though in a v env like Travis... nlp = spacy.load(scispacy_model_name, disable='parser') # Enable umls entity detection and abbreviation detection linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) # Create a new vector to assign to the virus terms new_vector = nlp( """Positive-sense single‐stranded ribonucleic acid virus, subgenus """ """sarbecovirus of the genus Betacoronavirus. """ """Also known as severe acute respiratory syndrome coronavirus 2, """ """also known by 2019 novel coronavirus. It is """ """contagious in humans and is the cause of the ongoing pandemic of """ """coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious """ """disease.""").vector # Add virus terms to the model vocabulary and assign to them the new vector created above # vocab = Vocab() virus_words = pd.read_csv(virus_lex_path, header=None) for virus_word in virus_words[0]: nlp.vocab.set_vector(virus_word, new_vector) return nlp
def create_organ_dicts(sio_atlas_path, organs_dir_path): voxelman_images_path = os.path.join(sio_atlas_path, "labels") organ_list_path = os.path.join(sio_atlas_path, "classes.txt") organ_list = open(organ_list_path).read().strip().split("\n") """Extract list of labels""" organ2label = {} for entry in organ_list: name, labels = entry.split('" ') labels = labels.split() organ2label[name[1:]] = [int(label) for label in labels] """Keep track of mergers""" organ2alias = {} for organ in organ2label.keys(): organ2alias[organ] = [organ] """Removal of bones, limb tissues and location unspecific tissues""" organs_to_remove = "bones of the left hand, bones of the right hand, cervical vertebra C5, cervical vertebra C6, cervical vertebra C7, coccyx, grey matter, intervertebral disc C6/C7, intervertebral disc C7/T1, intervertebral disc L1/L2, intervertebral disc L2/L3, intervertebral disc L3/L4, intervertebral disc L4/L5, intervertebral disc L5/S1, intervertebral disc S1/S2, intervertebral disc T1/T2, intervertebral disc T2/T3, intervertebral disc T3/T4, intervertebral disc T4/T5, intervertebral disc T5/T6, intervertebral disc T6/T7, intervertebral disc T7/T8, intervertebral disc T8/T9, intervertebral disc T9/T10, intervertebral disc T10/T11, intervertebral disc T11/T12, intervertebral disc T12/L1, left rib 1, left rib 2, left rib 3, left rib 4, left rib 5, left rib 6, left rib 7, left rib 8, left rib 9, left rib 10, left rib 11, left rib 12, left ulna, left scapula, left radius, left humerus, left hip bone, left femur, left clavicle, muscles of the left arm, muscles of the right arm, lumbar vertebra L1, lumbar vertebra L2, lumbar vertebra L3, lumbar vertebra L4, lumbar vertebra L5, marker 1, marker 2, marker 3, right rib 1, right rib 2, right rib 3, right rib 4, right rib 5, right rib 6, right rib 7, right rib 8, right rib 9, right rib 10, right rib 11, right rib 12, right ulna, right scapula, right radius, right humerus, right hip bone, right femur, right clavicle, skin of the left arm, skin of the right arm, thoracic vertebra T1, thoracic vertebra T2, thoracic vertebra T3, thoracic vertebra T4, thoracic vertebra T5, thoracic vertebra T6, thoracic vertebra T7, thoracic vertebra T8, thoracic vertebra T9, thoracic vertebra T10, thoracic vertebra T11, thoracic vertebra T12, unclassified bones, unclassified cartilage, unclassified muscles, unclassified skin, unclassified tissue, unclassified tissue of the left arm, unclassified tissue of the right arm, unclassified veins, white matter, sternum, sacrum, left costal cartilage 1, left costal cartilage 2, left costal cartilage 3, left costal cartilage 4, left costal cartilage 5, left costal cartilage 6-9, right costal cartilage 1, right costal cartilage 2, right costal cartilage 3, right costal cartilage 4, right costal cartilage 5, right costal cartilage 6-9, right clavicular cartilage, left clavicular cartilage" # noqa: E501 organs_to_remove = organs_to_remove.split(", ") for item in organs_to_remove: del organ2label[item] del organ2alias[item] """Removal of bilateral organs on the right side""" organs_to_remove_right = "right atrium, right external oblique, right iliacus, right internal oblique, right jugular vein, right kidney, right lung, right obturator internus, right psoas, right rectus abdominis, right renal medulla, right renal vein, right subclavian vein, right transversus abdominis, right ventricle" # noqa: E501 organs_to_remove_right = organs_to_remove_right.split(", ") for item in organs_to_remove_right: del organ2label[item] del organ2alias[item] """Removal of thorax muscles, scrotum visceral fat""" organs_to_remove_muscles = "scrotum, visceral fat, left psoas, left iliacus, left external oblique, left rectus abdominis, left internal oblique, left transversus abdominis, left obturator internus, ischiocavernosus, pelvic diaphragm, rectus sheath" # noqa: E501 organs_to_remove_muscles = organs_to_remove_muscles.split(", ") for item in organs_to_remove_muscles: del organ2label[item] del organ2alias[item] """Removal of blood vessels""" organs_to_remove_blood_vessels = "superior vena cava, superior mesenteric vein, splenic vein, pulmonary veins, pulmonary trunk, pulmonary arteries, portal vein, left subclavian vein, left jugular vein, inferior vena cava, inferior mesenteric vein, hepatic veins, descending aorta, brachiocephalic vein, azygos vein, arch of aorta, abdominal aorta, left renal vein, ascending aorta" # noqa: E501 organs_to_remove_blood_vessels = organs_to_remove_blood_vessels.split(", ") for item in organs_to_remove_blood_vessels: del organ2label[item] del organ2alias[item] """Removal of small organs with less than 1000 voxels""" organs_to_remove_small = "cystic duct" organs_to_remove_small = organs_to_remove_small.split(", ") for item in organs_to_remove_small: del organ2label[item] del organ2alias[item] """Mergers of stomach segments into "stomach""" organs_to_merge_stomach = "fundus of stomach, greater curvature, lesser curvature, body of stomach, cardia, stomach" organs_to_merge_stomach = organs_to_merge_stomach.split(", ") dest_organ = "stomach" labels = [] names = [] for item in organs_to_merge_stomach: labels += organ2label[item] names.append(item) del organ2label[item] del organ2alias[item] organ2label[dest_organ] = labels organ2alias[dest_organ] = list(set([dest_organ] + names)) """Mergers of colon segments into "colon""" organs_to_merge_colon = "ascending colon, descending colon, transverse colon, sigmoid colon, left colic flexure, right colic flexure" # noqa: E501 organs_to_merge_colon = organs_to_merge_colon.split(", ") dest_organ = "colon" labels = [] names = [] for item in organs_to_merge_colon: labels += organ2label[item] names.append(item) del organ2label[item] del organ2alias[item] organ2label[dest_organ] = labels organ2alias[dest_organ] = list(set([dest_organ] + names)) """Mergers of penis segments into "penis""" organs_to_merge_penis = "penis, corpus cavernosum penis, corpus spongiosum penis" organs_to_merge_penis = organs_to_merge_penis.split(", ") dest_organ = "penis" labels = [] names = [] for item in organs_to_merge_penis: labels += organ2label[item] names.append(item) del organ2label[item] del organ2alias[item] organ2label[dest_organ] = labels organ2alias[dest_organ] = list(set([dest_organ] + names)) """Mergers of trachea and trachea lumen into "trachea""" organs_to_merge_trachea = "trachea, trachea lumen" organs_to_merge_trachea = organs_to_merge_trachea.split(", ") dest_organ = "trachea" labels = [] names = [] for item in organs_to_merge_trachea: labels += organ2label[item] names.append(item) del organ2label[item] del organ2alias[item] organ2label[dest_organ] = labels organ2alias[dest_organ] = list(set([dest_organ] + names)) """Mergers of left kidney and left renal medulla into "left kidney""" organs_to_merge_kidney = "left renal medulla, left kidney" organs_to_merge_kidney = organs_to_merge_kidney.split(", ") dest_organ = "left kidney" labels = [] names = [] for item in organs_to_merge_kidney: labels += organ2label[item] names.append(item) del organ2label[item] del organ2alias[item] organ2label[dest_organ] = labels organ2alias[dest_organ] = list(set([dest_organ] + names)) """Renaming of paired organs to just the name of the organ""" organs_to_rename_left = "left ventricle, left atrium, left kidney, left lung" organs_to_rename_left = organs_to_rename_left.split(", ") target_names = "ventricle, atrium, kidney, lung" target_names = target_names.split(", ") for organ_to_rename, target_name in zip(organs_to_rename_left, target_names): organ2label[target_name] = organ2label[organ_to_rename] organ2alias[target_name] = organ2alias[organ_to_rename] if organ_to_rename in organ2alias[target_name]: organ2alias[target_name].remove(organ_to_rename) organ2alias[target_name].append(target_name) del organ2label[organ_to_rename] del organ2alias[organ_to_rename] """Renaming duodenum (retroperitoneal part) to duodenum""" organs_to_rename_duodenum = "duodenum (retroperitoneal part)" organs_to_rename_duodenum = organs_to_rename_duodenum.split(", ") target_names = "duodenum" target_names = target_names.split(", ") for organ_to_rename, target_name in zip(organs_to_rename_duodenum, target_names): organ2label[target_name] = organ2label[organ_to_rename] organ2alias[target_name] = organ2alias[organ_to_rename] if organ_to_rename in organ2alias[target_name]: organ2alias[target_name].remove(organ_to_rename) organ2alias[target_name].append(target_name) del organ2label[organ_to_rename] del organ2alias[organ_to_rename] """ Adding jejunum and ileum aliases to small intestine Perhaps later we can check if sentences with jejunum is above ileum (as it should be) """ target_organ = "small intestine" aliases = "jejunum, ileum" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding heart atria alias to atrium """ target_organ = "atrium" aliases = "heart atria" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding heart ventricles alias to ventricle """ target_organ = "ventricle" aliases = "heart ventricles" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding cecum alias to caecum """ target_organ = "caecum" aliases = "cecum" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding ampulla of vater alias to ampulla """ target_organ = "ampulla" aliases = "ampulla of vater" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding ampulla of vater alias to ampulla """ target_organ = "ampulla" aliases = "ampulla of vater" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding seminal vesicles alias to seminal gland """ target_organ = "seminal gland" aliases = "seminal vesicles" aliases = aliases.split(", ") for alias in aliases: organ2alias[target_organ].append(alias) """ Adding colon, ascending, colon, descending, colon, transverse, colon, sigmoid alias, and colic flexure names to colon # noqa: E501 """ target_organ = "colon" aliases = [ "colon, ascending", "colon, descending", "colon, transverse", "colon, sigmoid", "hepatic flexure", "splenic flexure", "colic flexure", ] for alias in aliases: organ2alias[target_organ].append(alias) """Random fixes""" organ2alias["kidney"] = ["renal medulla", "kidney"] organ2alias["colon"].remove("right colic flexure") organ2alias["colon"].remove("left colic flexure") organ2alias["bronchi"].append("bronchus") organ2alias["ampulla"] = ["ampulla", "ampulla of vater"] """Generate alias terms""" print("Generating Alias Terms...") nlp = spacy.load("en_core_sci_sm") linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) all_organ_words = list(organ2alias.values()) all_organ_words = [item for sublist in all_organ_words for item in sublist] organ_name_aliases = retrieve_alias_terms(all_organ_words, nlp, linker) for organ, aliases in organ2alias.items(): new_aliases = [] for alias in aliases: new_aliases.extend(organ_name_aliases[alias]) organ2alias[organ] = list(set(aliases + new_aliases)) for organ, aliases in organ2alias.items(): aliases = [ re.sub(r"[\(\[][^)\]]+[\)\]]", r"", alias).strip() for alias in aliases ] aliases = [re.sub(r"(, )*nos$", r"", alias).strip() for alias in aliases] aliases = [re.sub(r"structure$", r"", alias).strip() for alias in aliases] aliases = [re.sub(r"structure of", r"", alias).strip() for alias in aliases] aliases = [alias for alias in aliases if ">" not in alias] aliases = [alias for alias in aliases if not re.search(r"\d+", alias)] aliases = list(set(aliases)) organ2alias[organ] = aliases """Generate voxels""" print("Generating Dictionaries...") if not os.path.exists(organs_dir_path): os.makedirs(organs_dir_path) organ2ind = dict(zip(organ2alias.keys(), range(len(organ2alias)))) ind2organ = dict(zip(range(len(organ2alias)), organ2alias.keys())) with open(os.path.join(organs_dir_path, "organ2ind.json"), "w") as outfile: json.dump(organ2ind, outfile) with open(os.path.join(organs_dir_path, "ind2organ.json"), "w") as outfile: json.dump(ind2organ, outfile) with open(os.path.join(organs_dir_path, "organ2label.json"), "w") as outfile: json.dump(organ2label, outfile) with open(os.path.join(organs_dir_path, "organ2alias.json"), "w") as outfile: json.dump(organ2alias, outfile) organ2voxels = generate_organ2voxels(voxelman_images_path, organ2label) organ2center = {} for organ, labels in organ2label.items(): organ2center[organ] = get_center_of_mass(labels, voxelman_images_path) in_organ = point_within_organ(organ2center[organ], labels, voxelman_images_path) if in_organ: print("Center of mass is inside organ") else: print("Center of mass is not inside organ, that is an error") organ2summary = create_organ2summary(organ2voxels, 1000) with open(os.path.join(organs_dir_path, "organ2center.json"), "w") as outfile: json.dump(organ2center, outfile) with open(os.path.join(organs_dir_path, "organ2voxels.json"), "w") as outfile: json.dump(organ2voxels, outfile) with open(os.path.join(organs_dir_path, "organ2summary.json"), "w") as outfile: json.dump(organ2summary, outfile)
def detect(self, text, detect_relations=False, resolve_abbreviations=False, link_with_umls=False, verbose=False): if verbose: print('-- Will detect named entities using scispaCy.') if detect_relations: print('-- Will detect relations.') if resolve_abbreviations: print('-- Will detect abbreviations.') if link_with_umls: print('-- Will search for UMLS matches.') nlp = spacy.load(self.__model) if link_with_umls: umls_linker = UmlsEntityLinker(k=10, max_entities_per_mention=1) nlp.add_pipe(umls_linker) if resolve_abbreviations: abbrev_detector = AbbreviationDetector(nlp) nlp.add_pipe(abbrev_detector) doc = nlp(text) # Named Entities Detected: ner = set([X.text for X in doc.ents]) if verbose: print('Named Entities detected: {}'.format(ner)) relations = set() if detect_relations: matcher = Matcher(nlp.vocab) pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"}, {'DEP':'agent','OP':"?"}, {'POS':'ADJ','OP':"?"}] matcher.add("matching_1", None, pattern) for sentence in sent_tokenize(text): matches = matcher(doc) k = len(matches) - 1 span = doc[matches[k][1]:matches[k][2]] relations.add(span.text) tokens = word_tokenize(sentence) pos_tags = pos_tag(tokens) chunkGram = r"""Chunk: {<RB.?>?<VB.?><RB.?>?}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(pos_tags) for sub_tree in chunked.subtrees(): if sub_tree.label() == 'Chunk': candidate = '' for leaf in sub_tree.leaves(): if candidate == '': candidate = leaf[0] else: candidate = '{} {}'.format(candidate, leaf[0]) relations.add(candidate) if verbose: print('Relations detected: {}'.format(relations)) abbrev_refs = {} if resolve_abbreviations: for abbrv in doc._.abbreviations: reference = abbrv._.long_form if verbose: print('- {} : {}'.format(abbrv, reference)) abbrev_refs[abbrv] = reference if verbose: print('Abbreviations detected: {}'.format(abbrev_refs)) #TODO implement resolution (i.e. replace detected abbreviations) linked = {} if link_with_umls: if verbose: print('Serching for UMLS matches...') entities = str(ner.union(relations)) # Evaluate on both entities and relations entities = nlp(entities).ents for entity in entities: for umls_ent in entity._.umls_ents: Concept_Id, Score = umls_ent if verbose: print("Name:" ,entity) print('Concept_Id = {} Score = {}'.format(Concept_Id, Score)) umls_entity = umls_linker.umls.cui_to_entity[Concept_Id] print(umls_entity) if not entity.text in linked: # greater scores are shown first, so no need to add smaller scores. linked[entity.text] = 'sameas\tumls:{}\t{}\t'.format(Concept_Id, umls_entity.canonical_name) break if verbose: print('UMLS matches: {}'.format(linked)) return ner, relations, linked
from app import app from flask import render_template, request from spacy import displacy from scispacy.umls_linking import UmlsEntityLinker from app.spacy.spacy_models import MODELS import time print("Loading UMLS Tagger...") nlp_umls = MODELS['en_core_sci_sm'] linker = UmlsEntityLinker(resolve_abbreviations=False) nlp_umls.add_pipe(linker) print("UMLS Tagger loaded!") @app.route('/med_tagger') def med_tagger(): return render_template('med_tagger.html') def link_to_UMLS(text: str): doc = nlp_umls(text) entities = [entity for entity in doc.ents] entities_final = [] if len(entities): umls_entries = [ entity._.umls_ents[0] for entity in entities if len(entity._.umls_ents) ]
tmp["concept_id"] = results.concept_id tmp["confidence"] = str(round(prob, 2)) tmp["canonical_name"] = results.canonical_name tmp["tui"] = results.types ents.append(tmp) if ents != []: o["linked_to"] = ents #import pprint as pp #pp.pprint(data) with open(os.path.join(os.getcwd(), fin_path), "w") as fout: fout.write(json.dumps(data)) # load pre-trained model nlp = spacy.load("en_core_sci_sm") # for details see https://github.com/allenai/scispacy #abbreviation_pipe = AbbreviationDetector(nlp) #nlp.add_pipe(abbreviation_pipe) linker = UmlsEntityLinker(resolve_abbreviations=True, threshold=0.8) nlp.add_pipe(linker) # run pipeline in folder data_dir = "./output/" files = os.listdir(os.path.join(os.getcwd(), data_dir)) for f in tqdm(files): aggregate_json(nlp, os.path.join(data_dir, f))
def load_linker(): linker = UmlsEntityLinker(resolve_abbreviations=True) return linker
from scispacy.umls_linking import UmlsEntityLinker import json import urllib url = "https://raw.githubusercontent.com/allenai/scifact-annotate/master/app/claims/inputs/mock.jsonl?token=AHC7B3FM4TX44DFPTB4NNUK6HS42I" response = urllib.request.urlopen(url) string = response.read().decode('utf-8') stringsplit = string.split('\n') stringsplit = stringsplit[:10] data = [json.loads(c) for c in stringsplit] nlp = spacy.load("en_core_sci_sm") linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention=1) nlp.add_pipe(linker) outputdict = {} for i in range(len(data)): outputdict[data[i]['citing_id']] = [] doc = nlp(data[i]['paragraph_text_orig']['text']) #print('\n') for e in doc.ents: #print("Name: ", e) for umls_ent in e._.umls_ents: info = str(linker.umls.cui_to_entity[umls_ent[0]]) lines = info.split('\n') cuiandname = lines[0].split(', ') fname = cuiandname[1][6:]
import scispacy import spacy from scispacy.abbreviation import AbbreviationDetector from scispacy.umls_linking import UmlsEntityLinker from scispacy.umls_semantic_type_tree import SemanticTypeNode from typing import List, Set, Dict, Tuple, Optional # load large model. nlp = spacy.load("en_core_sci_lg") # add abbreviation pipe to the model. abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) # add UMLS linker to the model. linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) text = """ Myeloid derived suppressor cells (MDSC) are immature myeloid cells with immunosuppressive activity. They accumulate in tumor-bearing mice and humans with different types of cancer, including hepatocellular carcinoma (HCC). """ doc = nlp(text) # process sentences. # print('sents:') # print(list(doc.sents))
import numpy as np import scispacy import spacy from scispacy.umls_linking import UmlsEntityLinker import json import random import requests nlp = spacy.load("en_core_sci_lg") linker = UmlsEntityLinker() nlp.add_pipe(linker) tokenizer = nlp.Defaults.create_tokenizer(nlp) #Import json data with open( '../emrQG/relations.json' ) as json_file: #You can modify this line to change your input directory data = json.load(json_file) #Prcoess text to add whitespace between word/letter tokens and punctuation tokens. def process_text(text): temp = [] token = tokenizer(text) #Use Spacy to do tokenization temp.extend(i.text for i in token) # return " ".join(Final) return " ".join(temp)