예제 #1
0
    def process_data(pid, doc_list):
        nlp = spacy.load("en_core_sci_sm")
        nlp.add_pipe(
            AbbreviationDetector(nlp))  # Add abbreviation deteciton module
        linker = UmlsEntityLinker(resolve_abbreviations=True)
        nlp.add_pipe(linker)  # Add Entity linking module

        data = []
        for i, doc in enumerate(doc_list):
            sci_res = nlp(doc['text'])
            res_list = {}

            for ent in sci_res.ents:
                start, end = ent.start_char, ent.end_char
                res_list[(start, end)] = ent._.umls_ents

            doc['result'] = res_list
            data.append(doc)

            if i % 10 == 0:
                print('Completed [{}] {}, {}'.format(
                    pid, i,
                    time.strftime("%d_%m_%Y") + '_' +
                    time.strftime("%H:%M:%S")))

        return data
예제 #2
0
def unified_medical_language_entity_linker(model, document):
    """
    This function links named entities to the Unified Medical Language System UMLS (https://www.nlm.nih.gov/research/umls/)

    Parameters:
         model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: Attributes of Named entities accessible in the Unified Medical Language System database
     """
    nlp = model.load()
    linker = UmlsEntityLinker(
        k=10, max_entities_per_mention=2)  # parameters are tunable
    nlp.add_pipe(linker)
    doc = nlp(document)
    entity = doc.ents
    entity = [str(item) for item in entity
              ]  # convert each entity tuple to list of strings
    entity = str(OrderedDict.fromkeys(entity))  # returns unique entities only
    entity = nlp(entity).ents  # convert unique entities back to '.ents' object
    for entity in entity:
        for umls_ent in entity._.umls_ents:
            print("Entity Name:", entity)
            Concept_Id, Score = umls_ent
            print("Concept_Id = {} Score = {}".format(Concept_Id, Score))
            print(linker.umls.cui_to_entity[umls_ent[0]])
def init_nlp():
    spacy_nlp = spacy.load('en_core_sci_lg')
    new_vector = spacy_nlp(
        """Positive-sense single‐stranded ribonucleic acid virus, subgenus 
                       sarbecovirus of the genus Betacoronavirus. 
                       Also known as severe acute respiratory syndrome coronavirus 2, 
                       also known by 2019 novel coronavirus. It is 
                       contagious in humans and is the cause of the ongoing pandemic of 
                       coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious 
                       disease.""").vector
    vector_data = {
        "COVID-19": new_vector,
        "2019-nCoV": new_vector,
        "SARS-CoV-2": new_vector
    }
    for word, vector in vector_data.items():
        spacy_nlp.vocab.set_vector(word, vector)

    spacy_nlp.max_length = 2000000

    # We also need to detect language, or else we'll be parsing non-english text
    # as if it were English.
    #spacy_nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
    abbreviation_pipe = AbbreviationDetector(spacy_nlp)
    spacy_nlp.add_pipe(abbreviation_pipe)

    # Our linker will look up named entities/concepts in the UMLS graph and normalize
    # the data for us.
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    spacy_nlp.add_pipe(linker)

    return (spacy_nlp, linker)
예제 #4
0
 def __init__(self):
     self.tagger = en_ner_bc5cdr_md.load()
     self.abbreviation_pipe = AbbreviationDetector(self.tagger)
     self.tagger.add_pipe(self.abbreviation_pipe)
     self.linker = UmlsEntityLinker(resolve_abbreviations=True,
                                    max_entities_per_mention=1)
     self.tagger.add_pipe(self.linker)
     print('NER Module Ready')
예제 #5
0
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")

        umls_fixture = UmlsKnowledgeBase("tests/fixtures/umls_test_fixture.json", "tests/fixtures/test_umls_tree.tsv")
        with tempfile.TemporaryDirectory() as dir_name:
            umls_concept_aliases, tfidf_vectorizer, ann_index = create_tfidf_ann_index(dir_name, umls_fixture)
        candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, umls_concept_aliases, umls_fixture)

        self.linker = UmlsEntityLinker(candidate_generator, filter_for_definitions=False)
예제 #6
0
    def __init__(self, args):
        import scispacy, spacy
        from scispacy.abbreviation import AbbreviationDetector
        from scispacy.umls_linking import UmlsEntityLinker

        self.nlp = spacy.load("en_core_sci_sm")
        self.nlp.add_pipe(AbbreviationDetector(
            self.nlp))  # Add abbreviation deteciton module
        linker = UmlsEntityLinker(
            resolve_abbreviations=True)  # Add Entity linking module
        self.nlp.add_pipe(linker)
예제 #7
0
def scispacy_el(sent: str):
    """
	Test this code! 
	"""
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    nlp.add_pipe(linker)

    doc = nlp(sent)

    entities = doc.ents

    import pdb
    pdb.set_trace()
예제 #8
0
    def __post_init__(self):
        self.nlp = spacy.load(self.language_model)
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)

        # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)

        # Our linker will look up named entities/concepts in the UMLS graph and normalize the data
        # for us.
        self.linker = UmlsEntityLinker(resolve_abbreviations=True)
        self.nlp.add_pipe(self.linker)
예제 #9
0
def loadModel(model):
    """
    Loading Named Entity Recognition model.
    Args:
        model: options: en_core_sci_sm, en_core_sci_lg, en_ner_bc5cdr_md

    Returns:
        nlp: loaded model
        linker: loaded add-on
    """
    # Load the model
    nlp = model.load()

    # Add pipe features to pipeline
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    nlp.add_pipe(linker)

    logging.info("Model and add-ons successfully loaded.")
    return nlp, linker
예제 #10
0
def init_umls_nlp_linker():
    base_dir = ''
    tfidf_path = base_dir + 'tfidf_vectors_sparse.npz'
    ann_path = base_dir + 'nmslib_index.bin'
    ann_index = load_approximate_nearest_neighbours_index(
        tfidf_vectors_path=tfidf_path, ann_index_path=ann_path)
    vec = joblib.load(cached_path(base_dir + 'tfidf_vectorizer.joblib'))
    ann_concept = json.load(
        open(cached_path(base_dir + 'concept_aliases.json')))
    umlsknowlegebase = UmlsKnowledgeBase(
        file_path=base_dir + 'umls_2017_aa_cat0129.json',
        types_file_path=base_dir + 'umls_semantic_type_tree.tsv')
    cg = CandidateGenerator(ann_index=ann_index,
                            tfidf_vectorizer=vec,
                            ann_concept_aliases_list=ann_concept,
                            umls=umlsknowlegebase)
    linker = UmlsEntityLinker(candidate_generator=cg,
                              max_entities_per_mention=1)
    nlp.add_pipe(linker)
    return linker
예제 #11
0
    def __init__(self, umls_version= None):
        
        if umls_version is None:
#             if os.path.exists("nlp_model"):
#                 print("loading nlp model from path")
#                 from spacy.language import Language
#                 Language.factories['EntityLinker'] = lambda nlp, **cfg: UmlsEntityLinker( **cfg)
#                 self.nlp = spacy.load('nlp_model')
                
#             else:
            print("creating nlp model")
            self.linker = UmlsEntityLinker(resolve_abbreviations=True)
            self.nlp = spacy.load("en_core_sci_sm")
            self.nlp.add_pipe(self.linker)
            self.umls_data = self.linker.kb.cui_to_entity
#             self.nlp.to_disk('nlp_model')
        else:
            self.umls_data = None
            self.load(umls_version)
            
        self.umls_version = umls_version
예제 #12
0
    def __init__(self, biospacy, rules, dysplasia_mappings, dict_path,
                 aff_path):
        """
		Load models and rules

		Params:
			biospacy (str): full spaCy pipeline for biomedical data
			rules (str): hand-crafted rules file path
			dysplasia_mappings (str): dysplasia mappings file path

		Returns: None
		"""

        self.nlp = spacy.load(biospacy)

        abbreviation_pipe = AbbreviationDetector(
            self.nlp)  # add abbreviation detector to spaCy pipeline
        negex = Negex(self.nlp)  # add negation detector to spaCy pipeline
        self.hun = hunspell.HunSpell(dict_path, aff_path)  # add spell checker
        self.linker = UmlsEntityLinker(
            k=10, max_entities_per_mention=2, resolve_abbreviations=True
        )  # tunable params - add umls entity linker to spaCy pipeline
        self.nlp.add_pipe(abbreviation_pipe, name="abbrv_detector")
        self.nlp.add_pipe(self.linker, after="abbrv_detector")
        self.nlp.add_pipe(negex, last=True)
        self.nlp.add_pipe(
            self.expand_entity_mentions, name='expand_entities', after='ner'
        )  # add expand_entity_mentions to spaCy processing pipeline

        # load hand-crafted rules
        self.rules = utils.read_rules(rules)
        # set parameter to store the hand-crated rules restricted to a specific use-case (updated w/ self.set_rules() func)
        self.use_case_rules = dict()
        # set parameter to store candidate mentions from restricted rules
        self.use_case_candidates = list()

        # load dysplasia mappings
        self.dysplasia = utils.read_dysplasia_mappings(dysplasia_mappings)
        # set parameter to store dysplasia  mappings restricted to a specific use-case
        self.use_case_dysplasia = dict()
예제 #13
0
    def add_pipe(self, pipe):
        """Add Spacy pipes

        Args:
            pipe (str): pipe name
        """
        print('Loading Spacy pipe: {}'.format(pipe))
        pipe = pipe.lower()
        if pipe == 'abbreviation':  # Abbreviation extraction
            abbreviation_pipe = AbbreviationDetector(self.nlp)
            self.nlp.add_pipe(abbreviation_pipe)
        elif pipe == 'entitylinker':  # Entity linker
            linker = UmlsEntityLinker(resolve_abbreviations=True)
            self.nlp.add_pipe(linker)
        elif pipe == 'segmenter':  # Rule Segmenter
            self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True)
        elif pipe == 'tokenizer':  # Tokenizer
            self.nlp.tokenizer = combined_rule_tokenizer(self.nlp)
        elif pipe == 'textrank':  # Textrank
            tr = pytextrank.TextRank()
            self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
        print('Pipe loaded.')
예제 #14
0
def initialize_nlp(virus_lex_path: str,
                   scispacy_model_name: str = "en_core_sci_lg"):
    """
    Initialize scispacy nlp object and virus terms to the vocabulary.

    :param virus_lex_path: path to virus lexicon
    :param scispacy_model_name: name of scispacy model to use for w2v vectors
    :return: Scispacy nlp object
    """
    # Load the scispacy large model
    # nlp = en_core_sci_lg.load(disable='parser')
    # I believe this should work, I wonder if it's not recommended for  memory reasons though in a v env like Travis...
    nlp = spacy.load(scispacy_model_name, disable='parser')
    # Enable umls entity detection and abbreviation detection
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    nlp.add_pipe(linker)
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)

    # Create a new vector to assign to the virus terms
    new_vector = nlp(
        """Positive-sense single‐stranded ribonucleic acid virus, subgenus """
        """sarbecovirus of the genus Betacoronavirus. """
        """Also known as severe acute respiratory syndrome coronavirus 2, """
        """also known by 2019 novel coronavirus. It is """
        """contagious in humans and is the cause of the ongoing pandemic of """
        """coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious """
        """disease.""").vector

    # Add virus terms to the model vocabulary and assign to them the new vector created above
    # vocab = Vocab()
    virus_words = pd.read_csv(virus_lex_path, header=None)
    for virus_word in virus_words[0]:
        nlp.vocab.set_vector(virus_word, new_vector)

    return nlp
예제 #15
0
import scispacy
import spacy

from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker
from scispacy.umls_semantic_type_tree import SemanticTypeNode

from typing import List, Set, Dict, Tuple, Optional

# load large model.
nlp = spacy.load("en_core_sci_lg")
# add abbreviation pipe to the model.
abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)
# add UMLS linker to the model.
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)

text = """
Myeloid derived suppressor cells (MDSC) are immature
myeloid cells with immunosuppressive activity.
They accumulate in tumor-bearing mice and humans
with different types of cancer, including hepatocellular
carcinoma (HCC).
"""
doc = nlp(text)

# process sentences.
# print('sents:')
# print(list(doc.sents))
예제 #16
0
"""
Mention finding + linking:  25% accuracy
"""
import scispacy, spacy
from scispacy.umls_linking import UmlsEntityLinker
from clinical_data.concept_linking import load_n2c2_2019
nlp = spacy.load("en_core_sci_sm")
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)

correct = 0
total = 0

for file in load_n2c2_2019(partition='test'):
    text = str(file['note'])

    correct_mentions_text = [' '.join(span.text for span in mention['mention']) for mention in file['mentions'] ]
    batch_candidates = linker.candidate_generator(correct_mentions_text, 30)
    #doc = nlp(text)

    correct_labels = [(mention['mention'][0].start_char,
                         mention['mention'][-1].end_char,
                         mention['concept']) for mention in file['mentions']]
    predicted_labels = []

    for mention, candidates in zip(correct_labels, batch_candidates):
        predicted = []
        for cand in candidates:
            score = max(cand.similarities)
            if score > linker.threshold:
                predicted.append((cand.concept_id, score))
예제 #17
0
def load_linker():
    linker = UmlsEntityLinker(resolve_abbreviations=True)

    return linker
예제 #18
0
from app import app

from flask import render_template, request
from spacy import displacy
from scispacy.umls_linking import UmlsEntityLinker

from app.spacy.spacy_models import MODELS

import time

print("Loading UMLS Tagger...")
nlp_umls = MODELS['en_core_sci_sm']
linker = UmlsEntityLinker(resolve_abbreviations=False)
nlp_umls.add_pipe(linker)
print("UMLS Tagger loaded!")


@app.route('/med_tagger')
def med_tagger():
    return render_template('med_tagger.html')


def link_to_UMLS(text: str):
    doc = nlp_umls(text)
    entities = [entity for entity in doc.ents]
    entities_final = []
    if len(entities):
        umls_entries = [
            entity._.umls_ents[0] for entity in entities
            if len(entity._.umls_ents)
        ]
예제 #19
0
                            tmp["concept_id"] = results.concept_id
                            tmp["confidence"] = str(round(prob, 2))
                            tmp["canonical_name"] = results.canonical_name
                            tmp["tui"] = results.types
                            ents.append(tmp)
                    if ents != []:
                        o["linked_to"] = ents

    #import pprint as pp
    #pp.pprint(data)

    with open(os.path.join(os.getcwd(), fin_path), "w") as fout:
        fout.write(json.dumps(data))


# load pre-trained model
nlp = spacy.load("en_core_sci_sm")

# for details see https://github.com/allenai/scispacy
#abbreviation_pipe = AbbreviationDetector(nlp)
#nlp.add_pipe(abbreviation_pipe)

linker = UmlsEntityLinker(resolve_abbreviations=True, threshold=0.8)
nlp.add_pipe(linker)

# run pipeline in folder
data_dir = "./output/"
files = os.listdir(os.path.join(os.getcwd(), data_dir))
for f in tqdm(files):
    aggregate_json(nlp, os.path.join(data_dir, f))
예제 #20
0
from scispacy.umls_linking import UmlsEntityLinker
import json
import urllib

url = "https://raw.githubusercontent.com/allenai/scifact-annotate/master/app/claims/inputs/mock.jsonl?token=AHC7B3FM4TX44DFPTB4NNUK6HS42I"
response = urllib.request.urlopen(url)

string = response.read().decode('utf-8')
stringsplit = string.split('\n')
stringsplit = stringsplit[:10]

data = [json.loads(c) for c in stringsplit]

nlp = spacy.load("en_core_sci_sm")

linker = UmlsEntityLinker(resolve_abbreviations=True,
                          max_entities_per_mention=1)
nlp.add_pipe(linker)

outputdict = {}

for i in range(len(data)):
    outputdict[data[i]['citing_id']] = []
    doc = nlp(data[i]['paragraph_text_orig']['text'])
    #print('\n')
    for e in doc.ents:
        #print("Name: ", e)
        for umls_ent in e._.umls_ents:
            info = str(linker.umls.cui_to_entity[umls_ent[0]])
            lines = info.split('\n')
            cuiandname = lines[0].split(', ')
            fname = cuiandname[1][6:]
예제 #21
0
    def detect(self, text, detect_relations=False, resolve_abbreviations=False, link_with_umls=False, verbose=False):
        if verbose:
            print('-- Will detect named entities using scispaCy.')
            if detect_relations:
                print('-- Will detect relations.')
            if resolve_abbreviations:
                print('-- Will detect abbreviations.')
            if link_with_umls:
                print('-- Will search for UMLS matches.')

        nlp = spacy.load(self.__model)
        if link_with_umls:
            umls_linker = UmlsEntityLinker(k=10, max_entities_per_mention=1)
            nlp.add_pipe(umls_linker)
        if resolve_abbreviations:
            abbrev_detector = AbbreviationDetector(nlp)
            nlp.add_pipe(abbrev_detector)

        doc = nlp(text)

        # Named Entities Detected:
        ner = set([X.text for X in doc.ents])
        if verbose:
            print('Named Entities detected: {}'.format(ner))

        relations = set()
        if detect_relations:
            matcher = Matcher(nlp.vocab)
            pattern = [{'DEP':'ROOT'},
                       {'DEP':'prep','OP':"?"},
                       {'DEP':'agent','OP':"?"},
                       {'POS':'ADJ','OP':"?"}]
            matcher.add("matching_1", None, pattern)

            for sentence in sent_tokenize(text):
                matches = matcher(doc)
                k = len(matches) - 1
                span = doc[matches[k][1]:matches[k][2]]

                relations.add(span.text)

                tokens = word_tokenize(sentence)
                pos_tags = pos_tag(tokens)

                chunkGram = r"""Chunk: {<RB.?>?<VB.?><RB.?>?}"""
                chunkParser = RegexpParser(chunkGram)
                chunked = chunkParser.parse(pos_tags)

                for sub_tree in chunked.subtrees():
                    if sub_tree.label() == 'Chunk':
                        candidate = ''
                        for leaf in sub_tree.leaves():
                            if candidate == '':
                                candidate = leaf[0]
                            else:
                                candidate = '{} {}'.format(candidate, leaf[0])

                        relations.add(candidate)

            if verbose:
                print('Relations detected: {}'.format(relations))

        abbrev_refs = {}
        if resolve_abbreviations:
            for abbrv in doc._.abbreviations:
                reference = abbrv._.long_form

                if verbose:
                    print('- {} : {}'.format(abbrv, reference))

                abbrev_refs[abbrv] = reference

            if verbose:
                print('Abbreviations detected: {}'.format(abbrev_refs))

            #TODO implement resolution (i.e. replace detected abbreviations)

        linked = {}
        if link_with_umls:
            if verbose:
                print('Serching for UMLS matches...')

            entities = str(ner.union(relations)) # Evaluate on both entities and relations
            entities = nlp(entities).ents

            for entity in entities:
                for umls_ent in entity._.umls_ents:
                    Concept_Id, Score = umls_ent

                    if verbose:
                        print("Name:" ,entity)
                        print('Concept_Id = {} Score = {}'.format(Concept_Id, Score))
                        umls_entity = umls_linker.umls.cui_to_entity[Concept_Id]
                        print(umls_entity)

                    if not entity.text in linked: # greater scores are shown first, so no need to add smaller scores.
                        linked[entity.text] = 'sameas\tumls:{}\t{}\t'.format(Concept_Id, umls_entity.canonical_name)
                        break

            if verbose:
                print('UMLS matches: {}'.format(linked))

        return ner, relations, linked
예제 #22
0
def create_organ_dicts(sio_atlas_path, organs_dir_path):

    voxelman_images_path = os.path.join(sio_atlas_path, "labels")
    organ_list_path = os.path.join(sio_atlas_path, "classes.txt")

    organ_list = open(organ_list_path).read().strip().split("\n")

    """Extract list of labels"""
    organ2label = {}
    for entry in organ_list:
        name, labels = entry.split('" ')
        labels = labels.split()
        organ2label[name[1:]] = [int(label) for label in labels]

    """Keep track of mergers"""
    organ2alias = {}
    for organ in organ2label.keys():
        organ2alias[organ] = [organ]

    """Removal of bones, limb tissues and location unspecific tissues"""
    organs_to_remove = "bones of the left hand, bones of the right hand, cervical vertebra C5, cervical vertebra C6, cervical vertebra C7, coccyx, grey matter, intervertebral disc C6/C7, intervertebral disc C7/T1, intervertebral disc L1/L2, intervertebral disc L2/L3, intervertebral disc L3/L4, intervertebral disc L4/L5, intervertebral disc L5/S1, intervertebral disc S1/S2, intervertebral disc T1/T2, intervertebral disc T2/T3, intervertebral disc T3/T4, intervertebral disc T4/T5, intervertebral disc T5/T6, intervertebral disc T6/T7, intervertebral disc T7/T8, intervertebral disc T8/T9, intervertebral disc T9/T10, intervertebral disc T10/T11, intervertebral disc T11/T12, intervertebral disc T12/L1, left rib 1, left rib 2, left rib 3, left rib 4, left rib 5, left rib 6, left rib 7, left rib 8, left rib 9, left rib 10, left rib 11, left rib 12, left ulna, left scapula, left radius, left humerus, left hip bone, left femur, left clavicle, muscles of the left arm, muscles of the right arm, lumbar vertebra L1, lumbar vertebra L2, lumbar vertebra L3, lumbar vertebra L4, lumbar vertebra L5, marker 1, marker 2, marker 3, right rib 1, right rib 2, right rib 3, right rib 4, right rib 5, right rib 6, right rib 7, right rib 8, right rib 9, right rib 10, right rib 11, right rib 12, right ulna, right scapula, right radius, right humerus, right hip bone, right femur, right clavicle, skin of the left arm, skin of the right arm, thoracic vertebra T1, thoracic vertebra T2, thoracic vertebra T3, thoracic vertebra T4, thoracic vertebra T5, thoracic vertebra T6, thoracic vertebra T7, thoracic vertebra T8, thoracic vertebra T9, thoracic vertebra T10, thoracic vertebra T11, thoracic vertebra T12, unclassified bones, unclassified cartilage, unclassified muscles, unclassified skin, unclassified tissue, unclassified tissue of the left arm, unclassified tissue of the right arm, unclassified veins, white matter, sternum, sacrum, left costal cartilage 1, left costal cartilage 2, left costal cartilage 3, left costal cartilage 4, left costal cartilage 5, left costal cartilage 6-9, right costal cartilage 1, right costal cartilage 2, right costal cartilage 3, right costal cartilage 4, right costal cartilage 5, right costal cartilage 6-9, right clavicular cartilage, left clavicular cartilage"  # noqa: E501
    organs_to_remove = organs_to_remove.split(", ")
    for item in organs_to_remove:
        del organ2label[item]
        del organ2alias[item]

    """Removal of bilateral organs on the right side"""
    organs_to_remove_right = "right atrium, right external oblique, right iliacus, right internal oblique, right jugular vein, right kidney, right lung, right obturator internus, right psoas, right rectus abdominis, right renal medulla, right renal vein, right subclavian vein, right transversus abdominis, right ventricle"  # noqa: E501
    organs_to_remove_right = organs_to_remove_right.split(", ")
    for item in organs_to_remove_right:
        del organ2label[item]
        del organ2alias[item]

    """Removal of thorax muscles, scrotum visceral fat"""
    organs_to_remove_muscles = "scrotum, visceral fat, left psoas, left iliacus, left external oblique, left rectus abdominis, left internal oblique, left transversus abdominis, left obturator internus, ischiocavernosus, pelvic diaphragm, rectus sheath"  # noqa: E501
    organs_to_remove_muscles = organs_to_remove_muscles.split(", ")
    for item in organs_to_remove_muscles:
        del organ2label[item]
        del organ2alias[item]

    """Removal of blood vessels"""
    organs_to_remove_blood_vessels = "superior vena cava, superior mesenteric vein, splenic vein, pulmonary veins, pulmonary trunk, pulmonary arteries, portal vein, left subclavian vein, left jugular vein, inferior vena cava, inferior mesenteric vein, hepatic veins, descending aorta, brachiocephalic vein, azygos vein, arch of aorta, abdominal aorta, left renal vein, ascending aorta"  # noqa: E501
    organs_to_remove_blood_vessels = organs_to_remove_blood_vessels.split(", ")
    for item in organs_to_remove_blood_vessels:
        del organ2label[item]
        del organ2alias[item]

    """Removal of small organs with less than 1000 voxels"""
    organs_to_remove_small = "cystic duct"
    organs_to_remove_small = organs_to_remove_small.split(", ")
    for item in organs_to_remove_small:
        del organ2label[item]
        del organ2alias[item]
        
    """Mergers of stomach segments into "stomach"""
    organs_to_merge_stomach = "fundus of stomach, greater curvature, lesser curvature, body of stomach, cardia, stomach"
    organs_to_merge_stomach = organs_to_merge_stomach.split(", ")
    dest_organ = "stomach"
    labels = []
    names = []
    for item in organs_to_merge_stomach:
        labels += organ2label[item]
        names.append(item)
        del organ2label[item]
        del organ2alias[item]
    organ2label[dest_organ] = labels
    organ2alias[dest_organ] = list(set([dest_organ] + names))

    """Mergers of colon segments into "colon"""
    organs_to_merge_colon = "ascending colon, descending colon, transverse colon, sigmoid colon, left colic flexure, right colic flexure"  # noqa: E501
    organs_to_merge_colon = organs_to_merge_colon.split(", ")
    dest_organ = "colon"
    labels = []
    names = []
    for item in organs_to_merge_colon:
        labels += organ2label[item]
        names.append(item)
        del organ2label[item]
        del organ2alias[item]
    organ2label[dest_organ] = labels
    organ2alias[dest_organ] = list(set([dest_organ] + names))

    """Mergers of penis segments into "penis"""
    organs_to_merge_penis = "penis, corpus cavernosum penis, corpus spongiosum penis"
    organs_to_merge_penis = organs_to_merge_penis.split(", ")
    dest_organ = "penis"
    labels = []
    names = []
    for item in organs_to_merge_penis:
        labels += organ2label[item]
        names.append(item)
        del organ2label[item]
        del organ2alias[item]
    organ2label[dest_organ] = labels
    organ2alias[dest_organ] = list(set([dest_organ] + names))

    """Mergers of trachea and trachea lumen into "trachea"""
    organs_to_merge_trachea = "trachea, trachea lumen"
    organs_to_merge_trachea = organs_to_merge_trachea.split(", ")
    dest_organ = "trachea"
    labels = []
    names = []
    for item in organs_to_merge_trachea:
        labels += organ2label[item]
        names.append(item)
        del organ2label[item]
        del organ2alias[item]
    organ2label[dest_organ] = labels
    organ2alias[dest_organ] = list(set([dest_organ] + names))

    """Mergers of left kidney and left renal medulla into "left kidney"""
    organs_to_merge_kidney = "left renal medulla, left kidney"
    organs_to_merge_kidney = organs_to_merge_kidney.split(", ")
    dest_organ = "left kidney"
    labels = []
    names = []
    for item in organs_to_merge_kidney:
        labels += organ2label[item]
        names.append(item)
        del organ2label[item]
        del organ2alias[item]
    organ2label[dest_organ] = labels
    organ2alias[dest_organ] = list(set([dest_organ] + names))

    """Renaming of paired organs to just the name of the organ"""
    organs_to_rename_left = "left ventricle, left atrium, left kidney, left lung"
    organs_to_rename_left = organs_to_rename_left.split(", ")
    target_names = "ventricle, atrium, kidney, lung"
    target_names = target_names.split(", ")

    for organ_to_rename, target_name in zip(organs_to_rename_left, target_names):
        organ2label[target_name] = organ2label[organ_to_rename]
        organ2alias[target_name] = organ2alias[organ_to_rename]
        if organ_to_rename in organ2alias[target_name]:
            organ2alias[target_name].remove(organ_to_rename)
            organ2alias[target_name].append(target_name)
        del organ2label[organ_to_rename]
        del organ2alias[organ_to_rename]

    """Renaming duodenum (retroperitoneal part) to duodenum"""
    organs_to_rename_duodenum = "duodenum (retroperitoneal part)"
    organs_to_rename_duodenum = organs_to_rename_duodenum.split(", ")
    target_names = "duodenum"
    target_names = target_names.split(", ")

    for organ_to_rename, target_name in zip(organs_to_rename_duodenum, target_names):
        organ2label[target_name] = organ2label[organ_to_rename]
        organ2alias[target_name] = organ2alias[organ_to_rename]
        if organ_to_rename in organ2alias[target_name]:
            organ2alias[target_name].remove(organ_to_rename)
            organ2alias[target_name].append(target_name)
        del organ2label[organ_to_rename]
        del organ2alias[organ_to_rename]

    """
    Adding jejunum and ileum aliases to small intestine
    Perhaps later we can check if sentences with jejunum is above ileum
    (as it should be)
    """
    target_organ = "small intestine"
    aliases = "jejunum, ileum"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding heart atria alias to atrium
    """
    target_organ = "atrium"
    aliases = "heart atria"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding heart ventricles alias to ventricle
    """
    target_organ = "ventricle"
    aliases = "heart ventricles"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding cecum alias to caecum
    """
    target_organ = "caecum"
    aliases = "cecum"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding ampulla of vater alias to ampulla
    """
    target_organ = "ampulla"
    aliases = "ampulla of vater"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding ampulla of vater alias to ampulla
    """
    target_organ = "ampulla"
    aliases = "ampulla of vater"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding seminal vesicles alias to seminal gland
    """
    target_organ = "seminal gland"
    aliases = "seminal vesicles"
    aliases = aliases.split(", ")
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """
    Adding colon, ascending, colon, descending, colon, transverse, colon, sigmoid alias, and colic flexure names to colon  # noqa: E501
    """
    target_organ = "colon"
    aliases = [
        "colon, ascending",
        "colon, descending",
        "colon, transverse",
        "colon, sigmoid",
        "hepatic flexure",
        "splenic flexure",
        "colic flexure",
    ]
    for alias in aliases:
        organ2alias[target_organ].append(alias)

    """Random fixes"""
    organ2alias["kidney"] = ["renal medulla", "kidney"]
    organ2alias["colon"].remove("right colic flexure")
    organ2alias["colon"].remove("left colic flexure")
    organ2alias["bronchi"].append("bronchus")
    organ2alias["ampulla"] = ["ampulla", "ampulla of vater"]

    """Generate alias terms"""
    print("Generating Alias Terms...")
    nlp = spacy.load("en_core_sci_sm")
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    nlp.add_pipe(linker)

    all_organ_words = list(organ2alias.values())
    all_organ_words = [item for sublist in all_organ_words for item in sublist]
    organ_name_aliases = retrieve_alias_terms(all_organ_words, nlp, linker)
    for organ, aliases in organ2alias.items():
        new_aliases = []
        for alias in aliases:
            new_aliases.extend(organ_name_aliases[alias])
        organ2alias[organ] = list(set(aliases + new_aliases))

    for organ, aliases in organ2alias.items():
        aliases = [
            re.sub(r"[\(\[][^)\]]+[\)\]]", r"", alias).strip() for alias in aliases
        ]
        aliases = [re.sub(r"(, )*nos$", r"", alias).strip() for alias in aliases]
        aliases = [re.sub(r"structure$", r"", alias).strip() for alias in aliases]
        aliases = [re.sub(r"structure of", r"", alias).strip() for alias in aliases]
        aliases = [alias for alias in aliases if ">" not in alias]
        aliases = [alias for alias in aliases if not re.search(r"\d+", alias)]

        aliases = list(set(aliases))
        organ2alias[organ] = aliases

    """Generate voxels"""
    print("Generating Dictionaries...")

    if not os.path.exists(organs_dir_path):
        os.makedirs(organs_dir_path)

    organ2ind = dict(zip(organ2alias.keys(), range(len(organ2alias))))
    ind2organ = dict(zip(range(len(organ2alias)), organ2alias.keys()))

    with open(os.path.join(organs_dir_path, "organ2ind.json"), "w") as outfile:
        json.dump(organ2ind, outfile)
    with open(os.path.join(organs_dir_path, "ind2organ.json"), "w") as outfile:
        json.dump(ind2organ, outfile)
    with open(os.path.join(organs_dir_path, "organ2label.json"), "w") as outfile:
        json.dump(organ2label, outfile)
    with open(os.path.join(organs_dir_path, "organ2alias.json"), "w") as outfile:
        json.dump(organ2alias, outfile)

    organ2voxels = generate_organ2voxels(voxelman_images_path, organ2label)
    organ2center = {}
    for organ, labels in organ2label.items():
        organ2center[organ] = get_center_of_mass(labels, voxelman_images_path)
        in_organ = point_within_organ(organ2center[organ], labels, voxelman_images_path)
        if in_organ:
            print("Center of mass is inside organ")
        else:
            print("Center of mass is not inside organ, that is an error")
    organ2summary = create_organ2summary(organ2voxels, 1000)

    with open(os.path.join(organs_dir_path, "organ2center.json"), "w") as outfile:
        json.dump(organ2center, outfile)
    with open(os.path.join(organs_dir_path, "organ2voxels.json"), "w") as outfile:
        json.dump(organ2voxels, outfile)
    with open(os.path.join(organs_dir_path, "organ2summary.json"), "w") as outfile:
        json.dump(organ2summary, outfile)
예제 #23
0
import numpy as np
import scispacy
import spacy
from scispacy.umls_linking import UmlsEntityLinker
import json
import random
import requests

nlp = spacy.load("en_core_sci_lg")
linker = UmlsEntityLinker()
nlp.add_pipe(linker)
tokenizer = nlp.Defaults.create_tokenizer(nlp)

#Import json data
with open(
        '../emrQG/relations.json'
) as json_file:  #You can modify this line to change your input directory
    data = json.load(json_file)


#Prcoess text to add whitespace between word/letter tokens and punctuation tokens.
def process_text(text):

    temp = []

    token = tokenizer(text)  #Use Spacy to do tokenization
    temp.extend(i.text for i in token)

    # return " ".join(Final)
    return " ".join(temp)