Python AbbreviationDetector示例，scispacy.abbreviation.AbbreviationDetector Python示例

示例#1

0

显示文件

    def __init__(
        self,
        nlp: Language,
        name: str = "taxonomic_abbreviation_detector",
    ) -> None:
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        AbbreviationDetector.__init__(self, nlp, name)
        self.abb_name_pattern = re.compile("[A-Z]{1}\.")

示例#2

0

显示文件

文件： PreProcessUtils.py 项目： FortuneSeeker/CORD19-preprocessing

def init_nlp():
    spacy_nlp = spacy.load('en_core_sci_lg')
    new_vector = spacy_nlp(
        """Positive-sense single‐stranded ribonucleic acid virus, subgenus 
                       sarbecovirus of the genus Betacoronavirus. 
                       Also known as severe acute respiratory syndrome coronavirus 2, 
                       also known by 2019 novel coronavirus. It is 
                       contagious in humans and is the cause of the ongoing pandemic of 
                       coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious 
                       disease.""").vector
    vector_data = {
        "COVID-19": new_vector,
        "2019-nCoV": new_vector,
        "SARS-CoV-2": new_vector
    }
    for word, vector in vector_data.items():
        spacy_nlp.vocab.set_vector(word, vector)

    spacy_nlp.max_length = 2000000

    # We also need to detect language, or else we'll be parsing non-english text
    # as if it were English.
    #spacy_nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
    abbreviation_pipe = AbbreviationDetector(spacy_nlp)
    spacy_nlp.add_pipe(abbreviation_pipe)

    # Our linker will look up named entities/concepts in the UMLS graph and normalize
    # the data for us.
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    spacy_nlp.add_pipe(linker)

    return (spacy_nlp, linker)

示例#3

0

显示文件

文件： umls.py 项目： Ibrokhimsadikov/Streamlit_app

def load_model():

    nlp = spacy.load("en_core_sci_lg")
    # Add abbreviation detector
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    return nlp

示例#4

0

显示文件

文件： dump_linkers_output.py 项目： vsocrates/medtype

    def process_data(pid, doc_list):
        nlp = spacy.load("en_core_sci_sm")
        nlp.add_pipe(
            AbbreviationDetector(nlp))  # Add abbreviation deteciton module
        linker = UmlsEntityLinker(resolve_abbreviations=True)
        nlp.add_pipe(linker)  # Add Entity linking module

        data = []
        for i, doc in enumerate(doc_list):
            sci_res = nlp(doc['text'])
            res_list = {}

            for ent in sci_res.ents:
                start, end = ent.start_char, ent.end_char
                res_list[(start, end)] = ent._.umls_ents

            doc['result'] = res_list
            data.append(doc)

            if i % 10 == 0:
                print('Completed [{}] {}, {}'.format(
                    pid, i,
                    time.strftime("%d_%m_%Y") + '_' +
                    time.strftime("%H:%M:%S")))

        return data

示例#5

0

显示文件

def load_sci_pipe(model="en_core_sci_md"):
    nlp = spacy.load(model)
    abbreviation_pipe = AbbreviationDetector(nlp)

    nlp.add_pipe(abbreviation_pipe)
    nlp.add_pipe(merge_entities)
    return nlp

示例#6

0

显示文件

文件： example_sciscpacy.py 项目： JungeAlexander/kbase_nlp_app

def load_model(name):

    nlp = spacy.load(name)
    # Add abbreviation detector
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    return nlp

示例#7

0

显示文件

文件： linking_setup.py 项目： ilmcconnell/Cosmos

 def __init__(self):
     nlp = spacy.load('en_core_sci_lg')
     abbreviation_pipe = AbbreviationDetector(nlp)
     nlp.add_pipe(abbreviation_pipe)
     self.linker = EntityLinker(resolve_abbreviations=True, name="umls")
     nlp.add_pipe(self.linker)
     self.nlp = nlp

示例#8

0

显示文件

def add_pipes_mutative(nlps, linker):
    """add pipeline components to every nlp pipeline """
    for nlp in nlps:  #mutative
        abbreviation_pipe = AbbreviationDetector(nlp)
        nlp.add_pipe(abbreviation_pipe)
        nlp.add_pipe(merge_entities)
        nlp.add_pipe(linker)
    return nlps

示例#9

0

显示文件

文件： entity_link.py 项目： vj1494/PipelineIE

 def umls_entlink(self):
     """
     Add UMLS entity linker and abbreviation detector to spaCy pipeline_ie
     """
     abbreviation_pipe = AbbreviationDetector(self.nlp)
     self.nlp.add_pipe(abbreviation_pipe)
     linker = EntityLinker(resolve_abbreviations=True, name="umls")
     self.nlp.add_pipe(linker)

示例#10

0

显示文件

文件： medicalner.py 项目： brdlyrbrts/medical-ner

 def __init__(self):
     self.tagger = en_ner_bc5cdr_md.load()
     self.abbreviation_pipe = AbbreviationDetector(self.tagger)
     self.tagger.add_pipe(self.abbreviation_pipe)
     self.linker = UmlsEntityLinker(resolve_abbreviations=True,
                                    max_entities_per_mention=1)
     self.tagger.add_pipe(self.linker)
     print('NER Module Ready')

示例#11

0

显示文件

    def __init__(self, args=None, detect_entities=False):
        if args is None:
            self.args = load_pickle("args.pkl")
        else:
            self.args = args
        self.cuda = torch.cuda.is_available()
        self.detect_entities = detect_entities

        if self.detect_entities:
            self.nlp = spacy.load("en_core_sci_md")
            abbreviation_pipe = AbbreviationDetector(self.nlp)
            self.nlp.add_pipe(abbreviation_pipe)
            self.ner = spacy.load("en_ner_bc5cdr_md")
            self.nlp_norm = spacy.load("en_core_web_sm")
        else:
            self.nlp = None
        self.entities_of_interest = ["DISEASE", "CHEMICAL"]
        logger.info("Loading tokenizer and model...")
        from .train_funcs import load_state

        if args.model_no == 0:
            from ..model.BERT.modeling_bert import BertModel as Model
            model = 'bert-base-uncased'
            lower_case = True
            model_name = 'BERT'
        elif args.model_no == 1:
            from ..model.ALBERT.modeling_albert import AlbertModel as Model
            model = 'albert-base-v2'
            lower_case = False
            model_name = 'ALBERT'
        elif args.model_no == 2:
            from ..model.BIOBERT.modeling_biobert import BiobertModel as Model
            model = 'biobert'
            lower_case = False
            model_name = 'BIOBERT'

        self.net = Model.from_pretrained(model, force_download=False, \
                                         task='classification', n_classes_=args.num_classes)
        self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name)
        self.net.resize_token_embeddings(len(self.tokenizer))
        if self.cuda:
            self.net.cuda()
        start_epoch, best_pred, amp_checkpoint = load_state(self.net,
                                                            None,
                                                            None,
                                                            self.args,
                                                            load_best=False)
        logger.info("Done!")

        # self.d_id_s = self.tokenizer.convert_tokens_to_ids('[D]')
        # self.d_id_e = self.tokenizer.convert_tokens_to_ids('[/D]')
        # self.c_id_s = self.tokenizer.convert_tokens_to_ids('[C]')
        # self.d_id_s = self.tokenizer.convert_tokens_to_ids('[/C]')
        self.D_id = self.tokenizer.convert_tokens_to_ids('DISEASE')
        self.C_id = self.tokenizer.convert_tokens_to_ids('CHEMICAL')
        self.pad_id = self.tokenizer.pad_token_id
        self.rm = load_pickle("relations.pkl")

示例#12

0

显示文件

    def __init__(self, mkquery=mkquery_ngrams, es=None):
        self.case = True
        self.all_fields = True
        self.es = es or Elasticsearch()
        self.log = logging.getLogger(__name__)
        self.mkquery = mkquery

        self.nlp = spacy.load("en_core_web_sm")
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)
        self.nlp.disable_pipes("tagger", "ner", "parser")

示例#13

0

显示文件

文件： entity_linkers.py 项目： ndobb/medtype

    def __init__(self, args):
        import scispacy, spacy
        from scispacy.abbreviation import AbbreviationDetector
        from scispacy.umls_linking import UmlsEntityLinker

        self.nlp = spacy.load("en_core_sci_sm")
        self.nlp.add_pipe(AbbreviationDetector(
            self.nlp))  # Add abbreviation deteciton module
        linker = UmlsEntityLinker(
            resolve_abbreviations=True)  # Add Entity linking module
        self.nlp.add_pipe(linker)

示例#14

0

显示文件

文件： umls_linker.py 项目： CoronaWhy/task-risk

    def __post_init__(self):
        self.nlp = spacy.load(self.language_model)
        self.nlp.add_pipe(LanguageDetector(),
                          name='language_detector',
                          last=True)

        # Add the abbreviation pipe to the spacy pipeline. Only need to run this once.
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)

        # Our linker will look up named entities/concepts in the UMLS graph and normalize the data
        # for us.
        self.linker = UmlsEntityLinker(resolve_abbreviations=True)
        self.nlp.add_pipe(self.linker)

示例#15

0

显示文件

    def test_linker_resolves_abbreviations(self):

        detector = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(detector)
        text = "1-Methyl-4-phenylpyridinium (MPP+) is an abbreviation which doesn't exist in the baby index."
        doc = self.nlp(text)
        # Set abbreviated text (MPP+) to be the only entity, which is also not in the toy umls index.
        doc.ents = (doc[2:3], )
        doc = self.linker(doc)

        id_with_score = doc.ents[0]._.kb_ents[0]
        assert id_with_score == ("C0000098", 1.0)
        umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]]
        assert umls_entity.concept_id == "C0000098"

示例#16

0

显示文件

文件： get_drugs.py 项目： Reggie-Yang/clinical_notes

def show_medical_abbreviation(model, document):
    """
    This function detects and resolves medical abbreviations in word entities

    Parameters:
         model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/)
         document(str): Document to be processed

    Returns: List of unique abbreviations and their resolution
     """
    nlp = model.load()
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    doc = nlp(document)
    abbreviated = list(
        set([f"{abrv}  {abrv._.long_form}" for abrv in doc._.abbreviations
             ]))  # list is set to ensure only unique values are returned
    return abbreviated

示例#17

0

显示文件

文件： functions.py 项目： oeg-upm/AI4EU_raidologist

def get_abbr_ratio(text, known_abbreviatures=None):
    """Returns the percentage of dissambiguated abbreviations of the text.
    INPUT: Textual data, [Abbreviatures already identified]
    OUTPUT: Percentage of identified abbreviatures"""
    spacy.prefer_gpu()
    nlp = spacy.load('en_core_sci_md')
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    doc = nlp(text)
    # Esto tiene que haber una manera mas elegante
    tokens = list(set([t.text for t in doc if t.text not in string.punctuation]))
    abbrs = [d.text for d in doc._.abbreviations]
    if len(abbrs) == 0:
        return 1
    if known_abbreviatures:
        for ka in known_abbreviatures:
            if ka in abbrs: abbrs.remove(ka)
    return float(len(list(set(abbrs) & set(tokens)))) / float(len(abbrs))

示例#18

0

显示文件

文件： bionlp.py 项目： giachell/examode_CERT

    def __init__(self, biospacy, rules, dysplasia_mappings, dict_path,
                 aff_path):
        """
		Load models and rules

		Params:
			biospacy (str): full spaCy pipeline for biomedical data
			rules (str): hand-crafted rules file path
			dysplasia_mappings (str): dysplasia mappings file path

		Returns: None
		"""

        self.nlp = spacy.load(biospacy)

        abbreviation_pipe = AbbreviationDetector(
            self.nlp)  # add abbreviation detector to spaCy pipeline
        negex = Negex(self.nlp)  # add negation detector to spaCy pipeline
        self.hun = hunspell.HunSpell(dict_path, aff_path)  # add spell checker
        self.linker = UmlsEntityLinker(
            k=10, max_entities_per_mention=2, resolve_abbreviations=True
        )  # tunable params - add umls entity linker to spaCy pipeline
        self.nlp.add_pipe(abbreviation_pipe, name="abbrv_detector")
        self.nlp.add_pipe(self.linker, after="abbrv_detector")
        self.nlp.add_pipe(negex, last=True)
        self.nlp.add_pipe(
            self.expand_entity_mentions, name='expand_entities', after='ner'
        )  # add expand_entity_mentions to spaCy processing pipeline

        # load hand-crafted rules
        self.rules = utils.read_rules(rules)
        # set parameter to store the hand-crated rules restricted to a specific use-case (updated w/ self.set_rules() func)
        self.use_case_rules = dict()
        # set parameter to store candidate mentions from restricted rules
        self.use_case_candidates = list()

        # load dysplasia mappings
        self.dysplasia = utils.read_dysplasia_mappings(dysplasia_mappings)
        # set parameter to store dysplasia  mappings restricted to a specific use-case
        self.use_case_dysplasia = dict()

示例#19

0

显示文件

def get_abbreviation_df(nlp,
                        data,
                        fields,
                        skip_zero=False,
                        skip_duplicate=True):
    ### finds abrv, its meaning and cosine similarity between abrv and meaning in data (DataFrame) fields (list)
    ### nlp - embedding dictionary (e.g. en_core_sci_lg)
    ### skip_zero - skip abbreviations without embeddings
    ### skip_duplicate - skip duplicate abbreviations
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)
    n_dim = nlp("").vector.shape[0]
    abrv_set = set()

    rez = pd.DataFrame(columns=["abrv", "meaning", "similarity"])

    for field in fields:
        for s in data[field]:
            if not pd.isna(s):
                doc = nlp(s)

                for abrv in doc._.abbreviations:
                    abrv_str = str(abrv)
                    if not (skip_zero and np.allclose(
                            nlp(abrv_str).vector, np.zeros(n_dim))):
                        if not (skip_duplicate and abrv_str in abrv_set):
                            abrv_set.add(abrv_str)

                            meaning = str(abrv._.long_form)
                            sim = semantic_similarity(nlp, abrv_str, meaning)
                            rez = rez.append(
                                {
                                    "abrv": abrv_str,
                                    "meaning": meaning,
                                    "similarity": sim
                                },
                                ignore_index=True)

    return rez

示例#20

0

显示文件

    def add_pipe(self, pipe):
        """Add Spacy pipes

        Args:
            pipe (str): pipe name
        """
        print('Loading Spacy pipe: {}'.format(pipe))
        pipe = pipe.lower()
        if pipe == 'abbreviation':  # Abbreviation extraction
            abbreviation_pipe = AbbreviationDetector(self.nlp)
            self.nlp.add_pipe(abbreviation_pipe)
        elif pipe == 'entitylinker':  # Entity linker
            linker = UmlsEntityLinker(resolve_abbreviations=True)
            self.nlp.add_pipe(linker)
        elif pipe == 'segmenter':  # Rule Segmenter
            self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True)
        elif pipe == 'tokenizer':  # Tokenizer
            self.nlp.tokenizer = combined_rule_tokenizer(self.nlp)
        elif pipe == 'textrank':  # Textrank
            tr = pytextrank.TextRank()
            self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True)
        print('Pipe loaded.')

示例#21

0

显示文件

def initialize_nlp(virus_lex_path: str,
                   scispacy_model_name: str = "en_core_sci_lg"):
    """
    Initialize scispacy nlp object and virus terms to the vocabulary.

    :param virus_lex_path: path to virus lexicon
    :param scispacy_model_name: name of scispacy model to use for w2v vectors
    :return: Scispacy nlp object
    """
    # Load the scispacy large model
    # nlp = en_core_sci_lg.load(disable='parser')
    # I believe this should work, I wonder if it's not recommended for  memory reasons though in a v env like Travis...
    nlp = spacy.load(scispacy_model_name, disable='parser')
    # Enable umls entity detection and abbreviation detection
    linker = UmlsEntityLinker(resolve_abbreviations=True)
    nlp.add_pipe(linker)
    abbreviation_pipe = AbbreviationDetector(nlp)
    nlp.add_pipe(abbreviation_pipe)

    # Create a new vector to assign to the virus terms
    new_vector = nlp(
        """Positive-sense single‐stranded ribonucleic acid virus, subgenus """
        """sarbecovirus of the genus Betacoronavirus. """
        """Also known as severe acute respiratory syndrome coronavirus 2, """
        """also known by 2019 novel coronavirus. It is """
        """contagious in humans and is the cause of the ongoing pandemic of """
        """coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious """
        """disease.""").vector

    # Add virus terms to the model vocabulary and assign to them the new vector created above
    # vocab = Vocab()
    virus_words = pd.read_csv(virus_lex_path, header=None)
    for virus_word in virus_words[0]:
        nlp.vocab.set_vector(virus_word, new_vector)

    return nlp

示例#22

0

显示文件

    def parse(self, tex_path: str, tex: str) -> Iterator[Abbreviation]:
        check_for_reserved_characters(tex)
        plaintext, plaintext_to_tex_offset_map = plaintext_and_offset(tex_path, tex)

        # This is the most basic model and had no real performance difference on our inputs,
        # other options include NER models and models with pretrained word vectors.
        nlp = spacy.load("en_core_sci_sm")
        abbreviation_pipe = AbbreviationDetector(nlp)
        nlp.add_pipe(abbreviation_pipe)

        # These dictionaries hold abbreviated forms, their expansions, and the location of the expansions.
        # All of them use the abbreviated form as keys.
        abb_short_forms = {}
        abb_expansions = {}
        expanded_locations = {}
        doc = nlp(plaintext)

        # This extracts the abbreviations from the scispacy model.
        for abrv in doc._.abbreviations:
            count = 0
            for s in NON_ACRONYM_CHARACTERS:
                 count += str(abrv).count(s)
            # count makes sure that we don't accidentally include symbols or variables.
            if count == 0:
                abb_short_forms[str(abrv)] = [[plaintext_to_tex_offset_map[m.start()], plaintext_to_tex_offset_map[m.start() + len(str(abrv))]] for m in re.finditer(str(abrv), plaintext)]
                abb_expansions[str(abrv)] = str(abrv._.long_form)
                x = plaintext.find(str(abrv._.long_form))
                expanded_locations[str(abrv)] = [plaintext_to_tex_offset_map[x], plaintext_to_tex_offset_map[x + len(str(abrv._.long_form))]]

        # If you want to use another abbreviation detection method in addition to scispacy
        # you may implement it here and add its results to the three dictionaries.

        count = 0
        full_count = 1
        # Yields abbreviated forms and their expansions.
        for abb in abb_short_forms:
            exp_start, exp_end = expanded_locations[abb]
            expanded = abb_expansions[abb]
            tex_sub = tex[exp_start:exp_end]
            context_tex = tex[exp_start - DEFAULT_CONTEXT_SIZE : exp_end + DEFAULT_CONTEXT_SIZE]

            # Yields the expanded form as an Abbreviation type
            yield Abbreviation(
                text=abb,
                start=exp_start,
                end=exp_end,
                expansion=expanded,
                id_=count,
                tex_path=tex_path,
                tex=tex_sub,
                context_tex=context_tex,
                str_id="f" + str(full_count) + "-0"
            )
            count += 1
            short_count = 0

            # Yields the abbreviated forms as Abbreviation types.
            for location in abb_short_forms[abb]:
                short_count += 1
                start, end = location
                tex_sub = tex[start:end]
                context_tex = tex[start - DEFAULT_CONTEXT_SIZE : end + DEFAULT_CONTEXT_SIZE]
                yield Abbreviation(
                    text=abb,
                    start=start,
                    end=end,
                    expansion=expanded,
                    id_=count,
                    tex_path=tex_path,
                    tex=tex_sub,
                    context_tex=context_tex,
                    str_id="s" + str(full_count) + "-" + str(short_count)
                )
                count += 1

            full_count += 1

示例#23

0

显示文件

# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz

import scispacy
from scispacy.abbreviation import AbbreviationDetector
import spacy
import en_core_sci_lg
from spacy.matcher import Matcher
from pprint import pprint
from my_sentence_splitting import get_sents
import json
from tqdm import tqdm
import re

nlp = spacy.load("en_core_sci_lg")

abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

matcher = Matcher(nlp.vocab)

age_words     = [
    "boy", "girl", "man", "woman", 'men', 'women', 'girls', 'boys', 'baby', 'babies', 'infant', 'mother', 'father',
    'male', 'female', 'males', 'females', 'adult', 'adults', 'children', 'child', 'newborn', 'neonates', 'fathers',
    'toddlers', 'neonate', 'toddler', 'adolescent', 'adolescents', 'elderly', 'young', 'newborns', 'mothers',
    'persons', 'person'
]

age_pattern_1 = [  # this is simple. It will match everything
    {"OP": "+", "LOWER": {"IN": age_words}, "POS": {"IN": ["NOUN"]}},
    {"OP": "?", "LOWER": {"IN": ['patient', 'patients']}}
]

示例#24

0

显示文件

    def detect(self, text, detect_relations=False, resolve_abbreviations=False, link_with_umls=False, verbose=False):
        if verbose:
            print('-- Will detect named entities using scispaCy.')
            if detect_relations:
                print('-- Will detect relations.')
            if resolve_abbreviations:
                print('-- Will detect abbreviations.')
            if link_with_umls:
                print('-- Will search for UMLS matches.')

        nlp = spacy.load(self.__model)
        if link_with_umls:
            umls_linker = UmlsEntityLinker(k=10, max_entities_per_mention=1)
            nlp.add_pipe(umls_linker)
        if resolve_abbreviations:
            abbrev_detector = AbbreviationDetector(nlp)
            nlp.add_pipe(abbrev_detector)

        doc = nlp(text)

        # Named Entities Detected:
        ner = set([X.text for X in doc.ents])
        if verbose:
            print('Named Entities detected: {}'.format(ner))

        relations = set()
        if detect_relations:
            matcher = Matcher(nlp.vocab)
            pattern = [{'DEP':'ROOT'},
                       {'DEP':'prep','OP':"?"},
                       {'DEP':'agent','OP':"?"},
                       {'POS':'ADJ','OP':"?"}]
            matcher.add("matching_1", None, pattern)

            for sentence in sent_tokenize(text):
                matches = matcher(doc)
                k = len(matches) - 1
                span = doc[matches[k][1]:matches[k][2]]

                relations.add(span.text)

                tokens = word_tokenize(sentence)
                pos_tags = pos_tag(tokens)

                chunkGram = r"""Chunk: {<RB.?>?<VB.?><RB.?>?}"""
                chunkParser = RegexpParser(chunkGram)
                chunked = chunkParser.parse(pos_tags)

                for sub_tree in chunked.subtrees():
                    if sub_tree.label() == 'Chunk':
                        candidate = ''
                        for leaf in sub_tree.leaves():
                            if candidate == '':
                                candidate = leaf[0]
                            else:
                                candidate = '{} {}'.format(candidate, leaf[0])

                        relations.add(candidate)

            if verbose:
                print('Relations detected: {}'.format(relations))

        abbrev_refs = {}
        if resolve_abbreviations:
            for abbrv in doc._.abbreviations:
                reference = abbrv._.long_form

                if verbose:
                    print('- {} : {}'.format(abbrv, reference))

                abbrev_refs[abbrv] = reference

            if verbose:
                print('Abbreviations detected: {}'.format(abbrev_refs))

            #TODO implement resolution (i.e. replace detected abbreviations)

        linked = {}
        if link_with_umls:
            if verbose:
                print('Serching for UMLS matches...')

            entities = str(ner.union(relations)) # Evaluate on both entities and relations
            entities = nlp(entities).ents

            for entity in entities:
                for umls_ent in entity._.umls_ents:
                    Concept_Id, Score = umls_ent

                    if verbose:
                        print("Name:" ,entity)
                        print('Concept_Id = {} Score = {}'.format(Concept_Id, Score))
                        umls_entity = umls_linker.umls.cui_to_entity[Concept_Id]
                        print(umls_entity)

                    if not entity.text in linked: # greater scores are shown first, so no need to add smaller scores.
                        linked[entity.text] = 'sameas\tumls:{}\t{}\t'.format(Concept_Id, umls_entity.canonical_name)
                        break

            if verbose:
                print('UMLS matches: {}'.format(linked))

        return ner, relations, linked

示例#25

0

显示文件

文件： test_abbreviation_detection.py 项目： nipunsadvilkar/scispacy

 def setUp(self):
     super().setUp()
     self.nlp = spacy.load("en_core_web_sm")
     self.detector = AbbreviationDetector(self.nlp)
     self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \

示例#26

0

显示文件

文件： test_abbreviation_detection.py 项目： nipunsadvilkar/scispacy

class TestAbbreviationDetector(unittest.TestCase):

    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")
        self.detector = AbbreviationDetector(self.nlp)
        self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \
                inherited motor neuron disease caused by the expansion \
                of a polyglutamine tract within the androgen receptor (AR). \
                SBMA can be caused by this easily."

    def test_find_abbreviation(self):
        # Basic case
        doc = self.nlp("abbreviation (abbrn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "abbreviation"

        # Hypenation and numbers within abbreviation
        doc = self.nlp("abbreviation (ab-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "abbreviation"

        # No match
        doc = self.nlp("abbreviation (aeb-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form is None

        # First letter must match start of word.
        doc = self.nlp("aaaabbreviation (ab-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "aaaabbreviation"

        # Matching is greedy for first letter (are is not included).
        doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)")
        long = doc[0:5]
        short = doc[6:7]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "aaaabbreviation"

    def test_filter_matches(self):
        doc = self.nlp(self.text)
        matches = self.detector.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] -1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)

        assert len(filtered) == 2
        long, short  = filtered[0]
        assert long.string == "Spinal and bulbar muscular atrophy "
        assert short.string == "SBMA"
        long, short = filtered[1]
        assert long.string == "within the androgen receptor "
        assert short.string == "AR"

    def test_abbreviation_detection(self):
        # Attribute should be registered.
        doc = self.nlp(self.text)
        assert doc._.abbreviations == []
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 3

        correct  = set()
        span = doc[33:34]
        span._.long_form = doc[0:5]
        correct.add(span)
        span = doc[6:7]
        span._.long_form = doc[0:5]
        correct.add(span)
        span = doc[29:30]
        span._.long_form = doc[26:28]
        correct.add(span)
        correct_long = {x._.long_form for x in correct}

        assert set(doc2._.abbreviations) == correct
        assert {x._.long_form for x in doc2._.abbreviations} == correct_long

    def test_find(self):
        doc = self.nlp(self.text)
        long, shorts = self.detector.find(doc[6:7], doc)
        assert long.string == "Spinal and bulbar muscular atrophy "
        assert len(shorts) == 2
        assert {x.string for x in shorts} == {"SBMA", "SBMA "}

        long, shorts = self.detector.find(doc[7:13], doc)
        assert shorts == set()

    def test_issue_158(self):
        text = "The PVO observations showed that the total transterminator flux "\
               "was 23% of that at solar maximum and that the largest reductions in the "\
               "number of ions transported antisunward occurred at the highest altitudes "\
               "(Spenner et al., 1995)."
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

示例#27

0

显示文件

class TestAbbreviationDetector(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.nlp = spacy.load("en_core_web_sm")
        self.detector = AbbreviationDetector(self.nlp)
        self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \
                inherited motor neuron disease caused by the expansion \
                of a polyglutamine tract within the androgen receptor (AR). \
                SBMA can be caused by this easily."

    def test_find_abbreviation(self):
        # Basic case
        doc = self.nlp("abbreviation (abbrn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "abbreviation"

        # Hypenation and numbers within abbreviation
        doc = self.nlp("abbreviation (ab-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "abbreviation"

        # No match
        doc = self.nlp("abbreviation (aeb-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form is None

        # First letter must match start of word.
        doc = self.nlp("aaaabbreviation (ab-b9rn)")
        long = doc[0:1]
        short = doc[2:3]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "aaaabbreviation"

        # Matching is greedy for first letter (are is not included).
        doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)")
        long = doc[0:5]
        short = doc[6:7]
        _, long_form = find_abbreviation(long, short)
        assert long_form.text == "aaaabbreviation"

    def test_filter_matches(self):
        doc = self.nlp(self.text)
        matches = self.detector.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)

        assert len(filtered) == 2
        long, short = filtered[0]
        assert long.text_with_ws == "Spinal and bulbar muscular atrophy "
        assert short.text == "SBMA"
        long, short = filtered[1]
        assert long.text_with_ws == "within the androgen receptor "
        assert short.text == "AR"

    def test_abbreviation_detection(self):
        # Attribute should be registered.
        doc = self.nlp(self.text)
        assert doc._.abbreviations == []
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 3

        correct = set()
        span = doc[33:34]
        span._.long_form = doc[0:5]
        correct.add(span)
        span = doc[6:7]
        span._.long_form = doc[0:5]
        correct.add(span)
        span = doc[29:30]
        span._.long_form = doc[26:28]
        correct.add(span)
        correct_long = {x._.long_form for x in correct}

        assert set(doc2._.abbreviations) == correct
        assert {x._.long_form for x in doc2._.abbreviations} == correct_long

    def test_find(self):
        doc = self.nlp(self.text)
        long, shorts = self.detector.find(doc[6:7], doc)
        assert long.text_with_ws == "Spinal and bulbar muscular atrophy "
        assert len(shorts) == 2
        assert {x.text_with_ws for x in shorts} == {"SBMA", "SBMA "}

        long, shorts = self.detector.find(doc[7:13], doc)
        assert shorts == set()

    def test_issue_158(self):
        text = (
            "The PVO observations showed that the total transterminator flux "
            "was 23% of that at solar maximum and that the largest reductions in the "
            "number of ions transported antisunward occurred at the highest altitudes "
            "(Spenner et al., 1995)."
        )
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

    def test_issue_192(self):
        # test for <short> (<long>) pattern
        text = "blah SBMA (Spinal and bulbar muscular atrophy)"
        doc = self.nlp(text)
        doc2 = self.detector(doc)

        assert len(doc2._.abbreviations) == 1
        assert doc2._.abbreviations[0] == doc[1:2]
        assert doc2._.abbreviations[0]._.long_form == doc[3:8]

    def test_issue_161(self):
        # test some troublesome cases in the abbreviation detector
        text = "H2)]+(14)s.t. (1), (4).Similarly"
        print(f"Text: {text}")
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = ".(21)In (21), λ"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "map expX (·) : R"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "0,(3)with the following data: (3-i) (q̄"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "Φg(h),ThΦg(v) ) , (h, v)"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "dimension;(S-iii) The optimal control problem obtained in (S-ii) is con-verted"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "z), πut (z)) )"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "repositories he/she already worked with or from previous collaborators. Nevertheless, 88% of the first action of users to a repository (repository discovery) is"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

    def test_empty_span(self):
        text = "(19, 9, 4) Hadamard Designs and Their Residual Designs"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

    def test_space_issue(self):
        text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture."
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 1
        assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT"

    def test_multiple_spaces(self):
        text = "by      designing A     Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture."
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 1
        assert doc2._.abbreviations[0]._.long_form.text == "A     Lite BERT"

    @pytest.mark.xfail
    def test_difficult_cases(self):
        # Don't see an obvious way of solving these. They require something more semantic to distinguish
        text = "is equivalent to (iv) of Theorem"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

        text = "or to fork.Users work more on their repositories (owners) than on"
        doc = self.nlp(text)
        doc2 = self.detector(doc)
        assert len(doc2._.abbreviations) == 0

示例#28

0

显示文件

    def __init__(self, prediction_type: str) -> None:

        # Initialize modules for featurization.
        # To use a smaller model, swap out the parameter with "en_core_sci_sm"
        # The prediction_type are the '+' separated keys of the joint model heads.
        # They are the names of the datasets on which the joint model was trained.
        # Example : prediction_type = "DocDef2+AI2020+W00"
        logging.debug("Loading Spacy models (this may take some time).")
        self.nlp = spacy.load("en_core_sci_md")
        abbreviation_pipe = AbbreviationDetector(self.nlp)
        self.nlp.add_pipe(abbreviation_pipe)

        # Create a detector for verb phrases.
        verb_pattern = [
            {
                "POS": "VERB",
                "OP": "?"
            },
            {
                "POS": "ADV",
                "OP": "*"
            },
            {
                "POS": "AUX",
                "OP": "*"
            },
            {
                "POS": "VERB",
                "OP": "+"
            },
        ]
        self.verb_matcher = Matcher(self.nlp.vocab)
        self.verb_matcher.add("Verb phrase", None, verb_pattern)

        # Initialize modules for transformer-based inference model based on the prediction_type
        self.model_paths = {
            "W00": {
                "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/",
                "file": "termdef.zip",
                "type": "term-def",
            },
            "AI2020": {
                "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/",
                "file": "abbrexp.zip",
                "type": "abbr-exp",
            },
            "DocDef2": {
                "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/",
                "file": "symnick.zip",
                "type": "sym-nick",
            },
            "DocDef2+AI2020+W00": {
                "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/",
                "file": "joint_symnick_abbrexp_termdef.zip",
                "type": "joint",
            },
        }
        self.prediction_type = prediction_type

        cache_directory = f"./cache/{self.prediction_type}_model"
        # Make a directory storing model files (./data/)
        if not os.path.exists(cache_directory):
            os.makedirs(cache_directory)
            logging.debug("Created cache directory for models at %s",
                          cache_directory)

            # Download the best model files in ./data/
            MODEL_URL = (self.model_paths[self.prediction_type]["baseURL"] +
                         self.model_paths[self.prediction_type]["file"])
            logging.debug(
                "Downloading model from %s. Warning: this will take a long time.",
                MODEL_URL,
            )
            cache_file = self.model_paths[self.prediction_type]["file"]
            urllib.request.urlretrieve(
                MODEL_URL,
                os.path.join("{}/{}".format(cache_directory, cache_file)),
            )

            with zipfile.ZipFile("{}/{}".format(cache_directory, cache_file),
                                 "r") as zip_ref:
                zip_ref.extractall(cache_directory)
            logging.debug("Downloaded and unpacked model data in directory %s",
                          cache_file)

        else:
            logging.debug(  # pylint: disable=logging-not-lazy
                "Cache directory for models already exists at %s. " +
                "Skipping creation of directory and download of data.",
                cache_directory,
            )

        parser = HfArgumentParser(
            (ModelArguments, DataTrainingArguments, TrainingArguments))
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
            [
                "--model_name_or_path",
                "roberta-large",
                "--task",
                f"{self.prediction_type}",
                "--data_dir",
                cache_directory,
                "--output_dir",
                os.path.join(cache_directory, "roberta-large"),
                "--do_eval",
                "--overwrite_cache",
                "--use_crf",
                "--use_heuristic",
                "--use_pos",
                "--use_np",
                "--use_vp",
                "--use_entity",
                "--use_acronym",
                "--per_device_eval_batch_size",
                "16",
                "--max_seq_len",
                "80",
            ])

        # Set seed for model.
        set_torch_seed(training_args.seed, training_args.no_cuda)

        # Log basic debugging information about model and arguments.
        logging.info(  # pylint: disable=logging-not-lazy
            "Arguments for NLP model. Process rank: %s, device: %s, " +
            "n_gpu: %s, distributed training: %s, 16-bits training: %s. Training / evaluation "
            + "parameters: %s",
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
            bool(training_args.local_rank != -1),
            training_args.fp16,
            training_args,
        )

        # Set model type from arguments.
        model_args.model_type = model_args.model_name_or_path.split(
            "-")[0].split("_")[0]

        # Load model configuration.
        if model_args.config_name:
            config = AutoConfig.from_pretrained(model_args.config_name,
                                                cache_dir=model_args.cache_dir)
        elif model_args.model_name_or_path:
            config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                                cache_dir=model_args.cache_dir)
        else:
            config = CONFIG_MAPPING[model_args.model_type]()
            logging.warning(
                "You are instantiating a new config instance from scratch.")

        # Load tokenizer.
        if model_args.tokenizer_name:
            tokenizer = AutoTokenizer.from_pretrained(
                model_args.tokenizer_name, cache_dir=model_args.cache_dir)
        elif model_args.model_name_or_path:
            tokenizer = AutoTokenizer.from_pretrained(
                model_args.model_name_or_path, cache_dir=model_args.cache_dir)
        else:
            raise ValueError(
                "You are instantiating a new tokenizer from scratch. " +
                "This is not supported, but you can do it from another script, "
                + "save it, and load it from here, using --tokenizer_name.")

        # Rename output directory to reflect model parameters.
        training_args.output_dir = "{}{}{}{}{}{}".format(
            training_args.output_dir,
            "_pos={}".format(training_args.use_pos)
            if training_args.use_pos else "",
            "_np={}".format(training_args.use_np)
            if training_args.use_np else "",
            "_vp={}".format(training_args.use_vp)
            if training_args.use_vp else "",
            "_entity={}".format(training_args.use_entity)
            if training_args.use_entity else "",
            "_acronym={}".format(training_args.use_acronym)
            if training_args.use_acronym else "",
        )
        logging.info(
            "The output directory for the model has been set to %s",
            training_args.output_dir,
        )

        data_args.ignore_index = training_args.ignore_index
        data_args.output_dir = training_args.output_dir

        # Load the model.
        model_class = MODEL_CLASSES[model_args.model_type]
        if (os.path.exists(training_args.output_dir)
                and not training_args.overwrite_output_dir):
            model = model_class.from_pretrained(
                training_args.output_dir,
                args=training_args,
                intent_label_dict=get_joint_labels(data_args, "intent_label"),
                slot_label_dict=get_joint_labels(data_args, "slot_label"),
                pos_label_lst=get_joint_labels(data_args, "pos_label"),
                # This is because currently there are 3 different models - one for each task
                tasks=self.prediction_type.split('+'),
            )
            logging.info("Model loaded from %s", training_args.output_dir)
        else:
            logging.error(  # pylint: disable=logging-not-lazy
                "Could not load model from %s. A pre-trained model could " +
                "not be found in the directory. This can occur if the download of the model was "
                + "terminated. Try deleting %s and running this script again.",
                training_args.output_dir,
                cache_directory,
            )
            raise ValueError(
                f"Could not load model from {training_args.output_dir}")

            # model.resize_token_embeddings(len(tokenizer))

        self.data_args = data_args
        self.model_args = model_args

        self.tokenizer = tokenizer
        self.model = model
        self.trainer = Trainer(
            [
                training_args,
                self.model_args,
                self.data_args,
            ],
            self.model,
        )

示例#29

0

显示文件

    def determine_gene_associations(self,verbed=True,twosents=False):
        """Determine sentences with specified gene association
        using natural language processing

        Args:
            verbed (bool): A verb is required in the middle of the regex assoc
              and the gene; this eliminates sentences that do not make a claim 
              on the gene.
            twosents (bool): Lookup possible co-occurence in a sliding
              window of two sentences instead of sentence by sentence
              TODO not implemented yet
        """
        import spacy
        from scispacy.abbreviation import AbbreviationDetector
        if twosents: raise NotImplementedError
        try:
            nlp = spacy.load('en') #en_ner_craft_md
            # Detect abbreviations
            abbreviation_pipe = AbbreviationDetector(nlp)
            nlp.add_pipe(abbreviation_pipe)
            nlp.add_pipe(nlp.create_pipe('sentencizer'))
            # Prevent splitting intra-word hyphens
            suffixes = nlp.Defaults.suffixes + (r'''\w+-\w+''',)
            suffix_regex = spacy.util.compile_suffix_regex(suffixes)
            nlp.tokenizer.suffix_search = suffix_regex.search
        except OSError:
            raise Exception(
                '''spacy language module not installed.
                Run: python -m spacy download en
                ''' # pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_ner_craft_md-0.2.0.tar.gz
            )
        self.gene_association = {}
        self.gene_association_sents = {}
        pos_of_interest = ('VERB', 'NOUN', 'ADP', 'PUNCT', 'GENE')
        for association in self.associations:
            abstract = nlp(association['content'])
            sentences = list(abstract.sents)
            for sent in sentences:
                assoc_match = self.assoc.search(sent.text)
                if assoc_match:
                    sent_startposition = sent[0].idx
                    before_assoc_match = True
                    inbetween_feature_vectors = {}
                    for token in sent:
                        # Only looking up if gene symbol if it is not likely to be a general English word
                        gene_symbol = (
                            None if (token.text.isalpha() and (token.is_sent_start or token.text.islower()))
                            else self.get_gene_symbol(token.text)
                        )
                        if gene_symbol:
                        #    import pdb; pdb.set_trace()
                            association_key = (
                                association['pmid'],association['date'],token.text,(assoc_match.start(),assoc_match.end())
                            )
                        else:
                            association_key = None
                        # First check if still before match
                        if (assoc_match.start() < token.idx - sent_startposition) and before_assoc_match:
                            before_assoc_match = False
                            #Store before_assoc_match featurevectors
                            for iv in inbetween_feature_vectors:
                                if not iv in self.gene_association: self.gene_association[iv] = {}
                                prev_association_key = inbetween_feature_vectors[iv].pop('association_key')
                                if prev_association_key not in self.gene_association[iv]:
                                    self.gene_association[iv][prev_association_key] = []
                                self.gene_association[iv][prev_association_key].append(
                                    inbetween_feature_vectors[iv]
                                )
                            inbetween_feature_vector = {p:0 for p in pos_of_interest}
                            inbetween_feature_vector['sent'] = hash(sent)
                        if before_assoc_match:
                            if gene_symbol:
                                # For previous genes update GENE count (TODO retroactive for genes coming after)
                                for iv in inbetween_feature_vectors:
                                    inbetween_feature_vectors[iv]['GENE']+=1
                                # Initialise feature vector for each gene symbol
                                for gs in gene_symbol:
                                    inbetween_feature_vectors[gs] = {p:0 for p in pos_of_interest}
                                    inbetween_feature_vectors[gs]['sent'] = hash(sent)
                                    inbetween_feature_vectors[gs]['association_key'] = association_key
                                    self.gene_association_sents[hash(sent)] = sent
                            elif token.pos_ in pos_of_interest:
                                for iv in inbetween_feature_vectors:
                                    inbetween_feature_vectors[iv][token.pos_]+=1
                        else:
                            if gene_symbol:
                                for gs in gene_symbol:
                                    if not gs in self.gene_association: self.gene_association[gs] = {}
                                    if association_key not in self.gene_association[gs]:
                                        self.gene_association[gs][association_key] = []
                                    self.gene_association[gs][association_key].append(
                                        inbetween_feature_vector.copy()
                                    )
                                self.gene_association_sents[hash(sent)] = sent
                                inbetween_feature_vector['GENE']+=1
                            elif token.pos_ in pos_of_interest:
                                inbetween_feature_vector[token.pos_]+=1

示例#30

0

显示文件

文件： linking.py 项目： project-renard-survey/scispacy

def main(medmentions_path: str,
         umls_path: str,
         model_path: str,
         ks: str,
         thresholds,
         use_gold_mentions: bool = False,
         train: bool = False,
         spacy_model: str = "",
         generate_linker_data: bool = False,
         use_soft_matching: bool = False,
         substitute_abbreviations: bool = False):

    umls_concept_list = load_umls_kb(umls_path)
    umls_concept_dict_by_id = {c['concept_id']: c for c in umls_concept_list}

    # We need to keep around a map from text to possible canonical ids that they map to.
    text_to_concept_id: Dict[str, Set[str]] = defaultdict(set)

    for concept in umls_concept_list:
        for alias in set(concept["aliases"]).union({concept["canonical_name"]
                                                    }):
            text_to_concept_id[alias].add(concept["concept_id"])

    if train:
        create_tfidf_ann_index(model_path, text_to_concept_id)
    ann_concept_aliases_list, tfidf_vectorizer, ann_index = load_tfidf_ann_index(
        model_path)
    candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer,
                                             ann_concept_aliases_list,
                                             text_to_concept_id, False)

    linking_classifier = load_linking_classifier(model_path)
    linker = Linker(umls_concept_dict_by_id, linking_classifier)

    print('Reading MedMentions...')
    train_examples, dev_examples, test_examples = data_util.read_full_med_mentions(
        medmentions_path, spacy_format=False)

    k_list = [int(k) for k in ks.split(',')]
    if thresholds is None:
        thresholds = [1.0]
    else:
        thresholds = [float(x) for x in thresholds.split(",")]

    if len(thresholds) > 1 or len(k_list) > 1:
        assert not generate_linker_data, \
            'generating linker training data should be for a single threshold and k'

    nlp = spacy.load(spacy_model)
    if substitute_abbreviations:
        abbreviation_detector = AbbreviationDetector(nlp)
        nlp.add_pipe(abbreviation_detector, last=True)

    if generate_linker_data:
        examples_list = [train_examples, dev_examples, test_examples]
        filenames = [
            f'{model_path}/train.jsonl', f'{model_path}/dev.jsonl',
            f'{model_path}/test.jsonl'
        ]
        for examples, filename in zip(examples_list, filenames):
            supervised_data = eval_candidate_generation_and_linking(
                examples, umls_concept_dict_by_id, candidate_generator, k_list,
                thresholds, use_gold_mentions, nlp, generate_linker_data,
                linker, use_soft_matching, substitute_abbreviations)
            with open(filename, 'w') as f:
                for d in supervised_data:
                    f.write(f'{json.dumps(d)}\n')
    else:
        print('Results on the DEV set')
        eval_candidate_generation_and_linking(
            dev_examples, umls_concept_dict_by_id, candidate_generator, k_list,
            thresholds, use_gold_mentions, nlp, generate_linker_data, linker,
            use_soft_matching, substitute_abbreviations)