def __init__( self, nlp: Language, name: str = "taxonomic_abbreviation_detector", ) -> None: Doc.set_extension("abbreviations", default=[], force=True) Span.set_extension("long_form", default=None, force=True) AbbreviationDetector.__init__(self, nlp, name) self.abb_name_pattern = re.compile("[A-Z]{1}\.")
def init_nlp(): spacy_nlp = spacy.load('en_core_sci_lg') new_vector = spacy_nlp( """Positive-sense single‐stranded ribonucleic acid virus, subgenus sarbecovirus of the genus Betacoronavirus. Also known as severe acute respiratory syndrome coronavirus 2, also known by 2019 novel coronavirus. It is contagious in humans and is the cause of the ongoing pandemic of coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious disease.""").vector vector_data = { "COVID-19": new_vector, "2019-nCoV": new_vector, "SARS-CoV-2": new_vector } for word, vector in vector_data.items(): spacy_nlp.vocab.set_vector(word, vector) spacy_nlp.max_length = 2000000 # We also need to detect language, or else we'll be parsing non-english text # as if it were English. #spacy_nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(spacy_nlp) spacy_nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize # the data for us. linker = UmlsEntityLinker(resolve_abbreviations=True) spacy_nlp.add_pipe(linker) return (spacy_nlp, linker)
def load_model(): nlp = spacy.load("en_core_sci_lg") # Add abbreviation detector abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) return nlp
def process_data(pid, doc_list): nlp = spacy.load("en_core_sci_sm") nlp.add_pipe( AbbreviationDetector(nlp)) # Add abbreviation deteciton module linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) # Add Entity linking module data = [] for i, doc in enumerate(doc_list): sci_res = nlp(doc['text']) res_list = {} for ent in sci_res.ents: start, end = ent.start_char, ent.end_char res_list[(start, end)] = ent._.umls_ents doc['result'] = res_list data.append(doc) if i % 10 == 0: print('Completed [{}] {}, {}'.format( pid, i, time.strftime("%d_%m_%Y") + '_' + time.strftime("%H:%M:%S"))) return data
def load_sci_pipe(model="en_core_sci_md"): nlp = spacy.load(model) abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) nlp.add_pipe(merge_entities) return nlp
def load_model(name): nlp = spacy.load(name) # Add abbreviation detector abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) return nlp
def __init__(self): nlp = spacy.load('en_core_sci_lg') abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) self.linker = EntityLinker(resolve_abbreviations=True, name="umls") nlp.add_pipe(self.linker) self.nlp = nlp
def add_pipes_mutative(nlps, linker): """add pipeline components to every nlp pipeline """ for nlp in nlps: #mutative abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) nlp.add_pipe(merge_entities) nlp.add_pipe(linker) return nlps
def umls_entlink(self): """ Add UMLS entity linker and abbreviation detector to spaCy pipeline_ie """ abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) linker = EntityLinker(resolve_abbreviations=True, name="umls") self.nlp.add_pipe(linker)
def __init__(self): self.tagger = en_ner_bc5cdr_md.load() self.abbreviation_pipe = AbbreviationDetector(self.tagger) self.tagger.add_pipe(self.abbreviation_pipe) self.linker = UmlsEntityLinker(resolve_abbreviations=True, max_entities_per_mention=1) self.tagger.add_pipe(self.linker) print('NER Module Ready')
def __init__(self, args=None, detect_entities=False): if args is None: self.args = load_pickle("args.pkl") else: self.args = args self.cuda = torch.cuda.is_available() self.detect_entities = detect_entities if self.detect_entities: self.nlp = spacy.load("en_core_sci_md") abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) self.ner = spacy.load("en_ner_bc5cdr_md") self.nlp_norm = spacy.load("en_core_web_sm") else: self.nlp = None self.entities_of_interest = ["DISEASE", "CHEMICAL"] logger.info("Loading tokenizer and model...") from .train_funcs import load_state if args.model_no == 0: from ..model.BERT.modeling_bert import BertModel as Model model = 'bert-base-uncased' lower_case = True model_name = 'BERT' elif args.model_no == 1: from ..model.ALBERT.modeling_albert import AlbertModel as Model model = 'albert-base-v2' lower_case = False model_name = 'ALBERT' elif args.model_no == 2: from ..model.BIOBERT.modeling_biobert import BiobertModel as Model model = 'biobert' lower_case = False model_name = 'BIOBERT' self.net = Model.from_pretrained(model, force_download=False, \ task='classification', n_classes_=args.num_classes) self.tokenizer = load_pickle("%s_tokenizer.pkl" % model_name) self.net.resize_token_embeddings(len(self.tokenizer)) if self.cuda: self.net.cuda() start_epoch, best_pred, amp_checkpoint = load_state(self.net, None, None, self.args, load_best=False) logger.info("Done!") # self.d_id_s = self.tokenizer.convert_tokens_to_ids('[D]') # self.d_id_e = self.tokenizer.convert_tokens_to_ids('[/D]') # self.c_id_s = self.tokenizer.convert_tokens_to_ids('[C]') # self.d_id_s = self.tokenizer.convert_tokens_to_ids('[/C]') self.D_id = self.tokenizer.convert_tokens_to_ids('DISEASE') self.C_id = self.tokenizer.convert_tokens_to_ids('CHEMICAL') self.pad_id = self.tokenizer.pad_token_id self.rm = load_pickle("relations.pkl")
def __init__(self, mkquery=mkquery_ngrams, es=None): self.case = True self.all_fields = True self.es = es or Elasticsearch() self.log = logging.getLogger(__name__) self.mkquery = mkquery self.nlp = spacy.load("en_core_web_sm") abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) self.nlp.disable_pipes("tagger", "ner", "parser")
def __init__(self, args): import scispacy, spacy from scispacy.abbreviation import AbbreviationDetector from scispacy.umls_linking import UmlsEntityLinker self.nlp = spacy.load("en_core_sci_sm") self.nlp.add_pipe(AbbreviationDetector( self.nlp)) # Add abbreviation deteciton module linker = UmlsEntityLinker( resolve_abbreviations=True) # Add Entity linking module self.nlp.add_pipe(linker)
def __post_init__(self): self.nlp = spacy.load(self.language_model) self.nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) # Add the abbreviation pipe to the spacy pipeline. Only need to run this once. abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) # Our linker will look up named entities/concepts in the UMLS graph and normalize the data # for us. self.linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(self.linker)
def test_linker_resolves_abbreviations(self): detector = AbbreviationDetector(self.nlp) self.nlp.add_pipe(detector) text = "1-Methyl-4-phenylpyridinium (MPP+) is an abbreviation which doesn't exist in the baby index." doc = self.nlp(text) # Set abbreviated text (MPP+) to be the only entity, which is also not in the toy umls index. doc.ents = (doc[2:3], ) doc = self.linker(doc) id_with_score = doc.ents[0]._.kb_ents[0] assert id_with_score == ("C0000098", 1.0) umls_entity = self.linker.kb.cui_to_entity[id_with_score[0]] assert umls_entity.concept_id == "C0000098"
def show_medical_abbreviation(model, document): """ This function detects and resolves medical abbreviations in word entities Parameters: model(module): A pretrained biomedical model from ScispaCy(https://allenai.github.io/scispacy/) document(str): Document to be processed Returns: List of unique abbreviations and their resolution """ nlp = model.load() abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) doc = nlp(document) abbreviated = list( set([f"{abrv} {abrv._.long_form}" for abrv in doc._.abbreviations ])) # list is set to ensure only unique values are returned return abbreviated
def get_abbr_ratio(text, known_abbreviatures=None): """Returns the percentage of dissambiguated abbreviations of the text. INPUT: Textual data, [Abbreviatures already identified] OUTPUT: Percentage of identified abbreviatures""" spacy.prefer_gpu() nlp = spacy.load('en_core_sci_md') abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) doc = nlp(text) # Esto tiene que haber una manera mas elegante tokens = list(set([t.text for t in doc if t.text not in string.punctuation])) abbrs = [d.text for d in doc._.abbreviations] if len(abbrs) == 0: return 1 if known_abbreviatures: for ka in known_abbreviatures: if ka in abbrs: abbrs.remove(ka) return float(len(list(set(abbrs) & set(tokens)))) / float(len(abbrs))
def __init__(self, biospacy, rules, dysplasia_mappings, dict_path, aff_path): """ Load models and rules Params: biospacy (str): full spaCy pipeline for biomedical data rules (str): hand-crafted rules file path dysplasia_mappings (str): dysplasia mappings file path Returns: None """ self.nlp = spacy.load(biospacy) abbreviation_pipe = AbbreviationDetector( self.nlp) # add abbreviation detector to spaCy pipeline negex = Negex(self.nlp) # add negation detector to spaCy pipeline self.hun = hunspell.HunSpell(dict_path, aff_path) # add spell checker self.linker = UmlsEntityLinker( k=10, max_entities_per_mention=2, resolve_abbreviations=True ) # tunable params - add umls entity linker to spaCy pipeline self.nlp.add_pipe(abbreviation_pipe, name="abbrv_detector") self.nlp.add_pipe(self.linker, after="abbrv_detector") self.nlp.add_pipe(negex, last=True) self.nlp.add_pipe( self.expand_entity_mentions, name='expand_entities', after='ner' ) # add expand_entity_mentions to spaCy processing pipeline # load hand-crafted rules self.rules = utils.read_rules(rules) # set parameter to store the hand-crated rules restricted to a specific use-case (updated w/ self.set_rules() func) self.use_case_rules = dict() # set parameter to store candidate mentions from restricted rules self.use_case_candidates = list() # load dysplasia mappings self.dysplasia = utils.read_dysplasia_mappings(dysplasia_mappings) # set parameter to store dysplasia mappings restricted to a specific use-case self.use_case_dysplasia = dict()
def get_abbreviation_df(nlp, data, fields, skip_zero=False, skip_duplicate=True): ### finds abrv, its meaning and cosine similarity between abrv and meaning in data (DataFrame) fields (list) ### nlp - embedding dictionary (e.g. en_core_sci_lg) ### skip_zero - skip abbreviations without embeddings ### skip_duplicate - skip duplicate abbreviations abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) n_dim = nlp("").vector.shape[0] abrv_set = set() rez = pd.DataFrame(columns=["abrv", "meaning", "similarity"]) for field in fields: for s in data[field]: if not pd.isna(s): doc = nlp(s) for abrv in doc._.abbreviations: abrv_str = str(abrv) if not (skip_zero and np.allclose( nlp(abrv_str).vector, np.zeros(n_dim))): if not (skip_duplicate and abrv_str in abrv_set): abrv_set.add(abrv_str) meaning = str(abrv._.long_form) sim = semantic_similarity(nlp, abrv_str, meaning) rez = rez.append( { "abrv": abrv_str, "meaning": meaning, "similarity": sim }, ignore_index=True) return rez
def add_pipe(self, pipe): """Add Spacy pipes Args: pipe (str): pipe name """ print('Loading Spacy pipe: {}'.format(pipe)) pipe = pipe.lower() if pipe == 'abbreviation': # Abbreviation extraction abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) elif pipe == 'entitylinker': # Entity linker linker = UmlsEntityLinker(resolve_abbreviations=True) self.nlp.add_pipe(linker) elif pipe == 'segmenter': # Rule Segmenter self.nlp.add_pipe(combined_rule_sentence_segmenter, first=True) elif pipe == 'tokenizer': # Tokenizer self.nlp.tokenizer = combined_rule_tokenizer(self.nlp) elif pipe == 'textrank': # Textrank tr = pytextrank.TextRank() self.nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) print('Pipe loaded.')
def initialize_nlp(virus_lex_path: str, scispacy_model_name: str = "en_core_sci_lg"): """ Initialize scispacy nlp object and virus terms to the vocabulary. :param virus_lex_path: path to virus lexicon :param scispacy_model_name: name of scispacy model to use for w2v vectors :return: Scispacy nlp object """ # Load the scispacy large model # nlp = en_core_sci_lg.load(disable='parser') # I believe this should work, I wonder if it's not recommended for memory reasons though in a v env like Travis... nlp = spacy.load(scispacy_model_name, disable='parser') # Enable umls entity detection and abbreviation detection linker = UmlsEntityLinker(resolve_abbreviations=True) nlp.add_pipe(linker) abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) # Create a new vector to assign to the virus terms new_vector = nlp( """Positive-sense single‐stranded ribonucleic acid virus, subgenus """ """sarbecovirus of the genus Betacoronavirus. """ """Also known as severe acute respiratory syndrome coronavirus 2, """ """also known by 2019 novel coronavirus. It is """ """contagious in humans and is the cause of the ongoing pandemic of """ """coronavirus disease. Coronavirus disease 2019 is a zoonotic infectious """ """disease.""").vector # Add virus terms to the model vocabulary and assign to them the new vector created above # vocab = Vocab() virus_words = pd.read_csv(virus_lex_path, header=None) for virus_word in virus_words[0]: nlp.vocab.set_vector(virus_word, new_vector) return nlp
def parse(self, tex_path: str, tex: str) -> Iterator[Abbreviation]: check_for_reserved_characters(tex) plaintext, plaintext_to_tex_offset_map = plaintext_and_offset(tex_path, tex) # This is the most basic model and had no real performance difference on our inputs, # other options include NER models and models with pretrained word vectors. nlp = spacy.load("en_core_sci_sm") abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) # These dictionaries hold abbreviated forms, their expansions, and the location of the expansions. # All of them use the abbreviated form as keys. abb_short_forms = {} abb_expansions = {} expanded_locations = {} doc = nlp(plaintext) # This extracts the abbreviations from the scispacy model. for abrv in doc._.abbreviations: count = 0 for s in NON_ACRONYM_CHARACTERS: count += str(abrv).count(s) # count makes sure that we don't accidentally include symbols or variables. if count == 0: abb_short_forms[str(abrv)] = [[plaintext_to_tex_offset_map[m.start()], plaintext_to_tex_offset_map[m.start() + len(str(abrv))]] for m in re.finditer(str(abrv), plaintext)] abb_expansions[str(abrv)] = str(abrv._.long_form) x = plaintext.find(str(abrv._.long_form)) expanded_locations[str(abrv)] = [plaintext_to_tex_offset_map[x], plaintext_to_tex_offset_map[x + len(str(abrv._.long_form))]] # If you want to use another abbreviation detection method in addition to scispacy # you may implement it here and add its results to the three dictionaries. count = 0 full_count = 1 # Yields abbreviated forms and their expansions. for abb in abb_short_forms: exp_start, exp_end = expanded_locations[abb] expanded = abb_expansions[abb] tex_sub = tex[exp_start:exp_end] context_tex = tex[exp_start - DEFAULT_CONTEXT_SIZE : exp_end + DEFAULT_CONTEXT_SIZE] # Yields the expanded form as an Abbreviation type yield Abbreviation( text=abb, start=exp_start, end=exp_end, expansion=expanded, id_=count, tex_path=tex_path, tex=tex_sub, context_tex=context_tex, str_id="f" + str(full_count) + "-0" ) count += 1 short_count = 0 # Yields the abbreviated forms as Abbreviation types. for location in abb_short_forms[abb]: short_count += 1 start, end = location tex_sub = tex[start:end] context_tex = tex[start - DEFAULT_CONTEXT_SIZE : end + DEFAULT_CONTEXT_SIZE] yield Abbreviation( text=abb, start=start, end=end, expansion=expanded, id_=count, tex_path=tex_path, tex=tex_sub, context_tex=context_tex, str_id="s" + str(full_count) + "-" + str(short_count) ) count += 1 full_count += 1
# pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_ner_bc5cdr_md-0.2.4.tar.gz import scispacy from scispacy.abbreviation import AbbreviationDetector import spacy import en_core_sci_lg from spacy.matcher import Matcher from pprint import pprint from my_sentence_splitting import get_sents import json from tqdm import tqdm import re nlp = spacy.load("en_core_sci_lg") abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) matcher = Matcher(nlp.vocab) age_words = [ "boy", "girl", "man", "woman", 'men', 'women', 'girls', 'boys', 'baby', 'babies', 'infant', 'mother', 'father', 'male', 'female', 'males', 'females', 'adult', 'adults', 'children', 'child', 'newborn', 'neonates', 'fathers', 'toddlers', 'neonate', 'toddler', 'adolescent', 'adolescents', 'elderly', 'young', 'newborns', 'mothers', 'persons', 'person' ] age_pattern_1 = [ # this is simple. It will match everything {"OP": "+", "LOWER": {"IN": age_words}, "POS": {"IN": ["NOUN"]}}, {"OP": "?", "LOWER": {"IN": ['patient', 'patients']}} ]
def detect(self, text, detect_relations=False, resolve_abbreviations=False, link_with_umls=False, verbose=False): if verbose: print('-- Will detect named entities using scispaCy.') if detect_relations: print('-- Will detect relations.') if resolve_abbreviations: print('-- Will detect abbreviations.') if link_with_umls: print('-- Will search for UMLS matches.') nlp = spacy.load(self.__model) if link_with_umls: umls_linker = UmlsEntityLinker(k=10, max_entities_per_mention=1) nlp.add_pipe(umls_linker) if resolve_abbreviations: abbrev_detector = AbbreviationDetector(nlp) nlp.add_pipe(abbrev_detector) doc = nlp(text) # Named Entities Detected: ner = set([X.text for X in doc.ents]) if verbose: print('Named Entities detected: {}'.format(ner)) relations = set() if detect_relations: matcher = Matcher(nlp.vocab) pattern = [{'DEP':'ROOT'}, {'DEP':'prep','OP':"?"}, {'DEP':'agent','OP':"?"}, {'POS':'ADJ','OP':"?"}] matcher.add("matching_1", None, pattern) for sentence in sent_tokenize(text): matches = matcher(doc) k = len(matches) - 1 span = doc[matches[k][1]:matches[k][2]] relations.add(span.text) tokens = word_tokenize(sentence) pos_tags = pos_tag(tokens) chunkGram = r"""Chunk: {<RB.?>?<VB.?><RB.?>?}""" chunkParser = RegexpParser(chunkGram) chunked = chunkParser.parse(pos_tags) for sub_tree in chunked.subtrees(): if sub_tree.label() == 'Chunk': candidate = '' for leaf in sub_tree.leaves(): if candidate == '': candidate = leaf[0] else: candidate = '{} {}'.format(candidate, leaf[0]) relations.add(candidate) if verbose: print('Relations detected: {}'.format(relations)) abbrev_refs = {} if resolve_abbreviations: for abbrv in doc._.abbreviations: reference = abbrv._.long_form if verbose: print('- {} : {}'.format(abbrv, reference)) abbrev_refs[abbrv] = reference if verbose: print('Abbreviations detected: {}'.format(abbrev_refs)) #TODO implement resolution (i.e. replace detected abbreviations) linked = {} if link_with_umls: if verbose: print('Serching for UMLS matches...') entities = str(ner.union(relations)) # Evaluate on both entities and relations entities = nlp(entities).ents for entity in entities: for umls_ent in entity._.umls_ents: Concept_Id, Score = umls_ent if verbose: print("Name:" ,entity) print('Concept_Id = {} Score = {}'.format(Concept_Id, Score)) umls_entity = umls_linker.umls.cui_to_entity[Concept_Id] print(umls_entity) if not entity.text in linked: # greater scores are shown first, so no need to add smaller scores. linked[entity.text] = 'sameas\tumls:{}\t{}\t'.format(Concept_Id, umls_entity.canonical_name) break if verbose: print('UMLS matches: {}'.format(linked)) return ner, relations, linked
def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") self.detector = AbbreviationDetector(self.nlp) self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \
class TestAbbreviationDetector(unittest.TestCase): def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") self.detector = AbbreviationDetector(self.nlp) self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \ inherited motor neuron disease caused by the expansion \ of a polyglutamine tract within the androgen receptor (AR). \ SBMA can be caused by this easily." def test_find_abbreviation(self): # Basic case doc = self.nlp("abbreviation (abbrn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "abbreviation" # Hypenation and numbers within abbreviation doc = self.nlp("abbreviation (ab-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "abbreviation" # No match doc = self.nlp("abbreviation (aeb-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form is None # First letter must match start of word. doc = self.nlp("aaaabbreviation (ab-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "aaaabbreviation" # Matching is greedy for first letter (are is not included). doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)") long = doc[0:5] short = doc[6:7] _, long_form = find_abbreviation(long, short) assert long_form.text == "aaaabbreviation" def test_filter_matches(self): doc = self.nlp(self.text) matches = self.detector.matcher(doc) matches_no_brackets = [(x[0], x[1] + 1, x[2] -1) for x in matches] filtered = filter_matches(matches_no_brackets, doc) assert len(filtered) == 2 long, short = filtered[0] assert long.string == "Spinal and bulbar muscular atrophy " assert short.string == "SBMA" long, short = filtered[1] assert long.string == "within the androgen receptor " assert short.string == "AR" def test_abbreviation_detection(self): # Attribute should be registered. doc = self.nlp(self.text) assert doc._.abbreviations == [] doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 3 correct = set() span = doc[33:34] span._.long_form = doc[0:5] correct.add(span) span = doc[6:7] span._.long_form = doc[0:5] correct.add(span) span = doc[29:30] span._.long_form = doc[26:28] correct.add(span) correct_long = {x._.long_form for x in correct} assert set(doc2._.abbreviations) == correct assert {x._.long_form for x in doc2._.abbreviations} == correct_long def test_find(self): doc = self.nlp(self.text) long, shorts = self.detector.find(doc[6:7], doc) assert long.string == "Spinal and bulbar muscular atrophy " assert len(shorts) == 2 assert {x.string for x in shorts} == {"SBMA", "SBMA "} long, shorts = self.detector.find(doc[7:13], doc) assert shorts == set() def test_issue_158(self): text = "The PVO observations showed that the total transterminator flux "\ "was 23% of that at solar maximum and that the largest reductions in the "\ "number of ions transported antisunward occurred at the highest altitudes "\ "(Spenner et al., 1995)." doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0
class TestAbbreviationDetector(unittest.TestCase): def setUp(self): super().setUp() self.nlp = spacy.load("en_core_web_sm") self.detector = AbbreviationDetector(self.nlp) self.text = "Spinal and bulbar muscular atrophy (SBMA) is an \ inherited motor neuron disease caused by the expansion \ of a polyglutamine tract within the androgen receptor (AR). \ SBMA can be caused by this easily." def test_find_abbreviation(self): # Basic case doc = self.nlp("abbreviation (abbrn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "abbreviation" # Hypenation and numbers within abbreviation doc = self.nlp("abbreviation (ab-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "abbreviation" # No match doc = self.nlp("abbreviation (aeb-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form is None # First letter must match start of word. doc = self.nlp("aaaabbreviation (ab-b9rn)") long = doc[0:1] short = doc[2:3] _, long_form = find_abbreviation(long, short) assert long_form.text == "aaaabbreviation" # Matching is greedy for first letter (are is not included). doc = self.nlp("more words are considered aaaabbreviation (ab-b9rn)") long = doc[0:5] short = doc[6:7] _, long_form = find_abbreviation(long, short) assert long_form.text == "aaaabbreviation" def test_filter_matches(self): doc = self.nlp(self.text) matches = self.detector.matcher(doc) matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches] filtered = filter_matches(matches_no_brackets, doc) assert len(filtered) == 2 long, short = filtered[0] assert long.text_with_ws == "Spinal and bulbar muscular atrophy " assert short.text == "SBMA" long, short = filtered[1] assert long.text_with_ws == "within the androgen receptor " assert short.text == "AR" def test_abbreviation_detection(self): # Attribute should be registered. doc = self.nlp(self.text) assert doc._.abbreviations == [] doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 3 correct = set() span = doc[33:34] span._.long_form = doc[0:5] correct.add(span) span = doc[6:7] span._.long_form = doc[0:5] correct.add(span) span = doc[29:30] span._.long_form = doc[26:28] correct.add(span) correct_long = {x._.long_form for x in correct} assert set(doc2._.abbreviations) == correct assert {x._.long_form for x in doc2._.abbreviations} == correct_long def test_find(self): doc = self.nlp(self.text) long, shorts = self.detector.find(doc[6:7], doc) assert long.text_with_ws == "Spinal and bulbar muscular atrophy " assert len(shorts) == 2 assert {x.text_with_ws for x in shorts} == {"SBMA", "SBMA "} long, shorts = self.detector.find(doc[7:13], doc) assert shorts == set() def test_issue_158(self): text = ( "The PVO observations showed that the total transterminator flux " "was 23% of that at solar maximum and that the largest reductions in the " "number of ions transported antisunward occurred at the highest altitudes " "(Spenner et al., 1995)." ) doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 def test_issue_192(self): # test for <short> (<long>) pattern text = "blah SBMA (Spinal and bulbar muscular atrophy)" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 1 assert doc2._.abbreviations[0] == doc[1:2] assert doc2._.abbreviations[0]._.long_form == doc[3:8] def test_issue_161(self): # test some troublesome cases in the abbreviation detector text = "H2)]+(14)s.t. (1), (4).Similarly" print(f"Text: {text}") doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = ".(21)In (21), λ" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "map expX (·) : R" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "0,(3)with the following data: (3-i) (q̄" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "Φg(h),ThΦg(v) ) , (h, v)" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "dimension;(S-iii) The optimal control problem obtained in (S-ii) is con-verted" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "z), πut (z)) )" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "repositories he/she already worked with or from previous collaborators. Nevertheless, 88% of the first action of users to a repository (repository discovery) is" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 def test_empty_span(self): text = "(19, 9, 4) Hadamard Designs and Their Residual Designs" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 def test_space_issue(self): text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture." doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 1 assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT" def test_multiple_spaces(self): text = "by designing A Lite BERT (ALBERT) architecture that has significantly fewer parameters than a traditional BERT architecture." doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 1 assert doc2._.abbreviations[0]._.long_form.text == "A Lite BERT" @pytest.mark.xfail def test_difficult_cases(self): # Don't see an obvious way of solving these. They require something more semantic to distinguish text = "is equivalent to (iv) of Theorem" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0 text = "or to fork.Users work more on their repositories (owners) than on" doc = self.nlp(text) doc2 = self.detector(doc) assert len(doc2._.abbreviations) == 0
def __init__(self, prediction_type: str) -> None: # Initialize modules for featurization. # To use a smaller model, swap out the parameter with "en_core_sci_sm" # The prediction_type are the '+' separated keys of the joint model heads. # They are the names of the datasets on which the joint model was trained. # Example : prediction_type = "DocDef2+AI2020+W00" logging.debug("Loading Spacy models (this may take some time).") self.nlp = spacy.load("en_core_sci_md") abbreviation_pipe = AbbreviationDetector(self.nlp) self.nlp.add_pipe(abbreviation_pipe) # Create a detector for verb phrases. verb_pattern = [ { "POS": "VERB", "OP": "?" }, { "POS": "ADV", "OP": "*" }, { "POS": "AUX", "OP": "*" }, { "POS": "VERB", "OP": "+" }, ] self.verb_matcher = Matcher(self.nlp.vocab) self.verb_matcher.add("Verb phrase", None, verb_pattern) # Initialize modules for transformer-based inference model based on the prediction_type self.model_paths = { "W00": { "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/", "file": "termdef.zip", "type": "term-def", }, "AI2020": { "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/", "file": "abbrexp.zip", "type": "abbr-exp", }, "DocDef2": { "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/", "file": "symnick.zip", "type": "sym-nick", }, "DocDef2+AI2020+W00": { "baseURL": "https://scholarphi.s3-us-west-1.amazonaws.com/", "file": "joint_symnick_abbrexp_termdef.zip", "type": "joint", }, } self.prediction_type = prediction_type cache_directory = f"./cache/{self.prediction_type}_model" # Make a directory storing model files (./data/) if not os.path.exists(cache_directory): os.makedirs(cache_directory) logging.debug("Created cache directory for models at %s", cache_directory) # Download the best model files in ./data/ MODEL_URL = (self.model_paths[self.prediction_type]["baseURL"] + self.model_paths[self.prediction_type]["file"]) logging.debug( "Downloading model from %s. Warning: this will take a long time.", MODEL_URL, ) cache_file = self.model_paths[self.prediction_type]["file"] urllib.request.urlretrieve( MODEL_URL, os.path.join("{}/{}".format(cache_directory, cache_file)), ) with zipfile.ZipFile("{}/{}".format(cache_directory, cache_file), "r") as zip_ref: zip_ref.extractall(cache_directory) logging.debug("Downloaded and unpacked model data in directory %s", cache_file) else: logging.debug( # pylint: disable=logging-not-lazy "Cache directory for models already exists at %s. " + "Skipping creation of directory and download of data.", cache_directory, ) parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses( [ "--model_name_or_path", "roberta-large", "--task", f"{self.prediction_type}", "--data_dir", cache_directory, "--output_dir", os.path.join(cache_directory, "roberta-large"), "--do_eval", "--overwrite_cache", "--use_crf", "--use_heuristic", "--use_pos", "--use_np", "--use_vp", "--use_entity", "--use_acronym", "--per_device_eval_batch_size", "16", "--max_seq_len", "80", ]) # Set seed for model. set_torch_seed(training_args.seed, training_args.no_cuda) # Log basic debugging information about model and arguments. logging.info( # pylint: disable=logging-not-lazy "Arguments for NLP model. Process rank: %s, device: %s, " + "n_gpu: %s, distributed training: %s, 16-bits training: %s. Training / evaluation " + "parameters: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, training_args, ) # Set model type from arguments. model_args.model_type = model_args.model_name_or_path.split( "-")[0].split("_")[0] # Load model configuration. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logging.warning( "You are instantiating a new config instance from scratch.") # Load tokenizer. if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. " + "This is not supported, but you can do it from another script, " + "save it, and load it from here, using --tokenizer_name.") # Rename output directory to reflect model parameters. training_args.output_dir = "{}{}{}{}{}{}".format( training_args.output_dir, "_pos={}".format(training_args.use_pos) if training_args.use_pos else "", "_np={}".format(training_args.use_np) if training_args.use_np else "", "_vp={}".format(training_args.use_vp) if training_args.use_vp else "", "_entity={}".format(training_args.use_entity) if training_args.use_entity else "", "_acronym={}".format(training_args.use_acronym) if training_args.use_acronym else "", ) logging.info( "The output directory for the model has been set to %s", training_args.output_dir, ) data_args.ignore_index = training_args.ignore_index data_args.output_dir = training_args.output_dir # Load the model. model_class = MODEL_CLASSES[model_args.model_type] if (os.path.exists(training_args.output_dir) and not training_args.overwrite_output_dir): model = model_class.from_pretrained( training_args.output_dir, args=training_args, intent_label_dict=get_joint_labels(data_args, "intent_label"), slot_label_dict=get_joint_labels(data_args, "slot_label"), pos_label_lst=get_joint_labels(data_args, "pos_label"), # This is because currently there are 3 different models - one for each task tasks=self.prediction_type.split('+'), ) logging.info("Model loaded from %s", training_args.output_dir) else: logging.error( # pylint: disable=logging-not-lazy "Could not load model from %s. A pre-trained model could " + "not be found in the directory. This can occur if the download of the model was " + "terminated. Try deleting %s and running this script again.", training_args.output_dir, cache_directory, ) raise ValueError( f"Could not load model from {training_args.output_dir}") # model.resize_token_embeddings(len(tokenizer)) self.data_args = data_args self.model_args = model_args self.tokenizer = tokenizer self.model = model self.trainer = Trainer( [ training_args, self.model_args, self.data_args, ], self.model, )
def determine_gene_associations(self,verbed=True,twosents=False): """Determine sentences with specified gene association using natural language processing Args: verbed (bool): A verb is required in the middle of the regex assoc and the gene; this eliminates sentences that do not make a claim on the gene. twosents (bool): Lookup possible co-occurence in a sliding window of two sentences instead of sentence by sentence TODO not implemented yet """ import spacy from scispacy.abbreviation import AbbreviationDetector if twosents: raise NotImplementedError try: nlp = spacy.load('en') #en_ner_craft_md # Detect abbreviations abbreviation_pipe = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_pipe) nlp.add_pipe(nlp.create_pipe('sentencizer')) # Prevent splitting intra-word hyphens suffixes = nlp.Defaults.suffixes + (r'''\w+-\w+''',) suffix_regex = spacy.util.compile_suffix_regex(suffixes) nlp.tokenizer.suffix_search = suffix_regex.search except OSError: raise Exception( '''spacy language module not installed. Run: python -m spacy download en ''' # pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_ner_craft_md-0.2.0.tar.gz ) self.gene_association = {} self.gene_association_sents = {} pos_of_interest = ('VERB', 'NOUN', 'ADP', 'PUNCT', 'GENE') for association in self.associations: abstract = nlp(association['content']) sentences = list(abstract.sents) for sent in sentences: assoc_match = self.assoc.search(sent.text) if assoc_match: sent_startposition = sent[0].idx before_assoc_match = True inbetween_feature_vectors = {} for token in sent: # Only looking up if gene symbol if it is not likely to be a general English word gene_symbol = ( None if (token.text.isalpha() and (token.is_sent_start or token.text.islower())) else self.get_gene_symbol(token.text) ) if gene_symbol: # import pdb; pdb.set_trace() association_key = ( association['pmid'],association['date'],token.text,(assoc_match.start(),assoc_match.end()) ) else: association_key = None # First check if still before match if (assoc_match.start() < token.idx - sent_startposition) and before_assoc_match: before_assoc_match = False #Store before_assoc_match featurevectors for iv in inbetween_feature_vectors: if not iv in self.gene_association: self.gene_association[iv] = {} prev_association_key = inbetween_feature_vectors[iv].pop('association_key') if prev_association_key not in self.gene_association[iv]: self.gene_association[iv][prev_association_key] = [] self.gene_association[iv][prev_association_key].append( inbetween_feature_vectors[iv] ) inbetween_feature_vector = {p:0 for p in pos_of_interest} inbetween_feature_vector['sent'] = hash(sent) if before_assoc_match: if gene_symbol: # For previous genes update GENE count (TODO retroactive for genes coming after) for iv in inbetween_feature_vectors: inbetween_feature_vectors[iv]['GENE']+=1 # Initialise feature vector for each gene symbol for gs in gene_symbol: inbetween_feature_vectors[gs] = {p:0 for p in pos_of_interest} inbetween_feature_vectors[gs]['sent'] = hash(sent) inbetween_feature_vectors[gs]['association_key'] = association_key self.gene_association_sents[hash(sent)] = sent elif token.pos_ in pos_of_interest: for iv in inbetween_feature_vectors: inbetween_feature_vectors[iv][token.pos_]+=1 else: if gene_symbol: for gs in gene_symbol: if not gs in self.gene_association: self.gene_association[gs] = {} if association_key not in self.gene_association[gs]: self.gene_association[gs][association_key] = [] self.gene_association[gs][association_key].append( inbetween_feature_vector.copy() ) self.gene_association_sents[hash(sent)] = sent inbetween_feature_vector['GENE']+=1 elif token.pos_ in pos_of_interest: inbetween_feature_vector[token.pos_]+=1
def main(medmentions_path: str, umls_path: str, model_path: str, ks: str, thresholds, use_gold_mentions: bool = False, train: bool = False, spacy_model: str = "", generate_linker_data: bool = False, use_soft_matching: bool = False, substitute_abbreviations: bool = False): umls_concept_list = load_umls_kb(umls_path) umls_concept_dict_by_id = {c['concept_id']: c for c in umls_concept_list} # We need to keep around a map from text to possible canonical ids that they map to. text_to_concept_id: Dict[str, Set[str]] = defaultdict(set) for concept in umls_concept_list: for alias in set(concept["aliases"]).union({concept["canonical_name"] }): text_to_concept_id[alias].add(concept["concept_id"]) if train: create_tfidf_ann_index(model_path, text_to_concept_id) ann_concept_aliases_list, tfidf_vectorizer, ann_index = load_tfidf_ann_index( model_path) candidate_generator = CandidateGenerator(ann_index, tfidf_vectorizer, ann_concept_aliases_list, text_to_concept_id, False) linking_classifier = load_linking_classifier(model_path) linker = Linker(umls_concept_dict_by_id, linking_classifier) print('Reading MedMentions...') train_examples, dev_examples, test_examples = data_util.read_full_med_mentions( medmentions_path, spacy_format=False) k_list = [int(k) for k in ks.split(',')] if thresholds is None: thresholds = [1.0] else: thresholds = [float(x) for x in thresholds.split(",")] if len(thresholds) > 1 or len(k_list) > 1: assert not generate_linker_data, \ 'generating linker training data should be for a single threshold and k' nlp = spacy.load(spacy_model) if substitute_abbreviations: abbreviation_detector = AbbreviationDetector(nlp) nlp.add_pipe(abbreviation_detector, last=True) if generate_linker_data: examples_list = [train_examples, dev_examples, test_examples] filenames = [ f'{model_path}/train.jsonl', f'{model_path}/dev.jsonl', f'{model_path}/test.jsonl' ] for examples, filename in zip(examples_list, filenames): supervised_data = eval_candidate_generation_and_linking( examples, umls_concept_dict_by_id, candidate_generator, k_list, thresholds, use_gold_mentions, nlp, generate_linker_data, linker, use_soft_matching, substitute_abbreviations) with open(filename, 'w') as f: for d in supervised_data: f.write(f'{json.dumps(d)}\n') else: print('Results on the DEV set') eval_candidate_generation_and_linking( dev_examples, umls_concept_dict_by_id, candidate_generator, k_list, thresholds, use_gold_mentions, nlp, generate_linker_data, linker, use_soft_matching, substitute_abbreviations)