def __init__(self) -> None: snlp = stanfordnlp.Pipeline(lang='en') # stanfordnlp python pipeline self.nlp = StanfordNLPLanguage(snlp) # spacy wraper for snlp conllformatter = ConllFormatter(self.nlp) self.nlp.add_pipe(conllformatter, last=True) self.detokenizer = MosesDetokenizer() self.vanila_preprocessor = PreprocessorBase()
def lemmatize_texts(lemmatizer): entries = Entry.objects.filter(lemmatized='') if lemmatizer == 'stanford': texts = [(entry.text, entry.id) for entry in entries] snlp = stanfordnlp.Pipeline(lang='ru') nlp = StanfordNLPLanguage(snlp) for doc in tqdm.tqdm( nlp.pipe(texts, batch_size=100, as_tuples=True, disable=["tagger", "parser", "pos", "depparse"])): id = doc[1] lemmatized = ' '.join([token.lemma_ for token in doc[0]]) entry = Entry.objects.get(id=id) entry.lemmatized = lemmatized entry.save() if lemmatizer == 'mystem': m = Mystem() for entry in tqdm.tqdm(entries): lemmas = m.lemmatize(entry.text) lemmatized = ''.join(lemmas) entry = Entry.objects.get(id=entry.id) entry.lemmatized = lemmatized entry.save()
def __clean_text(self, df): config = { 'processors': 'tokenize,pos,lemma,depparse', # Comma-separated list of processors to use 'lang': 'ru', # Language code for the language to build the Pipeline in 'tokenize_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tokenizer.pt', 'pos_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tagger.pt', 'pos_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt', 'lemma_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_lemmatizer.pt', 'depparse_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_parser.pt', 'depparse_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt' } snlp = stanfordnlp.Pipeline(**config) nlp = StanfordNLPLanguage(snlp) text_list = df["Text"].values lower_text_list = [] for text in text_list: text_lower = text.lower() lower_text_list.append(text_lower) clean_text_list = [] for text in lower_text_list: text = nlp(text) token = [token.lemma_ for token in text if not (token.is_punct or token.is_stop)] clean_text_list.append(token) return clean_text_list
def tag_relations(text, terms, bags, nlp=None): """ Modified version of tag relations that handles the special case of making predictions on new data without known relation labels. """ # default to Stanford NLP pipeline wrapped in Spacy if nlp is None: snlp = stanfordnlp.Pipeline(lang="en") nlp = StanfordNLPLanguage(snlp) # preprocess with spacy if needed if type(terms[0]) != spacy.tokens.doc.Doc: terms = [nlp(term) for term in terms] if (type(text) != spacy.tokens.doc.Doc and type(text) != spacy.tokens.span.Span): text = nlp(text) results = tag_terms(text, terms, nlp) tokenized_text = results["tokenized_text"] tagged_text = results["tags"] found_terms_info = results["found_terms"] found_terms = list(found_terms_info.keys()) for i in range(len(found_terms) - 1): for j in range(i + 1, len(found_terms)): term_pair = (found_terms[i], found_terms[j]) bags = add_relation(term_pair, found_terms_info, tokenized_text, bags) term_pair_reverse = (found_terms[j], found_terms[i]) bags = add_relation(term_pair_reverse, found_terms_info, tokenized_text, bags) return bags
def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp): if model_or_lang is None: model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm' nlp = None if use_stanfordnlp: from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized) nlp = StanfordNLPLanguage(snlp) else: # Init model: # Initialize model, with custom pipe # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers' nlp = spacy.load(model_or_lang) if is_tokenized: nlp.tokenizer = nlp.tokenizer.tokens_from_list if disable_sbd: nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser') conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) return nlp
def geocode_entries(): snlp = stanfordnlp.Pipeline(lang='ru') nlp = StanfordNLPLanguage(snlp) entries = Entry.objects.all() for entry in tqdm.tqdm(entries): doc = nlp(entry.text) words = [ token.text for token in doc if token.is_punct is False and token.is_stop is False ] for word in words: geolocator = Nominatim(user_agent="prozhito_db") location = geolocator.geocode(word) if location: print(location) """
def main(config, input_text, terms, out_dir, model_version): logger = config.get_logger('test') # set up spacy nlp engine warnings.filterwarnings('ignore') sys.stdout = open(os.devnull, "w") snlp = stanfordnlp.Pipeline(lang="en") nlp = StanfordNLPLanguage(snlp) sys.stdout = sys.__stdout__ # read in text and terms with open(input_text, "r") as f: lines = f.readlines() if terms.endswith(".txt"): with open(terms, "r") as f: terms = f.readlines() elif terms.endswith(".json"): with open(terms, "r") as f: terms = list(json.load(f).keys()) # build input term pair bags terms = [nlp(term, disable=["ner", "parser"]) for term in terms] bags = {"no-relation": []} print("Preprocessing Data") for line in tqdm(lines): if len(line.strip()) == 0: continue doc = nlp(line, disable=["ner", "parser"]) for sent in doc.sents: bags = tag_relations(sent, terms, bags, nlp) # write out to tmp file for loading which we delete later tmp_input_file = "./relations_tmp.json" with open(tmp_input_file, "w") as f: json.dump(bags, f) print("Predicting Relations") predictions = relation_model_predict(config, logger) predictions = postprocess_relation_predictions(predictions) os.remove(tmp_input_file) input_filename = input_text.split("/")[-1][:-4] filename = f"{out_dir}/{input_filename}_{model_version}_predicted_relations.json" with open(filename, "w") as f: json.dump(predictions, f, indent=4)
class RuleBasedPreprocessor(PreprocessorBase): """ For rule based conversion, entire conversion should happen in the preprocessor """ def __init__(self) -> None: snlp = stanfordnlp.Pipeline(lang='en') # stanfordnlp python pipeline self.nlp = StanfordNLPLanguage(snlp) # spacy wraper for snlp conllformatter = ConllFormatter(self.nlp) self.nlp.add_pipe(conllformatter, last=True) self.detokenizer = MosesDetokenizer() self.vanila_preprocessor = PreprocessorBase() def __call__(self, q: str, o: str) -> Tuple[str, Dict]: if '_' in q: # FITB. Do it and return early h, meta = self.vanila_preprocessor(q, o) return h, meta if o in q: # most likely a preprocessed FITB question meta = {'question': q, 'option': o} return q, meta # the old code throws UserWarnings with warnings.catch_warnings(): warnings.simplefilter("ignore") q_doc = self.nlp(q) o_doc = self.nlp(o) try: q_conll_dict = parse(q_doc._.conll_str)[0].tokens o_conll_dict = parse(o_doc._.conll_str)[0].tokens except IndexError: logger.error(f"Index error on parse for {q}") h = q + ' ' + o meta: Dict[str, Any] = { 'question': q, 'option': o, 'conversion_issues': [str(ConversionIssue.UNKNOWN)] } return h, meta rule_q = Question(deepcopy(q_conll_dict)) # type:ignore rule_o = AnswerSpan(deepcopy(o_conll_dict)) # type:ignore conversion_issues = [] meta = {'question': q, 'option': o} if not rule_q.isvalid: conversion_issues.append(ConversionIssue.INVALID_QUESTION) if not rule_o.isvalid: conversion_issues.append(ConversionIssue.INVALID_OPTION) # if conversion issue is encountered just concat q + o if conversion_issues: h = q + ' ' + o else: rule_q.insert_answer_default(rule_o) h = self.detokenizer.detokenize(rule_q.format_declr(), return_str=True) meta['conversion_issues'] = [str(issue) for issue in conversion_issues] if meta['conversion_issues']: logger.debug( f"Issues {conversion_issues} encountered for {q} + {o}") return h, meta
def spacy_stanfordnlp_en_with_formatter(): snlp = stanfordnlp.Pipeline(lang='en') nlp = StanfordNLPLanguage(snlp) conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) return nlp
import spacy import stanfordnlp from spacy_stanfordnlp import StanfordNLPLanguage from extraction.extractor import PhraseExtractor, PhraseHighlighter if __name__ == '__main__': # Examples from SIFRank text_1 = "NuVox shows staying power with new cash, new market Who says you can't raise cash in today's telecom market? NuVox Communications positions itself for the long run with $78.5 million in funding and a new credit facility" text_2 = "This paper deals with two questions: Does social capital determine innovation in manufacturing firms? If it is the case, to what extent? To deal with these questions, we review the literature on innovation in order to see how social capital came to be added to the other forms of capital as an explanatory variable of innovation. In doing so, we have been led to follow the dominating view of the literature on social capital and innovation which claims that social capital cannot be captured through a single indicator, but that it actually takes many different forms that must be accounted for. Therefore, to the traditional explanatory variables of innovation, we have added five forms of structural social capital (business network assets, information network assets, research network assets, participation assets, and relational assets) and one form of cognitive social capital (reciprocal trust). In a context where empirical investigations regarding the relations between social capital and innovation are still scanty, this paper makes contributions to the advancement of knowledge in providing new evidence regarding the impact and the extent of social capital on innovation at the two decisionmaking stages considered in this study" # stanfordnlp.download('en') nlp = spacy.load('en_core_web_sm') corenlp = StanfordNLPLanguage(stanfordnlp.Pipeline(lang="en")) spacy_native = PhraseExtractor(nlp, np_method='NOUN_CHUNKS') spacy_grammar = PhraseExtractor(nlp, grammar='GRAMMAR1', np_method='NOUN_CHUNKS', np_tags='NLTK', stopwords='NLTK') corenlp_grammar = PhraseExtractor(corenlp, grammar='GRAMMAR1', np_method='NOUN_CHUNKS', np_tags='NLTK', stopwords='NLTK') # SHOW RESULTS # grammar method (corenlp tags) print(PhraseHighlighter.to_html(text_1, corenlp_grammar.run(text_1))) print(PhraseHighlighter.to_html(text_2, corenlp_grammar.run(text_2)))
def __init__(self): self.name = 'StanfordNLP' self.snlp = stanfordnlp.Pipeline( lang='fi', models_dir='data/stanfordnlp_resources') self.nlp = StanfordNLPLanguage(self.snlp)
def init_nlp(config): if config.get('name') == NLPs.SPACY: return spacy.load(config.get('model_name')) elif config.get('name') == NLPs.CORENLP: return StanfordNLPLanguage( stanfordnlp.Pipeline(lang=config.get('model_name')))
def init_parser( parser: str = "spacy", model_or_lang: str = "en", *, is_tokenized: bool = False, disable_sbd: bool = False, parser_opts: Optional[Dict] = None, **kwargs, ) -> Language: """Initialise a spacy-wrapped parser given a language or model and some options. :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are 'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be installed, e.g. spacy-stanza. Defaults to 'spacy' :param model_or_lang: language model to use (must be installed). Defaults to an English model :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines. See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy) :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()` initialisations :param kwargs: options to be passed to the ConllFormatter initialisation :return: an initialised Language object; the parser """ parser_opts = {} if parser_opts is None else parser_opts if parser == "spacy": nlp = spacy.load(model_or_lang, **parser_opts) if is_tokenized: nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab) if disable_sbd: nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser") elif parser == "stanfordnlp": from spacy_stanfordnlp import StanfordNLPLanguage import stanfordnlp snlp = stanfordnlp.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanfordNLPLanguage(snlp) elif parser == "stanza": import stanza from spacy_stanza import StanzaLanguage snlp = stanza.Pipeline(lang=model_or_lang, tokenize_pretokenized=is_tokenized, **parser_opts) nlp = StanzaLanguage(snlp) elif parser == "udpipe": import spacy_udpipe nlp = spacy_udpipe.load(model_or_lang, **parser_opts) else: raise ValueError( "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'" ) conllformatter = ConllFormatter(nlp, **kwargs) nlp.add_pipe(conllformatter, last=True) return nlp
import tqdm import multiprocessing as mp from typing import * import json # In[2]: # stanfordnlp.download('en') # Config input_path = Path('train_has_following.json') output_path = Path('.data/RACE/train_has_following_reconverted.json') # In[3]: snlp = stanfordnlp.Pipeline(lang='en') nlp = StanfordNLPLanguage(snlp) conllformatter = ConllFormatter(nlp) nlp.add_pipe(conllformatter, last=True) detokenizer = MosesDetokenizer() # In[4]: # load data with open(input_path) as f: samples = json.load(f) # In[5]: warnings.filterwarnings("ignore")
verbs = [] sentiment = 0 raw = open('suomi24vuodet2/suomi24kommentit2017.txt').read() sentences = sent_tokenize(raw) stop_words = set(stopwords.words('finnish')) #Top 20 most common namend entities in the comments words = [ "asia", "Suomi", "Turkki", "Helsinki", "Thaimaa", "Kanaria", "Kreikka", "Australia", "USA", "Thaimaa", "Alanyassa", "Italia", "Bulgaria", "Intia", "Gambia", "Teneriffa", "Turku", "Tunisia", "Tampere", "Usa" ] #initialize spacy with standord-nlp pipeline model for finnish language snlp = stanfordnlp.Pipeline(lang="fi", processors="tokenize,mwt,lemma,pos") nlp = StanfordNLPLanguage(snlp) #Find adjectives and verbs from the comments: for sentence in sentences: for word in words: if word in sentence: doc = nlp(sentence) for token in doc: if (token.text not in stop_words and token.pos_ == "ADJ"): print(token.text, token.pos_) adjectives.append((token.text)) if (token.text not in stop_words and token.pos_ == "VERB"): print(token.text, token.pos_) verbs.append((token.text)) a = Counter(adjectives)
def RuBERT_ents(): deleted_entries = [] entries = Entry.objects.filter( ~Q(RuBERT=True)) #Load all entries where RuBERT is not true # Split the process into blocks of 1000 to avoid RuntimeError: CUDA out of memory snlp = stanfordnlp.Pipeline(lang='ru', use_gpu=False) ner_model = build_model( configs.ner.ner_rus_bert, download=True) # This will download the model if not present for entry in tqdm.tqdm(entries): try: if entry.text is not None and len(entry.text) > 0: # Error in entry """{'_state': <django.db.models.base.ModelState at 0x7fcc7e6ef5f8>, 'id': 226316, 'text': ' ', 'lemmatized': ' \n', 'date_start': datetime.date(1943, 3, 23), 'date_end': None, 'author_id': 978, 'diary': 988, 'sentiment': None, 'RuBERT': False}""" #Throws stanfordnlp assertion error, assert input_str is not None and len(input_str) > 0, conll.py line 20 #Deleted the entry and all runs well, come back to this if reocurring nlp = StanfordNLPLanguage(snlp) doc = nlp(entry.text) block_size = 200 token_blocks = [ doc[i * block_size:(i + 1) * block_size] for i in range((len(doc) + block_size - 1) // block_size) ] for block in token_blocks: sent_text = " ".join( [token.lemma_ for token in block] ) #Limit to first 510 subtokens to avoid 'RuntimeError: input sequence after bert tokenization shouldn't exceed 512 tokens.'' try: result = ner_model([sent_text]) for i in range(len(result[0][0])): token = result[0][0][i] ent = result[1][0][i] if 'B-' in ent: # single token ent ent_type = ent.split('-')[1] span = find_span(result, i) ent_text = ' '.join([ token for token in result[0][0][span[0]:span[1]] ]) print('found', ent_type, ent_text, 'in span', span) if ent_type == 'LOC': try: geolocator = Nominatim( user_agent="prozhito_db") location = geolocator.geocode(ent_text) if location: place = Place.objects.get_or_create( name=location[0], geom=Point( location.longitude, location.latitude)) entry.places.add(place[0]) entry.save() except Exception as e: print(e) place = Place.objects.get_or_create( name=ent_text, ) entry.places.add(place[0]) entry.save() if ent_type == 'ORG': Keyword.objects.update_or_create( name=ent_text, ) if ent_type == 'PER': extractor = NamesExtractor() matches = extractor(sent_text) if not len(matches) == 0: for match in matches: if match.fact.last: person = Person.objects.get_or_create( family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.last} ' ) if match.fact.first and match.fact.last: person = Person.objects.get_or_create( first_name=match.fact. first, family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.last} ' ) if match.fact.first and match.fact.middle: person = Person.objects.get_or_create( first_name=match.fact. first, patronymic=match.fact. middle, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.last} ' ) if match.fact.first and match.fact.middle and match.fact.last: person = Person.objects.get_or_create( first_name=match.fact. first, patronymic=match.fact. middle, family_name=match.fact. last, from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} ' ) else: names = ent_text.split(' ') #if len(names) == 1: # person = Person.objects.update_or_create(family_name=names[0], from_natasha=True) # entry.people.add(person[0]) # entry.save() # print(f'[*] added person {names[0]} ') #if len(names) == 2: # person = Person.objects.update_or_create(first_name=names[0], family_name=names[1], from_natasha=True) # entry.people.add(person[0]) # entry.save() # print(f'[*] added person {names[0]} {names[1]} ') punct = ['.', ',', '-', ';', ':'] if len(names) == 3: if not [ token in punct for token in names ]: person = Person.objects.update_or_create( first_name=names[0], patronymic=names[1], family_name=names[2], from_natasha=True) entry.people.add(person[0]) entry.save() print( f'[*] added person {names[0]} {names[1]} {names[2]} ' ) except Exception as e: print(e) entry.RuBERT = True entry.save() except AssertionError: print(f"Stanfordnlp assertion error, deleting entry {entry.id}") deleted_entries.append(entry) entry.delete() [print(entry.id, entry.text) for entry in deleted_entries]
def tag_terms(text, terms, nlp=None): """ Identifies and tags any terms in a given input text. Searches through the input text and finds all terms (single words and phrases) that are present in the list of provided terms. Returns a list of found terms with indices and POS tagging as well as a BIOES tagged version of the sentence denoting where the terms are in the sentences. Additionally classifies terms as either entities or events and annotates the presence of the terms in the original sentence with these labels. Uses spacy functionality to tokenize and lemmatize for matching text (it is recommended to preprocess by Spacy before inputting to prevent repeated work if calling multiple times). Gives precedence to longer terms first so that terms that are part of a larger term phrase are ignored (i.e. match 'cell wall', not 'cell' within the phrase cell wall). Parameters ---------- text: str | spacy.tokens.doc.Doc Input text that will be/are preprocessed using spacy and searched for terms terms: list of str | list of spacy.tokens.doc.Doc List of input terms that will be/are preprocessed using spacy. nlp: Spacy nlp pipeline object that will tokenize, POS tag, lemmatize, etc. Returns ------- dict with four entries: tokenized_text: tokenized text as list of tokens tags: list of BIOES tags for the tokenized text annotated_text: original text with <entity> and <event> tags put around found terms found_terms: list of found terms each with list of indices where matches were found, basic part of speech information, and entity/event tag Examples -------- >>> tag_text('A biologist will tell you that a cell contains a cell wall.', ['cell', 'cell wall', 'biologist']) {'tokenized_text': ['A', 'biologist', 'will', 'tell', 'you', 'that', 'a', 'cell', 'contains', 'a', 'cell', 'wall', '.'], 'tags': ['O', 'S', 'O', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'B', 'E', 'O'], 'annotated_text': 'A <entity>biologist</entity> will tell you that a <entity>cell</entity> contains a <entity>cell wall</entity>.', 'found_terms': { 'cell wall': {'text': ['cell wall'], 'indices': [(10, 12)], 'pos': ['NN NN'], 'type': ['Entity']}, 'biologist': {'text': ['biologist'], 'indices': [(1, 2)], 'pos': ['NN'], 'type': ['Entity']}, 'cell': {'text': ['cell'], 'indices': [(7, 8)], 'tag': ['NN'], 'type': ['Entity']}}} """ from spacy.lang.en.stop_words import STOP_WORDS spacy.tokens.token.Token.set_extension('workaround', default='', force=True) HEURISTIC_TOKENS = ["-", "plant", "substance", "atom"] # default to Stanford NLP pipeline wrapped in Spacy if nlp is None: snlp = stanfordnlp.Pipeline(lang="en") nlp = StanfordNLPLanguage(snlp) # preprocess with spacy if needed if type(terms[0]) != spacy.tokens.doc.Doc: terms = [nlp(term) for term in terms] if (type(text) != spacy.tokens.doc.Doc and type(text) != spacy.tokens.span.Span): text = nlp(text) # set up a custom representation of the text where we can add term type annotations for token in text: token._.workaround = token.text_with_ws lemmatized_text = [token.lemma_ for token in text] tokenized_text = [token.text for token in text] tags = ['O'] * len(text) found_terms = defaultdict(lambda: { "text": [], "indices": [], "pos": [], "type": [] }) # iterate through terms from longest to shortest terms = sorted(terms, key=len)[::-1] for spacy_term in terms: term_length = len(spacy_term) lemma_term_list = [token.lemma_ for token in spacy_term] text_term_list = [token.text for token in spacy_term] term_lemma = " ".join(lemma_term_list) # skip short acronyms that can cause problems if len(term_lemma) <= 2: continue # additional check to check for simple plural of uncommon biology terms match_uncommon_plural = lemma_term_list.copy() match_uncommon_plural[-1] = match_uncommon_plural[-1] + "s" # additional check using heuristics on lemmatized version match_heuristic = [] if lemma_term_list[0] not in HEURISTIC_TOKENS: for token in lemma_term_list: if token not in HEURISTIC_TOKENS: match_heuristic += token.split("-") heuristic_length = len(match_heuristic) else: heuristic_term = lemma_term_list heuristic_length = len(lemma_term_list) for ix in range(len(text) - term_length): heuristic_match = ( lemmatized_text[ix:ix + heuristic_length] == match_heuristic) plural_match = ( lemmatized_text[ix:ix + term_length] == match_uncommon_plural) lemma_match = (lemmatized_text[ix:ix + term_length] == lemma_term_list) text_match = (tokenized_text[ix:ix + term_length] == text_term_list) lower_match = ([ t.lower() for t in tokenized_text[ix:ix + term_length] ] == [t.lower() for t in text_term_list]) # Only match on text if lemmatized version is a stop word (i.e. lower casing acronym) if term_lemma in STOP_WORDS: valid_match = text_match else: valid_match = heuristic_match or plural_match or text_match or lemma_match or lower_match if valid_match: if heuristic_match and not lemma_match: match_length = heuristic_length else: match_length = term_length term_text = " ".join( [t.text for t in text[ix:ix + match_length]]) term_tag = " ".join( [t.tag_ for t in text[ix:ix + match_length]]) # only tag term if not part of larger term if tags[ix:ix + match_length] == ["O"] * match_length: # classify term type term_type = determine_term_type(spacy_term) # collect term information found_terms[term_lemma]["text"].append(term_text) found_terms[term_lemma]["indices"].append( (ix, ix + match_length)) found_terms[term_lemma]["pos"].append(term_tag) found_terms[term_lemma]["type"].append(term_type) # update sentence tags tags = tag_bioes(tags, ix, match_length) # annotate token representations with term type text[ix]._.workaround = f"<{term_type}>" + text[ ix]._.workaround end_ix = ix + match_length - 1 if text[end_ix]._.workaround.endswith(" "): text[end_ix]._.workaround = text[ end_ix]._.workaround[:-1] + f"</{term_type}> " else: text[end_ix]._.workaround += f"</{term_type}>" # reconstruct fully annotated input text annotated_text = "" for token in text: annotated_text += token._.workaround return { "tokenized_text": tokenized_text, "tags": tags, "annotated_text": annotated_text, "found_terms": dict(found_terms) }