示例#1
0
 def __init__(self) -> None:
     snlp = stanfordnlp.Pipeline(lang='en')  # stanfordnlp python pipeline
     self.nlp = StanfordNLPLanguage(snlp)  # spacy wraper for snlp
     conllformatter = ConllFormatter(self.nlp)
     self.nlp.add_pipe(conllformatter, last=True)
     self.detokenizer = MosesDetokenizer()
     self.vanila_preprocessor = PreprocessorBase()
示例#2
0
def lemmatize_texts(lemmatizer):
    entries = Entry.objects.filter(lemmatized='')

    if lemmatizer == 'stanford':
        texts = [(entry.text, entry.id) for entry in entries]
        snlp = stanfordnlp.Pipeline(lang='ru')
        nlp = StanfordNLPLanguage(snlp)
        for doc in tqdm.tqdm(
                nlp.pipe(texts,
                         batch_size=100,
                         as_tuples=True,
                         disable=["tagger", "parser", "pos", "depparse"])):
            id = doc[1]
            lemmatized = ' '.join([token.lemma_ for token in doc[0]])
            entry = Entry.objects.get(id=id)
            entry.lemmatized = lemmatized
            entry.save()
    if lemmatizer == 'mystem':
        m = Mystem()
        for entry in tqdm.tqdm(entries):
            lemmas = m.lemmatize(entry.text)
            lemmatized = ''.join(lemmas)
            entry = Entry.objects.get(id=entry.id)
            entry.lemmatized = lemmatized
            entry.save()
示例#3
0
    def __clean_text(self, df):
        config = {
            'processors': 'tokenize,pos,lemma,depparse',  # Comma-separated list of processors to use
            'lang': 'ru',  # Language code for the language to build the Pipeline in
            'tokenize_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tokenizer.pt',
            'pos_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_tagger.pt',
            'pos_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt',
            'lemma_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_lemmatizer.pt',
            'depparse_model_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus_parser.pt',
            'depparse_pretrain_path': 'C:\\1Vadim\IT\\Repositories\\KURSACH\\Project\\stanfordnlp\\stanfordnlp_resources\\ru_syntagrus_models\\ru_syntagrus.pretrain.pt'
        }

        snlp = stanfordnlp.Pipeline(**config)
        nlp = StanfordNLPLanguage(snlp)

        text_list = df["Text"].values
        lower_text_list = []
        for text in text_list:
            text_lower = text.lower()
            lower_text_list.append(text_lower)
        clean_text_list = []
        for text in lower_text_list:
            text = nlp(text)
            token = [token.lemma_ for token in text if not (token.is_punct or token.is_stop)]
            clean_text_list.append(token)

        return clean_text_list
示例#4
0
文件: util.py 项目: mattboggess/tokn
def tag_relations(text, terms, bags, nlp=None):
    """ Modified version of tag relations that handles the special case of making predictions
        on new data without known relation labels.
    """

    # default to Stanford NLP pipeline wrapped in Spacy
    if nlp is None:
        snlp = stanfordnlp.Pipeline(lang="en")
        nlp = StanfordNLPLanguage(snlp)

    # preprocess with spacy if needed
    if type(terms[0]) != spacy.tokens.doc.Doc:
        terms = [nlp(term) for term in terms]
    if (type(text) != spacy.tokens.doc.Doc
            and type(text) != spacy.tokens.span.Span):
        text = nlp(text)

    results = tag_terms(text, terms, nlp)
    tokenized_text = results["tokenized_text"]
    tagged_text = results["tags"]
    found_terms_info = results["found_terms"]

    found_terms = list(found_terms_info.keys())
    for i in range(len(found_terms) - 1):
        for j in range(i + 1, len(found_terms)):
            term_pair = (found_terms[i], found_terms[j])
            bags = add_relation(term_pair, found_terms_info, tokenized_text,
                                bags)
            term_pair_reverse = (found_terms[j], found_terms[i])
            bags = add_relation(term_pair_reverse, found_terms_info,
                                tokenized_text, bags)

    return bags
示例#5
0
def _init_nlp(model_or_lang, is_tokenized, disable_sbd, use_stanfordnlp):
    if model_or_lang is None:
        model_or_lang = 'en' if use_stanfordnlp else 'en_core_web_sm'

    nlp = None
    if use_stanfordnlp:
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized)
        nlp = StanfordNLPLanguage(snlp)
    else:
        # Init model:
        # Initialize model, with custom pipe
        # taking into account 'is_tokenized', 'disable_sbd', and 'include_headers'
        nlp = spacy.load(model_or_lang)
        if is_tokenized:
            nlp.tokenizer = nlp.tokenizer.tokens_from_list
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name='prevent-sbd', before='parser')

    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
示例#6
0
def geocode_entries():
    snlp = stanfordnlp.Pipeline(lang='ru')
    nlp = StanfordNLPLanguage(snlp)
    entries = Entry.objects.all()
    for entry in tqdm.tqdm(entries):
        doc = nlp(entry.text)
        words = [
            token.text for token in doc
            if token.is_punct is False and token.is_stop is False
        ]
        for word in words:
            geolocator = Nominatim(user_agent="prozhito_db")
            location = geolocator.geocode(word)
            if location:
                print(location)
                """
示例#7
0
def main(config, input_text, terms, out_dir, model_version):
    logger = config.get_logger('test')
    
    # set up spacy nlp engine
    warnings.filterwarnings('ignore')
    sys.stdout = open(os.devnull, "w")
    snlp = stanfordnlp.Pipeline(lang="en")
    nlp = StanfordNLPLanguage(snlp)
    sys.stdout = sys.__stdout__
    
    # read in text and terms 
    with open(input_text, "r") as f:
        lines = f.readlines()
    if terms.endswith(".txt"):
        with open(terms, "r") as f:
            terms = f.readlines()
    elif terms.endswith(".json"):
        with open(terms, "r") as f:
            terms = list(json.load(f).keys())
        
        
    # build input term pair bags
    terms = [nlp(term, disable=["ner", "parser"]) for term in terms]
    bags = {"no-relation": []}
    print("Preprocessing Data")
    for line in tqdm(lines):
        if len(line.strip()) == 0:
            continue
        doc = nlp(line, disable=["ner", "parser"])
        for sent in doc.sents:
            bags = tag_relations(sent, terms, bags, nlp)
    
    # write out to tmp file for loading which we delete later
    tmp_input_file = "./relations_tmp.json"
    with open(tmp_input_file, "w") as f:
        json.dump(bags, f)
    
    print("Predicting Relations")
    predictions = relation_model_predict(config, logger)
    predictions = postprocess_relation_predictions(predictions)
    
    os.remove(tmp_input_file)
                
    input_filename = input_text.split("/")[-1][:-4]
    filename = f"{out_dir}/{input_filename}_{model_version}_predicted_relations.json"
    with open(filename, "w") as f:
        json.dump(predictions, f, indent=4)
示例#8
0
class RuleBasedPreprocessor(PreprocessorBase):
    """ For rule based conversion,
    entire conversion should happen in the preprocessor
    """
    def __init__(self) -> None:
        snlp = stanfordnlp.Pipeline(lang='en')  # stanfordnlp python pipeline
        self.nlp = StanfordNLPLanguage(snlp)  # spacy wraper for snlp
        conllformatter = ConllFormatter(self.nlp)
        self.nlp.add_pipe(conllformatter, last=True)
        self.detokenizer = MosesDetokenizer()
        self.vanila_preprocessor = PreprocessorBase()

    def __call__(self, q: str, o: str) -> Tuple[str, Dict]:
        if '_' in q:  # FITB. Do it and return early
            h, meta = self.vanila_preprocessor(q, o)

            return h, meta

        if o in q:
            # most likely a preprocessed FITB question
            meta = {'question': q, 'option': o}

            return q, meta

        # the old code throws UserWarnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            q_doc = self.nlp(q)
            o_doc = self.nlp(o)
        try:
            q_conll_dict = parse(q_doc._.conll_str)[0].tokens
            o_conll_dict = parse(o_doc._.conll_str)[0].tokens
        except IndexError:
            logger.error(f"Index error on parse for {q}")
            h = q + ' ' + o
            meta: Dict[str, Any] = {
                'question': q,
                'option': o,
                'conversion_issues': [str(ConversionIssue.UNKNOWN)]
            }

            return h, meta

        rule_q = Question(deepcopy(q_conll_dict))  # type:ignore
        rule_o = AnswerSpan(deepcopy(o_conll_dict))  # type:ignore
        conversion_issues = []
        meta = {'question': q, 'option': o}

        if not rule_q.isvalid:
            conversion_issues.append(ConversionIssue.INVALID_QUESTION)

        if not rule_o.isvalid:
            conversion_issues.append(ConversionIssue.INVALID_OPTION)
        # if conversion issue is encountered just concat q + o

        if conversion_issues:
            h = q + ' ' + o
        else:
            rule_q.insert_answer_default(rule_o)
            h = self.detokenizer.detokenize(rule_q.format_declr(),
                                            return_str=True)
        meta['conversion_issues'] = [str(issue) for issue in conversion_issues]

        if meta['conversion_issues']:
            logger.debug(
                f"Issues {conversion_issues} encountered for {q} + {o}")

        return h, meta
示例#9
0
def spacy_stanfordnlp_en_with_formatter():
    snlp = stanfordnlp.Pipeline(lang='en')
    nlp = StanfordNLPLanguage(snlp)
    conllformatter = ConllFormatter(nlp)
    nlp.add_pipe(conllformatter, last=True)
    return nlp
import spacy
import stanfordnlp
from spacy_stanfordnlp import StanfordNLPLanguage

from extraction.extractor import PhraseExtractor, PhraseHighlighter

if __name__ == '__main__':
    # Examples from SIFRank
    text_1 = "NuVox shows staying power with new cash, new market Who says you can't raise cash in today's telecom market? NuVox Communications positions itself for the long run with $78.5 million in funding and a new credit facility"
    text_2 = "This paper deals with two questions: Does social capital determine innovation in manufacturing firms? If it is the case, to what extent? To deal with these questions, we review the literature on innovation in order to see how social capital came to be added to the other forms of capital as an explanatory variable of innovation. In doing so, we have been led to follow the dominating view of the literature on social capital and innovation which claims that social capital cannot be captured through a single indicator, but that it actually takes many different forms that must be accounted for. Therefore, to the traditional explanatory variables of innovation, we have added five forms of structural social capital (business network assets, information network assets, research network assets, participation assets, and relational assets) and one form of cognitive social capital (reciprocal trust). In a context where empirical investigations regarding the relations between social capital and innovation are still scanty, this paper makes contributions to the advancement of knowledge in providing new evidence regarding the impact and the extent of social capital on innovation at the two decisionmaking stages considered in this study"

    # stanfordnlp.download('en')
    nlp = spacy.load('en_core_web_sm')
    corenlp = StanfordNLPLanguage(stanfordnlp.Pipeline(lang="en"))

    spacy_native = PhraseExtractor(nlp, np_method='NOUN_CHUNKS')
    spacy_grammar = PhraseExtractor(nlp,
                                    grammar='GRAMMAR1',
                                    np_method='NOUN_CHUNKS',
                                    np_tags='NLTK',
                                    stopwords='NLTK')
    corenlp_grammar = PhraseExtractor(corenlp,
                                      grammar='GRAMMAR1',
                                      np_method='NOUN_CHUNKS',
                                      np_tags='NLTK',
                                      stopwords='NLTK')

    # SHOW RESULTS
    # grammar method (corenlp tags)
    print(PhraseHighlighter.to_html(text_1, corenlp_grammar.run(text_1)))
    print(PhraseHighlighter.to_html(text_2, corenlp_grammar.run(text_2)))
示例#11
0
 def __init__(self):
     self.name = 'StanfordNLP'
     self.snlp = stanfordnlp.Pipeline(
         lang='fi', models_dir='data/stanfordnlp_resources')
     self.nlp = StanfordNLPLanguage(self.snlp)
def init_nlp(config):
    if config.get('name') == NLPs.SPACY:
        return spacy.load(config.get('model_name'))
    elif config.get('name') == NLPs.CORENLP:
        return StanfordNLPLanguage(
            stanfordnlp.Pipeline(lang=config.get('model_name')))
示例#13
0
def init_parser(
    parser: str = "spacy",
    model_or_lang: str = "en",
    *,
    is_tokenized: bool = False,
    disable_sbd: bool = False,
    parser_opts: Optional[Dict] = None,
    **kwargs,
) -> Language:
    """Initialise a spacy-wrapped parser given a language or model and some options.
    :param parser: which parser to use. Parsers other than 'spacy' need to be installed separately. Valid options are
           'spacy', 'stanfordnlp', 'stanza', 'udpipe'. Note that the spacy-* wrappers of those libraries need to be
           installed, e.g. spacy-stanza. Defaults to 'spacy'
    :param model_or_lang: language model to use (must be installed). Defaults to an English model
    :param is_tokenized: indicates whether your text has already been tokenized (space-seperated). For stanza and
           stanfordnlp, this will also cause sentence segmentation *only* to be done by splitting on new lines.
           See the documentation: https://stanfordnlp.github.io/stanfordnlp/tokenize.html
           See the documentation: https://stanfordnlp.github.io/stanza/tokenize.html
    :param disable_sbd: disables spaCy automatic sentence boundary detection (only works for spaCy)
    :param parser_opts: will be passed to the core pipeline. For spacy and udpipe, it will be passed to their
           `.load()` initialisations, for stanfordnlp and stanza `pipeline_opts` is passed to to their `.Pipeline()`
           initialisations
    :param kwargs: options to be passed to the ConllFormatter initialisation
    :return: an initialised Language object; the parser
    """
    parser_opts = {} if parser_opts is None else parser_opts

    if parser == "spacy":
        nlp = spacy.load(model_or_lang, **parser_opts)
        if is_tokenized:
            nlp.tokenizer = SpacyPretokenizedTokenizer(nlp.vocab)
        if disable_sbd:
            nlp.add_pipe(_prevent_sbd, name="prevent-sbd", before="parser")
    elif parser == "stanfordnlp":
        from spacy_stanfordnlp import StanfordNLPLanguage
        import stanfordnlp

        snlp = stanfordnlp.Pipeline(lang=model_or_lang,
                                    tokenize_pretokenized=is_tokenized,
                                    **parser_opts)
        nlp = StanfordNLPLanguage(snlp)
    elif parser == "stanza":
        import stanza
        from spacy_stanza import StanzaLanguage

        snlp = stanza.Pipeline(lang=model_or_lang,
                               tokenize_pretokenized=is_tokenized,
                               **parser_opts)
        nlp = StanzaLanguage(snlp)
    elif parser == "udpipe":
        import spacy_udpipe

        nlp = spacy_udpipe.load(model_or_lang, **parser_opts)
    else:
        raise ValueError(
            "Unexpected value for 'parser'. Options are: 'spacy', 'stanfordnlp', 'stanza', 'udpipe'"
        )

    conllformatter = ConllFormatter(nlp, **kwargs)
    nlp.add_pipe(conllformatter, last=True)

    return nlp
示例#14
0
import tqdm
import multiprocessing as mp
from typing import *
import json

# In[2]:

# stanfordnlp.download('en')
# Config
input_path = Path('train_has_following.json')
output_path = Path('.data/RACE/train_has_following_reconverted.json')

# In[3]:

snlp = stanfordnlp.Pipeline(lang='en')
nlp = StanfordNLPLanguage(snlp)
conllformatter = ConllFormatter(nlp)
nlp.add_pipe(conllformatter, last=True)
detokenizer = MosesDetokenizer()

# In[4]:

# load data
with open(input_path) as f:
    samples = json.load(f)

# In[5]:

warnings.filterwarnings("ignore")

示例#15
0
verbs = []
sentiment = 0
raw = open('suomi24vuodet2/suomi24kommentit2017.txt').read()
sentences = sent_tokenize(raw)
stop_words = set(stopwords.words('finnish'))

#Top 20 most common namend entities in the comments
words = [
    "asia", "Suomi", "Turkki", "Helsinki", "Thaimaa", "Kanaria", "Kreikka",
    "Australia", "USA", "Thaimaa", "Alanyassa", "Italia", "Bulgaria", "Intia",
    "Gambia", "Teneriffa", "Turku", "Tunisia", "Tampere", "Usa"
]

#initialize spacy with standord-nlp pipeline model for finnish language
snlp = stanfordnlp.Pipeline(lang="fi", processors="tokenize,mwt,lemma,pos")
nlp = StanfordNLPLanguage(snlp)

#Find adjectives and verbs from the comments:

for sentence in sentences:
    for word in words:
        if word in sentence:
            doc = nlp(sentence)
            for token in doc:
                if (token.text not in stop_words and token.pos_ == "ADJ"):
                    print(token.text, token.pos_)
                    adjectives.append((token.text))
                if (token.text not in stop_words and token.pos_ == "VERB"):
                    print(token.text, token.pos_)
                    verbs.append((token.text))
a = Counter(adjectives)
示例#16
0
def RuBERT_ents():
    deleted_entries = []
    entries = Entry.objects.filter(
        ~Q(RuBERT=True))  #Load all entries where RuBERT is not true
    # Split the process into blocks of 1000 to avoid RuntimeError: CUDA out of memory
    snlp = stanfordnlp.Pipeline(lang='ru', use_gpu=False)
    ner_model = build_model(
        configs.ner.ner_rus_bert,
        download=True)  # This will download the model if not present
    for entry in tqdm.tqdm(entries):
        try:
            if entry.text is not None and len(entry.text) > 0:
                # Error in entry
                """{'_state': <django.db.models.base.ModelState at 0x7fcc7e6ef5f8>,
                 'id': 226316,
                 'text': '          ',
                 'lemmatized': '          \n',
                 'date_start': datetime.date(1943, 3, 23),
                 'date_end': None,
                 'author_id': 978,
                 'diary': 988,
                 'sentiment': None,
                 'RuBERT': False}"""
                #Throws stanfordnlp assertion error, assert input_str is not None and len(input_str) > 0, conll.py line 20
                #Deleted the entry and all runs well, come back to this if reocurring

                nlp = StanfordNLPLanguage(snlp)
                doc = nlp(entry.text)
                block_size = 200
                token_blocks = [
                    doc[i * block_size:(i + 1) * block_size]
                    for i in range((len(doc) + block_size - 1) // block_size)
                ]
                for block in token_blocks:
                    sent_text = " ".join(
                        [token.lemma_ for token in block]
                    )  #Limit to first 510 subtokens to avoid 'RuntimeError: input sequence after bert tokenization shouldn't exceed 512 tokens.''
                    try:
                        result = ner_model([sent_text])
                        for i in range(len(result[0][0])):
                            token = result[0][0][i]
                            ent = result[1][0][i]

                            if 'B-' in ent:  # single token ent
                                ent_type = ent.split('-')[1]
                                span = find_span(result, i)
                                ent_text = ' '.join([
                                    token
                                    for token in result[0][0][span[0]:span[1]]
                                ])
                                print('found', ent_type, ent_text, 'in span',
                                      span)
                                if ent_type == 'LOC':
                                    try:
                                        geolocator = Nominatim(
                                            user_agent="prozhito_db")
                                        location = geolocator.geocode(ent_text)
                                        if location:
                                            place = Place.objects.get_or_create(
                                                name=location[0],
                                                geom=Point(
                                                    location.longitude,
                                                    location.latitude))
                                            entry.places.add(place[0])
                                            entry.save()
                                    except Exception as e:
                                        print(e)
                                        place = Place.objects.get_or_create(
                                            name=ent_text, )
                                        entry.places.add(place[0])
                                        entry.save()

                                if ent_type == 'ORG':
                                    Keyword.objects.update_or_create(
                                        name=ent_text, )

                                if ent_type == 'PER':
                                    extractor = NamesExtractor()
                                    matches = extractor(sent_text)
                                    if not len(matches) == 0:
                                        for match in matches:
                                            if match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.middle:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    patronymic=match.fact.
                                                    middle,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.last} '
                                                )

                                            if match.fact.first and match.fact.middle and match.fact.last:
                                                person = Person.objects.get_or_create(
                                                    first_name=match.fact.
                                                    first,
                                                    patronymic=match.fact.
                                                    middle,
                                                    family_name=match.fact.
                                                    last,
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {match.fact.first} {match.fact.middle} {match.fact.last} '
                                                )

                                    else:
                                        names = ent_text.split(' ')
                                        #if len(names) == 1:
                                        #    person = Person.objects.update_or_create(family_name=names[0], from_natasha=True)
                                        #    entry.people.add(person[0])
                                        #    entry.save()
                                        #    print(f'[*] added person {names[0]} ')

                                        #if len(names) == 2:
                                        #    person = Person.objects.update_or_create(first_name=names[0], family_name=names[1], from_natasha=True)
                                        #    entry.people.add(person[0])
                                        #    entry.save()
                                        #    print(f'[*] added person {names[0]} {names[1]} ')
                                        punct = ['.', ',', '-', ';', ':']
                                        if len(names) == 3:
                                            if not [
                                                    token in punct
                                                    for token in names
                                            ]:
                                                person = Person.objects.update_or_create(
                                                    first_name=names[0],
                                                    patronymic=names[1],
                                                    family_name=names[2],
                                                    from_natasha=True)
                                                entry.people.add(person[0])
                                                entry.save()
                                                print(
                                                    f'[*] added person {names[0]} {names[1]} {names[2]} '
                                                )

                    except Exception as e:
                        print(e)

                entry.RuBERT = True
                entry.save()
        except AssertionError:
            print(f"Stanfordnlp assertion error, deleting entry {entry.id}")
            deleted_entries.append(entry)
            entry.delete()

    [print(entry.id, entry.text) for entry in deleted_entries]
示例#17
0
文件: util.py 项目: mattboggess/tokn
def tag_terms(text, terms, nlp=None):
    """ Identifies and tags any terms in a given input text.

    Searches through the input text and finds all terms (single words and phrases) that are present
    in the list of provided terms. Returns a list of found terms with indices and POS tagging as 
    well as a BIOES tagged version of the sentence denoting where the terms are in the sentences. 
    
    Additionally classifies terms as either entities or events and annotates the presence of the
    terms in the original sentence with these labels.
  
    Uses spacy functionality to tokenize and lemmatize for matching text (it is recommended to 
    preprocess by Spacy before inputting to prevent repeated work if calling multiple times).

    Gives precedence to longer terms first so that terms that are part of a larger term phrase
    are ignored (i.e. match 'cell wall', not 'cell' within the phrase cell wall). 

    Parameters
    ----------
    text: str | spacy.tokens.doc.Doc
        Input text that will be/are preprocessed using spacy and searched for terms
    terms: list of str | list of spacy.tokens.doc.Doc
        List of input terms that will be/are preprocessed using spacy. 
    nlp: 
        Spacy nlp pipeline object that will tokenize, POS tag, lemmatize, etc. 

    Returns
    -------
    dict with four entries: 
        tokenized_text: tokenized text as list of tokens
        tags: list of BIOES tags for the tokenized text
        annotated_text: original text with <entity> and <event> tags put around found terms 
        found_terms: list of found terms each with list of indices where matches were found,
        basic part of speech information, and entity/event tag

    Examples
    --------
    >>> tag_text('A biologist will tell you that a cell contains a cell wall.', 
                 ['cell', 'cell wall', 'biologist'])
    
    {'tokenized_text': ['A', 'biologist', 'will', 'tell', 'you', 'that', 'a', 'cell', 'contains', 
                        'a', 'cell', 'wall', '.'], 
     'tags': ['O', 'S', 'O', 'O', 'O', 'O', 'O', 'S', 'O', 'O', 'B', 'E', 'O'], 
     'annotated_text': 'A <entity>biologist</entity> will tell you that a <entity>cell</entity> 
                        contains a <entity>cell wall</entity>.', 
     'found_terms': {
         'cell wall': {'text': ['cell wall'], 'indices': [(10, 12)], 'pos': ['NN NN'], 
                       'type': ['Entity']}, 
         'biologist': {'text': ['biologist'], 'indices': [(1, 2)], 'pos': ['NN'], 
                       'type': ['Entity']}, 
         'cell': {'text': ['cell'], 'indices': [(7, 8)], 'tag': ['NN'], 'type': ['Entity']}}}
    """
    from spacy.lang.en.stop_words import STOP_WORDS
    spacy.tokens.token.Token.set_extension('workaround',
                                           default='',
                                           force=True)

    HEURISTIC_TOKENS = ["-", "plant", "substance", "atom"]

    # default to Stanford NLP pipeline wrapped in Spacy
    if nlp is None:
        snlp = stanfordnlp.Pipeline(lang="en")
        nlp = StanfordNLPLanguage(snlp)

    # preprocess with spacy if needed
    if type(terms[0]) != spacy.tokens.doc.Doc:
        terms = [nlp(term) for term in terms]
    if (type(text) != spacy.tokens.doc.Doc
            and type(text) != spacy.tokens.span.Span):
        text = nlp(text)

    # set up a custom representation of the text where we can add term type annotations
    for token in text:
        token._.workaround = token.text_with_ws

    lemmatized_text = [token.lemma_ for token in text]
    tokenized_text = [token.text for token in text]
    tags = ['O'] * len(text)
    found_terms = defaultdict(lambda: {
        "text": [],
        "indices": [],
        "pos": [],
        "type": []
    })

    # iterate through terms from longest to shortest
    terms = sorted(terms, key=len)[::-1]
    for spacy_term in terms:
        term_length = len(spacy_term)
        lemma_term_list = [token.lemma_ for token in spacy_term]
        text_term_list = [token.text for token in spacy_term]
        term_lemma = " ".join(lemma_term_list)

        # skip short acronyms that can cause problems
        if len(term_lemma) <= 2:
            continue

        # additional check to check for simple plural of uncommon biology terms
        match_uncommon_plural = lemma_term_list.copy()
        match_uncommon_plural[-1] = match_uncommon_plural[-1] + "s"

        # additional check using heuristics on lemmatized version
        match_heuristic = []
        if lemma_term_list[0] not in HEURISTIC_TOKENS:
            for token in lemma_term_list:
                if token not in HEURISTIC_TOKENS:
                    match_heuristic += token.split("-")
            heuristic_length = len(match_heuristic)
        else:
            heuristic_term = lemma_term_list
            heuristic_length = len(lemma_term_list)

        for ix in range(len(text) - term_length):

            heuristic_match = (
                lemmatized_text[ix:ix + heuristic_length] == match_heuristic)
            plural_match = (
                lemmatized_text[ix:ix + term_length] == match_uncommon_plural)
            lemma_match = (lemmatized_text[ix:ix +
                                           term_length] == lemma_term_list)
            text_match = (tokenized_text[ix:ix +
                                         term_length] == text_term_list)
            lower_match = ([
                t.lower() for t in tokenized_text[ix:ix + term_length]
            ] == [t.lower() for t in text_term_list])

            # Only match on text if lemmatized version is a stop word (i.e. lower casing acronym)
            if term_lemma in STOP_WORDS:
                valid_match = text_match
            else:
                valid_match = heuristic_match or plural_match or text_match or lemma_match or lower_match

            if valid_match:

                if heuristic_match and not lemma_match:
                    match_length = heuristic_length
                else:
                    match_length = term_length

                term_text = " ".join(
                    [t.text for t in text[ix:ix + match_length]])
                term_tag = " ".join(
                    [t.tag_ for t in text[ix:ix + match_length]])

                # only tag term if not part of larger term
                if tags[ix:ix + match_length] == ["O"] * match_length:

                    # classify term type
                    term_type = determine_term_type(spacy_term)

                    # collect term information
                    found_terms[term_lemma]["text"].append(term_text)
                    found_terms[term_lemma]["indices"].append(
                        (ix, ix + match_length))
                    found_terms[term_lemma]["pos"].append(term_tag)
                    found_terms[term_lemma]["type"].append(term_type)

                    # update sentence tags
                    tags = tag_bioes(tags, ix, match_length)

                    # annotate token representations with term type
                    text[ix]._.workaround = f"<{term_type}>" + text[
                        ix]._.workaround
                    end_ix = ix + match_length - 1
                    if text[end_ix]._.workaround.endswith(" "):
                        text[end_ix]._.workaround = text[
                            end_ix]._.workaround[:-1] + f"</{term_type}> "
                    else:
                        text[end_ix]._.workaround += f"</{term_type}>"

    # reconstruct fully annotated input text
    annotated_text = ""
    for token in text:
        annotated_text += token._.workaround

    return {
        "tokenized_text": tokenized_text,
        "tags": tags,
        "annotated_text": annotated_text,
        "found_terms": dict(found_terms)
    }