def __init__(self, language: str = "en", mode: str = "lookup") -> None: """ SpacyLemmatizer constructor. Parameters ---------- language : str Language argument for the lemmatizer. For the list of supported languages, see https://spacy.io/usage/models#languages. Default: "en". mode : str The lemmatizer mode. By default, the following modes are available: "lookup" and "rule". Default: "lookup". """ language = "en_core_web_sm" if language == "en" else language nlp = load_spacy_model_or_raise(language, disable=["parser", "ner"]) try: # SpaCy<3.0 from spacy.lemmatizer import Lemmatizer is_spacy_old = True except ImportError: # SpaCy>=3.0 from spacy.pipeline import Lemmatizer from spacy.tokens import Doc is_spacy_old = False if is_spacy_old: lemmatizer = Lemmatizer(nlp.vocab.lookups) if mode == "lookup": lemmatizer.lookups.remove_table("lemma_rules") lemmatizer.lookups.remove_table("lemma_index") lemmatizer.lookups.remove_table("lemma_exc") else: lemmatizer.lookups.remove_table("lemma_lookup") def lemmatize(tokenized): return [lemmatizer.lookup(token) for token in tokenized] else: lemmatizer = Lemmatizer(nlp.vocab, None, mode=mode) try: lemmatizer.initialize() except ValueError as err: raise ValueError( "SpaCy lookups data is missing. " "Visit https://spacy.io/usage/models" "for more information on how to install it." ) from err def tokenizer(text: List[str]) -> Doc: return Doc(nlp.vocab, text) nlp.tokenizer = tokenizer def lemmatize(tokenized): return [token.lemma_ for token in lemmatizer(nlp(tokenized))] self._lemmatize = lemmatize