示例#1
0
    def __init__(self, language: str = "en", mode: str = "lookup") -> None:
        """
        SpacyLemmatizer constructor.

        Parameters
        ----------
        language : str
            Language argument for the lemmatizer.
            For the list of supported languages,
            see https://spacy.io/usage/models#languages.
            Default: "en".
        mode : str
            The lemmatizer mode. By default, the following modes are available:
            "lookup" and "rule". Default: "lookup".
        """

        language = "en_core_web_sm" if language == "en" else language
        nlp = load_spacy_model_or_raise(language, disable=["parser", "ner"])

        try:
            # SpaCy<3.0
            from spacy.lemmatizer import Lemmatizer

            is_spacy_old = True
        except ImportError:
            # SpaCy>=3.0
            from spacy.pipeline import Lemmatizer
            from spacy.tokens import Doc

            is_spacy_old = False

        if is_spacy_old:
            lemmatizer = Lemmatizer(nlp.vocab.lookups)

            if mode == "lookup":
                lemmatizer.lookups.remove_table("lemma_rules")
                lemmatizer.lookups.remove_table("lemma_index")
                lemmatizer.lookups.remove_table("lemma_exc")
            else:
                lemmatizer.lookups.remove_table("lemma_lookup")

            def lemmatize(tokenized):
                return [lemmatizer.lookup(token) for token in tokenized]

        else:
            lemmatizer = Lemmatizer(nlp.vocab, None, mode=mode)
            try:
                lemmatizer.initialize()
            except ValueError as err:
                raise ValueError(
                    "SpaCy lookups data is missing. "
                    "Visit https://spacy.io/usage/models"
                    "for more information on how to install it."
                ) from err

            def tokenizer(text: List[str]) -> Doc:
                return Doc(nlp.vocab, text)

            nlp.tokenizer = tokenizer

            def lemmatize(tokenized):
                return [token.lemma_ for token in lemmatizer(nlp(tokenized))]

        self._lemmatize = lemmatize