예제 #1
0
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
    ) -> "Transformer":
        """Load the pipe from disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (Transformer): The loaded object.

        DOCS: https://spacy.io/api/transformer#from_disk
        """

        def load_model(p):
            try:
                with open(p, "rb") as mfile:
                    self.model.from_bytes(mfile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None

        deserialize = {
            "vocab": self.vocab.from_disk,
            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
            "model": load_model,
        }
        util.from_disk(path, deserialize, exclude)
        return self
예제 #2
0
파일: subclasses.py 프로젝트: MalteHB/DaCy
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
    ) -> "Transformer":
        """Load the pipe from disk.
        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (Transformer): The loaded object.
        DOCS: https://spacy.io/api/transformer#from_disk
        """

        def load_model(p):
            p = Path(p).absolute()
            tokenizer, transformer = huggingface_classification_from_pretrained(
                p, self.model.attrs["tokenizer_config"]
            )
            self.model.attrs["tokenizer"] = tokenizer
            self.model.attrs["set_transformer"](self.model, transformer)

        deserialize = {
            "vocab": self.vocab.from_disk,
            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
            "model": load_model,
        }
        util.from_disk(path, deserialize, exclude)
        return self
예제 #3
0
파일: fi.py 프로젝트: aajanki/spacy-fi
 def from_disk(
     self,
     path: Union[str, Path],
     *,
     exclude: Iterable[str] = SimpleFrozenList()
 ) -> "MorphologizerLemmatizer":
     deserialize = {"lookups": lambda p: self.lookups.from_disk(p)}
     util.from_disk(path, deserialize, exclude)
     return self
예제 #4
0
    def from_disk(self, path: Path, **kwargs):
        """Deserialize saved RemoteAnnLinker from disk.
        
        path (Path): directory to deserialize from
        
        RETURNS (RemoteAnnLinker): Initialized RemoteAnnLinker
        """
        path = ensure_path(path)
        cfg = {}
        deserializers = {"cfg": lambda p: cfg.update(srsly.read_json(p))}
        from_disk(path, deserializers, {})
        self.cfg.update(cfg)
        self.base_url = cfg.get('base_url')
        self.headers = cfg.get('headers', {})

        return self
    def from_disk(self, path, exclude=tuple(), **kwargs):
        """Load the pipe and its model from disk."""
        def load_model(p):
            p = Path(p).absolute()
            tokenizer, transformer = huggingface_from_pretrained(
                p, self.model.attrs["tokenizer_config"])
            self.model.attrs["tokenizer"] = tokenizer
            self.model.attrs["set_transformer"](self.model, transformer)

        deserialize = {
            "vocab": self.vocab.from_disk,
            "cfg": lambda p: self.cfg.update(_load_cfg(p)),
            "model": load_model,
        }
        exclude = util.get_serialization_exclude(deserialize, exclude, kwargs)
        util.from_disk(path, deserialize, exclude)
        return self
예제 #6
0
    def from_disk(
        self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
    ) -> "Transformer":
        """Load the pipe from disk.

        path (str / Path): Path to a directory.
        exclude (Iterable[str]): String names of serialization fields to exclude.
        RETURNS (Transformer): The loaded object.

        DOCS: https://spacy.io/api/transformer#from_disk
        """

        def load_model(p):
            try:
                with open(p, "rb") as mfile:
                    self.model.from_bytes(mfile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
            except (IsADirectoryError, PermissionError):
                warn_msg = (
                    "Automatically converting a transformer component "
                    "from spacy-transformers v1.0 to v1.1+. If you see errors "
                    "or degraded performance, download a newer compatible "
                    "model or retrain your custom model with the current "
                    "spacy-transformers version. For more details and "
                    "available updates, run: python -m spacy validate"
                )
                warnings.warn(warn_msg)
                p = Path(p).absolute()
                hf_model = huggingface_from_pretrained(
                    p,
                    self.model._init_tokenizer_config,
                    self.model._init_transformer_config,
                )
                self.model.attrs["set_transformer"](self.model, hf_model)

        deserialize = {
            "vocab": self.vocab.from_disk,
            "cfg": lambda p: self.cfg.update(deserialize_config(p)),
            "model": load_model,
        }
        util.from_disk(path, deserialize, exclude)
        return self
    def from_disk(self, path: Path, **kwargs):
        """Deserialize CandidateGenerator data from disk
        
        path (Path): Directory to deserialize data from
        
        RETURNS (CandidateGenerator): Initialized Candidate Generator
        """
        aliases_path = f"{path}/aliases.json"
        short_aliases_path = f"{path}/short_aliases.json"
        ann_index_path = f"{path}/ann_index.bin"
        tfidf_vectorizer_path = f"{path}/tfidf_vectorizer.joblib"
        tfidf_vectors_path = f"{path}/tfidf_vectors_sparse.npz"

        cfg = {}
        deserializers = {"cg_cfg": lambda p: cfg.update(srsly.read_json(p))}
        from_disk(path, deserializers, {})

        self.k = cfg.get("k", 5)
        self.m_parameter = cfg.get("m_parameter", 100)
        self.ef_search = cfg.get("ef_search", 200)
        self.ef_construction = cfg.get("ef_construction", 2000)
        self.n_threads = cfg.get("n_threads", 60)

        aliases = srsly.read_json(aliases_path)
        short_aliases = srsly.read_json(short_aliases_path)
        tfidf_vectorizer = joblib.load(tfidf_vectorizer_path)
        alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype(
            np.float32)
        ann_index = nmslib.init(
            method="hnsw",
            space="cosinesimil_sparse",
            data_type=nmslib.DataType.SPARSE_VECTOR,
        )
        ann_index.addDataPointBatch(alias_tfidfs)
        ann_index.loadIndex(str(ann_index_path))
        query_time_params = {"efSearch": self.ef_search}
        ann_index.setQueryTimeParams(query_time_params)

        self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer,
                         alias_tfidfs)

        return self
예제 #8
0
 def from_disk(self, path, **_kwargs):
     path = util.ensure_path(path)
     serializers = OrderedDict(
         (("cfg", lambda p: self._set_config(srsly.read_json(p))), ))
     util.from_disk(path, serializers, [])