def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Transformer": """Load the pipe from disk. path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Transformer): The loaded object. DOCS: https://spacy.io/api/transformer#from_disk """ def load_model(p): try: with open(p, "rb") as mfile: self.model.from_bytes(mfile.read()) except AttributeError: raise ValueError(Errors.E149) from None deserialize = { "vocab": self.vocab.from_disk, "cfg": lambda p: self.cfg.update(deserialize_config(p)), "model": load_model, } util.from_disk(path, deserialize, exclude) return self
def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Transformer": """Load the pipe from disk. path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Transformer): The loaded object. DOCS: https://spacy.io/api/transformer#from_disk """ def load_model(p): p = Path(p).absolute() tokenizer, transformer = huggingface_classification_from_pretrained( p, self.model.attrs["tokenizer_config"] ) self.model.attrs["tokenizer"] = tokenizer self.model.attrs["set_transformer"](self.model, transformer) deserialize = { "vocab": self.vocab.from_disk, "cfg": lambda p: self.cfg.update(deserialize_config(p)), "model": load_model, } util.from_disk(path, deserialize, exclude) return self
def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> "MorphologizerLemmatizer": deserialize = {"lookups": lambda p: self.lookups.from_disk(p)} util.from_disk(path, deserialize, exclude) return self
def from_disk(self, path: Path, **kwargs): """Deserialize saved RemoteAnnLinker from disk. path (Path): directory to deserialize from RETURNS (RemoteAnnLinker): Initialized RemoteAnnLinker """ path = ensure_path(path) cfg = {} deserializers = {"cfg": lambda p: cfg.update(srsly.read_json(p))} from_disk(path, deserializers, {}) self.cfg.update(cfg) self.base_url = cfg.get('base_url') self.headers = cfg.get('headers', {}) return self
def from_disk(self, path, exclude=tuple(), **kwargs): """Load the pipe and its model from disk.""" def load_model(p): p = Path(p).absolute() tokenizer, transformer = huggingface_from_pretrained( p, self.model.attrs["tokenizer_config"]) self.model.attrs["tokenizer"] = tokenizer self.model.attrs["set_transformer"](self.model, transformer) deserialize = { "vocab": self.vocab.from_disk, "cfg": lambda p: self.cfg.update(_load_cfg(p)), "model": load_model, } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self
def from_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() ) -> "Transformer": """Load the pipe from disk. path (str / Path): Path to a directory. exclude (Iterable[str]): String names of serialization fields to exclude. RETURNS (Transformer): The loaded object. DOCS: https://spacy.io/api/transformer#from_disk """ def load_model(p): try: with open(p, "rb") as mfile: self.model.from_bytes(mfile.read()) except AttributeError: raise ValueError(Errors.E149) from None except (IsADirectoryError, PermissionError): warn_msg = ( "Automatically converting a transformer component " "from spacy-transformers v1.0 to v1.1+. If you see errors " "or degraded performance, download a newer compatible " "model or retrain your custom model with the current " "spacy-transformers version. For more details and " "available updates, run: python -m spacy validate" ) warnings.warn(warn_msg) p = Path(p).absolute() hf_model = huggingface_from_pretrained( p, self.model._init_tokenizer_config, self.model._init_transformer_config, ) self.model.attrs["set_transformer"](self.model, hf_model) deserialize = { "vocab": self.vocab.from_disk, "cfg": lambda p: self.cfg.update(deserialize_config(p)), "model": load_model, } util.from_disk(path, deserialize, exclude) return self
def from_disk(self, path: Path, **kwargs): """Deserialize CandidateGenerator data from disk path (Path): Directory to deserialize data from RETURNS (CandidateGenerator): Initialized Candidate Generator """ aliases_path = f"{path}/aliases.json" short_aliases_path = f"{path}/short_aliases.json" ann_index_path = f"{path}/ann_index.bin" tfidf_vectorizer_path = f"{path}/tfidf_vectorizer.joblib" tfidf_vectors_path = f"{path}/tfidf_vectors_sparse.npz" cfg = {} deserializers = {"cg_cfg": lambda p: cfg.update(srsly.read_json(p))} from_disk(path, deserializers, {}) self.k = cfg.get("k", 5) self.m_parameter = cfg.get("m_parameter", 100) self.ef_search = cfg.get("ef_search", 200) self.ef_construction = cfg.get("ef_construction", 2000) self.n_threads = cfg.get("n_threads", 60) aliases = srsly.read_json(aliases_path) short_aliases = srsly.read_json(short_aliases_path) tfidf_vectorizer = joblib.load(tfidf_vectorizer_path) alias_tfidfs = scipy.sparse.load_npz(tfidf_vectors_path).astype( np.float32) ann_index = nmslib.init( method="hnsw", space="cosinesimil_sparse", data_type=nmslib.DataType.SPARSE_VECTOR, ) ann_index.addDataPointBatch(alias_tfidfs) ann_index.loadIndex(str(ann_index_path)) query_time_params = {"efSearch": self.ef_search} ann_index.setQueryTimeParams(query_time_params) self._initialize(aliases, short_aliases, ann_index, tfidf_vectorizer, alias_tfidfs) return self
def from_disk(self, path, **_kwargs): path = util.ensure_path(path) serializers = OrderedDict( (("cfg", lambda p: self._set_config(srsly.read_json(p))), )) util.from_disk(path, serializers, [])