def test_language_wordpiece_to_from_bytes(name): nlp = PyTT_Language() nlp.add_pipe(nlp.create_pipe("sentencizer")) wordpiecer = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name) nlp.add_pipe(wordpiecer) doc = nlp("hello world") assert doc._.pytt_word_pieces is not None nlp2 = PyTT_Language() nlp2.add_pipe(nlp.create_pipe("sentencizer")) nlp2.add_pipe(PyTT_WordPiecer(nlp2.vocab)) with pytest.raises(ValueError): new_doc = nlp2("hello world") nlp2.from_bytes(nlp.to_bytes()) new_doc = nlp2("hello world") assert new_doc._.pytt_word_pieces is not None
def __init__(self, model_name: str, model_path: Optional[str] = None) -> None: """ Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased) see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model :param model_name: :param model_path: """ super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if model_name not in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name]
def nlp(name): p_nlp = PyTT_Language(pytt_name=name) p_nlp.add_pipe(p_nlp.create_pipe("sentencizer")) p_nlp.add_pipe(PyTT_WordPiecer.from_pretrained(p_nlp.vocab, pytt_name=name)) p_nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained(p_nlp.vocab, name=name)) return p_nlp
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) nlp2 = PyTT_Language() nlp2.add_pipe(nlp2.create_pipe("sentencizer")) nlp2.add_pipe(PyTT_WordPiecer(nlp.vocab)) nlp2.add_pipe(PyTT_TokenVectorEncoder(nlp.vocab)) with pytest.raises(ValueError): new_doc = nlp2("hello world") nlp2.from_bytes(nlp.to_bytes()) new_doc = nlp2("hello world") assert is_valid_tensor(new_doc.tensor) assert new_doc._.pytt_word_pieces is not None
def main(path, name="bert-base-uncased", lang="en"): msg = Printer() msg.info(f"Creating model for '{name}' ({lang})") with msg.loading(f"Setting up the pipeline..."): nlp = PyTT_Language(pytt_name=name, meta={"lang": lang}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name)) msg.good("Initialized the model pipeline") nlp.to_disk(path) msg.good(f"Saved '{name}' ({lang})") msg.text(f"Pipeline: {nlp.pipe_names}") msg.text(f"Location: {path}")
def main( name="bert-base-uncased", n_texts=1000, lang="en", skip=False, retry=False, force=False, ): """Test the wordpiecer on a large dataset to find misalignments. If both the retry and force flag are set (which is the default runtime configuration), this script should always pass. * retry: If alignment fails after cleaning and normalizing both sets of tokens, try again with a more aggressive strategy that strips out all characters that are not uppercase/lowercase letters. * force: If alignment still fails, run the word-piece tokenizer on the individual spaCy tokens, so that alignment is trivial. This should always work. """ cfg = {"retry_alignment": retry, "force_alignment": force} nlp = get_lang_class(lang)() nlp.add_pipe(nlp.create_pipe("sentencizer")) wp = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name, **cfg) msg.good(f"Loaded WordPiecer for model '{name}'") with msg.loading("Loading IMDB data..."): data, _ = thinc.extra.datasets.imdb(limit=n_texts) texts, _ = zip(*data) msg.good(f"Using {len(texts)} texts from IMDB data") msg.info("Processing texts...") sent_counts = 0 for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)): try: doc = wp(doc) sent_counts += len(list(doc.sents)) except AssertionError as e: if len(e.args) and isinstance(e.args[0], tuple): # Misaligned error a, b = e.args[0] msg.fail("Misaligned tokens") print(diff_strings(a, b)) if not skip: sys.exit(1) elif len(e.args): msg.fail(f"Error: {e.args[0]}", exits=None if skip else 1) else: if skip: print(e) else: raise e msg.good(f"Processed {len(texts)} documents ({sent_counts} sentences)")
def test_language_to_from_disk(nlp, name): doc = nlp("hello world") assert is_valid_tensor(doc.tensor) with make_tempdir() as tempdir: nlp.to_disk(tempdir) new_nlp = PyTT_Language() new_nlp.add_pipe(new_nlp.create_pipe("sentencizer")) wordpiecer = PyTT_WordPiecer(new_nlp.vocab, pytt_name=name) tok2vec = PyTT_TokenVectorEncoder(new_nlp.vocab, pytt_name=name) new_nlp.add_pipe(wordpiecer) new_nlp.add_pipe(tok2vec) new_nlp.from_disk(tempdir) assert new_nlp.pipe_names == nlp.pipe_names new_doc = new_nlp("hello world") assert is_valid_tensor(new_doc.tensor) assert_equal(doc.tensor, new_doc.tensor)
def __init__(self, model_name: str, model_path: Optional[str]) -> None: super().__init__() with SwitchDefaultTensor(): if model_path: self.nlp = PyTT_Language(pytt_name=model_name, meta={"lang": "en"}) self.nlp.add_pipe(self.nlp.create_pipe("sentencizer")) self.nlp.add_pipe( PyTT_WordPiecer.from_pretrained(self.nlp.vocab, model_name)) self.nlp.add_pipe( PyTT_TokenVectorEncoder.from_pretrained( self.nlp.vocab, model_path)) else: self.nlp = spacy.load(model_name) if not model_name in NAME_TO_DIM: raise ValueError("Model name is unknown, I know " + str(list(NAME_TO_DIM.keys()))) self.output_dim = NAME_TO_DIM[model_name]
def wordpiecer(name): return PyTT_WordPiecer.from_pretrained(vocab, pytt_name=name)
from spacy_pytorch_transformers import PyTT_Language, PyTT_WordPiecer, PyTT_TokenVectorEncoder from pathlib import Path pytorch_path = str(Path.home() / "pytorch-rubert") spacy_path = str(Path.home() / "spacy-rubert") name = "ru_pytt_rubert_cased" nlp = PyTT_Language(pytt_name=name, meta={"lang": "ru"}) nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, pytorch_path)) nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, pytorch_path)) print(nlp.pipe_names) nlp.to_disk(spacy_path)
def wp(name): return PyTT_WordPiecer.from_pretrained(Vocab(), pytt_name=name)