예제 #1
0
def test_language_wordpiece_to_from_bytes(name):
    nlp = PyTT_Language()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wordpiecer = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name)
    nlp.add_pipe(wordpiecer)
    doc = nlp("hello world")
    assert doc._.pytt_word_pieces is not None
    nlp2 = PyTT_Language()
    nlp2.add_pipe(nlp.create_pipe("sentencizer"))
    nlp2.add_pipe(PyTT_WordPiecer(nlp2.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert new_doc._.pytt_word_pieces is not None
예제 #2
0
    def __init__(self,
                 model_name: str,
                 model_path: Optional[str] = None) -> None:
        """
        Loads a model_name (e.g. en_pytt_xlnetbasecased_lg) or a combination
        of model name and local model path (e.g. xlnet-large-cased and /local/mlinde/xlnet-large-cased)
        see https://github.com/explosion/spacy-pytorch-transformers#loading-models-from-a-path for how to prepare a model
        :param model_name:
        :param model_path:
        """
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if model_name not in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]
예제 #3
0
def nlp(name):
    p_nlp = PyTT_Language(pytt_name=name)
    p_nlp.add_pipe(p_nlp.create_pipe("sentencizer"))
    p_nlp.add_pipe(PyTT_WordPiecer.from_pretrained(p_nlp.vocab,
                                                   pytt_name=name))
    p_nlp.add_pipe(
        PyTT_TokenVectorEncoder.from_pretrained(p_nlp.vocab, name=name))
    return p_nlp
예제 #4
0
def test_language_wordpiece_tok2vec_to_from_bytes(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    nlp2 = PyTT_Language()
    nlp2.add_pipe(nlp2.create_pipe("sentencizer"))
    nlp2.add_pipe(PyTT_WordPiecer(nlp.vocab))
    nlp2.add_pipe(PyTT_TokenVectorEncoder(nlp.vocab))
    with pytest.raises(ValueError):
        new_doc = nlp2("hello world")
    nlp2.from_bytes(nlp.to_bytes())
    new_doc = nlp2("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert new_doc._.pytt_word_pieces is not None
def main(path, name="bert-base-uncased", lang="en"):
    msg = Printer()
    msg.info(f"Creating model for '{name}' ({lang})")
    with msg.loading(f"Setting up the pipeline..."):
        nlp = PyTT_Language(pytt_name=name, meta={"lang": lang})
        nlp.add_pipe(nlp.create_pipe("sentencizer"))
        nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, name))
        nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, name))
    msg.good("Initialized the model pipeline")
    nlp.to_disk(path)
    msg.good(f"Saved '{name}' ({lang})")
    msg.text(f"Pipeline: {nlp.pipe_names}")
    msg.text(f"Location: {path}")
def main(
    name="bert-base-uncased",
    n_texts=1000,
    lang="en",
    skip=False,
    retry=False,
    force=False,
):
    """Test the wordpiecer on a large dataset to find misalignments. If both the
    retry and force flag are set (which is the default runtime configuration),
    this script should always pass.

    * retry: If alignment fails after cleaning and normalizing both sets of
        tokens, try again with a more aggressive strategy that strips out all
        characters that are not uppercase/lowercase letters.
    * force: If alignment still fails, run the word-piece tokenizer on the
        individual spaCy tokens, so that alignment is trivial. This should
        always work.
    """
    cfg = {"retry_alignment": retry, "force_alignment": force}
    nlp = get_lang_class(lang)()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    wp = PyTT_WordPiecer.from_pretrained(nlp.vocab, pytt_name=name, **cfg)
    msg.good(f"Loaded WordPiecer for model '{name}'")
    with msg.loading("Loading IMDB data..."):
        data, _ = thinc.extra.datasets.imdb(limit=n_texts)
    texts, _ = zip(*data)
    msg.good(f"Using {len(texts)} texts from IMDB data")
    msg.info("Processing texts...")
    sent_counts = 0
    for doc in tqdm.tqdm(nlp.pipe(texts), total=len(texts)):
        try:
            doc = wp(doc)
            sent_counts += len(list(doc.sents))
        except AssertionError as e:
            if len(e.args) and isinstance(e.args[0],
                                          tuple):  # Misaligned error
                a, b = e.args[0]
                msg.fail("Misaligned tokens")
                print(diff_strings(a, b))
                if not skip:
                    sys.exit(1)
            elif len(e.args):
                msg.fail(f"Error: {e.args[0]}", exits=None if skip else 1)
            else:
                if skip:
                    print(e)
                else:
                    raise e
    msg.good(f"Processed {len(texts)} documents ({sent_counts} sentences)")
예제 #7
0
def test_language_to_from_disk(nlp, name):
    doc = nlp("hello world")
    assert is_valid_tensor(doc.tensor)
    with make_tempdir() as tempdir:
        nlp.to_disk(tempdir)
        new_nlp = PyTT_Language()
        new_nlp.add_pipe(new_nlp.create_pipe("sentencizer"))
        wordpiecer = PyTT_WordPiecer(new_nlp.vocab, pytt_name=name)
        tok2vec = PyTT_TokenVectorEncoder(new_nlp.vocab, pytt_name=name)
        new_nlp.add_pipe(wordpiecer)
        new_nlp.add_pipe(tok2vec)
        new_nlp.from_disk(tempdir)
    assert new_nlp.pipe_names == nlp.pipe_names
    new_doc = new_nlp("hello world")
    assert is_valid_tensor(new_doc.tensor)
    assert_equal(doc.tensor, new_doc.tensor)
예제 #8
0
    def __init__(self, model_name: str, model_path: Optional[str]) -> None:
        super().__init__()
        with SwitchDefaultTensor():
            if model_path:
                self.nlp = PyTT_Language(pytt_name=model_name,
                                         meta={"lang": "en"})
                self.nlp.add_pipe(self.nlp.create_pipe("sentencizer"))
                self.nlp.add_pipe(
                    PyTT_WordPiecer.from_pretrained(self.nlp.vocab,
                                                    model_name))
                self.nlp.add_pipe(
                    PyTT_TokenVectorEncoder.from_pretrained(
                        self.nlp.vocab, model_path))

            else:
                self.nlp = spacy.load(model_name)
        if not model_name in NAME_TO_DIM:
            raise ValueError("Model name is unknown, I know " +
                             str(list(NAME_TO_DIM.keys())))
        self.output_dim = NAME_TO_DIM[model_name]
def wordpiecer(name):
    return PyTT_WordPiecer.from_pretrained(vocab, pytt_name=name)
예제 #10
0
from spacy_pytorch_transformers import PyTT_Language, PyTT_WordPiecer, PyTT_TokenVectorEncoder
from pathlib import Path

pytorch_path = str(Path.home() / "pytorch-rubert")
spacy_path = str(Path.home() / "spacy-rubert")
name = "ru_pytt_rubert_cased"

nlp = PyTT_Language(pytt_name=name, meta={"lang": "ru"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(PyTT_WordPiecer.from_pretrained(nlp.vocab, pytorch_path))
nlp.add_pipe(PyTT_TokenVectorEncoder.from_pretrained(nlp.vocab, pytorch_path))
print(nlp.pipe_names)
nlp.to_disk(spacy_path)
예제 #11
0
def wp(name):
    return PyTT_WordPiecer.from_pretrained(Vocab(), pytt_name=name)