def test_serialize_transformer_data(): data = {"x": TransformerData.empty()} bytes_data = srsly.msgpack_dumps(data) new_data = srsly.msgpack_loads(bytes_data) assert isinstance(new_data["x"], TransformerData) nlp = Language() trf = nlp.add_pipe( "transformer", config={ "model": { "name": "distilbert-base-uncased", "transformer_config": { "output_attentions": True }, } }, ) nlp.initialize() doc = nlp("This is a test.") b = doc.to_bytes() reloaded_doc = Doc(nlp.vocab) reloaded_doc.from_bytes(b) assert_docs_equal(doc, reloaded_doc) for key in doc._.trf_data.model_output: assert_array_equal(doc._.trf_data.model_output[key], reloaded_doc._.trf_data.model_output[key])
def check_serialization(nlp, text: str = "It is a serialization set. 今日はとてもいい天気だった!" ): with tempfile.TemporaryDirectory() as d: nlp.to_disk(str(d)) nlp2 = spacy.load(str(d)) assert_docs_equal(nlp(text), nlp2(text))
def test_multiprocessing(simple_nlp, texts): ops = get_current_ops() if isinstance(ops, NumpyOps): texts = texts * 3 expecteds = [simple_nlp(text) for text in texts] docs = simple_nlp.pipe(texts, n_process=2, batch_size=2) for doc, expected_doc in zip(docs, expecteds): assert_docs_equal(doc, expected_doc)
def test_serialization(nlp, tmpdir): docs = [nlp(text) for text in TEXTS] for i in range(2): d = str(tmpdir + f"/{i}") nlp.to_disk(d) nlp = spacy.load(d) docs2 = [nlp(text) for text in TEXTS] for doc1, doc2 in zip(docs, docs2): assert_docs_equal(doc1, doc2)
def test_serialization(nlp, tmpdir): docs = [nlp(text) for text in TEXTS] nlp.to_disk(str(tmpdir)) nlp2 = spacy.load(str(tmpdir)) docs2 = [nlp2(text) for text in TEXTS] for doc1, doc2 in zip(docs, docs2): if spacy.__version__ == "2.2.4": # this version of spacy has a bug in `assert_docs_equal`. # see https://github.com/explosion/spaCy/issues/5144 return assert_docs_equal(doc1, doc2)