def doc(): nlp = English() # make sure we get a new vocab every time # fmt: off words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."] tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"] morphs = ["NounType=prop|Number=sing", "Poss=yes", "Number=sing", "Tense=past|VerbForm=fin", "", "NounType=prop|Number=sing", "NounType=prop|Number=sing", "", "NounType=prop|Number=sing", "PunctType=peri"] # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] deps = ["poss", "case", "nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] lemmas = ["Sarah", "'s", "sister", "fly", "to", "Silicon", "Valley", "via", "London", "."] ents = ["O"] * len(words) ents[0] = "B-PERSON" ents[1] = "I-PERSON" ents[5] = "B-LOC" ents[6] = "I-LOC" ents[8] = "B-GPE" cats = {"TRAVEL": 1.0, "BAKING": 0.0} # fmt: on doc = Doc( nlp.vocab, words=words, tags=tags, pos=pos, morphs=morphs, heads=heads, deps=deps, lemmas=lemmas, ents=ents, ) doc.cats = cats return doc
def doc(nlp): # fmt: off words = ["Sarah", "'s", "sister", "flew", "to", "Silicon", "Valley", "via", "London", "."] tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] pos = ["PROPN", "PART", "NOUN", "VERB", "ADP", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"] ents = ["B-PERSON", "I-PERSON", "O", "O", "O", "B-LOC", "I-LOC", "O", "B-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} # fmt: on doc = Doc(nlp.vocab, words=words, tags=tags, pos=pos, ents=ents) doc.cats = cats return doc
def test_serialize_doc_roundtrip_bytes(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) doc.cats = {"A": 0.5} doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b