def test_serialize_doc_roundtrip_disk(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes()
def Serialization(): print("\nThe outcomes of Serialization are:") try: text = open("/home/wangdi498/SpaCy/diary2.txt", 'r').read( ) # 'r'会按编码格式进行解析,read()返回的是str;'rb':会按二进制进行解析,read()返回的是bytes。 print("\nInfo: The Serialization file can be read.\n") except FileNotFoundError: print("\nError! The Serialization file cannot be read!\n") sys.exit(0) # os._exit()会直接将python程序终止,之后的所有代码都不会继续执行。 except: print("\nError! The .txt file must be UTF-8 encoded format!\n") doc = nlp(text) doc.to_disk("/home/wangdi498/SpaCy/diary1.bin") from spacy.tokens import Doc from spacy.vocab import Vocab doc = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary1.bin") print("The texts are:\n{}".format(doc)) from spacy.tokens import Span doc = nlp(text) print("\nThe 1st round of Save and Load is:") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) assert len( doc.ents) != 0, "\nError! This document cannot be empty!" # 防止Doc为空。 augment = [Span(doc, 0, 2, label=doc.vocab.strings[u'EVENT'])] doc.ents = list(doc.ents) + augment doc.to_disk("/home/wangdi498/SpaCy/diary2.bin") print("\nThe 2nd round of Save and Load is:") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) paragraph = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary2.bin") assert len(paragraph.ents ) != 0, "\nError! This document cannot be empty!" # 防止Doc为空。 print("\nThe 3rd round of Save and Load is:") for ent in paragraph.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) assert [(ent.text, ent.label_) for ent in paragraph.ents] != [ (u'2018年9月27日', u'EVENT') ], "\nHere! The entity '%s' has matched the specified one." % ent.text
def test_tensorwrapper(tmp_path: Path): doc = Doc(Vocab(), words=["This", "is", "a", "test"]) doc.user_data["tensor"] = TensorWrapper(torch.zeros((2, 2)), 1) doc.to_disk(tmp_path / "doc.bin") spacy.load(tmp_path / "doc.bin")