def test_serialize_doc_roundtrip_disk(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()
示例#2
0
def test_serialize_doc_roundtrip_disk(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()
示例#3
0
def Serialization():
    print("\nThe outcomes of Serialization are:")
    try:
        text = open("/home/wangdi498/SpaCy/diary2.txt", 'r').read(
        )  # 'r'会按编码格式进行解析,read()返回的是str;'rb':会按二进制进行解析,read()返回的是bytes。
        print("\nInfo: The Serialization file can be read.\n")
    except FileNotFoundError:
        print("\nError! The Serialization file cannot be read!\n")
        sys.exit(0)  # os._exit()会直接将python程序终止,之后的所有代码都不会继续执行。
    except:
        print("\nError! The .txt file must be UTF-8 encoded format!\n")
    doc = nlp(text)
    doc.to_disk("/home/wangdi498/SpaCy/diary1.bin")

    from spacy.tokens import Doc
    from spacy.vocab import Vocab
    doc = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary1.bin")
    print("The texts are:\n{}".format(doc))

    from spacy.tokens import Span
    doc = nlp(text)
    print("\nThe 1st round of Save and Load is:")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
    assert len(
        doc.ents) != 0, "\nError! This document cannot be empty!"  # 防止Doc为空。
    augment = [Span(doc, 0, 2, label=doc.vocab.strings[u'EVENT'])]
    doc.ents = list(doc.ents) + augment
    doc.to_disk("/home/wangdi498/SpaCy/diary2.bin")
    print("\nThe 2nd round of Save and Load is:")
    for ent in doc.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))

    paragraph = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary2.bin")
    assert len(paragraph.ents
               ) != 0, "\nError! This document cannot be empty!"  # 防止Doc为空。
    print("\nThe 3rd round of Save and Load is:")
    for ent in paragraph.ents:
        print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format(
            ent.text, ent.start_char, ent.end_char, ent.label_,
            doc[doc.ents.index(ent)].ent_iob_,
            doc[doc.ents.index(ent)].ent_type_))
    assert [(ent.text, ent.label_) for ent in paragraph.ents] != [
        (u'2018年9月27日', u'EVENT')
    ], "\nHere! The entity '%s' has matched the specified one." % ent.text
示例#4
0
def test_tensorwrapper(tmp_path: Path):
    doc = Doc(Vocab(), words=["This", "is", "a", "test"])
    doc.user_data["tensor"] = TensorWrapper(torch.zeros((2, 2)), 1)
    doc.to_disk(tmp_path / "doc.bin")
    spacy.load(tmp_path / "doc.bin")