Пример #1
0
def test_serialize_doc_roundtrip_disk(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()
Пример #2
0
def test_serialize_doc_roundtrip_disk(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    with make_tempdir() as d:
        file_path = d / "doc"
        doc.to_disk(file_path)
        doc_d = Doc(en_vocab).from_disk(file_path)
        assert doc.to_bytes() == doc_d.to_bytes()
Пример #3
0
def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data
Пример #4
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Пример #5
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    string = "This is a first sentence . And another one"
    doc = Doc(Vocab(), words=string.split())
    doc[6].sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.is_parsed
    assert not new_doc.is_tagged
    doc.is_parsed = True
    doc.is_tagged = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_parsed
    assert new_doc.is_tagged
Пример #6
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Пример #7
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc.is_tagged = True
    doc.is_parsed = True
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.is_parsed
Пример #8
0
def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text
Пример #9
0
def test_serialize_empty_doc(en_vocab):
    doc = Doc(en_vocab)
    data = doc.to_bytes()
    doc2 = Doc(en_vocab)
    doc2.from_bytes(data)
    assert len(doc) == len(doc2)
    for token1, token2 in zip(doc, doc2):
        assert token1.text == token2.text
Пример #10
0
def test_issue1834():
    """Test that sentence boundaries & parse/tag flags are not lost
    during serialization."""
    words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
    doc = Doc(Vocab(), words=words)
    doc[6].is_sent_start = True
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert not new_doc.has_annotation("DEP")
    assert not new_doc.has_annotation("TAG")
    doc = Doc(
        Vocab(),
        words=words,
        tags=["TAG"] * len(words),
        heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
        deps=["dep"] * len(words),
    )
    new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
    assert new_doc[6].sent_start
    assert new_doc.has_annotation("DEP")
    assert new_doc.has_annotation("TAG")
Пример #11
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.has_annotation("ENT_IOB")
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.has_annotation("ENT_IOB")
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.has_annotation("ENT_IOB")
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.has_annotation("ENT_IOB")
Пример #12
0
def test_sbd_serialization_projective(EN):
    """
    test that before and after serialization, the sentence boundaries are the same.
    """

    example = EN.tokenizer.tokens_from_list(u"I bought a couch from IKEA. It was n't very comfortable .".split(' '))
    EN.tagger(example)
    apply_transition_sequence(EN, example, ['L-nsubj','S','L-det','R-dobj','D','R-prep','R-pobj','B-ROOT','L-nsubj','R-neg','D','S','L-advmod','R-acomp','D','R-punct'])

    example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())

    assert example.to_bytes() == example_serialized.to_bytes()
    assert [s.text for s in example.sents] == [s.text for s in example_serialized.sents]
Пример #13
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.is_nered
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.is_nered
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.is_nered
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_nered
Пример #14
0
def test_serialize_after_adding_entity():
    # Re issue #514
    vocab = spacy.en.English.Defaults.create_vocab()
    entity_recognizer = spacy.en.English.Defaults.create_entity()

    doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())
    entity_recognizer.add_label('Food')
    entity_recognizer(doc)

    label_id = vocab.strings[u'Food']
    doc.ents = [(label_id, 5,6)]

    assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]

    byte_string = doc.to_bytes()
Пример #15
0
def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
Пример #16
0
def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data
    with pytest.raises(ValueError):
        doc.to_bytes(user_data=False)
    with pytest.raises(ValueError):
        Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
Пример #17
0
def test_serialize_doc_exclude(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.user_data["foo"] == "bar"
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
    assert not new_doc.user_data
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
    assert not new_doc.user_data
    with pytest.raises(ValueError):
        doc.to_bytes(user_data=False)
    with pytest.raises(ValueError):
        Doc(en_vocab).from_bytes(doc.to_bytes(), tensor=False)
Пример #18
0
def test_sbd_serialization_projective(EN):
    """
    test that before and after serialization, the sentence boundaries are the same.
    """

    example = EN.tokenizer.tokens_from_list(
        u"I bought a couch from IKEA. It was n't very comfortable .".split(
            ' '))
    EN.tagger(example)
    apply_transition_sequence(EN, example, [
        'L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', 'B-ROOT',
        'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', 'R-acomp', 'D', 'R-punct'
    ])

    example_serialized = Doc(EN.vocab).from_bytes(example.to_bytes())

    assert example.to_bytes() == example_serialized.to_bytes()
    assert [s.text for s in example.sents
            ] == [s.text for s in example_serialized.sents]
Пример #19
0
def test_en_parser_sbd_serialization_projective(nlp):
    """Test that before and after serialization, the sentence boundaries are
     the same."""
    # NB: This was marked as causing segfault previously.
    # fmt: off
    text = "I bought a couch from IKEA It wasn't very comfortable."
    transition = ["L-nsubj", "S", "L-det", "R-dobj", "D", "R-prep", "R-pobj",
                  "B-ROOT", "L-nsubj", "R-neg", "D", "S", "L-advmod",
                  "R-acomp", "D", "R-punct"]
    # fmt: on

    doc = nlp.tokenizer(text)
    # This utility function needs to be updated I think?
    apply_transition_sequence(nlp.get_pipe("parser"), doc, transition)
    doc_serialized = Doc(nlp.vocab).from_bytes(doc.to_bytes())
    assert doc.is_parsed
    assert doc_serialized.is_parsed
    assert doc.to_bytes() == doc_serialized.to_bytes()
    assert [s.text for s in doc.sents] == [s.text for s in doc_serialized.sents]
Пример #20
0
def test_en_sbd_serialization_projective(combined_all_model_fixture):
    """Test that before and after serialization, the sentence boundaries are
    the same."""

    text = "I bought a couch from IKEA It wasn't very comfortable."
    transition = [
        'L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', 'B-ROOT',
        'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', 'R-acomp', 'D', 'R-punct'
    ]

    doc = combined_all_model_fixture.tokenizer(text)
    apply_transition_sequence(en_with_combined_rule_tokenizer_fixture.parser,
                              doc, transition)
    doc_serialized = Doc(
        en_with_combined_rule_tokenizer_fixture.vocab).from_bytes(
            doc.to_bytes())
    assert doc.is_parsed == True
    assert doc_serialized.is_parsed == True
    assert doc.to_bytes() == doc_serialized.to_bytes()
    assert [s.text
            for s in doc.sents] == [s.text for s in doc_serialized.sents]
Пример #21
0
def test_issue3012(en_vocab):
    """Test that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information."""
    words = ["This", "is", "10", "%", "."]
    tags = ["DT", "VBZ", "CD", "NN", "."]
    pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"]
    ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"]
    doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents)
    assert doc.has_annotation("TAG")
    expected = ("10", "NUM", "CD", "PERCENT")
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    header = [ENT_IOB, ENT_TYPE]
    ent_array = doc.to_array(header)
    doc.from_array(header, ent_array)
    assert (doc[2].text, doc[2].pos_, doc[2].tag_,
            doc[2].ent_type_) == expected
    # Serializing then deserializing
    doc_bytes = doc.to_bytes()
    doc2 = Doc(en_vocab).from_bytes(doc_bytes)
    assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_,
            doc2[2].ent_type_) == expected
Пример #22
0
def test_serialize_doc_roundtrip_bytes(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc_b = doc.to_bytes()
    new_doc = Doc(en_vocab).from_bytes(doc_b)
    assert new_doc.to_bytes() == doc_b
Пример #23
0
def test_serialize_doc_roundtrip_bytes(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world"])
    doc_b = doc.to_bytes()
    new_doc = Doc(en_vocab).from_bytes(doc_b)
    assert new_doc.to_bytes() == doc_b
Пример #24
0
def test_serialize_doc_span_groups(en_vocab):
    doc = Doc(en_vocab, words=["hello", "world", "!"])
    doc.spans["content"] = [doc[0:2]]
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert len(new_doc.spans["content"]) == 1
Пример #25
0
def test_issue599(en_vocab):
    doc = Doc(en_vocab)
    doc2 = Doc(doc.vocab)
    doc2.from_bytes(doc.to_bytes())
    assert doc2.has_annotation("DEP")