Exemplo n.º 1
0
def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam",
                               entities=["Q2"],
                               probabilities=[0.9])

    candidates = mykb.get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
    assert candidates[0].alias == adam_hash
    assert candidates[0].alias_ == "adam"

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

        candidates = kb_new_vocab.get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"
Exemplo n.º 2
0
def test_kb_serialize_2(nlp):
    v = [5, 6, 7, 8]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E1"], [1], [v])
    assert kb1.get_vector("E1") == v
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert kb2.get_vector("E1") == v
Exemplo n.º 3
0
def test_kb_serialize_vocab(nlp):
    """Test serialization of the KB and custom strings"""
    entity = "MyFunnyID"
    assert entity not in nlp.vocab.strings
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    assert not mykb.contains_entity(entity)
    mykb.add_entity(entity, freq=342, entity_vector=[3])
    assert mykb.contains_entity(entity)
    assert entity in mykb.vocab.strings
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
        mykb_new.from_disk(d / "kb")
        assert entity in mykb_new.vocab.strings
Exemplo n.º 4
0
def test_save_and_load_knowledge_base():
    nlp = Language()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        path = d / "kb"
        try:
            kb.to_disk(path)
        except Exception as e:
            pytest.fail(str(e))

        try:
            kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
            kb_loaded.from_disk(path)
        except Exception as e:
            pytest.fail(str(e))
Exemplo n.º 5
0
 def load(self, output_dir):
     kb_path = os.path.join(output_dir, "kb")
     vocab_path = os.path.join(output_dir, "vocab")
     kb_info_path = os.path.join(output_dir, "kb_info.txt")
     print("Loading vocab from", vocab_path)
     print("Loading KB from", kb_path)
     print("Loading KB info from", kb_info_path)
     with open(kb_info_path, "r") as file:
         # The first line is the entity_vector_length
         entity_vector_length = int(file.readline().strip())
     vocab = Vocab().from_disk(vocab_path)
     kb = KnowledgeBase(vocab=vocab,
                        entity_vector_length=entity_vector_length)
     kb.from_disk(kb_path)
     self.kb = kb
     return self.kb
Exemplo n.º 6
0
def test_serialize_kb_disk(en_vocab):
    # baseline assertions
    kb1 = _get_dummy_kb(en_vocab)
    _check_kb(kb1)

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb1.to_disk(str(file_path))
        kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))

    # final assertions
    _check_kb(kb2)
Exemplo n.º 7
0
def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase

    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False

    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
Exemplo n.º 8
0
def test_kb_set_entities(nlp):
    """Test that set_entities entirely overwrites the previous set of entities"""
    v = [5, 6, 7, 8]
    v1 = [1, 1, 1, 0]
    v2 = [2, 2, 2, 3]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E0"], [1], [v])
    assert kb1.get_entity_strings() == ["E0"]
    kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
    assert set(kb1.get_entity_strings()) == {"E1", "E2"}
    assert kb1.get_vector("E1") == v1
    assert kb1.get_vector("E2") == v2
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert set(kb2.get_entity_strings()) == {"E1", "E2"}
        assert kb2.get_vector("E1") == v1
        assert kb2.get_vector("E2") == v2
Exemplo n.º 9
0
def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.to_disk(str(file_path))
        kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))
    assert kb2.get_size_entities() == 1
Exemplo n.º 10
0
def test_kb_serialize(nlp):
    """Test serialization of the KB"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb.from_disk(d / "kb")
        mykb.to_disk(d / "new" / "kb")
        mykb.from_disk(d / "new" / "kb")
        # allow overwriting an existing file
        mykb.to_disk(d / "kb")
        with pytest.raises(ValueError):
            # can not read from an unknown file
            mykb.from_disk(d / "unknown" / "kb")
Exemplo n.º 11
0
 def create_kb(vocab):
     entity_vector_length = 300
     kb = KnowledgeBase(vocab=vocab,
                        entity_vector_length=entity_vector_length)
     kb.from_disk(kb_folder)
     return kb
Exemplo n.º 12
0
 def create_kb(vocab):
     kb = KnowledgeBase(vocab, entity_vector_length=1)
     kb.from_disk(kb_dir)
     return kb