예제 #1
0
def test_vocab_serialization(nlp):
    """Test that string information is retained across storage"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
    q2_hash = mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
    mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias="douglas",
                   entities=["Q2", "Q3"],
                   probabilities=[0.4, 0.1])
    adam_hash = mykb.add_alias(alias="adam",
                               entities=["Q2"],
                               probabilities=[0.9])

    candidates = mykb.get_alias_candidates("adam")
    assert len(candidates) == 1
    assert candidates[0].entity == q2_hash
    assert candidates[0].entity_ == "Q2"
    assert candidates[0].alias == adam_hash
    assert candidates[0].alias_ == "adam"

    with make_tempdir() as d:
        mykb.to_disk(d / "kb")
        kb_new_vocab = KnowledgeBase(Vocab(), entity_vector_length=1)
        kb_new_vocab.from_disk(d / "kb")

        candidates = kb_new_vocab.get_alias_candidates("adam")
        assert len(candidates) == 1
        assert candidates[0].entity == q2_hash
        assert candidates[0].entity_ == "Q2"
        assert candidates[0].alias == adam_hash
        assert candidates[0].alias_ == "adam"
예제 #2
0
def test_kb_serialization():
    # Test that the KB can be used in a pipeline with a different vocab
    vector_length = 3
    with make_tempdir() as tmp_dir:
        kb_dir = tmp_dir / "kb"
        nlp1 = English()
        assert "Q2146908" not in nlp1.vocab.strings
        mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(alias="Russ Cochran",
                       entities=["Q2146908"],
                       probabilities=[0.8])
        assert "Q2146908" in nlp1.vocab.strings
        mykb.to_disk(kb_dir)

        nlp2 = English()
        assert "RandomWord" not in nlp2.vocab.strings
        nlp2.vocab.strings.add("RandomWord")
        assert "RandomWord" in nlp2.vocab.strings
        assert "Q2146908" not in nlp2.vocab.strings

        # Create the Entity Linker component with the KB from file, and check the final vocab
        entity_linker = nlp2.add_pipe("entity_linker", last=True)
        entity_linker.set_kb(load_kb(kb_dir))
        assert "Q2146908" in nlp2.vocab.strings
        assert "RandomWord" in nlp2.vocab.strings
예제 #3
0
def test_kb_serialize_2(nlp):
    v = [5, 6, 7, 8]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E1"], [1], [v])
    assert kb1.get_vector("E1") == v
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert kb2.get_vector("E1") == v
예제 #4
0
def create_kb(kb_dir='sample_kb', nlp_dir='sample_nlp'):
    nlp = spacy.load('en_core_web_lg')
    text = 'Tennis champion Emerson was expected to win Wimbledon.'
    doc = nlp(text)
    file_path = 'entities.csv'
    name_dict, desc_dict = load_entities(file_path)

    sample_qid, sample_desc = list(desc_dict.items())[0]
    sample_doc = nlp(sample_desc)
    entity_vector_length = len(sample_doc.vector)  # should be 300
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    for qid, desc in desc_dict.items():
        desc_doc = nlp(desc)
        desc_enc = desc_doc.vector
        # NB: entity_vector could be any encoding
        # freq is the count of times the word appears in the corpus
        # not used in this tutorial
        kb.add_entity(entity=qid, entity_vector=desc_enc, freq=42)

    # add provided alias
    for qid, name in name_dict.items():
        # probabilities is P(entity|alias) = 1.0
        # we assume that each alias only maps to one entity
        kb.add_alias(alias=name, entities=[qid], probabilities=[1])

    # add additional alias with equal probability
    # this could be learned from data
    qids = name_dict.keys()
    probs = [0.3 for qid in qids]
    kb.add_alias(alias='Emerson', entities=qids, probabilities=probs)

    print(f'Entities in the KB: {kb.get_entity_strings()}')
    print(f'Aliases in the KB: {kb.get_alias_strings()}')
    print()
    # questions here are:
    # 1) what matching function is being used? - is this deterministic?
    # 2) what threshold is being used to determine how many candidates are presented?
    entities = [
        c.entity_ for c in kb.get_alias_candidates('Roy Stanley Emerson')
    ]
    print(f'Candidates for \'Roy Stanley Emerson\': {entities}')
    entities = [c.entity_ for c in kb.get_alias_candidates('Emerson')]
    print(f'Candidates for \'Emerson\': {entities}')
    entities = [c.entity_ for c in kb.get_alias_candidates('Todd')]
    print(f'Candidates for \'Todd\': {entities}')

    kb.to_disk(kb_dir)
    nlp.to_disk(nlp_dir)
예제 #5
0
def test_kb_serialize_vocab(nlp):
    """Test serialization of the KB and custom strings"""
    entity = "MyFunnyID"
    assert entity not in nlp.vocab.strings
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    assert not mykb.contains_entity(entity)
    mykb.add_entity(entity, freq=342, entity_vector=[3])
    assert mykb.contains_entity(entity)
    assert entity in mykb.vocab.strings
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1)
        mykb_new.from_disk(d / "kb")
        assert entity in mykb_new.vocab.strings
예제 #6
0
def test_save_and_load_knowledge_base():
    nlp = Language()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        path = d / "kb"
        try:
            kb.to_disk(path)
        except Exception as e:
            pytest.fail(str(e))

        try:
            kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1)
            kb_loaded.from_disk(path)
        except Exception as e:
            pytest.fail(str(e))
예제 #7
0
def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase

    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False

    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
예제 #8
0
def test_kb_set_entities(nlp):
    """Test that set_entities entirely overwrites the previous set of entities"""
    v = [5, 6, 7, 8]
    v1 = [1, 1, 1, 0]
    v2 = [2, 2, 2, 3]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E0"], [1], [v])
    assert kb1.get_entity_strings() == ["E0"]
    kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
    assert set(kb1.get_entity_strings()) == {"E1", "E2"}
    assert kb1.get_vector("E1") == v1
    assert kb1.get_vector("E2") == v2
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert set(kb2.get_entity_strings()) == {"E1", "E2"}
        assert kb2.get_vector("E1") == v1
        assert kb2.get_vector("E2") == v2
예제 #9
0
def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    with pytest.warns(UserWarning):
        kb.set_entities(
            entity_list=["Q1", "Q1"],
            freq_list=[32, 111],
            vector_list=[vector1, vector2],
        )
    assert kb.get_size_entities() == 1
    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.to_disk(str(file_path))
        kb2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
        kb2.from_disk(str(file_path))
    assert kb2.get_size_entities() == 1
예제 #10
0
def test_kb_serialize(nlp):
    """Test serialization of the KB"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
    with make_tempdir() as d:
        # normal read-write behaviour
        mykb.to_disk(d / "kb")
        mykb.from_disk(d / "kb")
        mykb.to_disk(d / "new" / "kb")
        mykb.from_disk(d / "new" / "kb")
        # allow overwriting an existing file
        mykb.to_disk(d / "kb")
        with pytest.raises(ValueError):
            # can not read from an unknown file
            mykb.from_disk(d / "unknown" / "kb")