Exemplo n.º 1
0
def test_kb_to_bytes():
    # Test that the KB's to_bytes method works correctly
    nlp = English()
    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
    kb_1.add_alias(alias="Russ Cochran",
                   entities=["Q2146908"],
                   probabilities=[0.8])
    kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
    kb_1.add_alias(alias="Randomness",
                   entities=["Q66", "Q2146908"],
                   probabilities=[0.1, 0.2])
    assert kb_1.contains_alias("Russ Cochran")
    kb_bytes = kb_1.to_bytes()
    kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    assert not kb_2.contains_alias("Russ Cochran")
    kb_2 = kb_2.from_bytes(kb_bytes)
    # check that both KBs are exactly the same
    assert kb_1.get_size_entities() == kb_2.get_size_entities()
    assert kb_1.entity_vector_length == kb_2.entity_vector_length
    assert kb_1.get_entity_strings() == kb_2.get_entity_strings()
    assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908")
    assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66")
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
        kb_2.get_alias_candidates("Russ Cochran"))
    assert len(kb_1.get_alias_candidates("Randomness")) == len(
        kb_2.get_alias_candidates("Randomness"))
Exemplo n.º 2
0
def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase

    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False

    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
Exemplo n.º 3
0
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):

    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities = list(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    aliases = list(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for e in entities:
        entity_ids.append(e["id"])
        descriptions.append(e.get("description", ""))
        freqs.append(100)

    # msg.divider("Train EntityEncoder")

    # with msg.loading("Starting training EntityEncoder"):
    #     # training entity description encodings
    #     # this part can easily be replaced with a custom entity encoder
    #     encoder = EntityEncoder(nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter)
    #     encoder.train(description_list=descriptions, to_print=True)
    #     msg.good("Done Training")

    msg.divider("Apply EntityEncoder")

    with msg.loading("Applying EntityEncoder to descriptions"):
        # get the pretrained entity vectors
        embeddings = [nlp.make_doc(desc).vector for desc in descriptions]
        msg.good("Finished, embeddings created")

    with msg.loading("Setting kb entities and aliases"):
        # set the entities, can also be done by calling `kb.add_entity` for each entity
        for i in range(len(entity_ids)):
            entity = entity_ids[i]
            if not kb.contains_entity(entity):
                kb.add_entity(entity, freqs[i], embeddings[i])

        for a in aliases:
            ents = [e for e in a["entities"] if kb.contains_entity(e)]
            n_ents = len(ents)
            if n_ents > 0:
                prior_prob = [1.0 / n_ents] * n_ents
                kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob)

        msg.good("Done adding entities and aliases to kb")

    msg.divider("Create ANN Index")

    cg = CandidateGenerator().fit(kb.get_alias_strings(), verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)
Exemplo n.º 4
0
def create_index(
    model: str,
    kb_dir: Path,
    output_dir: Path,
    new_model_name: str = "ann_linker",
    cg_threshold: float = 0.8,
    n_iter: int = 5,
    verbose: bool = True,
):
    """Create an AnnLinker based on the Character N-Gram
    TF-IDF vectors for aliases in a KnowledgeBase

    model (str): spaCy language model directory or name to load
    kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files
    output_dir (Path): path to output_dir for spaCy model with ann_linker pipe


    kb File Formats
    
    e.g. entities.jsonl

    {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."}
    {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."}

    e.g. aliases.jsonl
    {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]}
    """
    msg = Printer(hide_animation=not verbose)

    msg.divider("Load Model")
    with msg.loading(f"Loading model {model}"):
        nlp = spacy.load(model)
        msg.good("Done.")

    if output_dir is not None:
        output_dir = Path(output_dir / new_model_name)
        if not output_dir.exists():
            output_dir.mkdir(parents=True)

    entities, entities_copy = tee(srsly.read_jsonl(kb_dir / "entities.jsonl"))
    total_entities = sum(1 for _ in entities_copy)

    aliases, aliases_copy = tee(srsly.read_jsonl(kb_dir / "aliases.jsonl"))
    total_aliases = sum(1 for _ in aliases_copy)

    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM)

    empty_doc = nlp.make_doc('').vector

    for entity in tqdm(entities,
                       desc='Adding entities to KB',
                       total=total_entities):
        id = entity['id']
        if not kb.contains_entity(id):
            embedding = nlp.make_doc(
                entity['description']
            ).vector if 'description' in entity else empty_doc
            label = entity['label'] if 'label' in entity else 0
            if label: label = kb_type_vs_index[label]
            kb.add_entity(
                entity=id,
                freq=
                label,  #TODO: Add a proper "label" field (repurposed freq field as the type label)
                entity_vector=embedding)

    for alias in tqdm(aliases,
                      desc="Setting kb entities and aliases",
                      total=total_aliases):
        entities = [e for e in alias["entities"] if kb.contains_entity(e)]
        num_entities = len(entities)
        if num_entities > 0:
            prior_probabilities = alias['probabilities'] if len(
                alias['probabilities']
            ) == num_entities else [1.0 / num_entities] * num_entities
            kb.add_alias(alias=alias["alias"],
                         entities=entities,
                         probabilities=prior_probabilities)

    msg.divider("Create ANN Index")
    alias_strings = kb.get_alias_strings()
    cg = CandidateGenerator().fit(alias_strings, verbose=True)

    ann_linker = nlp.create_pipe("ann_linker")
    ann_linker.set_kb(kb)
    ann_linker.set_cg(cg)

    nlp.add_pipe(ann_linker, last=True)

    nlp.meta["name"] = new_model_name
    nlp.to_disk(output_dir)
    nlp.from_disk(output_dir)