def test_kb_to_bytes(): # Test that the KB's to_bytes method works correctly nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3]) kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5]) kb_1.add_alias(alias="Randomness", entities=["Q66", "Q2146908"], probabilities=[0.1, 0.2]) assert kb_1.contains_alias("Russ Cochran") kb_bytes = kb_1.to_bytes() kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3) assert not kb_2.contains_alias("Russ Cochran") kb_2 = kb_2.from_bytes(kb_bytes) # check that both KBs are exactly the same assert kb_1.get_size_entities() == kb_2.get_size_entities() assert kb_1.entity_vector_length == kb_2.entity_vector_length assert kb_1.get_entity_strings() == kb_2.get_entity_strings() assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908") assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66") assert kb_2.contains_alias("Russ Cochran") assert kb_1.get_size_aliases() == kb_2.get_size_aliases() assert kb_1.get_alias_strings() == kb_2.get_alias_strings() assert len(kb_1.get_alias_candidates("Russ Cochran")) == len( kb_2.get_alias_candidates("Russ Cochran")) assert len(kb_1.get_alias_candidates("Randomness")) == len( kb_2.get_alias_candidates("Randomness"))
def test_issue6730(en_vocab): """Ensure that the KB does not accept empty strings, but otherwise IO works fine.""" from spacy.kb import KnowledgeBase kb = KnowledgeBase(en_vocab, entity_vector_length=3) kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3]) with pytest.raises(ValueError): kb.add_alias(alias="", entities=["1"], probabilities=[0.4]) assert kb.contains_alias("") is False kb.add_alias(alias="x", entities=["1"], probabilities=[0.2]) kb.add_alias(alias="y", entities=["1"], probabilities=[0.1]) with make_tempdir() as tmp_dir: kb.to_disk(tmp_dir) kb.from_disk(tmp_dir) assert kb.get_size_aliases() == 2 assert set(kb.get_alias_strings()) == {"x", "y"}
def create_index( model: str, kb_dir: Path, output_dir: Path, new_model_name: str = "ann_linker", cg_threshold: float = 0.8, n_iter: int = 5, verbose: bool = True, ): """Create an AnnLinker based on the Character N-Gram TF-IDF vectors for aliases in a KnowledgeBase model (str): spaCy language model directory or name to load kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files output_dir (Path): path to output_dir for spaCy model with ann_linker pipe kb File Formats e.g. entities.jsonl {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."} {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."} e.g. aliases.jsonl {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]} """ msg = Printer(hide_animation=not verbose) msg.divider("Load Model") with msg.loading(f"Loading model {model}"): nlp = spacy.load(model) msg.good("Done.") if output_dir is not None: output_dir = Path(output_dir / new_model_name) if not output_dir.exists(): output_dir.mkdir(parents=True) entities = list(srsly.read_jsonl(kb_dir / "entities.jsonl")) aliases = list(srsly.read_jsonl(kb_dir / "aliases.jsonl")) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM) # set up the data entity_ids = [] descriptions = [] freqs = [] for e in entities: entity_ids.append(e["id"]) descriptions.append(e.get("description", "")) freqs.append(100) # msg.divider("Train EntityEncoder") # with msg.loading("Starting training EntityEncoder"): # # training entity description encodings # # this part can easily be replaced with a custom entity encoder # encoder = EntityEncoder(nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter) # encoder.train(description_list=descriptions, to_print=True) # msg.good("Done Training") msg.divider("Apply EntityEncoder") with msg.loading("Applying EntityEncoder to descriptions"): # get the pretrained entity vectors embeddings = [nlp.make_doc(desc).vector for desc in descriptions] msg.good("Finished, embeddings created") with msg.loading("Setting kb entities and aliases"): # set the entities, can also be done by calling `kb.add_entity` for each entity for i in range(len(entity_ids)): entity = entity_ids[i] if not kb.contains_entity(entity): kb.add_entity(entity, freqs[i], embeddings[i]) for a in aliases: ents = [e for e in a["entities"] if kb.contains_entity(e)] n_ents = len(ents) if n_ents > 0: prior_prob = [1.0 / n_ents] * n_ents kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob) msg.good("Done adding entities and aliases to kb") msg.divider("Create ANN Index") cg = CandidateGenerator().fit(kb.get_alias_strings(), verbose=True) ann_linker = nlp.create_pipe("ann_linker") ann_linker.set_kb(kb) ann_linker.set_cg(cg) nlp.add_pipe(ann_linker, last=True) nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) nlp.from_disk(output_dir)
def create_index( model: str, kb_dir: Path, output_dir: Path, new_model_name: str = "ann_linker", cg_threshold: float = 0.8, n_iter: int = 5, verbose: bool = True, ): """Create an AnnLinker based on the Character N-Gram TF-IDF vectors for aliases in a KnowledgeBase model (str): spaCy language model directory or name to load kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files output_dir (Path): path to output_dir for spaCy model with ann_linker pipe kb File Formats e.g. entities.jsonl {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."} {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."} e.g. aliases.jsonl {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]} """ msg = Printer(hide_animation=not verbose) msg.divider("Load Model") with msg.loading(f"Loading model {model}"): nlp = spacy.load(model) msg.good("Done.") if output_dir is not None: output_dir = Path(output_dir / new_model_name) if not output_dir.exists(): output_dir.mkdir(parents=True) entities, entities_copy = tee(srsly.read_jsonl(kb_dir / "entities.jsonl")) total_entities = sum(1 for _ in entities_copy) aliases, aliases_copy = tee(srsly.read_jsonl(kb_dir / "aliases.jsonl")) total_aliases = sum(1 for _ in aliases_copy) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM) empty_doc = nlp.make_doc('').vector for entity in tqdm(entities, desc='Adding entities to KB', total=total_entities): id = entity['id'] if not kb.contains_entity(id): embedding = nlp.make_doc( entity['description'] ).vector if 'description' in entity else empty_doc label = entity['label'] if 'label' in entity else 0 if label: label = kb_type_vs_index[label] kb.add_entity( entity=id, freq= label, #TODO: Add a proper "label" field (repurposed freq field as the type label) entity_vector=embedding) for alias in tqdm(aliases, desc="Setting kb entities and aliases", total=total_aliases): entities = [e for e in alias["entities"] if kb.contains_entity(e)] num_entities = len(entities) if num_entities > 0: prior_probabilities = alias['probabilities'] if len( alias['probabilities'] ) == num_entities else [1.0 / num_entities] * num_entities kb.add_alias(alias=alias["alias"], entities=entities, probabilities=prior_probabilities) msg.divider("Create ANN Index") alias_strings = kb.get_alias_strings() cg = CandidateGenerator().fit(alias_strings, verbose=True) ann_linker = nlp.create_pipe("ann_linker") ann_linker.set_kb(kb) ann_linker.set_cg(cg) nlp.add_pipe(ann_linker, last=True) nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) nlp.from_disk(output_dir)