def test_kb_serialize_vocab(nlp): """Test serialization of the KB and custom strings""" entity = "MyFunnyID" assert entity not in nlp.vocab.strings mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) assert not mykb.contains_entity(entity) mykb.add_entity(entity, freq=342, entity_vector=[3]) assert mykb.contains_entity(entity) assert entity in mykb.vocab.strings with make_tempdir() as d: # normal read-write behaviour mykb.to_disk(d / "kb") mykb_new = KnowledgeBase(Vocab(), entity_vector_length=1) mykb_new.from_disk(d / "kb") assert entity in mykb_new.vocab.strings
print(len(desc_enc)) #Now we want to specify aliases or synonyms. We first add the full names. Here, we are 100% certain that they resolve to their corresponding QID, as there is no ambiguity. for qid, name in name_dict.items(): kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # 100% prior probability P(entity|alias) aliases = {} words = [] with open('disease_alieases.tsv', 'r') as fr: for row in fr: row = row.strip().split('\t') qid = row[0] name = row[1] #print (row) if kb.contains_entity(qid): aliases[name] = qid kb.add_alias( alias=name, entities=[qid], probabilities=[1]) # 100% prior probability P(entity|alias) print("Checking KB ...") print(kb.contains_entity('MONDO:0000001')) annots = parse_annots('dailymed_disease3_L.jsonl') entity_labels = [] for text, res in annots.items(): t = res['text'] for span in res['spans']: s = span['start']
def create_index( model: str, kb_dir: Path, output_dir: Path, new_model_name: str = "ann_linker", cg_threshold: float = 0.8, n_iter: int = 5, verbose: bool = True, ): """Create an AnnLinker based on the Character N-Gram TF-IDF vectors for aliases in a KnowledgeBase model (str): spaCy language model directory or name to load kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files output_dir (Path): path to output_dir for spaCy model with ann_linker pipe kb File Formats e.g. entities.jsonl {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."} {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."} e.g. aliases.jsonl {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]} """ msg = Printer(hide_animation=not verbose) msg.divider("Load Model") with msg.loading(f"Loading model {model}"): nlp = spacy.load(model) msg.good("Done.") if output_dir is not None: output_dir = Path(output_dir / new_model_name) if not output_dir.exists(): output_dir.mkdir(parents=True) entities = list(srsly.read_jsonl(kb_dir / "entities.jsonl")) aliases = list(srsly.read_jsonl(kb_dir / "aliases.jsonl")) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM) # set up the data entity_ids = [] descriptions = [] freqs = [] for e in entities: entity_ids.append(e["id"]) descriptions.append(e.get("description", "")) freqs.append(100) # msg.divider("Train EntityEncoder") # with msg.loading("Starting training EntityEncoder"): # # training entity description encodings # # this part can easily be replaced with a custom entity encoder # encoder = EntityEncoder(nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter) # encoder.train(description_list=descriptions, to_print=True) # msg.good("Done Training") msg.divider("Apply EntityEncoder") with msg.loading("Applying EntityEncoder to descriptions"): # get the pretrained entity vectors embeddings = [nlp.make_doc(desc).vector for desc in descriptions] msg.good("Finished, embeddings created") with msg.loading("Setting kb entities and aliases"): # set the entities, can also be done by calling `kb.add_entity` for each entity for i in range(len(entity_ids)): entity = entity_ids[i] if not kb.contains_entity(entity): kb.add_entity(entity, freqs[i], embeddings[i]) for a in aliases: ents = [e for e in a["entities"] if kb.contains_entity(e)] n_ents = len(ents) if n_ents > 0: prior_prob = [1.0 / n_ents] * n_ents kb.add_alias(alias=a["alias"], entities=ents, probabilities=prior_prob) msg.good("Done adding entities and aliases to kb") msg.divider("Create ANN Index") cg = CandidateGenerator().fit(kb.get_alias_strings(), verbose=True) ann_linker = nlp.create_pipe("ann_linker") ann_linker.set_kb(kb) ann_linker.set_cg(cg) nlp.add_pipe(ann_linker, last=True) nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) nlp.from_disk(output_dir)
def create_index( model: str, kb_dir: Path, output_dir: Path, new_model_name: str = "ann_linker", cg_threshold: float = 0.8, n_iter: int = 5, verbose: bool = True, ): """Create an AnnLinker based on the Character N-Gram TF-IDF vectors for aliases in a KnowledgeBase model (str): spaCy language model directory or name to load kb_dir (Path): path to the directory with kb entities.jsonl and aliases.jsonl files output_dir (Path): path to output_dir for spaCy model with ann_linker pipe kb File Formats e.g. entities.jsonl {"id": "a1", "description": "Machine learning (ML) is the scientific study of algorithms and statistical models..."} {"id": "a2", "description": "ML (\"Meta Language\") is a general-purpose functional programming language. It has roots in Lisp, and has been characterized as \"Lisp with types\"."} e.g. aliases.jsonl {"alias": "ML", "entities": ["a1", "a2"], "probabilities": [0.5, 0.5]} """ msg = Printer(hide_animation=not verbose) msg.divider("Load Model") with msg.loading(f"Loading model {model}"): nlp = spacy.load(model) msg.good("Done.") if output_dir is not None: output_dir = Path(output_dir / new_model_name) if not output_dir.exists(): output_dir.mkdir(parents=True) entities, entities_copy = tee(srsly.read_jsonl(kb_dir / "entities.jsonl")) total_entities = sum(1 for _ in entities_copy) aliases, aliases_copy = tee(srsly.read_jsonl(kb_dir / "aliases.jsonl")) total_aliases = sum(1 for _ in aliases_copy) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=INPUT_DIM) empty_doc = nlp.make_doc('').vector for entity in tqdm(entities, desc='Adding entities to KB', total=total_entities): id = entity['id'] if not kb.contains_entity(id): embedding = nlp.make_doc( entity['description'] ).vector if 'description' in entity else empty_doc label = entity['label'] if 'label' in entity else 0 if label: label = kb_type_vs_index[label] kb.add_entity( entity=id, freq= label, #TODO: Add a proper "label" field (repurposed freq field as the type label) entity_vector=embedding) for alias in tqdm(aliases, desc="Setting kb entities and aliases", total=total_aliases): entities = [e for e in alias["entities"] if kb.contains_entity(e)] num_entities = len(entities) if num_entities > 0: prior_probabilities = alias['probabilities'] if len( alias['probabilities'] ) == num_entities else [1.0 / num_entities] * num_entities kb.add_alias(alias=alias["alias"], entities=entities, probabilities=prior_probabilities) msg.divider("Create ANN Index") alias_strings = kb.get_alias_strings() cg = CandidateGenerator().fit(alias_strings, verbose=True) ann_linker = nlp.create_pipe("ann_linker") ann_linker.set_kb(kb) ann_linker.set_cg(cg) nlp.add_pipe(ann_linker, last=True) nlp.meta["name"] = new_model_name nlp.to_disk(output_dir) nlp.from_disk(output_dir)