def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) doc = nlp("douglas adam Adam shrubbery") douglas_ent = doc[0:1] adam_ent = doc[1:2] Adam_ent = doc[2:3] shrubbery_ent = doc[3:4] # adding entities mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9]) # test the size of the relevant candidates assert len(get_candidates(mykb, douglas_ent)) == 2 assert len(get_candidates(mykb, adam_ent)) == 1 assert len(get_candidates(mykb, Adam_ent)) == 0 # default case sensitive assert len(get_candidates(mykb, shrubbery_ent)) == 0 # test the content of the candidates assert get_candidates(mykb, adam_ent)[0].entity_ == "Q2" assert get_candidates(mykb, adam_ent)[0].alias_ == "adam" assert_almost_equal(get_candidates(mykb, adam_ent)[0].entity_freq, 12) assert_almost_equal(get_candidates(mykb, adam_ent)[0].prior_prob, 0.9)
def entity_linker_manual(dataset, source, nlp_dir, kb_loc, entity_loc): # Load the NLP and KB objects from file nlp = spacy.load(nlp_dir) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=1) kb.load_bulk(kb_loc) model = EntityRecognizer(nlp) # Read the pre-defined CSV file into dictionaries mapping QIDs to the full names and descriptions id_dict = dict() with entity_loc.open("r", encoding="utf8") as csvfile: csvreader = csv.reader(csvfile, delimiter=",") for row in csvreader: id_dict[row[0]] = (row[1], row[2]) # Initialize the Prodigy stream by running the NER model stream = TXT(source) stream = [set_hashes(eg) for eg in stream] stream = (eg for score, eg in model(stream)) # For each NER mention, add the candidates from the KB to the annotation task stream = _add_options(stream, kb, id_dict) stream = filter_duplicates(stream, by_input=True, by_task=False) return { "dataset": dataset, "stream": stream, "view_id": "choice", "config": { "choice_auto_accept": True }, }
def create_kb(): """ Step 1: create the Knowledge Base in spaCy and write it to file """ nlp = spacy.load("en_core_web_lg") name_dict, desc_dict = load_entities() kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342) # 342 is an arbitrary value here for qid, name in name_dict.items(): kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # 100% prior probability P(entity|alias) qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias="Emerson", entities=qids, probabilities=probs) # sum([probs]) should be <= 1 ! print(f"Entities in the KB: {kb.get_entity_strings()}") print(f"Aliases in the KB: {kb.get_alias_strings()}") print() if not os.path.exists(output_dir): os.mkdir(output_dir) kb.dump(output_dir / "my_kb") nlp.to_disk(output_dir / "my_nlp")
def __init__(self, kb_folder, vectors_loc, lang='sv', stz=True, vectors_name='fasttext'): self.nlp = create_model(vectors_loc=vectors_loc, lang=lang, stz=stz, vectors_name=vectors_name, max_items=1000) self.kb = KnowledgeBase(vocab=self.nlp.vocab) print(kb_folder) self.kb.load_bulk(kb_folder) print() _print_kb(self.kb)
def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q613241", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias("Kirby", ["Q613241"], [0.9]) # Placeholder mykb.add_entity(entity="pink", freq=12, entity_vector=[7, 2, -5]) mykb.add_alias("pink", ["pink"], [0.9]) return mykb
def load(self, output_dir): kb_path = os.path.join(output_dir, "kb") vocab_path = os.path.join(output_dir, "vocab") print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab = Vocab().from_disk(vocab_path) kb = KnowledgeBase(vocab=vocab) kb.load_bulk(kb_path) self.kb = kb return self.kb
def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.4])
def get_kb(self): """ :returns: self.kb, reading it from file if not loaded """ if not self.kb: print("Loading vocabulary...") vocab = Vocab().from_disk(self.spacy_nlp_vocab_dir) print("Loading KB...") self.kb = KnowledgeBase(vocab=vocab) self.kb.load_bulk(self.spacy_kb_file) print("KB loaded!") return self.kb
def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab vector_length = 3 with make_tempdir() as tmp_dir: kb_dir = tmp_dir / "kb" nlp1 = English() assert "Q2146908" not in nlp1.vocab.strings mykb = KnowledgeBase(nlp1.vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert "Q2146908" in nlp1.vocab.strings mykb.to_disk(kb_dir) nlp2 = English() assert "RandomWord" not in nlp2.vocab.strings nlp2.vocab.strings.add("RandomWord") assert "RandomWord" in nlp2.vocab.strings assert "Q2146908" not in nlp2.vocab.strings # Create the Entity Linker component with the KB from file, and check the final vocab entity_linker = nlp2.add_pipe("entity_linker", last=True) entity_linker.set_kb(load_kb(kb_dir)) assert "Q2146908" in nlp2.vocab.strings assert "RandomWord" in nlp2.vocab.strings
def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): mykb.add_alias(alias="douglas", entities=["Q2", "Q342"], probabilities=[0.8, 0.2])
def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.3, 0.4, 0.1])
def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=1) kb.add_entity(entity="Q2", freq=12, entity_vector=[2]) kb.add_entity(entity="Q3", freq=5, entity_vector=[3]) kb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) return kb
def load(self, output_dir): kb_path = os.path.join(output_dir, "kb") vocab_path = os.path.join(output_dir, "vocab") kb_info_path = os.path.join(output_dir, "kb_info.txt") print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) print("Loading KB info from", kb_info_path) with open(kb_info_path, "r") as file: # The first line is the entity_vector_length entity_vector_length = int(file.readline().strip()) vocab = Vocab().from_disk(vocab_path) kb = KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length) kb.from_disk(kb_path) self.kb = kb return self.kb
def create_kb(vocab): # create artificial KB mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="No. 8", entities=["Q270853"], probabilities=[1.0], ) mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3]) mykb.add_alias( alias="Mahler", entities=["Q7304"], probabilities=[1.0], ) return mykb
def create_kb(vocab): mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) return mykb
def test_serialize_kb_disk(en_vocab): # baseline assertions kb1 = _get_dummy_kb(en_vocab) _check_kb(kb1) # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb1.to_disk(str(file_path)) kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) kb2.from_disk(str(file_path)) # final assertions _check_kb(kb2)
def entity_linker(): nlp = Language() nlp.add_pipe(nlp.create_pipe("entity_linker")) entity_linker = nlp.get_pipe("entity_linker") # need to add model for two reasons: # 1. no model leads to error in serialization, # 2. the affected line is the one for model serialization kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) entity_linker.set_kb(kb) entity_linker.begin_training(pipeline=nlp.pipeline) return entity_linker
def test_kb_pickle(): # Test that the KB can be pickled nlp = English() kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3) kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) assert not kb_1.contains_alias("Russ Cochran") kb_1.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) assert kb_1.contains_alias("Russ Cochran") data = pickle.dumps(kb_1) kb_2 = pickle.loads(data) assert kb_2.contains_alias("Russ Cochran")
def read_nlp_kb(model_dir, kb_file): nlp = spacy.load(model_dir) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_file) logger.info("kb entities: {}".format(kb.get_size_entities())) logger.info("kb aliases: {}".format(kb.get_size_aliases())) return nlp, kb
def create_kb(vocab): kb = KnowledgeBase(vocab, entity_vector_length=3) kb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) kb.add_alias(alias="Russ Cochran", entities=["Q2146908"], probabilities=[0.8]) return kb
def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.4])
def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4])
def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1])
def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2])
def test_save_and_load_knowledge_base(): nlp = Language() kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: kb.dump(path) except Exception as e: pytest.fail(str(e)) try: kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) kb_loaded.load_bulk(path) except Exception as e: pytest.fail(str(e))
def from_disk(self, path: Path, **kwargs): """Deserialize saved AnnLinker from disk. path (Path): directory to deserialize from RETURNS (AnnLinker): Initialized AnnLinker """ path = util.ensure_path(path) kb = KnowledgeBase(self.nlp.vocab, 300) kb.load_bulk(path / "kb") self.set_kb(kb) cg = CandidateGenerator().from_disk(path) self.set_cg(cg) cfg = srsly.read_json(path / "cfg") self.threshold = cfg.get("threshold", 0.7) self.no_description_threshold = cfg.get("no_description_threshold", 0.95) self.disambiguate = cfg.get("disambiguate", True) return self
def test_preserving_links_asdoc(nlp): """Test that Span.as_doc preserves the existing entity links""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) mykb.add_alias(alias="Denver", entities=["Q2"], probabilities=[0.6]) # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained) sentencizer = nlp.create_pipe("sentencizer") nlp.add_pipe(sentencizer) ruler = EntityRuler(nlp) patterns = [ { "label": "GPE", "pattern": "Boston" }, { "label": "GPE", "pattern": "Denver" }, ] ruler.add_patterns(patterns) nlp.add_pipe(ruler) el_pipe = nlp.create_pipe(name="entity_linker") el_pipe.set_kb(mykb) el_pipe.begin_training() el_pipe.incl_context = False el_pipe.incl_prior = True nlp.add_pipe(el_pipe, last=True) # test whether the entity links are preserved by the `as_doc()` function text = "She lives in Boston. He lives in Denver." doc = nlp(text) for ent in doc.ents: orig_text = ent.text orig_kb_id = ent.kb_id_ sent_doc = ent.sent.as_doc() for s_ent in sent_doc.ents: if s_ent.text == orig_text: assert s_ent.kb_id_ == orig_kb_id
def train(self, entities, list_aliases): """ Args: entities: a dict of each entity, it's description and it's corpus frequency list_aliases: a list of dicts for each entity e.g.:: [{ 'alias':'Farrar', 'entities': ['Q1', 'Q2'], 'probabilities': [0.4, 0.6] }] probabilities are 'prior probabilities' and must sum to < 1 """ try: nlp = spacy.load(self.kb_model) except IOError: subprocess.run( ["python", "-m", "spacy", "download", self.kb_model]) # pkg_resources need to be reloaded to pick up the newly installed models import pkg_resources import imp imp.reload(pkg_resources) nlp = spacy.load(self.kb_model) print("Loaded model '%s'" % self.kb_model) # set up the data entity_ids = [] embeddings = [] freqs = [] for key, value in entities.items(): desc, freq = value entity_ids.append(key) embeddings.append(nlp(desc).vector) freqs.append(freq) self.entity_vector_length = len( embeddings[0]) # This is needed in loading a kb kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=self.entity_vector_length) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand for alias in list_aliases: kb.add_alias( alias=alias["alias"], entities=alias["entities"], probabilities=alias["probabilities"], ) self.kb = kb return self.kb
def test_kb_invalid_entity_vector(nlp): """Test the invalid construction of a KB with non-matching entity vector lengths""" mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3]) # this should fail because the kb's expected entity vector length is 3 with pytest.raises(ValueError): mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2])
def create_kb(vocab): mykb = KnowledgeBase(vocab, entity_vector_length=vector_length) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.5, 0.5], ) return mykb
def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) kb.add_entity(entity=entity_0, prob=0.5) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) kb.add_entity(entity=entity_1, prob=0.5) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) kb.add_entity(entity=entity_2, prob=0.5) # adding aliases print() alias_0 = "Douglas" print("adding alias", alias_0) kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb
def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2') mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert(mykb.get_size_entities() == 3) assert(mykb.get_size_aliases() == 2)
def test_candidate_generation(nlp): """Test correct candidate generation""" mykb = KnowledgeBase(nlp.vocab) # adding entities mykb.add_entity(entity=u'Q1', prob=0.9) mykb.add_entity(entity=u'Q2', prob=0.2) mykb.add_entity(entity=u'Q3', prob=0.5) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) # test the size of the relevant candidates assert(len(mykb.get_candidates(u'douglas')) == 2) assert(len(mykb.get_candidates(u'adam')) == 1) assert(len(mykb.get_candidates(u'shrubbery')) == 0)