def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
def train(self, entities, list_aliases): """ Args: entities: a dict of each entity, it's description and it's corpus frequency list_aliases: a list of dicts for each entity e.g.:: [{ 'alias':'Farrar', 'entities': ['Q1', 'Q2'], 'probabilities': [0.4, 0.6] }] probabilities are 'prior probabilities' and must sum to < 1 """ try: nlp = spacy.load(self.kb_model) except IOError: subprocess.run( ["python", "-m", "spacy", "download", self.kb_model]) # pkg_resources need to be reloaded to pick up the newly installed models import pkg_resources import imp imp.reload(pkg_resources) nlp = spacy.load(self.kb_model) print("Loaded model '%s'" % self.kb_model) # set up the data entity_ids = [] embeddings = [] freqs = [] for key, value in entities.items(): desc, freq = value entity_ids.append(key) embeddings.append(nlp(desc).vector) freqs.append(freq) self.entity_vector_length = len( embeddings[0]) # This is needed in loading a kb kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=self.entity_vector_length) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand for alias in list_aliases: kb.add_alias( alias=alias["alias"], entities=alias["entities"], probabilities=alias["probabilities"], ) self.kb = kb return self.kb
def test_kb_serialize_2(nlp): v = [5, 6, 7, 8] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E1"], [1], [v]) assert kb1.get_vector("E1") == v with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert kb2.get_vector("E1") == v
def test_kb_set_entities(nlp): """Test that set_entities entirely overwrites the previous set of entities""" v = [5, 6, 7, 8] v1 = [1, 1, 1, 0] v2 = [2, 2, 2, 3] kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb1.set_entities(["E0"], [1], [v]) assert kb1.get_entity_strings() == ["E0"] kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2]) assert set(kb1.get_entity_strings()) == {"E1", "E2"} assert kb1.get_vector("E1") == v1 assert kb1.get_vector("E2") == v2 with make_tempdir() as d: kb1.to_disk(d / "kb") kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4) kb2.from_disk(d / "kb") assert set(kb2.get_entity_strings()) == {"E1", "E2"} assert kb2.get_vector("E1") == v1 assert kb2.get_vector("E2") == v2
def main(vocab_path=None, model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings. If an output_dir is provided, the KB will be stored there in a file 'kb'. When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" if model is None and vocab_path is None: raise ValueError("Either the `nlp` model or the `vocab` should be specified.") if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: vocab = Vocab().from_disk(vocab_path) # create blank Language class with specified vocab nlp = spacy.blank("en", vocab=vocab) print("Created blank 'en' model with vocab from '%s'" % vocab_path) kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) # only storing the vocab if we weren't already reading it from file if not vocab_path: vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def train(self, entities, list_aliases): """ Args: entities: a dict of each entity, it's description and it's corpus frequency list_aliases: a list of dicts for each entity e.g.:: [{ 'alias':'Farrar', 'entities': ['Q1', 'Q2'], 'probabilities': [0.4, 0.6] }] probabilities are 'prior probabilities' and must sum to < 1 """ try: nlp = spacy.load(self.kb_model) except IOError: subprocess.run( ["python", "-m", "spacy", "download", self.kb_model]) # pkg_resources need to be reloaded to pick up the newly installed models import pkg_resources import imp imp.reload(pkg_resources) nlp = spacy.load(self.kb_model) print("Loaded model '%s'" % self.kb_model) kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=self.desc_width) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in entities.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=self.input_dim, desc_width=self.desc_width, epochs=self.num_epochs, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand for alias in list_aliases: kb.add_alias( alias=alias["alias"], entities=alias["entities"], probabilities=alias["probabilities"], ) self.kb = kb return self.kb
def main(model=None, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] descr_embeddings = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descr_embeddings.append(nlp(desc).vector) freqs.append(freq) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) print() _print_kb(kb2)
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_input, entity_descr_path, count_input, prior_prob_input, entity_vector_length, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) # read the mappings from file title_to_id = get_entity_to_id(entity_def_input) id_to_descr = get_id_to_description(entity_descr_path) # check the length of the nlp vectors if "vectors" in nlp.meta and nlp.vocab.vectors.size: input_dim = nlp.vocab.vectors_length logger.info("Loaded pretrained vectors of size %s" % input_dim) else: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") logger.info("Get entity frequencies") entity_frequencies = wp.get_all_frequencies(count_input=count_input) logger.info("Filtering entities with fewer than {} mentions".format( min_entity_freq)) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities( title_to_id, id_to_descr, entity_frequencies, min_entity_freq) logger.info("Left with {} entities".format(len(description_list))) logger.info("Train entity encoder") encoder = EntityEncoder(nlp, input_dim, entity_vector_length) encoder.train(description_list=description_list, to_print=True) logger.info("Get entity embeddings:") embeddings = encoder.apply_encoder(description_list) logger.info("Adding {} entities".format(len(entity_list))) kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) logger.info("Adding aliases") _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) logger.info("KB size: {} entities, {} aliases".format( kb.get_size_entities(), kb.get_size_aliases())) logger.info("Done with kb") return kb
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, wikidata_input, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) # disable this part of the pipeline when rerunning the KB generation from preprocessed files read_raw_data = True if read_raw_data: print() print(" * _read_wikidata_entities", datetime.datetime.now()) title_to_id, id_to_descr = wd.read_wikidata_entities_json( wikidata_input) # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) else: # read the mappings from file title_to_id = get_entity_to_id(entity_def_output) id_to_descr = get_id_to_description(entity_descr_output) print() print(" * _get_entity_frequencies", datetime.datetime.now()) print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() entity_list = [] description_list = [] frequency_list = [] for title, entity in title_to_id.items(): freq = entity_frequencies.get(title, 0) desc = id_to_descr.get(entity, None) if desc and freq > min_entity_freq: entity_list.append(entity) description_list.append(desc) frequency_list.append(freq) filtered_title_to_id[title] = entity print(len(title_to_id.keys()), "original titles") print("kept", len(filtered_title_to_id.keys()), " with frequency", min_entity_freq) print() print(" * train entity encoder", datetime.datetime.now()) print() encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder.train(description_list=description_list, to_print=True) print() print(" * get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) print() print(" * adding", len(entity_list), "entities", datetime.datetime.now()) kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) print() print(" * adding aliases", datetime.datetime.now()) print() _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) print("done with kb", datetime.datetime.now()) return kb
def main(model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def settingup_knowledgebase(self, names, train_data_2): QID = names['QID'].values.tolist() Names = names['Names'].values.tolist() Frequency = names['Frequency'].values.tolist() descript = [] for desc in names['Description']: descript.append(self.custom_ner_model(desc).vector) print("Setting up entities \n") kb = KnowledgeBase(vocab=self.custom_ner_model.vocab, entity_vector_length=96) kb.set_entities(entity_list=QID, freq_list=Frequency, vector_list=descript) print("Setting up Alias \n") print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) #kb_dump_file = str(input("Enter the KB Dump name: ")) #kb_vocab_folder = str(input("Enter the KB Vocab name: ")) folder.nel_kb_vocab() alias_prep = list(zip(Names, QID)) folder.nel_kb_vocab() for i, j in alias_prep: names_alias = str(i) list_qid = [] list_qid.append(j) prob = [] prob.append(int(1.0)) kb.add_alias(alias=names_alias, entities=list_qid, probabilities=prob) kb.dump("KB_Dump") kb.vocab.to_disk("KB_Vocab") print("\n") print("Knowbase dump and Vocab are stored in a local disk") train_data_dict_2 = train_data_2.to_dict('records') dataset_2 = [] for data in train_data_dict_2: Text = data['Text'] Name = data['Name'] QID = data['QID'] offset = (data["Start"], data["End"]) links_dict = {QID: 1.0} dataset_2.append((Text, {"links": {offset: links_dict}})) self.custom_ner_model.vocab.from_disk("KB_Vocab") self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors" kb = KnowledgeBase(vocab=self.custom_ner_model.vocab) kb.load_bulk("KB_Dump") TRAIN_DOCS = [] for text, annotation in dataset_2: doc = self.custom_ner_model( text ) # to make this more efficient, you can use nlp.pipe() just once for all the texts TRAIN_DOCS.append((doc, annotation)) print("\n") print("Training started for Named Entity Linking \n") entity_linker = self.custom_ner_model.create_pipe( "entity_linker", config={"incl_prior": False}) entity_linker.set_kb(kb) self.custom_ner_model.add_pipe(entity_linker, last=True) other_pipes = [ pipe for pipe in self.custom_ner_model.pipe_names if pipe != "entity_linker" ] with self.custom_ner_model.disable_pipes( *other_pipes): # train only the entity_linker optimizer = self.custom_ner_model.begin_training() for itn in range( 500): # 500 iterations takes about a minute to train random.shuffle(TRAIN_DOCS) batches = minibatch(TRAIN_DOCS, size=compounding( 4.0, 32.0, 1.001)) # increasing batch sizes losses = {} for batch in batches: texts, annotations = zip(*batch) self.custom_ner_model.update( texts, annotations, drop=0.2, # prevent overfitting losses=losses, sgd=optimizer, ) if itn % 50 == 0: print(itn, "Losses", losses) # print the training loss print(itn, "Losses", losses) print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) ner_dump_name = str(input("Enter the Model name: ")) self.custom_ner_model.to_disk(ner_dump_name) return self.custom_ner_model
def create_kb( nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, wikidata_input, entity_vector_length, limit=None, read_raw_data=True, ): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) # check the length of the nlp vectors if "vectors" in nlp.meta and nlp.vocab.vectors.size: input_dim = nlp.vocab.vectors_length print("Loaded pre-trained vectors of size %s" % input_dim) else: raise ValueError( "The `nlp` object should have access to pre-trained word vectors, " " cf. https://spacy.io/usage/models#languages.") # disable this part of the pipeline when rerunning the KB generation from preprocessed files if read_raw_data: print() print(now(), " * read wikidata entities:") title_to_id, id_to_descr = wd.read_wikidata_entities_json( wikidata_input, limit=limit) # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) else: # read the mappings from file title_to_id = get_entity_to_id(entity_def_output) id_to_descr = get_id_to_description(entity_descr_output) print() print(now(), " * get entity frequencies:") print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() entity_list = [] description_list = [] frequency_list = [] for title, entity in title_to_id.items(): freq = entity_frequencies.get(title, 0) desc = id_to_descr.get(entity, None) if desc and freq > min_entity_freq: entity_list.append(entity) description_list.append(desc) frequency_list.append(freq) filtered_title_to_id[title] = entity print(len(title_to_id.keys()), "original titles") kept_nr = len(filtered_title_to_id.keys()) print("kept", kept_nr, "entities with min. frequency", min_entity_freq) print() print(now(), " * train entity encoder:") print() encoder = EntityEncoder(nlp, input_dim, entity_vector_length) encoder.train(description_list=description_list, to_print=True) print() print(now(), " * get entity embeddings:") print() embeddings = encoder.apply_encoder(description_list) print(now(), " * adding", len(entity_list), "entities") kb.set_entities(entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings) alias_cnt = _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) print() print(now(), " * adding", alias_cnt, "aliases") print() print() print("# of entities in kb:", kb.get_size_entities()) print("# of aliases in kb:", kb.get_size_aliases()) print(now(), "Done with kb") return kb