def create_kb(): """ Step 1: create the Knowledge Base in spaCy and write it to file """ nlp = spacy.load("en_core_web_lg") name_dict, desc_dict = load_entities() kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=300) for qid, desc in desc_dict.items(): desc_doc = nlp(desc) desc_enc = desc_doc.vector kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342) # 342 is an arbitrary value here for qid, name in name_dict.items(): kb.add_alias(alias=name, entities=[qid], probabilities=[1]) # 100% prior probability P(entity|alias) qids = name_dict.keys() probs = [0.3 for qid in qids] kb.add_alias(alias="Emerson", entities=qids, probabilities=probs) # sum([probs]) should be <= 1 ! print(f"Entities in the KB: {kb.get_entity_strings()}") print(f"Aliases in the KB: {kb.get_alias_strings()}") print() if not os.path.exists(output_dir): os.mkdir(output_dir) kb.dump(output_dir / "my_kb") nlp.to_disk(output_dir / "my_nlp")
def test_issue4674(): """Test that setting entities with overlapping identifiers does not mess up IO""" nlp = English() kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) assert kb.get_size_entities() == 1 # dumping to file & loading back in with make_tempdir() as d: dir_path = ensure_path(d) if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" kb.dump(str(file_path)) kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) assert kb2.get_size_entities() == 1
def test_save_and_load_knowledge_base(): nlp = Language() kb = KnowledgeBase(nlp.vocab, entity_vector_length=1) with make_tempdir() as d: path = d / "kb" try: kb.dump(path) except Exception as e: pytest.fail(str(e)) try: kb_loaded = KnowledgeBase(nlp.vocab, entity_vector_length=1) kb_loaded.load_bulk(path) except Exception as e: pytest.fail(str(e))
def main(vocab_path=None, model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings. If an output_dir is provided, the KB will be stored there in a file 'kb'. When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" if model is None and vocab_path is None: raise ValueError("Either the `nlp` model or the `vocab` should be specified.") if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: vocab = Vocab().from_disk(vocab_path) # create blank Language class with specified vocab nlp = spacy.blank("en", vocab=vocab) print("Created blank 'en' model with vocab from '%s'" % vocab_path) kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) # only storing the vocab if we weren't already reading it from file if not vocab_path: vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def main(model=None, output_dir=None): """Load the model and create the KB with pre-defined entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality. # For simplicity, we'll just use the original vector dimension here instead. vectors_dim = nlp.vocab.vectors.shape[1] kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim) # set up the data entity_ids = [] descr_embeddings = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descr_embeddings.append(nlp(desc).vector) freqs.append(freq) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=descr_embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) print() _print_kb(kb2)
def main(model=None, output_dir=None, n_iter=50): """Load the model, create the KB and pretrain the entity encodings. If an output_dir is provided, the KB will be stored there in a file 'kb'. The updated vocab will also be written to a directory in the output_dir.""" nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) # check the length of the nlp vectors if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: raise ValueError( "The `nlp` object should have access to pretrained word vectors, " " cf. https://spacy.io/usage/models#languages.") kb = KnowledgeBase(vocab=nlp.vocab) # set up the data entity_ids = [] descriptions = [] freqs = [] for key, value in ENTITIES.items(): desc, freq = value entity_ids.append(key) descriptions.append(desc) freqs.append(freq) # training entity description encodings # this part can easily be replaced with a custom entity encoder encoder = EntityEncoder( nlp=nlp, input_dim=INPUT_DIM, desc_width=DESC_WIDTH, epochs=n_iter, ) encoder.train(description_list=descriptions, to_print=True) # get the pretrained entity vectors embeddings = encoder.apply_encoder(descriptions) # set the entities, can also be done by calling `kb.add_entity` for each entity kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) # adding aliases, the entities need to be defined in the KB beforehand kb.add_alias( alias="Russ Cochran", entities=["Q2146908", "Q7381115"], probabilities=[0.24, 0.7 ], # the sum of these probabilities should not exceed 1 ) # test the trained model print() _print_kb(kb) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() kb_path = str(output_dir / "kb") kb.dump(kb_path) print() print("Saved KB to", kb_path) vocab_path = output_dir / "vocab" kb.vocab.to_disk(vocab_path) print("Saved vocab to", vocab_path) print() # test the saved model # always reload a knowledge base with the same vocab instance! print("Loading vocab from", vocab_path) print("Loading KB from", kb_path) vocab2 = Vocab().from_disk(vocab_path) kb2 = KnowledgeBase(vocab=vocab2) kb2.load_bulk(kb_path) _print_kb(kb2) print()
def settingup_knowledgebase(self, names, train_data_2): QID = names['QID'].values.tolist() Names = names['Names'].values.tolist() Frequency = names['Frequency'].values.tolist() descript = [] for desc in names['Description']: descript.append(self.custom_ner_model(desc).vector) print("Setting up entities \n") kb = KnowledgeBase(vocab=self.custom_ner_model.vocab, entity_vector_length=96) kb.set_entities(entity_list=QID, freq_list=Frequency, vector_list=descript) print("Setting up Alias \n") print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) #kb_dump_file = str(input("Enter the KB Dump name: ")) #kb_vocab_folder = str(input("Enter the KB Vocab name: ")) folder.nel_kb_vocab() alias_prep = list(zip(Names, QID)) folder.nel_kb_vocab() for i, j in alias_prep: names_alias = str(i) list_qid = [] list_qid.append(j) prob = [] prob.append(int(1.0)) kb.add_alias(alias=names_alias, entities=list_qid, probabilities=prob) kb.dump("KB_Dump") kb.vocab.to_disk("KB_Vocab") print("\n") print("Knowbase dump and Vocab are stored in a local disk") train_data_dict_2 = train_data_2.to_dict('records') dataset_2 = [] for data in train_data_dict_2: Text = data['Text'] Name = data['Name'] QID = data['QID'] offset = (data["Start"], data["End"]) links_dict = {QID: 1.0} dataset_2.append((Text, {"links": {offset: links_dict}})) self.custom_ner_model.vocab.from_disk("KB_Vocab") self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors" kb = KnowledgeBase(vocab=self.custom_ner_model.vocab) kb.load_bulk("KB_Dump") TRAIN_DOCS = [] for text, annotation in dataset_2: doc = self.custom_ner_model( text ) # to make this more efficient, you can use nlp.pipe() just once for all the texts TRAIN_DOCS.append((doc, annotation)) print("\n") print("Training started for Named Entity Linking \n") entity_linker = self.custom_ner_model.create_pipe( "entity_linker", config={"incl_prior": False}) entity_linker.set_kb(kb) self.custom_ner_model.add_pipe(entity_linker, last=True) other_pipes = [ pipe for pipe in self.custom_ner_model.pipe_names if pipe != "entity_linker" ] with self.custom_ner_model.disable_pipes( *other_pipes): # train only the entity_linker optimizer = self.custom_ner_model.begin_training() for itn in range( 500): # 500 iterations takes about a minute to train random.shuffle(TRAIN_DOCS) batches = minibatch(TRAIN_DOCS, size=compounding( 4.0, 32.0, 1.001)) # increasing batch sizes losses = {} for batch in batches: texts, annotations = zip(*batch) self.custom_ner_model.update( texts, annotations, drop=0.2, # prevent overfitting losses=losses, sgd=optimizer, ) if itn % 50 == 0: print(itn, "Losses", losses) # print the training loss print(itn, "Losses", losses) print("\n") print("Spacy Pipeline \n") print(self.custom_ner_model.pipe_names) ner_dump_name = str(input("Enter the Model name: ")) self.custom_ner_model.to_disk(ner_dump_name) return self.custom_ner_model
entity_labels = list(set(entity_labels)) entities = name_dict.values() ent2id = {v: k for k, v in name_dict.items()} print("Testing candidate generation") print(entity_labels[0], cand_gen(entity_labels[0], entities, ent2id)) ## adding (fuzzy matching) candidates into KB aliases = {} words = [] for flabel in entity_labels: name = flabel qids, probs = cand_gen(flabel, entities, ent2id) if len(probs) == 1 and probs[0] == 1.0: continue kb.add_alias(alias=flabel, entities=qids, probabilities=probs) # sum([probs]) should be <= 1 ! print( f"Candidates for 'hyperlipidemia': {[c.entity_ for c in kb.get_candidates('hyperlipidemia')]}" ) # change the directory and file names to whatever you like output_dir = Path.cwd() / "output" if not os.path.exists(output_dir): os.mkdir(output_dir) kb.dump(output_dir / "my_kb") nlp.to_disk(output_dir / "my_nlp")