Python KnowledgeBase.set_entities示例，spacy.kb.KnowledgeBase.set_entities Python示例

示例#1

0

显示文件

文件： test_issue4674.py 项目： uliang/spaCy

def test_issue4674():
    """Test that setting entities with overlapping identifiers does not mess up IO"""
    nlp = English()
    kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

    vector1 = [0.9, 1.1, 1.01]
    vector2 = [1.8, 2.25, 2.01]
    kb.set_entities(entity_list=["Q1", "Q1"],
                    freq_list=[32, 111],
                    vector_list=[vector1, vector2])

    assert kb.get_size_entities() == 1

    # dumping to file & loading back in
    with make_tempdir() as d:
        dir_path = ensure_path(d)
        if not dir_path.exists():
            dir_path.mkdir()
        file_path = dir_path / "kb"
        kb.dump(str(file_path))

        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
        kb2.load_bulk(str(file_path))

    assert kb2.get_size_entities() == 1

示例#2

0

显示文件

    def train(self, entities, list_aliases):
        """
        Args:
            entities: a dict of each entity, it's description and it's corpus frequency
            list_aliases: a list of dicts for each entity e.g.::

                    [{
                        'alias':'Farrar',
                        'entities': ['Q1', 'Q2'],
                        'probabilities': [0.4, 0.6]
                    }]

                probabilities are 'prior probabilities' and must sum to < 1
        """
        try:
            nlp = spacy.load(self.kb_model)
        except IOError:
            subprocess.run(
                ["python", "-m", "spacy", "download", self.kb_model])
            # pkg_resources need to be reloaded to pick up the newly installed models
            import pkg_resources
            import imp

            imp.reload(pkg_resources)
            nlp = spacy.load(self.kb_model)

        print("Loaded model '%s'" % self.kb_model)

        # set up the data
        entity_ids = []
        embeddings = []
        freqs = []
        for key, value in entities.items():
            desc, freq = value
            entity_ids.append(key)
            embeddings.append(nlp(desc).vector)
            freqs.append(freq)

        self.entity_vector_length = len(
            embeddings[0])  # This is needed in loading a kb
        kb = KnowledgeBase(vocab=nlp.vocab,
                           entity_vector_length=self.entity_vector_length)

        # set the entities, can also be done by calling `kb.add_entity` for each entity
        kb.set_entities(entity_list=entity_ids,
                        freq_list=freqs,
                        vector_list=embeddings)

        # adding aliases, the entities need to be defined in the KB beforehand
        for alias in list_aliases:
            kb.add_alias(
                alias=alias["alias"],
                entities=alias["entities"],
                probabilities=alias["probabilities"],
            )
        self.kb = kb
        return self.kb

示例#3

0

显示文件

文件： test_entity_linker.py 项目： paolodedios/spaCy

def test_kb_serialize_2(nlp):
    v = [5, 6, 7, 8]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E1"], [1], [v])
    assert kb1.get_vector("E1") == v
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert kb2.get_vector("E1") == v

示例#4

0

显示文件

文件： test_entity_linker.py 项目： paolodedios/spaCy

def test_kb_set_entities(nlp):
    """Test that set_entities entirely overwrites the previous set of entities"""
    v = [5, 6, 7, 8]
    v1 = [1, 1, 1, 0]
    v2 = [2, 2, 2, 3]
    kb1 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
    kb1.set_entities(["E0"], [1], [v])
    assert kb1.get_entity_strings() == ["E0"]
    kb1.set_entities(["E1", "E2"], [1, 9], [v1, v2])
    assert set(kb1.get_entity_strings()) == {"E1", "E2"}
    assert kb1.get_vector("E1") == v1
    assert kb1.get_vector("E2") == v2
    with make_tempdir() as d:
        kb1.to_disk(d / "kb")
        kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=4)
        kb2.from_disk(d / "kb")
        assert set(kb2.get_entity_strings()) == {"E1", "E2"}
        assert kb2.get_vector("E1") == v1
        assert kb2.get_vector("E2") == v2

示例#5

0

显示文件

文件： pretrain_kb.py 项目： yogesh1997/spaCy

def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
    if model is None and vocab_path is None:
        raise ValueError("Either the `nlp` model or the `vocab` should be specified.")

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        vocab = Vocab().from_disk(vocab_path)
        # create blank Language class with specified vocab
        nlp = spacy.blank("en", vocab=vocab)
        print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        # only storing the vocab if we weren't already reading it from file
        if not vocab_path:
            vocab_path = output_dir / "vocab"
            kb.vocab.to_disk(vocab_path)
            print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()

示例#6

0

显示文件

文件： spacy_knowledge_base.py 项目： Xx-Ashutosh-xX/WellcomeML

    def train(self, entities, list_aliases):
        """
        Args:
            entities: a dict of each entity, it's description and it's corpus frequency
            list_aliases: a list of dicts for each entity e.g.::

                    [{
                        'alias':'Farrar',
                        'entities': ['Q1', 'Q2'],
                        'probabilities': [0.4, 0.6]
                    }]

                probabilities are 'prior probabilities' and must sum to < 1
        """
        try:
            nlp = spacy.load(self.kb_model)
        except IOError:
            subprocess.run(
                ["python", "-m", "spacy", "download", self.kb_model])
            # pkg_resources need to be reloaded to pick up the newly installed models
            import pkg_resources
            import imp

            imp.reload(pkg_resources)
            nlp = spacy.load(self.kb_model)

        print("Loaded model '%s'" % self.kb_model)
        kb = KnowledgeBase(vocab=nlp.vocab,
                           entity_vector_length=self.desc_width)

        # set up the data
        entity_ids = []
        descriptions = []
        freqs = []
        for key, value in entities.items():
            desc, freq = value
            entity_ids.append(key)
            descriptions.append(desc)
            freqs.append(freq)

        # training entity description encodings
        # this part can easily be replaced with a custom entity encoder
        encoder = EntityEncoder(
            nlp=nlp,
            input_dim=self.input_dim,
            desc_width=self.desc_width,
            epochs=self.num_epochs,
        )

        encoder.train(description_list=descriptions, to_print=True)

        # get the pretrained entity vectors
        embeddings = encoder.apply_encoder(descriptions)

        # set the entities, can also be done by calling `kb.add_entity` for each entity
        kb.set_entities(entity_list=entity_ids,
                        freq_list=freqs,
                        vector_list=embeddings)

        # adding aliases, the entities need to be defined in the KB beforehand
        for alias in list_aliases:
            kb.add_alias(
                alias=alias["alias"],
                entities=alias["entities"],
                probabilities=alias["probabilities"],
            )
        self.kb = kb
        return self.kb

示例#7

0

显示文件

def main(model=None, output_dir=None):
    """Load the model and create the KB with pre-defined entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # You can change the dimension of vectors in your KB by using an encoder that changes the dimensionality.
    # For simplicity, we'll just use the original vector dimension here instead.
    vectors_dim = nlp.vocab.vectors.shape[1]
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=vectors_dim)

    # set up the data
    entity_ids = []
    descr_embeddings = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descr_embeddings.append(nlp(desc).vector)
        freqs.append(freq)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=descr_embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        print()
        _print_kb(kb2)

示例#8

0

显示文件

def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_input,
    entity_descr_path,
    count_input,
    prior_prob_input,
    entity_vector_length,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # read the mappings from file
    title_to_id = get_entity_to_id(entity_def_input)
    id_to_descr = get_id_to_description(entity_descr_path)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        logger.info("Loaded pretrained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    logger.info("Get entity frequencies")
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    logger.info("Filtering entities with fewer than {} mentions".format(
        min_entity_freq))
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
        title_to_id, id_to_descr, entity_frequencies, min_entity_freq)
    logger.info("Left with {} entities".format(len(description_list)))

    logger.info("Train entity encoder")
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    logger.info("Get entity embeddings:")
    embeddings = encoder.apply_encoder(description_list)

    logger.info("Adding {} entities".format(len(entity_list)))
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    logger.info("Adding aliases")
    _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )

    logger.info("KB size: {} entities, {} aliases".format(
        kb.get_size_entities(), kb.get_size_aliases()))

    logger.info("Done with kb")
    return kb

示例#9

0

显示文件

文件： kb_creator.py 项目： Fersubair40/chatbot

def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_output,
    entity_descr_output,
    count_input,
    prior_prob_input,
    wikidata_input,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)

    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    read_raw_data = True

    if read_raw_data:
        print()
        print(" * _read_wikidata_entities", datetime.datetime.now())
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wikidata_input)

        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output,
                            title_to_id, id_to_descr)

    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)

    print()
    print(" * _get_entity_frequencies", datetime.datetime.now())
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity

    print(len(title_to_id.keys()), "original titles")
    print("kept", len(filtered_title_to_id.keys()), " with frequency",
          min_entity_freq)

    print()
    print(" * train entity encoder", datetime.datetime.now())
    print()
    encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
    encoder.train(description_list=description_list, to_print=True)

    print()
    print(" * get entity embeddings", datetime.datetime.now())
    print()
    embeddings = encoder.apply_encoder(description_list)

    print()
    print(" * adding", len(entity_list), "entities", datetime.datetime.now())
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    print()
    print(" * adding aliases", datetime.datetime.now())
    print()
    _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    print("done with kb", datetime.datetime.now())
    return kb

示例#10

0

显示文件

文件： pretrain_kb.py 项目： samlet/saai

def main(model=None, output_dir=None, n_iter=50):
    """Load the model, create the KB and pretrain the entity encodings.
    If an output_dir is provided, the KB will be stored there in a file 'kb'.
    The updated vocab will also be written to a directory in the output_dir."""

    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # check the length of the nlp vectors
    if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    kb = KnowledgeBase(vocab=nlp.vocab)

    # set up the data
    entity_ids = []
    descriptions = []
    freqs = []
    for key, value in ENTITIES.items():
        desc, freq = value
        entity_ids.append(key)
        descriptions.append(desc)
        freqs.append(freq)

    # training entity description encodings
    # this part can easily be replaced with a custom entity encoder
    encoder = EntityEncoder(
        nlp=nlp,
        input_dim=INPUT_DIM,
        desc_width=DESC_WIDTH,
        epochs=n_iter,
    )
    encoder.train(description_list=descriptions, to_print=True)

    # get the pretrained entity vectors
    embeddings = encoder.apply_encoder(descriptions)

    # set the entities, can also be done by calling `kb.add_entity` for each entity
    kb.set_entities(entity_list=entity_ids,
                    freq_list=freqs,
                    vector_list=embeddings)

    # adding aliases, the entities need to be defined in the KB beforehand
    kb.add_alias(
        alias="Russ Cochran",
        entities=["Q2146908", "Q7381115"],
        probabilities=[0.24, 0.7
                       ],  # the sum of these probabilities should not exceed 1
    )

    # test the trained model
    print()
    _print_kb(kb)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        kb_path = str(output_dir / "kb")
        kb.dump(kb_path)
        print()
        print("Saved KB to", kb_path)

        vocab_path = output_dir / "vocab"
        kb.vocab.to_disk(vocab_path)
        print("Saved vocab to", vocab_path)

        print()

        # test the saved model
        # always reload a knowledge base with the same vocab instance!
        print("Loading vocab from", vocab_path)
        print("Loading KB from", kb_path)
        vocab2 = Vocab().from_disk(vocab_path)
        kb2 = KnowledgeBase(vocab=vocab2)
        kb2.load_bulk(kb_path)
        _print_kb(kb2)
        print()

示例#11

0

显示文件

文件： training_nel.py 项目： Big-Data-And-Data-Analytics/case-study-1-october2019-case-study-group-4

    def settingup_knowledgebase(self, names, train_data_2):

        QID = names['QID'].values.tolist()
        Names = names['Names'].values.tolist()
        Frequency = names['Frequency'].values.tolist()
        descript = []
        for desc in names['Description']:
            descript.append(self.custom_ner_model(desc).vector)

        print("Setting up entities \n")

        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab,
                           entity_vector_length=96)
        kb.set_entities(entity_list=QID,
                        freq_list=Frequency,
                        vector_list=descript)

        print("Setting up Alias \n")

        print("\n")
        print("Spacy Pipeline \n")

        print(self.custom_ner_model.pipe_names)

        #kb_dump_file = str(input("Enter the KB Dump name: "))
        #kb_vocab_folder = str(input("Enter the KB Vocab name: "))
        folder.nel_kb_vocab()

        alias_prep = list(zip(Names, QID))
        folder.nel_kb_vocab()
        for i, j in alias_prep:
            names_alias = str(i)
            list_qid = []
            list_qid.append(j)
            prob = []
            prob.append(int(1.0))
            kb.add_alias(alias=names_alias,
                         entities=list_qid,
                         probabilities=prob)

            kb.dump("KB_Dump")
            kb.vocab.to_disk("KB_Vocab")

        print("\n")
        print("Knowbase dump and Vocab are stored in a local disk")

        train_data_dict_2 = train_data_2.to_dict('records')

        dataset_2 = []
        for data in train_data_dict_2:
            Text = data['Text']
            Name = data['Name']
            QID = data['QID']
            offset = (data["Start"], data["End"])
            links_dict = {QID: 1.0}
            dataset_2.append((Text, {"links": {offset: links_dict}}))

        self.custom_ner_model.vocab.from_disk("KB_Vocab")
        self.custom_ner_model.vocab.vectors.name = "spacy_pretrained_vectors"
        kb = KnowledgeBase(vocab=self.custom_ner_model.vocab)
        kb.load_bulk("KB_Dump")

        TRAIN_DOCS = []
        for text, annotation in dataset_2:
            doc = self.custom_ner_model(
                text
            )  # to make this more efficient, you can use nlp.pipe() just once for all the texts
            TRAIN_DOCS.append((doc, annotation))

        print("\n")
        print("Training started for Named Entity Linking \n")

        entity_linker = self.custom_ner_model.create_pipe(
            "entity_linker", config={"incl_prior": False})
        entity_linker.set_kb(kb)
        self.custom_ner_model.add_pipe(entity_linker, last=True)

        other_pipes = [
            pipe for pipe in self.custom_ner_model.pipe_names
            if pipe != "entity_linker"
        ]
        with self.custom_ner_model.disable_pipes(
                *other_pipes):  # train only the entity_linker
            optimizer = self.custom_ner_model.begin_training()
            for itn in range(
                    500):  # 500 iterations takes about a minute to train
                random.shuffle(TRAIN_DOCS)
                batches = minibatch(TRAIN_DOCS,
                                    size=compounding(
                                        4.0, 32.0,
                                        1.001))  # increasing batch sizes
                losses = {}
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.custom_ner_model.update(
                        texts,
                        annotations,
                        drop=0.2,  # prevent overfitting
                        losses=losses,
                        sgd=optimizer,
                    )
                if itn % 50 == 0:
                    print(itn, "Losses", losses)  # print the training loss
        print(itn, "Losses", losses)
        print("\n")
        print("Spacy Pipeline \n")
        print(self.custom_ner_model.pipe_names)
        ner_dump_name = str(input("Enter the Model name: "))

        self.custom_ner_model.to_disk(ner_dump_name)

        return self.custom_ner_model

示例#12

0

显示文件

def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_output,
    entity_descr_output,
    count_input,
    prior_prob_input,
    wikidata_input,
    entity_vector_length,
    limit=None,
    read_raw_data=True,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        print("Loaded pre-trained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pre-trained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    if read_raw_data:
        print()
        print(now(), " * read wikidata entities:")
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wikidata_input, limit=limit)

        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output,
                            title_to_id, id_to_descr)

    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)

    print()
    print(now(), " *  get entity frequencies:")
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity

    print(len(title_to_id.keys()), "original titles")
    kept_nr = len(filtered_title_to_id.keys())
    print("kept", kept_nr, "entities with min. frequency", min_entity_freq)

    print()
    print(now(), " * train entity encoder:")
    print()
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    print()
    print(now(), " * get entity embeddings:")
    print()
    embeddings = encoder.apply_encoder(description_list)

    print(now(), " * adding", len(entity_list), "entities")
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    alias_cnt = _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )
    print()
    print(now(), " * adding", alias_cnt, "aliases")
    print()

    print()
    print("# of entities in kb:", kb.get_size_entities())
    print("# of aliases in kb:", kb.get_size_aliases())

    print(now(), "Done with kb")
    return kb