Exemplo n.º 1
0
def test_kb_to_bytes():
    # Test that the KB's to_bytes method works correctly
    nlp = English()
    kb_1 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    kb_1.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
    kb_1.add_entity(entity="Q66", freq=9, entity_vector=[1, 2, 3])
    kb_1.add_alias(alias="Russ Cochran",
                   entities=["Q2146908"],
                   probabilities=[0.8])
    kb_1.add_alias(alias="Boeing", entities=["Q66"], probabilities=[0.5])
    kb_1.add_alias(alias="Randomness",
                   entities=["Q66", "Q2146908"],
                   probabilities=[0.1, 0.2])
    assert kb_1.contains_alias("Russ Cochran")
    kb_bytes = kb_1.to_bytes()
    kb_2 = KnowledgeBase(nlp.vocab, entity_vector_length=3)
    assert not kb_2.contains_alias("Russ Cochran")
    kb_2 = kb_2.from_bytes(kb_bytes)
    # check that both KBs are exactly the same
    assert kb_1.get_size_entities() == kb_2.get_size_entities()
    assert kb_1.entity_vector_length == kb_2.entity_vector_length
    assert kb_1.get_entity_strings() == kb_2.get_entity_strings()
    assert kb_1.get_vector("Q2146908") == kb_2.get_vector("Q2146908")
    assert kb_1.get_vector("Q66") == kb_2.get_vector("Q66")
    assert kb_2.contains_alias("Russ Cochran")
    assert kb_1.get_size_aliases() == kb_2.get_size_aliases()
    assert kb_1.get_alias_strings() == kb_2.get_alias_strings()
    assert len(kb_1.get_alias_candidates("Russ Cochran")) == len(
        kb_2.get_alias_candidates("Russ Cochran"))
    assert len(kb_1.get_alias_candidates("Randomness")) == len(
        kb_2.get_alias_candidates("Randomness"))
Exemplo n.º 2
0
def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3)

    # adding entities
    mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3])
    mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0])
    mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5])

    # adding aliases
    mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2])
    mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])

    # test the size of the corresponding KB
    assert mykb.get_size_entities() == 3
    assert mykb.get_size_aliases() == 2

    # test retrieval of the entity vectors
    assert mykb.get_vector("Q1") == [8, 4, 3]
    assert mykb.get_vector("Q2") == [2, 1, 0]
    assert mykb.get_vector("Q3") == [-1, -6, 5]

    # test retrieval of prior probabilities
    assert_almost_equal(mykb.get_prior_prob(entity="Q2", alias="douglas"), 0.8)
    assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglas"), 0.2)
    assert_almost_equal(mykb.get_prior_prob(entity="Q342", alias="douglas"), 0.0)
    assert_almost_equal(mykb.get_prior_prob(entity="Q3", alias="douglassssss"), 0.0)
Exemplo n.º 3
0
def read_nlp_kb(model_dir, kb_file):
    nlp = spacy.load(model_dir)
    kb = KnowledgeBase(vocab=nlp.vocab)
    kb.load_bulk(kb_file)
    logger.info("kb entities: {}".format(kb.get_size_entities()))
    logger.info("kb aliases: {}".format(kb.get_size_aliases()))
    return nlp, kb
Exemplo n.º 4
0
def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab, entity_vector_length=1)

    # adding entities
    entity_0 = "Q1004791_Douglas"
    print("adding entity", entity_0)
    kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0])

    entity_1 = "Q42_Douglas_Adams"
    print("adding entity", entity_1)
    kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1])

    entity_2 = "Q5301561_Douglas_Haig"
    print("adding entity", entity_2)
    kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2])

    # adding aliases
    print()
    alias_0 = "Douglas"
    print("adding alias", alias_0)
    kb.add_alias(alias=alias_0,
                 entities=[entity_0, entity_1, entity_2],
                 probabilities=[0.6, 0.1, 0.2])

    alias_1 = "Douglas Adams"
    print("adding alias", alias_1)
    kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    return kb
Exemplo n.º 5
0
def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab)

    # adding entities
    entity_0 = "Q1004791_Douglas"
    print("adding entity", entity_0)
    kb.add_entity(entity=entity_0, prob=0.5)

    entity_1 = "Q42_Douglas_Adams"
    print("adding entity", entity_1)
    kb.add_entity(entity=entity_1, prob=0.5)

    entity_2 = "Q5301561_Douglas_Haig"
    print("adding entity", entity_2)
    kb.add_entity(entity=entity_2, prob=0.5)

    # adding aliases
    print()
    alias_0 = "Douglas"
    print("adding alias", alias_0)
    kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2])

    alias_1 = "Douglas Adams"
    print("adding alias", alias_1)
    kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    return kb
Exemplo n.º 6
0
def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab)

    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2')
    mykb.add_entity(entity=u'Q3', prob=0.5)

    # adding aliases
    mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])

    # test the size of the corresponding KB
    assert(mykb.get_size_entities() == 3)
    assert(mykb.get_size_aliases() == 2)
Exemplo n.º 7
0
def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab)

    # adding entities
    mykb.add_entity(entity=u'Q1', prob=0.9)
    mykb.add_entity(entity=u'Q2')
    mykb.add_entity(entity=u'Q3', prob=0.5)

    # adding aliases
    mykb.add_alias(alias=u'douglas',
                   entities=[u'Q2', u'Q3'],
                   probabilities=[0.8, 0.2])
    mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9])

    # test the size of the corresponding KB
    assert (mykb.get_size_entities() == 3)
    assert (mykb.get_size_aliases() == 2)
Exemplo n.º 8
0
def test_kb_valid_entities(nlp):
    """Test the valid construction of a KB with 3 entities and two aliases"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

    # adding entities
    mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1])
    mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2])
    mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3])

    # adding aliases
    mykb.add_alias(alias='douglas',
                   entities=['Q2', 'Q3'],
                   probabilities=[0.8, 0.2])
    mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9])

    # test the size of the corresponding KB
    assert (mykb.get_size_entities() == 3)
    assert (mykb.get_size_aliases() == 2)
Exemplo n.º 9
0
def test_issue6730(en_vocab):
    """Ensure that the KB does not accept empty strings, but otherwise IO works fine."""
    from spacy.kb import KnowledgeBase

    kb = KnowledgeBase(en_vocab, entity_vector_length=3)
    kb.add_entity(entity="1", freq=148, entity_vector=[1, 2, 3])

    with pytest.raises(ValueError):
        kb.add_alias(alias="", entities=["1"], probabilities=[0.4])
    assert kb.contains_alias("") is False

    kb.add_alias(alias="x", entities=["1"], probabilities=[0.2])
    kb.add_alias(alias="y", entities=["1"], probabilities=[0.1])

    with make_tempdir() as tmp_dir:
        kb.to_disk(tmp_dir)
        kb.from_disk(tmp_dir)
    assert kb.get_size_aliases() == 2
    assert set(kb.get_alias_strings()) == {"x", "y"}
Exemplo n.º 10
0
from spacy.kb import KnowledgeBase

nlp = spacy.load("en_core_web_sm")
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
# adding entities
kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])
​
# adding aliases
kb.add_alias(alias="Douglas", entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.6, 0.1, 0.2])
kb.add_alias(alias="Douglas Adams", entities=["Q42"], probabilities=[0.9])
​
print()
print("Number of entities in KB:",kb.get_size_entities()) # 3
print("Number of aliases in KB:", kb.get_size_aliases()) # 2



import spacy
from spacy.kb import KnowledgeBase

nlp = spacy.load('en_core_web_sm')
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)

# adding entities
kb.add_entity(entity="Q1004791", freq=6, entity_vector=[0, 3, 5])
kb.add_entity(entity="Q42", freq=342, entity_vector=[1, 9, -3])
kb.add_entity(entity="Q5301561", freq=12, entity_vector=[-2, 4, 2])

# adding aliases
Exemplo n.º 11
0
def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_input,
    entity_descr_path,
    count_input,
    prior_prob_input,
    entity_vector_length,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # read the mappings from file
    title_to_id = get_entity_to_id(entity_def_input)
    id_to_descr = get_id_to_description(entity_descr_path)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        logger.info("Loaded pretrained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pretrained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    logger.info("Get entity frequencies")
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    logger.info("Filtering entities with fewer than {} mentions".format(
        min_entity_freq))
    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
        title_to_id, id_to_descr, entity_frequencies, min_entity_freq)
    logger.info("Left with {} entities".format(len(description_list)))

    logger.info("Train entity encoder")
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    logger.info("Get entity embeddings:")
    embeddings = encoder.apply_encoder(description_list)

    logger.info("Adding {} entities".format(len(entity_list)))
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    logger.info("Adding aliases")
    _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )

    logger.info("KB size: {} entities, {} aliases".format(
        kb.get_size_entities(), kb.get_size_aliases()))

    logger.info("Done with kb")
    return kb
Exemplo n.º 12
0
def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_output,
    entity_descr_output,
    count_input,
    prior_prob_input,
    wikidata_input,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH)

    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    read_raw_data = True

    if read_raw_data:
        print()
        print(" * _read_wikidata_entities", datetime.datetime.now())
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wikidata_input)

        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output,
                            title_to_id, id_to_descr)

    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)

    print()
    print(" * _get_entity_frequencies", datetime.datetime.now())
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity

    print(len(title_to_id.keys()), "original titles")
    print("kept", len(filtered_title_to_id.keys()), " with frequency",
          min_entity_freq)

    print()
    print(" * train entity encoder", datetime.datetime.now())
    print()
    encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH)
    encoder.train(description_list=description_list, to_print=True)

    print()
    print(" * get entity embeddings", datetime.datetime.now())
    print()
    embeddings = encoder.apply_encoder(description_list)

    print()
    print(" * adding", len(entity_list), "entities", datetime.datetime.now())
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    print()
    print(" * adding aliases", datetime.datetime.now())
    print()
    _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    print("done with kb", datetime.datetime.now())
    return kb
Exemplo n.º 13
0
def run_pipeline():
    # set the appropriate booleans to define which parts of the pipeline should be re(run)
    print("START", datetime.datetime.now())
    print()
    nlp_1 = spacy.load('en_core_web_lg')
    nlp_2 = None
    kb_2 = None

    # one-time methods to create KB and write to file
    to_create_prior_probs = False
    to_create_entity_counts = False
    to_create_kb = False

    # read KB back in from file
    to_read_kb = True
    to_test_kb = False

    # create training dataset
    create_wp_training = False

    # train the EL pipe
    train_pipe = True
    measure_performance = True

    # test the EL pipe on a simple example
    to_test_pipeline = True

    # write the NLP object, read back in and test again
    to_write_nlp = True
    to_read_nlp = True
    test_from_file = False

    # STEP 1 : create prior probabilities from WP (run only once)
    if to_create_prior_probs:
        print("STEP 1: to_create_prior_probs", datetime.datetime.now())
        wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP,
                                      prior_prob_output=PRIOR_PROB)
        print()

    # STEP 2 : deduce entity frequencies from WP (run only once)
    if to_create_entity_counts:
        print("STEP 2: to_create_entity_counts", datetime.datetime.now())
        wp.write_entity_counts(prior_prob_input=PRIOR_PROB,
                               count_output=ENTITY_COUNTS,
                               to_print=False)
        print()

    # STEP 3 : create KB and write to file (run only once)
    if to_create_kb:
        print("STEP 3a: to_create_kb", datetime.datetime.now())
        kb_1 = kb_creator.create_kb(nlp_1,
                                    max_entities_per_alias=MAX_CANDIDATES,
                                    min_entity_freq=MIN_ENTITY_FREQ,
                                    min_occ=MIN_PAIR_OCC,
                                    entity_def_output=ENTITY_DEFS,
                                    entity_descr_output=ENTITY_DESCR,
                                    count_input=ENTITY_COUNTS,
                                    prior_prob_input=PRIOR_PROB,
                                    wikidata_input=WIKIDATA_JSON)
        print("kb entities:", kb_1.get_size_entities())
        print("kb aliases:", kb_1.get_size_aliases())
        print()

        print("STEP 3b: write KB and NLP", datetime.datetime.now())
        kb_1.dump(KB_FILE)
        nlp_1.to_disk(NLP_1_DIR)
        print()

    # STEP 4 : read KB back in from file
    if to_read_kb:
        print("STEP 4: to_read_kb", datetime.datetime.now())
        nlp_2 = spacy.load(NLP_1_DIR)
        kb_2 = KnowledgeBase(vocab=nlp_2.vocab,
                             entity_vector_length=DESC_WIDTH)
        kb_2.load_bulk(KB_FILE)
        print("kb entities:", kb_2.get_size_entities())
        print("kb aliases:", kb_2.get_size_aliases())
        print()

        # test KB
        if to_test_kb:
            check_kb(kb_2)
            print()

    # STEP 5: create a training dataset from WP
    if create_wp_training:
        print("STEP 5: create training dataset", datetime.datetime.now())
        training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP,
                                             entity_def_input=ENTITY_DEFS,
                                             training_output=TRAINING_DIR)

    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
        type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
        print(" -analysing", len(type_to_int), "different entity types")
        el_pipe = nlp_2.create_pipe(name='entity_linker',
                                    config={
                                        "context_width": CONTEXT_WIDTH,
                                        "pretrained_vectors":
                                        nlp_2.vocab.vectors.name,
                                        "type_to_int": type_to_int
                                    })
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

        other_pipes = [
            pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"
        ]
        with nlp_2.disable_pipes(*other_pipes):  # only train Entity Linking
            optimizer = nlp_2.begin_training()
            optimizer.learn_rate = LEARN_RATE
            optimizer.L2 = L2

        # define the size (nr of entities) of training and dev set
        train_limit = 5000
        dev_limit = 5000

        train_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit)

        print("Training on", len(train_data), "articles")
        print()

        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing on", len(dev_data), "articles")
        print()

        if not train_data:
            print("Did not find any training data")
        else:
            for itn in range(EPOCHS):
                random.shuffle(train_data)
                losses = {}
                batches = minibatch(train_data,
                                    size=compounding(4.0, 128.0, 1.001))
                batchnr = 0

                with nlp_2.disable_pipes(*other_pipes):
                    for batch in batches:
                        try:
                            docs, golds = zip(*batch)
                            nlp_2.update(
                                docs,
                                golds,
                                sgd=optimizer,
                                drop=DROPOUT,
                                losses=losses,
                            )
                            batchnr += 1
                        except Exception as e:
                            print("Error updating batch:", e)

                if batchnr > 0:
                    el_pipe.cfg["context_weight"] = 1
                    el_pipe.cfg["prior_weight"] = 1
                    dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                        dev_data, el_pipe)
                    losses['entity_linker'] = losses['entity_linker'] / batchnr
                    print("Epoch, train loss", itn,
                          round(losses['entity_linker'], 2), " / dev acc avg",
                          round(dev_acc_context, 3))

        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
            print()
            print("STEP 7: performance measurement of Entity Linking pipe",
                  datetime.datetime.now())
            print()

            counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(
                dev_data, kb_2)
            print("dev counts:", sorted(counts.items(), key=lambda x: x[0]))
            print("dev acc oracle:", round(acc_o, 3),
                  [(x, round(y, 3)) for x, y in acc_o_label.items()])
            print("dev acc random:", round(acc_r, 3),
                  [(x, round(y, 3)) for x, y in acc_r_label.items()])
            print("dev acc prior:", round(acc_p, 3),
                  [(x, round(y, 3)) for x, y in acc_p_label.items()])

            # using only context
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 0
            dev_acc_context, dev_acc_context_dict = _measure_accuracy(
                dev_data, el_pipe)
            print("dev acc context avg:", round(dev_acc_context, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()])

            # measuring combined accuracy (prior + context)
            el_pipe.cfg["context_weight"] = 1
            el_pipe.cfg["prior_weight"] = 1
            dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
                dev_data, el_pipe, error_analysis=False)
            print("dev acc combo avg:", round(dev_acc_combo, 3),
                  [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

        # STEP 8: apply the EL pipe on a toy example
        if to_test_pipeline:
            print()
            print("STEP 8: applying Entity Linking to toy example",
                  datetime.datetime.now())
            print()
            run_el_toy_example(nlp=nlp_2)

        # STEP 9: write the NLP pipeline (including entity linker) to file
        if to_write_nlp:
            print()
            print("STEP 9: testing NLP IO", datetime.datetime.now())
            print()
            print("writing to", NLP_2_DIR)
            nlp_2.to_disk(NLP_2_DIR)
            print()

    # verify that the IO has gone correctly
    if to_read_nlp:
        print("reading from", NLP_2_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)

        print("running toy example with NLP 3")
        run_el_toy_example(nlp=nlp_3)

    # testing performance with an NLP model from file
    if test_from_file:
        nlp_2 = spacy.load(NLP_1_DIR)
        nlp_3 = spacy.load(NLP_2_DIR)
        el_pipe = nlp_3.get_pipe("entity_linker")

        dev_limit = 5000
        dev_data = training_set_creator.read_training(
            nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit)

        print("Dev testing from file on", len(dev_data), "articles")
        print()

        dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(
            dev_data, el_pipe=el_pipe, error_analysis=False)
        print("dev acc combo avg:", round(dev_acc_combo, 3),
              [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])

    print()
    print("STOP", datetime.datetime.now())
Exemplo n.º 14
0
def create_kb(
    nlp,
    max_entities_per_alias,
    min_entity_freq,
    min_occ,
    entity_def_output,
    entity_descr_output,
    count_input,
    prior_prob_input,
    wikidata_input,
    entity_vector_length,
    limit=None,
    read_raw_data=True,
):
    # Create the knowledge base from Wikidata entries
    kb = KnowledgeBase(vocab=nlp.vocab,
                       entity_vector_length=entity_vector_length)

    # check the length of the nlp vectors
    if "vectors" in nlp.meta and nlp.vocab.vectors.size:
        input_dim = nlp.vocab.vectors_length
        print("Loaded pre-trained vectors of size %s" % input_dim)
    else:
        raise ValueError(
            "The `nlp` object should have access to pre-trained word vectors, "
            " cf. https://spacy.io/usage/models#languages.")

    # disable this part of the pipeline when rerunning the KB generation from preprocessed files
    if read_raw_data:
        print()
        print(now(), " * read wikidata entities:")
        title_to_id, id_to_descr = wd.read_wikidata_entities_json(
            wikidata_input, limit=limit)

        # write the title-ID and ID-description mappings to file
        _write_entity_files(entity_def_output, entity_descr_output,
                            title_to_id, id_to_descr)

    else:
        # read the mappings from file
        title_to_id = get_entity_to_id(entity_def_output)
        id_to_descr = get_id_to_description(entity_descr_output)

    print()
    print(now(), " *  get entity frequencies:")
    print()
    entity_frequencies = wp.get_all_frequencies(count_input=count_input)

    # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
    filtered_title_to_id = dict()
    entity_list = []
    description_list = []
    frequency_list = []
    for title, entity in title_to_id.items():
        freq = entity_frequencies.get(title, 0)
        desc = id_to_descr.get(entity, None)
        if desc and freq > min_entity_freq:
            entity_list.append(entity)
            description_list.append(desc)
            frequency_list.append(freq)
            filtered_title_to_id[title] = entity

    print(len(title_to_id.keys()), "original titles")
    kept_nr = len(filtered_title_to_id.keys())
    print("kept", kept_nr, "entities with min. frequency", min_entity_freq)

    print()
    print(now(), " * train entity encoder:")
    print()
    encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
    encoder.train(description_list=description_list, to_print=True)

    print()
    print(now(), " * get entity embeddings:")
    print()
    embeddings = encoder.apply_encoder(description_list)

    print(now(), " * adding", len(entity_list), "entities")
    kb.set_entities(entity_list=entity_list,
                    freq_list=frequency_list,
                    vector_list=embeddings)

    alias_cnt = _add_aliases(
        kb,
        title_to_id=filtered_title_to_id,
        max_entities_per_alias=max_entities_per_alias,
        min_occ=min_occ,
        prior_prob_input=prior_prob_input,
    )
    print()
    print(now(), " * adding", alias_cnt, "aliases")
    print()

    print()
    print("# of entities in kb:", kb.get_size_entities())
    print("# of aliases in kb:", kb.get_size_aliases())

    print(now(), "Done with kb")
    return kb