示例#1
0
def build_coocc_classifier_dataset(out_file, dataset_dir, entity_db_file,
                                   **kwargs):
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db_file)

    ret = coocc_classifier.build_dataset(dataset, entity_db, **kwargs)
    joblib.dump(ret, out_file)
示例#2
0
def build_page_classifier_dataset(out_file, dataset_dir, page_db_file,
                                  entity_db_file, **kwargs):
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db_file)
    page_db = PageDB(page_db_file, 'r')

    ret = page_classifier.build_dataset(dataset, page_db, entity_db, **kwargs)
    joblib.dump(ret, out_file)
示例#3
0
文件: cli.py 项目: xuewyang/ntee
def generate_corpus(dump_file, entity_db_file, out_file, abstract_db,
                    **kwargs):
    entity_db = EntityDB.load(entity_db_file)
    if abstract_db:
        abstract_db = AbstractDB(abstract_db, 'r')

    word2vec.generate_corpus(dump_file, entity_db, out_file, abstract_db,
                             **kwargs)
示例#4
0
def build_scorer_dataset(clf_cache_file, out_file, dataset_dir, entity_db_file,
                         **kwargs):
    clf_cache = joblib.load(clf_cache_file, mmap_mode='r')
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db_file)

    ret = scorer.build_dataset(dataset, clf_cache, entity_db, **kwargs)

    joblib.dump(ret, out_file)
示例#5
0
def evaluate_text_classification(model_file, entity_db_file, min_link_prob,
                                 min_disambi_score, tagme_cache, **kwargs):
    entity_db = EntityDB.load(entity_db_file)
    if tagme_cache:
        tagme_cache = joblib.load(tagme_cache)

    entity_linker = TagmeEntityLinker(entity_db, min_link_prob,
                                      min_disambi_score, tagme_cache)

    text_classification.evaluate(model_file, entity_linker, **kwargs)
示例#6
0
文件: cli.py 项目: studio-ousia/ntee
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
示例#7
0
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.load(vocab_file)

    if word2vec:
        w2vec = ModelReader(word2vec)
    else:
        w2vec = None

    train.train(db, entity_db, vocab, w2vec, **kwargs)
示例#8
0
def cache_classifier_results(page_db_file, out_file, dataset_dir,
                             entity_db_file, init, **kwargs):
    page_db = PageDB(page_db_file, 'r')
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db_file)
    if init:
        initial_data = joblib.load(init)
    else:
        initial_data = {}

    scorer.cache_classifier_results(dataset, initial_data, out_file, page_db,
                                    entity_db, **kwargs)
示例#9
0
def train_model(description_db_file, entity_db_file, word_vocab_file,
                entity_vocab_file, target_entity_vocab_file, out_file,
                embedding, **kwargs):
    description_db = DescriptionDB(description_db_file)
    entity_db = EntityDB.load(entity_db_file)

    word_vocab = WordVocab.load(word_vocab_file)
    entity_vocab = EntityVocab.load(entity_vocab_file)
    target_entity_vocab = EntityVocab.load(target_entity_vocab_file)

    embeddings = [EmbeddingReader.load(f) for f in embedding]

    train.train(description_db, entity_db, word_vocab, entity_vocab,
                target_entity_vocab, out_file, embeddings, **kwargs)
示例#10
0
def evaluate_entity_typing(model_file, entity_db_file, dataset_dir, embedding,
                           **kwargs):
    entity_db = EntityDB.load(entity_db_file)

    if embedding:
        model = EmbeddingReader.load(model_file)
        entity_embedding = model.entity_embedding
        entity_vocab = model.entity_vocab
    else:
        model = train.load_model(model_file)
        entity_embedding = model.target_entity_embedding
        entity_vocab = model.target_entity_vocab

    entity_typing.evaluate(entity_embedding, entity_vocab, entity_db,
                           dataset_dir, **kwargs)
示例#11
0
def build_entity_vocab(description_db_file, entity_db_file, out_file,
                       target_vocab, white_list, **kwargs):
    description_db = DescriptionDB(description_db_file)

    if target_vocab:
        target_vocab = EntityVocab.load(target_vocab)

    entity_db = EntityDB.load(entity_db_file)
    white_titles = []
    for f in white_list:
        white_titles += [l.rstrip().decode('utf-8') for l in f]

    entity_vocab = EntityVocab.build(description_db,
                                     entity_db,
                                     white_titles,
                                     start_index=1,
                                     target_vocab=target_vocab,
                                     **kwargs)
    entity_vocab.save(out_file)
示例#12
0
def build_entity_db(dump_file, out_file, **kwargs):
    db = EntityDB.build(dump_file, **kwargs)
    db.save(out_file)
示例#13
0
def generate_word2vec_corpus(dump_file, entity_db_file, out_file, **kwargs):
    entity_db = EntityDB.load(entity_db_file)
    word2vec.generate_corpus(dump_file, entity_db, out_file, **kwargs)
示例#14
0
def build_sentence_db(wiki_sentences_file, entity_db_file, out_file):
    entity_db = EntityDB.load(entity_db_file)
    SentenceDB.build(wiki_sentences_file, entity_db, out_file)
示例#15
0
def run(dataset_dir, entity_db, **kwargs):
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db)

    scorer.run(dataset=dataset, entity_db=entity_db, **kwargs)
示例#16
0
文件: cli.py 项目: studio-ousia/ntee
def build_entity_db(dump_file, out_file, **kwargs):
    db = EntityDB.build(dump_file, **kwargs)
    db.save(out_file)
示例#17
0
文件: cli.py 项目: studio-ousia/ntee
def build_vocab(db_file, entity_db_file, out_file, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.build(db, entity_db, **kwargs)
    vocab.save(out_file)
示例#18
0
def build_vocab(db_file, entity_db_file, out_file, **kwargs):
    db = AbstractDB(db_file, 'r')
    entity_db = EntityDB.load(entity_db_file)
    vocab = Vocab.build(db, entity_db, **kwargs)
    vocab.save(out_file)
示例#19
0
def build_description_db(nif_context_file, nif_text_links_file, entity_db_file,
                         out_file, **kwargs):
    entity_db = EntityDB.load(entity_db_file)
    DescriptionDB.build(nif_context_file, nif_text_links_file, entity_db,
                        out_file, **kwargs)
示例#20
0
def build_page_db(dataset_dir, entity_db_file, **kwargs):
    dataset = DatasetLoader(dataset_dir)
    entity_db = EntityDB.load(entity_db_file)
    PageDB.build(dataset, entity_db, **kwargs)
示例#21
0
文件: cli.py 项目: studio-ousia/ntee
def generate_corpus(dump_file, entity_db_file, out_file, abstract_db, **kwargs):
    entity_db = EntityDB.load(entity_db_file)
    if abstract_db:
        abstract_db = AbstractDB(abstract_db, 'r')

    word2vec.generate_corpus(dump_file, entity_db, out_file, abstract_db, **kwargs)