def build_coocc_classifier_dataset(out_file, dataset_dir, entity_db_file, **kwargs): dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db_file) ret = coocc_classifier.build_dataset(dataset, entity_db, **kwargs) joblib.dump(ret, out_file)
def build_page_classifier_dataset(out_file, dataset_dir, page_db_file, entity_db_file, **kwargs): dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db_file) page_db = PageDB(page_db_file, 'r') ret = page_classifier.build_dataset(dataset, page_db, entity_db, **kwargs) joblib.dump(ret, out_file)
def generate_corpus(dump_file, entity_db_file, out_file, abstract_db, **kwargs): entity_db = EntityDB.load(entity_db_file) if abstract_db: abstract_db = AbstractDB(abstract_db, 'r') word2vec.generate_corpus(dump_file, entity_db, out_file, abstract_db, **kwargs)
def build_scorer_dataset(clf_cache_file, out_file, dataset_dir, entity_db_file, **kwargs): clf_cache = joblib.load(clf_cache_file, mmap_mode='r') dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db_file) ret = scorer.build_dataset(dataset, clf_cache, entity_db, **kwargs) joblib.dump(ret, out_file)
def evaluate_text_classification(model_file, entity_db_file, min_link_prob, min_disambi_score, tagme_cache, **kwargs): entity_db = EntityDB.load(entity_db_file) if tagme_cache: tagme_cache = joblib.load(tagme_cache) entity_linker = TagmeEntityLinker(entity_db, min_link_prob, min_disambi_score, tagme_cache) text_classification.evaluate(model_file, entity_linker, **kwargs)
def train_model(db_file, entity_db_file, vocab_file, word2vec, **kwargs): db = AbstractDB(db_file, 'r') entity_db = EntityDB.load(entity_db_file) vocab = Vocab.load(vocab_file) if word2vec: w2vec = ModelReader(word2vec) else: w2vec = None train.train(db, entity_db, vocab, w2vec, **kwargs)
def cache_classifier_results(page_db_file, out_file, dataset_dir, entity_db_file, init, **kwargs): page_db = PageDB(page_db_file, 'r') dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db_file) if init: initial_data = joblib.load(init) else: initial_data = {} scorer.cache_classifier_results(dataset, initial_data, out_file, page_db, entity_db, **kwargs)
def train_model(description_db_file, entity_db_file, word_vocab_file, entity_vocab_file, target_entity_vocab_file, out_file, embedding, **kwargs): description_db = DescriptionDB(description_db_file) entity_db = EntityDB.load(entity_db_file) word_vocab = WordVocab.load(word_vocab_file) entity_vocab = EntityVocab.load(entity_vocab_file) target_entity_vocab = EntityVocab.load(target_entity_vocab_file) embeddings = [EmbeddingReader.load(f) for f in embedding] train.train(description_db, entity_db, word_vocab, entity_vocab, target_entity_vocab, out_file, embeddings, **kwargs)
def evaluate_entity_typing(model_file, entity_db_file, dataset_dir, embedding, **kwargs): entity_db = EntityDB.load(entity_db_file) if embedding: model = EmbeddingReader.load(model_file) entity_embedding = model.entity_embedding entity_vocab = model.entity_vocab else: model = train.load_model(model_file) entity_embedding = model.target_entity_embedding entity_vocab = model.target_entity_vocab entity_typing.evaluate(entity_embedding, entity_vocab, entity_db, dataset_dir, **kwargs)
def build_entity_vocab(description_db_file, entity_db_file, out_file, target_vocab, white_list, **kwargs): description_db = DescriptionDB(description_db_file) if target_vocab: target_vocab = EntityVocab.load(target_vocab) entity_db = EntityDB.load(entity_db_file) white_titles = [] for f in white_list: white_titles += [l.rstrip().decode('utf-8') for l in f] entity_vocab = EntityVocab.build(description_db, entity_db, white_titles, start_index=1, target_vocab=target_vocab, **kwargs) entity_vocab.save(out_file)
def build_description_db(nif_context_file, nif_text_links_file, entity_db_file, out_file, **kwargs): entity_db = EntityDB.load(entity_db_file) DescriptionDB.build(nif_context_file, nif_text_links_file, entity_db, out_file, **kwargs)
def generate_word2vec_corpus(dump_file, entity_db_file, out_file, **kwargs): entity_db = EntityDB.load(entity_db_file) word2vec.generate_corpus(dump_file, entity_db, out_file, **kwargs)
def build_sentence_db(wiki_sentences_file, entity_db_file, out_file): entity_db = EntityDB.load(entity_db_file) SentenceDB.build(wiki_sentences_file, entity_db, out_file)
def build_page_db(dataset_dir, entity_db_file, **kwargs): dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db_file) PageDB.build(dataset, entity_db, **kwargs)
def run(dataset_dir, entity_db, **kwargs): dataset = DatasetLoader(dataset_dir) entity_db = EntityDB.load(entity_db) scorer.run(dataset=dataset, entity_db=entity_db, **kwargs)
def build_vocab(db_file, entity_db_file, out_file, **kwargs): db = AbstractDB(db_file, 'r') entity_db = EntityDB.load(entity_db_file) vocab = Vocab.build(db, entity_db, **kwargs) vocab.save(out_file)