def load_data(text_path=None, mention_file=None, supplement=None, include_unresolved=False, lowercase=False, wiki_entity_file=None): assert not isinstance(text_path, type(None)), "xlwiki data requires raw path!" print("Loading", text_path) wiki_map = loadWikiVocab(wiki_entity_file) if supplement is None or supplement not in [0, 1, 2]: supplement=2 docs = [] doc_iter = XlwikiDataLoader(text_path, genre=supplement, lowercase=lowercase, wiki_map=wiki_map) for doc in doc_iter.documents(): docs.append(doc) return docs
def load_data(text_path=None, mention_file=None, supplement=None, include_unresolved=False, lowercase=False, wiki_entity_file=None): assert not isinstance(text_path, type(None)) and not isinstance(mention_file, type(None)),\ "wned data requires raw text path and mention file!" print("Loading {0}, {1}".format(text_path, mention_file)) wiki_map = loadWikiVocab(wiki_entity_file) docs = [] doc_iter = WnedDataLoader(text_path, mention_file, include_unresolved=include_unresolved, lowercase=lowercase, wiki_map=wiki_map) for doc in doc_iter.documents(): docs.append(doc) return docs
def load_data_and_embeddings(FLAGS, logger, candidate_manager): # dev, train and eval # conll only one file, controlled by genre (FLAG.test_genre) # no dev, train and eval # xlwiki only text path of training or eval, controlled by genre (FLAG.genre) # kbp15,16 may contain multiple training and eval [path, file] # kbp10 only one training and eval [path, file] # must cross_validation # wned only one eval [path, file] dataset_types = set() raw_training_data = None if not FLAGS.eval_only_mode: raw_training_data = [] unwraped_data_tuples = unwrapDataset(FLAGS.training_data) for data_tuple in unwraped_data_tuples: dataset_types.add(data_tuple[0]) raw_training_data.extend( extractRawData(data_tuple[0], data_tuple[2], data_tuple[3], data_tuple[1], FLAGS)) raw_eval_sets = [] unwraped_data_tuples = unwrapDataset(FLAGS.eval_data) for data_tuple in unwraped_data_tuples: dataset_types.add(data_tuple[0]) raw_eval_sets.append( extractRawData(data_tuple[0], data_tuple[2], data_tuple[3], data_tuple[1], FLAGS)) # replace mention gold id in redirect to entity id redirect_vocab = None if FLAGS.wiki_redirect_vocab is not None: gold_id_set = set() if raw_training_data is not None: gold_id_set.update([ m.gold_ent_id() for doc in raw_training_data for m in doc.mentions if m.gold_ent_id() is not None ]) for eval_data in raw_eval_sets: gold_id_set.update([ m.gold_ent_id() for doc in eval_data for m in doc.mentions if m.gold_ent_id() is not None ]) redirect_vocab = loadRedirectVocab(FLAGS.wiki_redirect_vocab, id_vocab=gold_id_set) if raw_training_data is not None: for i, doc in enumerate(raw_training_data): for j, mention in enumerate(doc.mentions): if mention.gold_ent_id() in redirect_vocab: raw_training_data[i].mentions[ j]._gold_ent_id = redirect_vocab[ mention.gold_ent_id()] for eval_data in raw_eval_sets: for i, doc in enumerate(eval_data): for j, mention in enumerate(doc.mentions): if mention.gold_ent_id() in redirect_vocab: eval_data[i].mentions[j]._gold_ent_id = redirect_vocab[ mention.gold_ent_id()] # Prepare the word and mention vocabulary. word_vocab, mention_vocab = BuildVocabulary(raw_training_data, raw_eval_sets, FLAGS.word_embedding_file, logger=logger) wiki2id_vocab, id2wiki_vocab = loadWikiVocab(FLAGS.wiki_entity_vocab) # candidate file types candidate_types = [] files = re.split(r',', FLAGS.candidates_file) for f in files: tmp_items = re.split(r':', f) candidate_types.append(tmp_items[0]) candidate_handler = candidate_manager(FLAGS.candidates_file, vocab=mention_vocab, lowercase=FLAGS.lowercase, id2label=id2wiki_vocab, label2id=wiki2id_vocab, support_fuzzy=FLAGS.support_fuzzy, redirect_vocab=redirect_vocab, topn=FLAGS.topn_candidate) candidate_handler.loadCandidates() if FLAGS.save_candidates_path is not None: fuzzy_str = 'fuzzy' if FLAGS.support_fuzzy else 'nofuzzy' candidate_handler.saveCandidatesToFile( os.path.join( FLAGS.save_candidates_path, '-'.join(dataset_types) + '-'.join(candidate_types) + '_candidate_' + fuzzy_str)) logger.Log( "Unk mention types rate: {:2.6f}% ({}/{}), average candidates: {:2.2f} ({}/{}) from {}!" .format((len(mention_vocab) - len(candidate_handler._mention_dict)) * 100 / float(len(mention_vocab)), len(mention_vocab) - len(candidate_handler._mention_dict), len(mention_vocab), candidate_handler._candidates_total / float(len(candidate_handler._mention_dict)), candidate_handler._candidates_total, len(candidate_handler._mention_dict), FLAGS.candidates_file)) entity_vocab, sense_vocab = BuildEntityVocabulary( candidate_handler._entity_set, FLAGS.entity_embedding_file, FLAGS.sense_embedding_file, logger=logger) # Load pretrained embeddings. logger.Log("Loading vocabulary with " + str(len(word_vocab)) + " words from " + FLAGS.word_embedding_file) word_embeddings = LoadEmbeddingsFromBinary(word_vocab, FLAGS.embedding_dim, FLAGS.word_embedding_file) logger.Log("Loading vocabulary with " + str(len(entity_vocab)) + " entities from " + FLAGS.entity_embedding_file) entity_embeddings = LoadEmbeddingsFromBinary(entity_vocab, FLAGS.embedding_dim, FLAGS.entity_embedding_file) sense_embeddings = None mu_embeddings = None if sense_vocab is not None: sense_embeddings, mu_embeddings = LoadEmbeddingsFromBinary( sense_vocab, FLAGS.embedding_dim, FLAGS.sense_embedding_file, isSense=True) logger.Log("Loading vocabulary with " + str(len(sense_vocab)) + " senses from " + FLAGS.sense_embedding_file) initial_embeddings = (word_embeddings, entity_embeddings, sense_embeddings, mu_embeddings) vocabulary = (word_vocab, entity_vocab, sense_vocab, id2wiki_vocab) stop_words = loadStopWords( FLAGS.stop_word_file) if FLAGS.stop_word_file is not None else {} feature_manager = get_feature_manager( initial_embeddings, FLAGS.embedding_dim, lowercase=FLAGS.lowercase, str_sim=FLAGS.str_sim, prior=FLAGS.prior, hasAtt=FLAGS.att, local_context_window=FLAGS.local_context_window, global_context_window=FLAGS.global_context_window) # Trim dataset, convert token sequences to integer sequences, crop, and # pad. construct data iterator logger.Log("Preprocessing data.") eval_sets = [] for i, raw_eval_data in enumerate(raw_eval_sets): logger.Log("Processing {} raw eval data ...".format(i)) AddCandidatesToDocs(raw_eval_sets[i], candidate_handler, topn=FLAGS.topn_candidate, vocab=entity_vocab, logger=logger, include_unresolved=FLAGS.include_unresolved) eval_data = PreprocessDataset( raw_eval_sets[i], vocabulary, initial_embeddings, FLAGS.max_tokens, FLAGS.max_candidates_per_document, feature_manager, stop_words=stop_words, logger=logger, include_unresolved=FLAGS.include_unresolved, allow_cropping=FLAGS.allow_cropping) eval_sets.append(eval_data) training_data_iter = None training_data_length = 0 if raw_training_data is not None: logger.Log("Processing raw training data ...") AddCandidatesToDocs(raw_training_data, candidate_handler, topn=FLAGS.topn_candidate, vocab=entity_vocab, logger=logger, include_unresolved=FLAGS.include_unresolved) training_data = PreprocessDataset( raw_training_data, vocabulary, initial_embeddings, FLAGS.max_tokens, FLAGS.max_candidates_per_document, feature_manager, stop_words=stop_words, logger=logger, include_unresolved=FLAGS.include_unresolved, allow_cropping=FLAGS.allow_cropping) training_data_length = training_data.shape[0] training_data_iter = MakeTrainingIterator(training_data, FLAGS.batch_size, FLAGS.smart_batching) logger.Log("Processing raw eval data ...") eval_iterators = [] for eval_data in eval_sets: eval_it = MakeEvalIterator(eval_data, FLAGS.batch_size) eval_iterators.append(eval_it) return vocabulary, initial_embeddings, training_data_iter, eval_iterators, training_data_length, feature_manager.base_feature_dim