def flush_entity_universe(self): print("len(self.entities_universe) =", len(self.entities_universe)) entities_folder = config.base_folder + "data/entities/extension_entities/" _, wiki_id_name_map = load_wiki_name_id_map() if not os.path.exists(entities_folder): os.makedirs(entities_folder) def dump_entities(entity_set, name): with open(entities_folder + name + ".pickle", 'wb') as handle: pickle.dump(entity_set, handle) with open(entities_folder + name + ".txt", "w") as fout: for ent_id in entity_set: fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n") dump_entities(self.entities_universe, "entities_universe") # now calculate the expansion i.e. from this universe omit the ones that we have already trained extension_entity_set = set() wikiid2nnid = load_wikiid2nnid() for wikiid in self.entities_universe: if wikiid not in wikiid2nnid: extension_entity_set.add(wikiid) print("len(extension_entity_set) =", len(extension_entity_set)) dump_entities(extension_entity_set, "extension_entities")
def __init__(self, output_folder, predictions_folder, entity_extension=None): self.thr = None self.output_folder = output_folder self.predictions_folder = predictions_folder with open(output_folder+"word_char_maps.pickle", 'rb') as handle: _, self.id2word, _, self.id2char, _, _ = pickle.load(handle) self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() self.extra_info = ""
def create_entity_universe(gmonly_files=None, allspans_files=None, printSamples=None): new_dataset_folder = config.base_folder+"data/hipe_new/" if gmonly_files is None: gmonly_files = [] if allspans_files is None: allspans_files = ['HIPE-data-v1.0-train-de.txt', 'HIPE-data-v1.0-dev-de.txt','HIPE-data-v1.0-test-de.txt'] print("gmonly_files: ", gmonly_files) print("allspans_files: ", allspans_files) def create_entity_universe_aux(generator, datasets): entities_universe = set() for dataset in datasets: print("Processing dataset: ", dataset) for sample in generator.process(filepath=new_dataset_folder+dataset): entities_universe.update(*sample.cand_entities) entities_universe.update(sample.ground_truth) if printSamples: printSamples.print_sample(sample) print("Overall statistics: ") print("all_gm_misses: ", generator.all_gm_misses) print("all_gt_misses: ", generator.all_gt_misses) print("all_gm: ", generator.all_gm) print("recall % : ", (1 - (generator.all_gm_misses+generator.all_gt_misses)/generator.all_gm)*100, " %") print("len(entities_universe):\t\t\t", colored(len(entities_universe), 'red')) return entities_universe gmonly_entities, allspans_entities = set(), set() samplesGenerator = SamplesGenerator() if gmonly_files: print("gmonly files statistics: ") samplesGenerator.set_gmonly_mode() gmonly_entities = create_entity_universe_aux(samplesGenerator, gmonly_files) if allspans_files: print("Test files statistics: ") samplesGenerator.set_allspans_mode() allspans_entities = create_entity_universe_aux(samplesGenerator, allspans_files) all_entities = gmonly_entities | allspans_entities print("len(all_entities) = ", len(all_entities)) # print the entities of our universe to a file together with the name with open(config.base_folder+"data/entities/entities_universe.txt", "w") as fout: wiki_name_map_path = config.base_folder + "data/basic_data/wiki_name_map.txt" _, wiki_id_name_map = util.load_wiki_name_id_map(filepath = wiki_name_map_path) for ent_id in all_entities: if ent_id in wiki_id_name_map: fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n") return all_entities
def print_p_e_m_dictionary_to_file(p_e_m, full_filepath): _, wiki_id_name_map = util.load_wiki_name_id_map() with open(full_filepath, "w") as fout: for mention, entities in p_e_m.items(): out_acc = [] # entities is a defaultdict(int) # so items returns ent2: 10, ent54:20, ent3:2 sorted_ = sorted(entities.items(), key=operator.itemgetter(1), reverse=True) # a list of tuples [(ent54,20), (ent2,10), (ent3,2)] total_freq = 0 for ent_id, prob in sorted_: if len(out_acc) > 100: # at most 100 candidate entities break total_freq += prob out_acc.append(','.join([ent_id, str(prob), wiki_id_name_map[ent_id].replace(' ', '_')])) fout.write(mention + "\t" + str(total_freq) + "\t" + "\t".join(out_acc) + "\n")
def __init__(self, output_folder, predictions_folder, entity_extension=None, gm_bucketing_pempos=None, print_global_voters=False, print_global_pairwise_scores=False): self.thr = None self.output_folder = output_folder self.predictions_folder = predictions_folder with open(output_folder + "word_char_maps.pickle", 'rb') as handle: _, self.id2word, _, self.id2char, _, _ = pickle.load(handle) self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() self.extra_info = "" self.gm_bucketing = GMBucketingResults( gm_bucketing_pempos) if gm_bucketing_pempos else None self.print_global_pairwise_scores = print_global_pairwise_scores self.print_global_voters = print_global_voters
def __init__(self, only_misses=True): _, self.wiki_id_name_map = util.load_wiki_name_id_map() self.only_misses = only_misses
def __init__(self, train_args, args): self.args = args # input pipeline self.streaming_samples = StreamingSamples() ds = tf.data.Dataset.from_generator( self.streaming_samples.gen, ( tf.int64, tf.int64, tf.int64, tf.int64, #words, words_len, chars, chars_len tf.int64, tf.int64, tf.int64, # begin_span, end_span, span_len tf.int64, tf.float32, tf.int64 ), #cand_entities, cand_entities_scores, cand_entities_len (tf.TensorShape([None]), tf.TensorShape( []), tf.TensorShape([None, None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape( []), tf.TensorShape([None, None]), tf.TensorShape( [None, None]), tf.TensorShape([None]))) next_element = ds.make_one_shot_iterator().get_next() # batch size = 1 i expand the dims now to match the training that has batch dimension next_element = [tf.expand_dims(t, 0) for t in next_element] next_element = [ None, *next_element[:-1], None, next_element[-1], None, None, None, None ] # restore model print("loading Model:", train_args.output_folder) model = Model(train_args, next_element) model.build() checkpoint_path = model.restore_session("el" if args.el_mode else "ed") self.model = model if args.hardcoded_thr: self.thr = args.hardcoded_thr print("threshold used:", self.thr) else: # optimal threshold recovery from log files. # based on the checkpoint selected look at the log file for threshold (otherwise recompute it) self.thr = retrieve_optimal_threshold_from_logfile( train_args.output_folder, checkpoint_path, args.el_mode) print("optimal threshold selected = ", self.thr) if args.running_mode == "el_mode": args.el_mode = True elif args.running_mode == "ed_mode": args.el_mode = False # convert text to tensors for the NN with open(args.experiment_folder + "word_char_maps.pickle", 'rb') as handle: self.word2id, _, self.char2id, _, _, _ = pickle.load(handle) self.wikiid2nnid = load_wikiid2nnid( extension_name=args.entity_extension) self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True) _, self.wiki_id_name_map = load_wiki_name_id_map() with open(args.experiment_folder + "prepro_args.pickle", 'rb') as handle: self.prepro_args = pickle.load(handle) if args.lowercase_spans_pem: self.prepro_args.lowercase_p_e_m = True self.prepro_args.lowercase_spans = True print("prepro_args:", self.prepro_args) self.prepro_args.persons_coreference = args.persons_coreference self.prepro_args.persons_coreference_merge = args.persons_coreference_merge self.fetchFilteredCoreferencedCandEntities = FetchFilteredCoreferencedCandEntities( self.prepro_args) prepro_util.args = self.prepro_args self.special_tokenized_words = {"``", '"', "''"} self.special_words_assertion_errors = 0 self.gm_idx_errors = 0 if self.args.el_with_stanfordner_and_our_ed: from nltk.tag import StanfordNERTagger self.st = StanfordNERTagger( '../data/stanford_core_nlp/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz', '../data/stanford_core_nlp/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8') self.from_myspans_to_given_spans_map_errors = 0
def create_entity_universe(language, gmonly_files=None, allspans_files=None, printSamples=None): new_dataset_folder = config.base_folder + "data/new_datasets/" + language + "/" if gmonly_files is None: gmonly_files = [] if allspans_files is None: #allspans_files = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt', # 'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt'] allspans_files = [] for dataset in util.get_immediate_files(new_dataset_folder): if language in dataset: allspans_file.append( os.path.basename(os.path.normpath(dataset))) print("gmonly_files: ", gmonly_files) print("allspans_files: ", allspans_files) def create_entity_universe_aux(generator, datasets): entities_universe = set() for dataset in datasets: print("Processing dataset: ", dataset) for sample in generator.process(filepath=new_dataset_folder + dataset): entities_universe.update(*sample.cand_entities) entities_universe.update(sample.ground_truth) if printSamples: printSamples.print_sample(sample) print("Overall statistics: ") print("all_gm_misses: ", generator.all_gm_misses) print("all_gt_misses: ", generator.all_gt_misses) print("all_gm: ", generator.all_gm) print("recall % : ", (1 - (generator.all_gm_misses + generator.all_gt_misses) / (generator.all_gm + 1.0)) * 100, " %") print("len(entities_universe):\t\t\t", colored(len(entities_universe), 'red')) return entities_universe gmonly_entities, allspans_entities = set(), set() samplesGenerator = SamplesGenerator() if gmonly_files: print("gmonly files statistics: ") samplesGenerator.set_gmonly_mode() gmonly_entities = create_entity_universe_aux(samplesGenerator, gmonly_files) if allspans_files: print("Test files statistics: ") samplesGenerator.set_allspans_mode() allspans_entities = create_entity_universe_aux(samplesGenerator, allspans_files) all_entities = gmonly_entities | allspans_entities print("len(all_entities) = ", len(all_entities)) # print the entities of our universe to a file together with the name with open( config.base_folder + "data/entities/" + language + "/entities_universe.txt", "w") as fout: _, wiki_id_name_map = util.load_wiki_name_id_map() for ent_id in all_entities: fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n") return all_entities
def __init__(self, only_misses=True): wiki_name_map_path = config.base_folder + "data/basic_data/wiki_name_map.txt" _, self.wiki_id_name_map = util.load_wiki_name_id_map(filepath = wiki_name_map_path) self.only_misses = only_misses