def prev_mentions_format_to_new(tab_file, xml_file, output_file): xml_text = __read_text_file(xml_file) miter = re.finditer(xml_mention_pattern_str, xml_text) mentions_dict = dict() beg_pos_dict = dict() for m in miter: cur_doc_id = m.group(3) mention = Mention(name=m.group(2), docid=cur_doc_id, mention_id=m.group(1)) doc_beg = beg_pos_dict.get(cur_doc_id, 0) # TODO mention.beg_pos = doc_beg mention.end_pos = doc_beg + len(mention.name.encode('utf-8')) - 1 beg_pos_dict[cur_doc_id] = mention.end_pos + 1 mentions_dict[mention.mention_id] = mention f = open(tab_file, 'r') for line in f: vals = line.strip().split('\t') if len(vals) < 3: continue m = mentions_dict.get(vals[0], None) if m: m.kbid = vals[1] m.entity_type = vals[2] f.close() Mention.save_as_edl_file(mentions_dict.values(), output_file)
def __apply_coref(edl_file, linking_info_file, dst_edl_file): coref_dict = dict() f = open(linking_info_file, 'rb') while True: docid = ioutils.read_str_with_byte_len(f) if not docid: break num_mentions = np.fromfile(f, '>i4', 1) is_nested = np.fromfile(f, 'b', num_mentions) corefs = np.fromfile(f, '>i4', num_mentions) qids = list() for i in xrange(num_mentions): qid = __read_mention_from_linking_info_file(f) qids.append(qid) for coref_id, qid in izip(corefs, qids): if coref_id > 0: coref_dict[qid] = qids[coref_id] f.close() mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) __assgin_different_id_to_all_nils(mentions) print qid_mentions['EDL14_ENG_0052'].kbid for m in mentions: if not m.kbid.startswith('NIL'): continue coref_qid = coref_dict.get(m.mention_id, '') if coref_qid: print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid m.kbid = qid_mentions[coref_qid].kbid Mention.save_as_edl_file(mentions, dst_edl_file)
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred, max_scores, dst_file, use_nil_thres): mentions = Mention.load_edl_file(edl_file) for m in mentions: m.kbid = 'NODEF' qid_mentions = Mention.group_mentions_by_qid(mentions) for qid, kbid in result_triv.iteritems(): qid_mentions[qid].kbid = kbid # print qid, kbid for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores): if y >= len(kbids): print y, len(kbids) if qid_mentions[qid].kbid == 'NODEF': if use_nil_thres and max_score < 0.5: qid_mentions[qid].kbid = 'NIL' else: qid_mentions[qid].kbid = kbids[y] # print qid, kbids[y] for m in mentions: if m.kbid.startswith('m.') or m.kbid.startswith('NIL'): m.kbid = 'NIL0001' Mention.save_as_edl_file(mentions, dst_file)
def __nil_clustering(nom_dict_file, edl_file, dst_file): nom_names = load_nom_dict(nom_dict_file) all_mentions = Mention.load_edl_file(edl_file) nil_mentions = [m for m in all_mentions if m.kbid.startswith('NIL') and m.name.lower() not in nom_names] kbid_mentions = __group_mentions_by_kbid(nil_mentions) new_kbids, new_mentions_kbids = list(), list() for kbid, mentions in kbid_mentions.iteritems(): merged = False for nkbid, nmentions in izip(new_kbids, new_mentions_kbids): if __should_merge(mentions, nmentions): # for m in mentions: # print '%s\t' % m.name, # print # for m in nmentions: # print '%s\t' % m.name, # print '\n' for m in mentions: m.kbid = nkbid nmentions.append(m) merged = True break if not merged: new_kbids.append(kbid) new_mentions_kbids.append(mentions) Mention.save_as_edl_file(all_mentions, dst_file)
def main(): # dataset = 'LDC2015E75' dataset = 'LDC2015E103' # dataset = 'LDC2016E63' mentions_tag = '0' run_id = 4 # datadir = '/home/dhl/data/EDL/' datadir = 'e:/data/edl' doc_list_file = os.path.join(datadir, dataset, 'data/eng-docs-list-win.txt') mid_type_file = os.path.join(datadir, 'res/freebase/mid-entity-type.txt') cur_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-%s.tab' % mentions_tag) miss_match_mentions_file = os.path.join(datadir, dataset, 'output/miss-match-mentions-%s.txt' % mentions_tag) new_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-pp-ft-%d.tab' % run_id) # __nil_clustering(nom_dict_file, edl_file, dst_file) mentions = Mention.load_edl_file(cur_edl_file) # __link_nom(doc_mentions_dict, max_nil_id) __nil_author_clustering(mentions) __fix_special_types(mentions) __fix_entity_types_by_mid(mid_type_file, mentions) # __fix_type_diff_of_same_kbid(mentions) __validate_mentions(doc_list_file, mentions, miss_match_mentions_file) __fix_pos_error(mentions) Mention.save_as_edl_file(mentions, new_edl_file, runid='WednesdayGo%d' % run_id)
def __remove_leading_the(metions_file, dst_mentions_edl_file): mentions = Mention.load_edl_file(metions_file) for m in mentions: if m.name.startswith('the '): m.name = m.name[4:] m.beg_pos += 4 Mention.save_as_edl_file(mentions, dst_mentions_edl_file)
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file): noms = load_nom_dict(nom_dict_file) nom_name_list = [n for n in noms] nom_name_list.sort(key=lambda x: -len(x)) nom_name_list = [n.split(' ') for n in nom_name_list] doc_path_dict = __load_doc_paths_as_dict(doc_list_file) mentions = list() f_wp = open(words_pos_file, 'r') for i, line in enumerate(f_wp): vals = line.rstrip().split('\t') docid = vals[0] if (i + 1) % 10 == 0: print i + 1, docid doc_path = doc_path_dict[docid] doc_text = read_text(doc_path).decode('utf-8') if doc_text.startswith(doc_head): doc_text = doc_text[len(doc_head):] num_sentences = int(vals[1]) for j in xrange(num_sentences): sentence = __next_sentence_in_words_pos_file(f_wp) words = [tup[0].lower() for tup in sentence] # print words hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False) for hit_span, hit_idx in izip(hit_spans, hit_indices): beg_pos = sentence[hit_span[0]][3] end_pos = sentence[hit_span[1] - 1][4] tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]] # print tags # if 'NN' not in tags and 'NNP' not in tags: # continue if 'NN' not in tags: continue name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ') if '<' in name or 'http:' in name or '>' in name: continue m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM', entity_type='PER', kbid='NIL00000') mentions.append(m) # print sentence[hit_span[0]], sentence[hit_span[1]] # print nom_name_list[hit_idx], name # break f_wp.close() Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
def __name_expansion(edl_mentions_file, doc_ner_file, tokenized_text_file, entity_candidates_dict_file, dst_file): mentions = Mention.load_edl_file(edl_mentions_file) __expand_name_with_ner_result(mentions, doc_ner_file) # __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file) Mention.save_as_edl_file(mentions, dst_file)
def all_to_all(edl_file, dst_edl_file): mentions = Mention.load_edl_file(edl_file) __assgin_different_id_to_all_nils(mentions) Mention.save_as_edl_file(mentions, dst_edl_file)