def load_mid_eid_file(filename): f = open(filename, 'rb') num = np.fromfile(f, '>i4', 1) print num mid_eid_dict = dict() for i in xrange(num): mid = ioutils.read_str_with_fixed_len(f, 8) eid = ioutils.read_str_with_fixed_len(f, 8) # print mid, eid mid_eid_dict[mid] = eid f.close() return mid_eid_dict
def load_mid_eid_file(file_name): print 'loading ', file_name fin = open(file_name, 'rb') num_mids = np.fromfile(fin, '>i4', 1) print num_mids, 'mids' id_len = 8 mid_eid_dict = dict() for i in xrange(num_mids): mid = ioutils.read_str_with_fixed_len(fin, id_len) eid = ioutils.read_str_with_fixed_len(fin, id_len) mid_eid_dict[mid] = eid fin.close() print 'done' return mid_eid_dict
def gen_legal_wid_list_file(): # mid_alias_cnt_file_name = 'd:/data/el/merged_fb_mid_alias_cnt.txt' filter_mid_file_name = 'e:/el/res/freebase/filter_mids_10_8.bin' mid_wid_file_name = 'd:/data/el/mid_to_wid_full_ord_wid.txt' dst_wid_file_name = 'e:/dc/el/legal_wid_list.bin' print 'loading filter mids ...' fin = open(filter_mid_file_name, 'rb') num_mids = np.fromfile(fin, '>i4', 1) filter_mid_set = set() for i in xrange(num_mids): mid = ioutils.read_str_with_fixed_len(fin, 8) filter_mid_set.add(mid) fin.close() print 'done.' print 'loading wids ...' fin = open(mid_wid_file_name, 'rb') wid_list = list() for line in fin: vals = line.strip().split('\t') mid = vals[0] wid = int(vals[1]) if mid not in filter_mid_set: wid_list.append(wid) fin.close() print 'done.' print len(wid_list), 'wids' fout = open(dst_wid_file_name, 'wb') np.asarray([len(wid_list)], np.int32).tofile(fout) np.asarray(wid_list, np.int32).tofile(fout) fout.close() print 'done.'
def __load_mention_info(fin, vec_dim): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions # gold_label = 'NIL' candidates = list() num_candidates = np.fromfile(fin, '>i4', 1) for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) commonness = np.fromfile(fin, '>f4', 1) vec = np.fromfile(fin, '>f4', vec_dim) candidates.append((mid, commonness, vec)) return qid, candidates
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file): mid_eid_dict = load_mid_eid_file(mid_eid_file) label_dict = load_gold_id_file(gold_label_file) vec_dim = 100 nil_cnt = 0 miss_cnt = 0 fh_cnt = 0 nil_hit_cnt = 0 tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb') fin = open(vec_train_file, 'rb') num_docs = np.fromfile(fin, '>i4', 1) print num_docs fout = open(dst_file, 'wb') np.asarray([num_docs, vec_dim], np.int32).tofile(fout) candidates_list = list() for i in xrange(num_docs): doc_id = ioutils.read_str_with_byte_len(fin) doc_vec = np.fromfile(fin, '>f4', vec_dim) # if i < 5: # print doc_vec doc_vec.astype(np.float32).tofile(fout) mention_infos = list() num_mentions = np.fromfile(fin, '>i4', 1) for j in xrange(num_mentions): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions # gold_label = 'NIL' num_candidates = np.fromfile(fin, '>i4', 1) gold_label = label_dict[qid] cur_candidates = list() cur_candidates_tup = (qid, cur_candidates) candidates_list.append(cur_candidates_tup) hit_idx = -1 commonness = list() candidate_vecs = list() eids = list() all_nil = True non_nil_cnt = -1 for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) eid = mid_eid_dict.get(mid, 'NILL') if eid != 'NILL': all_nil = False non_nil_cnt += 1 cur_candidates.append(eid) if k == 0 and eid == 'NILL': tmp_fout.write(qid + '\t' + eid + '\n') if eid == gold_label: # hit_idx = k hit_idx = non_nil_cnt cur_com = np.fromfile(fin, '>f4', 1) # print cur_com vec = np.fromfile(fin, '>f4', vec_dim) if eid != 'NILL': commonness.append(cur_com) candidate_vecs.append(vec.astype(np.float32)) eids.append(eid) if hit_idx == -1: miss_cnt += 1 else: # mention_infos.append((qid, hit_idx, candidate_vecs, eids)) mention_infos.append((hit_idx, commonness, candidate_vecs)) # print commonness # print if hit_idx == 0: fh_cnt += 1 if gold_label.startswith('NIL'): nil_cnt += 1 if all_nil: nil_hit_cnt += 1 # print len(mention_infos) np.asarray([len(mention_infos)], np.int32).tofile(fout) for mention_info in mention_infos: # io_utils.write_str_with_byte_len(mention_info[0], fout) np.asarray([len(mention_info[1])], np.int32).tofile(fout) np.asarray([mention_info[0]], np.int32).tofile(fout) np.asarray(mention_info[1], np.float32).tofile(fout) for vec in mention_info[2]: vec.tofile(fout) # for eid in mention_info[3]: # io_utils.write_str_with_byte_len(eid, fout) # break fin.close() fout.close() tmp_fout.close() candidates_list.sort(key=lambda x: x[0]) __print_candidates(candidates_list) num_queries = len(label_dict) num_non_nil_queries = num_queries - nil_cnt print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries' print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries, nil_hit_cnt print float(fh_cnt) / num_non_nil_queries print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries print float(num_queries - miss_cnt + nil_hit_cnt) / num_queries print 1 - float(miss_cnt - nil_cnt) / num_queries
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file): mid_eid_dict = load_mid_eid_file(mid_eid_file) label_dict = load_gold_label_file(gold_label_file) vec_dim = 100 nil_cnt = 0 miss_cnt = 0 fh_cnt = 0 tmp_fout = open('e:/dc/el/tmp_result.txt', 'wb') fin = open(vec_train_file, 'rb') num_docs = np.fromfile(fin, '>i4', 1) print num_docs fout = open(dst_file, 'wb') np.asarray([num_docs, vec_dim], np.int32).tofile(fout) for i in xrange(num_docs): doc_id = ioutils.read_str_with_byte_len(fin) doc_vec = np.fromfile(fin, '>f4', vec_dim) doc_vec.astype(np.float32).tofile(fout) mention_infos = list() num_mentions = np.fromfile(fin, '>i4', 1) for j in xrange(num_mentions): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions gold_label = label_dict[qid] # gold_label = 'NIL' if gold_label.startswith('NIL'): nil_cnt += 1 num_candidates = np.fromfile(fin, '>i4', 1) hit_idx = -1 commonness = list() candidate_vecs = list() eids = list() for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) eid = mid_eid_dict.get(mid, 'NILL') if k == 0 and eid != 'NILL': tmp_fout.write(qid + '\t' + eid + '\n') if eid == gold_label: hit_idx = k cur_com = np.fromfile(fin, '>f4', 1) # print cur_com commonness.append(cur_com) vec = np.fromfile(fin, '>f4', vec_dim) candidate_vecs.append(vec.astype(np.float32)) eids.append(eid) if hit_idx == -1: miss_cnt += 1 else: # mention_infos.append((qid, hit_idx, candidate_vecs, eids)) mention_infos.append((hit_idx, commonness, candidate_vecs)) if hit_idx == 0: fh_cnt += 1 # print len(mention_infos) np.asarray([len(mention_infos)], np.int32).tofile(fout) for mention_info in mention_infos: # io_utils.write_str_with_byte_len(mention_info[0], fout) np.asarray([len(mention_info[1])], np.int32).tofile(fout) np.asarray([mention_info[0]], np.int32).tofile(fout) np.asarray(mention_info[1], np.float32).tofile(fout) for vec in mention_info[2]: vec.tofile(fout) # for eid in mention_info[3]: # io_utils.write_str_with_byte_len(eid, fout) # break fin.close() fout.close() tmp_fout.close() num_queries = len(label_dict) num_non_nil_queries = num_queries - nil_cnt print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries' print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries print float(fh_cnt) / num_non_nil_queries print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries