Exemplo n.º 1
0
def load_mid_eid_file(filename):
    f = open(filename, 'rb')
    num = np.fromfile(f, '>i4', 1)
    print num
    mid_eid_dict = dict()
    for i in xrange(num):
        mid = ioutils.read_str_with_fixed_len(f, 8)
        eid = ioutils.read_str_with_fixed_len(f, 8)
        # print mid, eid
        mid_eid_dict[mid] = eid
    f.close()
    return mid_eid_dict
Exemplo n.º 2
0
def load_mid_eid_file(file_name):
    print 'loading ', file_name
    fin = open(file_name, 'rb')
    num_mids = np.fromfile(fin, '>i4', 1)
    print num_mids, 'mids'
    id_len = 8
    mid_eid_dict = dict()
    for i in xrange(num_mids):
        mid = ioutils.read_str_with_fixed_len(fin, id_len)
        eid = ioutils.read_str_with_fixed_len(fin, id_len)
        mid_eid_dict[mid] = eid
    fin.close()
    print 'done'
    return mid_eid_dict
Exemplo n.º 3
0
def load_mid_eid_file(file_name):
    print 'loading ', file_name
    fin = open(file_name, 'rb')
    num_mids = np.fromfile(fin, '>i4', 1)
    print num_mids, 'mids'
    id_len = 8
    mid_eid_dict = dict()
    for i in xrange(num_mids):
        mid = ioutils.read_str_with_fixed_len(fin, id_len)
        eid = ioutils.read_str_with_fixed_len(fin, id_len)
        mid_eid_dict[mid] = eid
    fin.close()
    print 'done'
    return mid_eid_dict
Exemplo n.º 4
0
def gen_legal_wid_list_file():
    # mid_alias_cnt_file_name = 'd:/data/el/merged_fb_mid_alias_cnt.txt'
    filter_mid_file_name = 'e:/el/res/freebase/filter_mids_10_8.bin'
    mid_wid_file_name = 'd:/data/el/mid_to_wid_full_ord_wid.txt'
    dst_wid_file_name = 'e:/dc/el/legal_wid_list.bin'
    print 'loading filter mids ...'
    fin = open(filter_mid_file_name, 'rb')
    num_mids = np.fromfile(fin, '>i4', 1)
    filter_mid_set = set()
    for i in xrange(num_mids):
        mid = ioutils.read_str_with_fixed_len(fin, 8)
        filter_mid_set.add(mid)
    fin.close()
    print 'done.'

    print 'loading wids ...'
    fin = open(mid_wid_file_name, 'rb')
    wid_list = list()
    for line in fin:
        vals = line.strip().split('\t')
        mid = vals[0]
        wid = int(vals[1])
        if mid not in filter_mid_set:
            wid_list.append(wid)
    fin.close()
    print 'done.'

    print len(wid_list), 'wids'
    fout = open(dst_wid_file_name, 'wb')
    np.asarray([len(wid_list)], np.int32).tofile(fout)
    np.asarray(wid_list, np.int32).tofile(fout)
    fout.close()
    print 'done.'
Exemplo n.º 5
0
def gen_legal_wid_list_file():
    # mid_alias_cnt_file_name = 'd:/data/el/merged_fb_mid_alias_cnt.txt'
    filter_mid_file_name = 'e:/el/res/freebase/filter_mids_10_8.bin'
    mid_wid_file_name = 'd:/data/el/mid_to_wid_full_ord_wid.txt'
    dst_wid_file_name = 'e:/dc/el/legal_wid_list.bin'
    print 'loading filter mids ...'
    fin = open(filter_mid_file_name, 'rb')
    num_mids = np.fromfile(fin, '>i4', 1)
    filter_mid_set = set()
    for i in xrange(num_mids):
        mid = ioutils.read_str_with_fixed_len(fin, 8)
        filter_mid_set.add(mid)
    fin.close()
    print 'done.'

    print 'loading wids ...'
    fin = open(mid_wid_file_name, 'rb')
    wid_list = list()
    for line in fin:
        vals = line.strip().split('\t')
        mid = vals[0]
        wid = int(vals[1])
        if mid not in filter_mid_set:
            wid_list.append(wid)
    fin.close()
    print 'done.'

    print len(wid_list), 'wids'
    fout = open(dst_wid_file_name, 'wb')
    np.asarray([len(wid_list)], np.int32).tofile(fout)
    np.asarray(wid_list, np.int32).tofile(fout)
    fout.close()
    print 'done.'
Exemplo n.º 6
0
def __load_mention_info(fin, vec_dim):
    qid = ioutils.read_str_with_byte_len(fin)
    # print qid
    # if qid == '':
    #     print doc_id, j, num_mentions
    # gold_label = 'NIL'

    candidates = list()
    num_candidates = np.fromfile(fin, '>i4', 1)
    for k in xrange(num_candidates):
        mid = ioutils.read_str_with_fixed_len(fin, 8)
        commonness = np.fromfile(fin, '>f4', 1)
        vec = np.fromfile(fin, '>f4', vec_dim)
        candidates.append((mid, commonness, vec))
    return qid, candidates
Exemplo n.º 7
0
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file):
    mid_eid_dict = load_mid_eid_file(mid_eid_file)
    label_dict = load_gold_id_file(gold_label_file)

    vec_dim = 100

    nil_cnt = 0
    miss_cnt = 0
    fh_cnt = 0
    nil_hit_cnt = 0
    tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb')
    fin = open(vec_train_file, 'rb')
    num_docs = np.fromfile(fin, '>i4', 1)
    print num_docs
    fout = open(dst_file, 'wb')
    np.asarray([num_docs, vec_dim], np.int32).tofile(fout)
    candidates_list = list()
    for i in xrange(num_docs):
        doc_id = ioutils.read_str_with_byte_len(fin)
        doc_vec = np.fromfile(fin, '>f4', vec_dim)
        # if i < 5:
        #     print doc_vec

        doc_vec.astype(np.float32).tofile(fout)

        mention_infos = list()
        num_mentions = np.fromfile(fin, '>i4', 1)
        for j in xrange(num_mentions):
            qid = ioutils.read_str_with_byte_len(fin)
            # print qid
            # if qid == '':
            #     print doc_id, j, num_mentions
            # gold_label = 'NIL'

            num_candidates = np.fromfile(fin, '>i4', 1)

            gold_label = label_dict[qid]

            cur_candidates = list()
            cur_candidates_tup = (qid, cur_candidates)
            candidates_list.append(cur_candidates_tup)

            hit_idx = -1
            commonness = list()
            candidate_vecs = list()
            eids = list()
            all_nil = True
            non_nil_cnt = -1
            for k in xrange(num_candidates):
                mid = ioutils.read_str_with_fixed_len(fin, 8)
                eid = mid_eid_dict.get(mid, 'NILL')
                if eid != 'NILL':
                    all_nil = False
                    non_nil_cnt += 1

                cur_candidates.append(eid)

                if k == 0 and eid == 'NILL':
                    tmp_fout.write(qid + '\t' + eid + '\n')
                if eid == gold_label:
                    # hit_idx = k
                    hit_idx = non_nil_cnt

                cur_com = np.fromfile(fin, '>f4', 1)
                # print cur_com
                vec = np.fromfile(fin, '>f4', vec_dim)

                if eid != 'NILL':
                    commonness.append(cur_com)
                    candidate_vecs.append(vec.astype(np.float32))
                eids.append(eid)

            if hit_idx == -1:
                miss_cnt += 1
            else:
                # mention_infos.append((qid, hit_idx, candidate_vecs, eids))
                mention_infos.append((hit_idx, commonness, candidate_vecs))
                # print commonness
                # print
                if hit_idx == 0:
                    fh_cnt += 1

            if gold_label.startswith('NIL'):
                nil_cnt += 1
                if all_nil:
                    nil_hit_cnt += 1

        # print len(mention_infos)
        np.asarray([len(mention_infos)], np.int32).tofile(fout)
        for mention_info in mention_infos:
            # io_utils.write_str_with_byte_len(mention_info[0], fout)
            np.asarray([len(mention_info[1])], np.int32).tofile(fout)
            np.asarray([mention_info[0]], np.int32).tofile(fout)
            np.asarray(mention_info[1], np.float32).tofile(fout)
            for vec in mention_info[2]:
                vec.tofile(fout)
            # for eid in mention_info[3]:
            #     io_utils.write_str_with_byte_len(eid, fout)
        # break
    fin.close()
    fout.close()
    tmp_fout.close()

    candidates_list.sort(key=lambda x: x[0])
    __print_candidates(candidates_list)

    num_queries = len(label_dict)
    num_non_nil_queries = num_queries - nil_cnt
    print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries'
    print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries, nil_hit_cnt
    print float(fh_cnt) / num_non_nil_queries
    print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries
    print float(num_queries - miss_cnt + nil_hit_cnt) / num_queries
    print 1 - float(miss_cnt - nil_cnt) / num_queries
Exemplo n.º 8
0
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file):
    mid_eid_dict = load_mid_eid_file(mid_eid_file)
    label_dict = load_gold_label_file(gold_label_file)

    vec_dim = 100

    nil_cnt = 0
    miss_cnt = 0
    fh_cnt = 0
    tmp_fout = open('e:/dc/el/tmp_result.txt', 'wb')
    fin = open(vec_train_file, 'rb')
    num_docs = np.fromfile(fin, '>i4', 1)
    print num_docs
    fout = open(dst_file, 'wb')
    np.asarray([num_docs, vec_dim], np.int32).tofile(fout)
    for i in xrange(num_docs):
        doc_id = ioutils.read_str_with_byte_len(fin)
        doc_vec = np.fromfile(fin, '>f4', vec_dim)

        doc_vec.astype(np.float32).tofile(fout)

        mention_infos = list()
        num_mentions = np.fromfile(fin, '>i4', 1)
        for j in xrange(num_mentions):
            qid = ioutils.read_str_with_byte_len(fin)
            # print qid
            # if qid == '':
            #     print doc_id, j, num_mentions
            gold_label = label_dict[qid]
            # gold_label = 'NIL'
            if gold_label.startswith('NIL'):
                nil_cnt += 1

            num_candidates = np.fromfile(fin, '>i4', 1)
            hit_idx = -1
            commonness = list()
            candidate_vecs = list()
            eids = list()
            for k in xrange(num_candidates):
                mid = ioutils.read_str_with_fixed_len(fin, 8)
                eid = mid_eid_dict.get(mid, 'NILL')

                if k == 0 and eid != 'NILL':
                    tmp_fout.write(qid + '\t' + eid + '\n')
                if eid == gold_label:
                    hit_idx = k

                cur_com = np.fromfile(fin, '>f4', 1)
                # print cur_com
                commonness.append(cur_com)
                vec = np.fromfile(fin, '>f4', vec_dim)
                candidate_vecs.append(vec.astype(np.float32))
                eids.append(eid)

            if hit_idx == -1:
                miss_cnt += 1
            else:
                # mention_infos.append((qid, hit_idx, candidate_vecs, eids))
                mention_infos.append((hit_idx, commonness, candidate_vecs))
                if hit_idx == 0:
                    fh_cnt += 1

        # print len(mention_infos)
        np.asarray([len(mention_infos)], np.int32).tofile(fout)
        for mention_info in mention_infos:
            # io_utils.write_str_with_byte_len(mention_info[0], fout)
            np.asarray([len(mention_info[1])], np.int32).tofile(fout)
            np.asarray([mention_info[0]], np.int32).tofile(fout)
            np.asarray(mention_info[1], np.float32).tofile(fout)
            for vec in mention_info[2]:
                vec.tofile(fout)
            # for eid in mention_info[3]:
            #     io_utils.write_str_with_byte_len(eid, fout)
        # break
    fin.close()
    fout.close()
    tmp_fout.close()

    num_queries = len(label_dict)
    num_non_nil_queries = num_queries - nil_cnt
    print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries'
    print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries
    print float(fh_cnt) / num_non_nil_queries
    print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries