예제 #1
0
        srcdir + 'relation_train.hist-%d.txt' % hist_size,
        srcdir + 'relation_valid.hist-%d.txt' % hist_size,
        srcdir + 'relation_test.hist-%d.txt' % hist_size
    ]
    embed_dict = read_embedding(filename=embedfile)
    print('read embedding finished ...')
    _PAD_ = len(embed_dict)
    embed_size = len(list(embed_dict.values())[0])
    embed_dict[_PAD_] = np.zeros((embed_size, ), dtype=np.float32)
    embed = np.float32(np.random.uniform(-0.2, 0.2, [_PAD_ + 1, embed_size]))
    embed = convert_embed_2_numpy(embed_dict, embed=embed)

    corpus, _ = read_data(corpusfile)
    print('read corpus finished....')
    for idx, relfile in enumerate(relfiles):
        histfile = histfiles[idx]
        rel = read_relation(relfile)
        fout = open(histfile, 'w')
        for label, d1, d2 in rel:
            assert d1 in corpus
            assert d2 in corpus
            qnum = len(corpus[d1])
            d1_embed = embed[corpus[d1]]
            d2_embed = embed[corpus[d2]]
            curr_hist = cal_hist(d1_embed, d2_embed, qnum, hist_size)
            curr_hist = curr_hist.tolist()
            fout.write(' '.join(map(str, curr_hist)))
            fout.write('\n')
        fout.close()
    print('generate histogram finished ...')
예제 #2
0
            srcdir + 'relation_train.hist-%d.txt' % hist_size,
            srcdir + 'relation_valid.hist-%d.txt' % hist_size,
            srcdir + 'relation_test.hist-%d.txt' % hist_size
            ]
    embed_dict = read_embedding(filename = embedfile)
    print('read embedding finished ...')
    _PAD_ = len(embed_dict)
    embed_size = len(list(embed_dict.values())[0])
    embed_dict[_PAD_] = np.zeros((embed_size, ), dtype=np.float32)
    embed = np.float32(np.random.uniform(-0.2, 0.2, [_PAD_+1, embed_size]))
    embed = convert_embed_2_numpy(embed_dict, embed = embed)

    corpus, _ = read_data(corpusfile)
    print('read corpus finished....')
    for idx, relfile in enumerate(relfiles):
        histfile = histfiles[idx]
        rel = read_relation(relfile)
        fout = open(histfile, 'w')
        for label, d1, d2 in rel:
            assert d1 in corpus
            assert d2 in corpus
            qnum = len(corpus[d1])
            d1_embed = embed[corpus[d1]]
            d2_embed = embed[corpus[d2]]
            curr_hist = cal_hist(d1_embed, d2_embed, qnum, hist_size)
            curr_hist = curr_hist.tolist()
            fout.write(' '.join(map(str, curr_hist)))
            fout.write('\n')
        fout.close()
    print('generate histogram finished ...')