Пример #1
0
def close_words_of_entities():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_joint_oml_100.bin'
    entity_dict_file_name = 'e:/dc/20ng_bydate/entity_names.txt'
    entity_vecs_file_name = 'e:/dc/20ng_bydate/vecs/entity_vecs_joint_oml_100.bin'

    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)
    words = ioutils.load_words_dict_to_list(word_dict_file_name)
    entity_vecs = ioutils.load_vec_list_file(entity_vecs_file_name)
    entities = ioutils.load_entity_dict(entity_dict_file_name)
    print len(entity_vecs)
    print len(entities)

    def show_close_words(entity_idx):
        print 'entity: ', entities[entity_idx]
        entity_vec = entity_vecs[entity_idx]
        dist_list = list()
        for word_idx in xrange(len(word_vecs)):
            # dist = np.dot(entity_vec, word_vecs[word_idx])
            dist = scipy.spatial.distance.cosine(entity_vec, word_vecs[word_idx])
            dist_list.append((dist, word_idx))
        # dist_list.sort(key=lambda tup: tup[0])
        closest_words = heapq.nsmallest(100, dist_list, key=lambda tup: tup[0])
        for dist, idx in closest_words:
            print dist, words[idx], idx

    show_close_words(25304)
Пример #2
0
def close_words_of_entities():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_joint_oml_100.bin'
    entity_dict_file_name = 'e:/dc/20ng_bydate/entity_names.txt'
    entity_vecs_file_name = 'e:/dc/20ng_bydate/vecs/entity_vecs_joint_oml_100.bin'

    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)
    words = ioutils.load_words_dict_to_list(word_dict_file_name)
    entity_vecs = ioutils.load_vec_list_file(entity_vecs_file_name)
    entities = ioutils.load_entity_dict(entity_dict_file_name)
    print len(entity_vecs)
    print len(entities)

    def show_close_words(entity_idx):
        print 'entity: ', entities[entity_idx]
        entity_vec = entity_vecs[entity_idx]
        dist_list = list()
        for word_idx in xrange(len(word_vecs)):
            # dist = np.dot(entity_vec, word_vecs[word_idx])
            dist = scipy.spatial.distance.cosine(entity_vec,
                                                 word_vecs[word_idx])
            dist_list.append((dist, word_idx))
        # dist_list.sort(key=lambda tup: tup[0])
        closest_words = heapq.nsmallest(100, dist_list, key=lambda tup: tup[0])
        for dist, idx in closest_words:
            print dist, words[idx], idx

    show_close_words(25304)
Пример #3
0
def merge_vecs():
    year = 2010
    part = 'train'
    method = 3

    # vecs_file0 = 'e:/dc/el/vecs/tac_' + file_tag + '_entity_vecs.bin'
    # vecs_file1 = 'e:/dc/el/vecs/tac_' + file_tag + '_dw_vecs.bin'
    # dst_file = 'e:/dc/el/vecs/tac_' + file_tag + '_vecs.bin'

    vecs_file0 = 'e:/dc/el/vecs/%d/%s_%d_de_vecs.bin' % (year, part, method)
    vecs_file1 = 'e:/dc/el/vecs/%d/%s_%d_dw_vecs.bin' % (year, part, method)
    dst_file = 'e:/dc/el/vecs/%d/%s_%d_vecs.bin' % (year, part, method)

    vecs0 = ioutils.load_vec_list_file(vecs_file0)
    vecs1 = ioutils.load_vec_list_file(vecs_file1)
    if len(vecs0) != len(vecs1):
        print 'number of vectors not equal!'
        return

    fout = open(dst_file, 'wb')
    np.asarray([len(vecs0), len(vecs0[0]) + len(vecs1[0])],
               np.int32).tofile(fout)
    for i in xrange(len(vecs0)):
        vecs0[i].tofile(fout)
        vecs1[i].tofile(fout)
    fout.close()
Пример #4
0
def merge_vecs():
    year = 2010
    part = 'train'
    method = 3

    # vecs_file0 = 'e:/dc/el/vecs/tac_' + file_tag + '_entity_vecs.bin'
    # vecs_file1 = 'e:/dc/el/vecs/tac_' + file_tag + '_dw_vecs.bin'
    # dst_file = 'e:/dc/el/vecs/tac_' + file_tag + '_vecs.bin'

    vecs_file0 = 'e:/dc/el/vecs/%d/%s_%d_de_vecs.bin' % (year, part, method)
    vecs_file1 = 'e:/dc/el/vecs/%d/%s_%d_dw_vecs.bin' % (year, part, method)
    dst_file = 'e:/dc/el/vecs/%d/%s_%d_vecs.bin' % (year, part, method)

    vecs0 = ioutils.load_vec_list_file(vecs_file0)
    vecs1 = ioutils.load_vec_list_file(vecs_file1)
    if len(vecs0) != len(vecs1):
        print 'number of vectors not equal!'
        return

    fout = open(dst_file, 'wb')
    np.asarray([len(vecs0), len(vecs0[0]) + len(vecs1[0])], np.int32).tofile(fout)
    for i in xrange(len(vecs0)):
        vecs0[i].tofile(fout)
        vecs1[i].tofile(fout)
    fout.close()
Пример #5
0
def close_wiki_pages():
    dst_wid = 534366
    wid_title_file = 'e:/el/tmpres/wiki/enwiki-20150403-id-title-list.txt'
    wid_title_dict = __load_id_title_dict(wid_title_file)

    wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin'
    fin = open(wiki_page_id_file, 'rb')
    num_pages = np.fromfile(fin, np.int32, 1)
    page_ids = np.fromfile(fin, np.int32, num_pages)
    fin.close()

    pos = 0
    for i, wid in enumerate(page_ids):
        if wid == dst_wid:
            pos = i
            break

    wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs_1.bin'
    # fin = open(wiki_vec_file, 'rb')
    # num_vecs, dim = np.fromfile(fin, np.int32, 2)
    # wiki_vecs = np.zeros((num_vecs, 50), np.float32)
    # for i in xrange(num_vecs):
    #     vec = np.fromfile(fin, np.float32, dim)
    #     wiki_vecs[i][:] = vec[50:]
    # fin.close()
    wiki_vecs = ioutils.load_vec_list_file(wiki_vec_file)

    top_list = close_vecs(wiki_vecs, wiki_vecs[pos])
    for idx, dist in top_list:
        print wid_title_dict[page_ids[idx]], dist
Пример #6
0
def close_wiki_pages_el_doc():
    wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin'
    fin = open(wiki_page_id_file, 'rb')
    num_pages = np.fromfile(fin, np.int32, 1)
    page_ids = np.fromfile(fin, np.int32, num_pages)
    fin.close()

    wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs.bin'
    fin = open(wiki_vec_file, 'rb')
    num_vecs, dim = np.fromfile(fin, np.int32, 2)
    # wiki_vecs = np.zeros((num_vecs, 50), np.float32)
    wiki_vecs = np.zeros((num_vecs, 100), np.float32)
    for i in xrange(num_vecs):
        vec = np.fromfile(fin, np.float32, dim)
        # wiki_vecs[i][:] = vec[50:]
        wiki_vecs[i][:] = vec
    fin.close()

    # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_doc_vecs_av.bin'
    # el_doc_vec_file = 'e:/dc/el/vecs/wiki_vecs_av.bin'
    el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_eval_vecs.bin'
    # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_dw_vecs.bin'
    el_doc_vecs = ioutils.load_vec_list_file(el_doc_vec_file)
    top_list = close_vecs(wiki_vecs, el_doc_vecs[0])
    for idx, dist in top_list:
        print page_ids[idx], dist
Пример #7
0
def close_wiki_pages():
    dst_wid = 534366
    wid_title_file = 'e:/el/tmpres/wiki/enwiki-20150403-id-title-list.txt'
    wid_title_dict = __load_id_title_dict(wid_title_file)

    wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin'
    fin = open(wiki_page_id_file, 'rb')
    num_pages = np.fromfile(fin, np.int32, 1)
    page_ids = np.fromfile(fin, np.int32, num_pages)
    fin.close()

    pos = 0
    for i, wid in enumerate(page_ids):
        if wid == dst_wid:
            pos = i
            break

    wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs_1.bin'
    # fin = open(wiki_vec_file, 'rb')
    # num_vecs, dim = np.fromfile(fin, np.int32, 2)
    # wiki_vecs = np.zeros((num_vecs, 50), np.float32)
    # for i in xrange(num_vecs):
    #     vec = np.fromfile(fin, np.float32, dim)
    #     wiki_vecs[i][:] = vec[50:]
    # fin.close()
    wiki_vecs = ioutils.load_vec_list_file(wiki_vec_file)

    top_list = close_vecs(wiki_vecs, wiki_vecs[pos])
    for idx, dist in top_list:
        print wid_title_dict[page_ids[idx]], dist
Пример #8
0
def __split_vecs(all_vecs_file_name, split_labels_file_name,
                 dst_train_vecs_file_name, dst_test_vecs_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name)
    split_labels = ioutils.load_labels_file(split_labels_file_name)

    train_vec_list = list()
    test_vec_list = list()
    for vec, split_label in zip(all_vec_list, split_labels):
        # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32)
        # print split_label
        if split_label == 1:
            test_vec_list.append(vec)
        else:
            train_vec_list.append(vec)

    print len(train_vec_list), 'training samples'
    print len(test_vec_list), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vecs_file_name)
    save_vecs(test_vec_list, dst_test_vecs_file_name)
Пример #9
0
def averaging_vecs(doc_word_file, word_vecs_file, dst_file):
    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    dim = len(word_vecs[0])

    fin = open(doc_word_file, 'rb')
    num_left, num_right = np.fromfile(fin, np.int32, 2)
    print num_left, num_right
    fout = open(dst_file, 'wb')
    np.asarray([num_left, dim], np.int32).tofile(fout)
    for i in xrange(num_left):
        num_vertices = np.fromfile(fin, np.int32, 1)

        indices = np.fromfile(fin, np.int32, num_vertices)
        weights = np.fromfile(fin, np.uint16, num_vertices)

        vec = np.zeros(dim, np.float32)
        sum_weights = np.sum(weights)
        for idx, w in zip(indices, weights):
            vec += word_vecs[idx] * w
        vec /= sum_weights
        vec.tofile(fout)

        if i % 10000 == 10000 - 1:
            print i + 1
        # break  # TODO
    fout.close()
    fin.close()
Пример #10
0
def cluster_nyt():
    num_clusters_list = [5, 10, 15, 20]
    method = 'RSM'

    result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower()

    labels_file_name = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/glove-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner-200.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-015.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw5-vecs-ner.bin'
    doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/rsm-vecs-20.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/drbm-vecs-30.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdm-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdbow-vecs.bin'
    # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/nvdm-nyt.bin'

    # doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/test-dedw-vecs.bin'
    # labels_file_name = 'e:/dc/20ng_bydate/test_labels.bin'

    perf_list = list()
    # for num_clusters in [5, 10, 15, 20]:
    vec_list = ioutils.load_vec_list_file(doc_vec_file_name)
    labels = ioutils.load_labels_file(labels_file_name)
    for num_clusters in num_clusters_list:
        print '%d clusters' % num_clusters
        # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters)
        nmi_score, purity_score, ri_score = cluster_and_eval(vec_list, labels, num_clusters)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
        # break
    write_clustering_perf_to_csv(method, perf_list, result_file)
Пример #11
0
def split_vecs(all_vecs_file_name,
               split_labels_file_name,
               dst_train_vecs_file_name,
               dst_test_vecs_file_name,
               train_label=0,
               test_label=1):
    all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name)
    split_labels = ioutils.load_labels_file(split_labels_file_name)

    train_vec_list = list()
    test_vec_list = list()
    for vec, split_label in zip(all_vec_list, split_labels):
        # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32)
        # print split_label
        if split_label == test_label:
            test_vec_list.append(vec)
        elif split_label == train_label:
            train_vec_list.append(vec)

    print len(train_vec_list), 'training samples'
    print len(test_vec_list), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vecs_file_name)
    save_vecs(test_vec_list, dst_test_vecs_file_name)
Пример #12
0
def averaging_vecs(doc_word_file, word_vecs_file, dst_file):
    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    dim = len(word_vecs[0])

    fin = open(doc_word_file, 'rb')
    num_left, num_right = np.fromfile(fin, np.int32, 2)
    print num_left, num_right
    fout = open(dst_file, 'wb')
    np.asarray([num_left, dim], np.int32).tofile(fout)
    for i in xrange(num_left):
        num_vertices = np.fromfile(fin, np.int32, 1)

        indices = np.fromfile(fin, np.int32, num_vertices)
        weights = np.fromfile(fin, np.uint16, num_vertices)

        vec = np.zeros(dim, np.float32)
        sum_weights = np.sum(weights)
        for idx, w in zip(indices, weights):
            vec += word_vecs[idx] * w
        vec /= sum_weights
        vec.tofile(fout)

        if i % 10000 == 10000 - 1:
            print i + 1
        # break  # TODO
    fout.close()
    fin.close()
Пример #13
0
def close_wiki_pages_el_doc():
    wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin'
    fin = open(wiki_page_id_file, 'rb')
    num_pages = np.fromfile(fin, np.int32, 1)
    page_ids = np.fromfile(fin, np.int32, num_pages)
    fin.close()

    wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs.bin'
    fin = open(wiki_vec_file, 'rb')
    num_vecs, dim = np.fromfile(fin, np.int32, 2)
    # wiki_vecs = np.zeros((num_vecs, 50), np.float32)
    wiki_vecs = np.zeros((num_vecs, 100), np.float32)
    for i in xrange(num_vecs):
        vec = np.fromfile(fin, np.float32, dim)
        # wiki_vecs[i][:] = vec[50:]
        wiki_vecs[i][:] = vec
    fin.close()

    # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_doc_vecs_av.bin'
    # el_doc_vec_file = 'e:/dc/el/vecs/wiki_vecs_av.bin'
    el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_eval_vecs.bin'
    # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_dw_vecs.bin'
    el_doc_vecs = ioutils.load_vec_list_file(el_doc_vec_file)
    top_list = close_vecs(wiki_vecs, el_doc_vecs[0])
    for idx, dist in top_list:
        print page_ids[idx], dist
Пример #14
0
def __cluster_nyt():
    # num_clusters_list = [5, 10, 15, 20]
    num_clusters_list = [10, 15, 20]
    # num_clusters_list = [5]
    method = 'RSM'

    datadir = 'e:/data/emadr/nyt-less-docs/world'
    result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower(
    )

    labels_file_name = os.path.join(datadir, 'bindata/test-labels.bin')
    # doc_vec_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin')
    doc_vec_file_name = os.path.join(datadir, 'bindata/test-pvdbow-vecs.bin')
    # doc_vec_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin')

    perf_list = list()
    # for num_clusters in [5, 10, 15, 20]:
    vec_list = ioutils.load_vec_list_file(doc_vec_file_name)
    labels = ioutils.load_labels_file(labels_file_name)
    for num_clusters in num_clusters_list:
        print '%d clusters' % num_clusters
        # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters)
        nmi_score, purity_score, ri_score = cluster_and_eval(
            vec_list, labels, num_clusters)
        perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
Пример #15
0
def close_docs(doc_vec_file, dst_idx=0, k=10):
    # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin'

    print 'loading', doc_vec_file
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file)
    print 'done'

    dst_vec = doc_vecs[dst_idx]
    return close_vecs(doc_vecs, dst_vec)
Пример #16
0
def close_docs(doc_vec_file, dst_idx=0, k=10):
    # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin'

    print 'loading', doc_vec_file
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file)
    print 'done'

    dst_vec = doc_vecs[dst_idx]
    return close_vecs(doc_vecs, dst_vec)
Пример #17
0
def close_words():
    words_dict_file = 'e:/dc/el/words_dict_proper.txt'
    word_vecs_file = 'e:/dc/el/vecs/word_vecs.bin'

    words = ioutils.load_words_dict_to_list(words_dict_file, False)
    print len(words)
    idx = 774318
    print words[idx]

    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    print len(word_vecs)
    close_list = close_vecs(word_vecs, word_vecs[idx])
    for idx, dist in close_list:
        print words[idx], dist
Пример #18
0
def close_words():
    words_dict_file = 'e:/dc/el/words_dict_proper.txt'
    word_vecs_file = 'e:/dc/el/vecs/word_vecs.bin'

    words = ioutils.load_words_dict_to_list(words_dict_file, False)
    print len(words)
    idx = 774318
    print words[idx]

    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    print len(word_vecs)
    close_list = close_vecs(word_vecs, word_vecs[idx])
    for idx, dist in close_list:
        print words[idx], dist
Пример #19
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])
Пример #20
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(
        bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])
Пример #21
0
def __cluster_20ng():
    num_clusters = 20
    labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-dedw-vecs.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-0_8-50.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-cluster-0_15-50.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdbow-vecs.bin'
    doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdm-vecs.bin'
    # doc_vec_file = 'e:/data/emadr/20ng_bydate/rsm/test-rsm-vecs.bin'

    vec_list = ioutils.load_vec_list_file(doc_vec_file)
    labels = ioutils.load_labels_file(labels_file)
    nmi_score, purity_score, ri_score = cluster_and_eval(
        vec_list, labels, num_clusters)
    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
Пример #22
0
def close_20ng_docs():
    # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin'
    doc_vec_file = 'e:/dc/20ng_bydate/vecs/dbow_doc_vecs.bin'
    # av_doc_vec_file = 'e:/dc/20ng_bydate/vecs/doc_vecs_av.bin'
    print 'loading', doc_vec_file
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file)
    print len(doc_vecs)
    # dw_vecs = np.zeros((len(doc_vecs), 50), np.float32)
    # for i in xrange(len(doc_vecs)):
    #     dw_vecs[i] = doc_vecs[i][50:]
    # print 'done'

    # print 'loading', av_doc_vec_file
    # av_doc_vecs = ioutils.load_vec_list_file(av_doc_vec_file)
    # print 'done'

    dst_vec = doc_vecs[0]
    close_list = close_vecs(doc_vecs, dst_vec)
    for idx, dist in close_list:
        print idx, dist
Пример #23
0
def close_20ng_docs():
    # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin'
    doc_vec_file = 'e:/dc/20ng_bydate/vecs/dbow_doc_vecs.bin'
    # av_doc_vec_file = 'e:/dc/20ng_bydate/vecs/doc_vecs_av.bin'
    print 'loading', doc_vec_file
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file)
    print len(doc_vecs)
    # dw_vecs = np.zeros((len(doc_vecs), 50), np.float32)
    # for i in xrange(len(doc_vecs)):
    #     dw_vecs[i] = doc_vecs[i][50:]
    # print 'done'

    # print 'loading', av_doc_vec_file
    # av_doc_vecs = ioutils.load_vec_list_file(av_doc_vec_file)
    # print 'done'

    dst_vec = doc_vecs[0]
    close_list = close_vecs(doc_vecs, dst_vec)
    for idx, dist in close_list:
        print idx, dist
Пример #24
0
def doc_vecs_file_for_tsne():
    tag = 'all'
    vecs_file_tag = '012'
    all_doc_vecs_file = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-%s.bin' % vecs_file_tag
    all_doc_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'
    doc_indices_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-indices-%s.bin' % tag
    dst_doc_vecs_file = 'e:/dc/nyt-world-full/processed/test/sne-emadr-vecs-%s-%s.txt' % (vecs_file_tag, tag)
    dst_labels_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-labels-%s.txt' % tag

    fin = open(doc_indices_file, 'rb')
    num_indices = np.fromfile(fin, np.int32, 1)
    doc_indices = np.fromfile(fin, np.int32, num_indices)
    fin.close()

    doc_vecs = ioutils.load_vec_list_file(all_doc_vecs_file)
    fout_vecs = open(dst_doc_vecs_file, 'wb')
    for idx in doc_indices:
        for v in doc_vecs[idx]:
            fout_vecs.write('   %f' % v)
        fout_vecs.write('\n')
    fout_vecs.close()
Пример #25
0
def doc_vecs_file_for_tsne():
    tag = 'all'
    vecs_file_tag = '012'
    all_doc_vecs_file = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-%s.bin' % vecs_file_tag
    all_doc_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin'
    doc_indices_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-indices-%s.bin' % tag
    dst_doc_vecs_file = 'e:/dc/nyt-world-full/processed/test/sne-emadr-vecs-%s-%s.txt' % (
        vecs_file_tag, tag)
    dst_labels_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-labels-%s.txt' % tag

    fin = open(doc_indices_file, 'rb')
    num_indices = np.fromfile(fin, np.int32, 1)
    doc_indices = np.fromfile(fin, np.int32, num_indices)
    fin.close()

    doc_vecs = ioutils.load_vec_list_file(all_doc_vecs_file)
    fout_vecs = open(dst_doc_vecs_file, 'wb')
    for idx in doc_indices:
        for v in doc_vecs[idx]:
            fout_vecs.write('   %f' % v)
        fout_vecs.write('\n')
    fout_vecs.close()
Пример #26
0
def split_vectors(all_vec_file_name, all_labels_file_name,
                  dst_train_vec_file_name, dst_train_labels_file_name,
                  dst_test_vec_file_name, dst_test_labels_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vec_file_name)
    all_labels = ioutils.load_labels_file(all_labels_file_name)

    train_vec_list = list()
    train_labels = list()
    test_vec_list = list()
    test_labels = list()
    for vec, label in zip(all_vec_list, all_labels):
        rand_val = random.randint(1, 10)
        if rand_val == 1:
            test_vec_list.append(vec)
            test_labels.append(label)
        else:
            train_vec_list.append(vec)
            train_labels.append(label)

    print len(train_labels), 'training samples'
    print len(test_labels), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    def save_labels(labels_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(labels_list)], np.int32).tofile(fout)
        np.asarray(labels_list, np.int32).tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vec_file_name)
    save_labels(train_labels, dst_train_labels_file_name)
    save_vecs(test_vec_list, dst_test_vec_file_name)
    save_labels(test_labels, dst_test_labels_file_name)
Пример #27
0
def split_vectors(all_vec_file_name, all_labels_file_name, dst_train_vec_file_name,
                  dst_train_labels_file_name, dst_test_vec_file_name, dst_test_labels_file_name):
    all_vec_list = ioutils.load_vec_list_file(all_vec_file_name)
    all_labels = ioutils.load_labels_file(all_labels_file_name)

    train_vec_list = list()
    train_labels = list()
    test_vec_list = list()
    test_labels = list()
    for vec, label in zip(all_vec_list, all_labels):
        rand_val = random.randint(1, 10)
        if rand_val == 1:
            test_vec_list.append(vec)
            test_labels.append(label)
        else:
            train_vec_list.append(vec)
            train_labels.append(label)

    print len(train_labels), 'training samples'
    print len(test_labels), 'testing samples'

    def save_vecs(vec_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout)
        for cur_vec in vec_list:
            cur_vec.tofile(fout)
        fout.close()

    def save_labels(labels_list, dst_file_name):
        fout = open(dst_file_name, 'wb')
        np.asarray([len(labels_list)], np.int32).tofile(fout)
        np.asarray(labels_list, np.int32).tofile(fout)
        fout.close()

    save_vecs(train_vec_list, dst_train_vec_file_name)
    save_labels(train_labels, dst_train_labels_file_name)
    save_vecs(test_vec_list, dst_test_vec_file_name)
    save_labels(test_labels, dst_test_labels_file_name)
Пример #28
0
def __lda_clustering():
    num_topics = 20
    min_occurrence = 30
    # datadir = 'e:/data/emadr/20ng_bydate/'
    # labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))
    datadir = 'e:/data/emadr/nyt-less-docs/world'
    labels_file = os.path.join(datadir, 'bindata/test-labels.bin')
    topic_vecs_file = os.path.join(
        datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence))

    topic_vecs = ioutils.load_vec_list_file(topic_vecs_file)
    gold_labels = ioutils.load_labels_file(labels_file)
    sys_labels = list()
    for i, topic_vec in enumerate(topic_vecs):
        cluster_idx = 0
        max_dist = 0
        for j, v in enumerate(topic_vec):
            if v > max_dist:
                cluster_idx = j
                max_dist = v
        # print cluster_idx, max_dist
        sys_labels.append(cluster_idx)
        if len(sys_labels) % 5000 == 0:
            print len(sys_labels)

    nmi_score = normalized_mutual_info_score(gold_labels, sys_labels)
    purity_score = purity(gold_labels, sys_labels)
    # ri_score = rand_index(gold_labels, sys_labels)
    ri_score = 0

    print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score,
                                                 ri_score)
    # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_)

    print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)