def close_words_of_entities(): word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_joint_oml_100.bin' entity_dict_file_name = 'e:/dc/20ng_bydate/entity_names.txt' entity_vecs_file_name = 'e:/dc/20ng_bydate/vecs/entity_vecs_joint_oml_100.bin' word_vecs = ioutils.load_vec_list_file(word_vec_file_name) words = ioutils.load_words_dict_to_list(word_dict_file_name) entity_vecs = ioutils.load_vec_list_file(entity_vecs_file_name) entities = ioutils.load_entity_dict(entity_dict_file_name) print len(entity_vecs) print len(entities) def show_close_words(entity_idx): print 'entity: ', entities[entity_idx] entity_vec = entity_vecs[entity_idx] dist_list = list() for word_idx in xrange(len(word_vecs)): # dist = np.dot(entity_vec, word_vecs[word_idx]) dist = scipy.spatial.distance.cosine(entity_vec, word_vecs[word_idx]) dist_list.append((dist, word_idx)) # dist_list.sort(key=lambda tup: tup[0]) closest_words = heapq.nsmallest(100, dist_list, key=lambda tup: tup[0]) for dist, idx in closest_words: print dist, words[idx], idx show_close_words(25304)
def merge_vecs(): year = 2010 part = 'train' method = 3 # vecs_file0 = 'e:/dc/el/vecs/tac_' + file_tag + '_entity_vecs.bin' # vecs_file1 = 'e:/dc/el/vecs/tac_' + file_tag + '_dw_vecs.bin' # dst_file = 'e:/dc/el/vecs/tac_' + file_tag + '_vecs.bin' vecs_file0 = 'e:/dc/el/vecs/%d/%s_%d_de_vecs.bin' % (year, part, method) vecs_file1 = 'e:/dc/el/vecs/%d/%s_%d_dw_vecs.bin' % (year, part, method) dst_file = 'e:/dc/el/vecs/%d/%s_%d_vecs.bin' % (year, part, method) vecs0 = ioutils.load_vec_list_file(vecs_file0) vecs1 = ioutils.load_vec_list_file(vecs_file1) if len(vecs0) != len(vecs1): print 'number of vectors not equal!' return fout = open(dst_file, 'wb') np.asarray([len(vecs0), len(vecs0[0]) + len(vecs1[0])], np.int32).tofile(fout) for i in xrange(len(vecs0)): vecs0[i].tofile(fout) vecs1[i].tofile(fout) fout.close()
def close_wiki_pages(): dst_wid = 534366 wid_title_file = 'e:/el/tmpres/wiki/enwiki-20150403-id-title-list.txt' wid_title_dict = __load_id_title_dict(wid_title_file) wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin' fin = open(wiki_page_id_file, 'rb') num_pages = np.fromfile(fin, np.int32, 1) page_ids = np.fromfile(fin, np.int32, num_pages) fin.close() pos = 0 for i, wid in enumerate(page_ids): if wid == dst_wid: pos = i break wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs_1.bin' # fin = open(wiki_vec_file, 'rb') # num_vecs, dim = np.fromfile(fin, np.int32, 2) # wiki_vecs = np.zeros((num_vecs, 50), np.float32) # for i in xrange(num_vecs): # vec = np.fromfile(fin, np.float32, dim) # wiki_vecs[i][:] = vec[50:] # fin.close() wiki_vecs = ioutils.load_vec_list_file(wiki_vec_file) top_list = close_vecs(wiki_vecs, wiki_vecs[pos]) for idx, dist in top_list: print wid_title_dict[page_ids[idx]], dist
def close_wiki_pages_el_doc(): wiki_page_id_file = 'e:/dc/el/wiki/wiki_page_ids.bin' fin = open(wiki_page_id_file, 'rb') num_pages = np.fromfile(fin, np.int32, 1) page_ids = np.fromfile(fin, np.int32, num_pages) fin.close() wiki_vec_file = 'e:/dc/el/vecs/wiki_dedw_vecs.bin' fin = open(wiki_vec_file, 'rb') num_vecs, dim = np.fromfile(fin, np.int32, 2) # wiki_vecs = np.zeros((num_vecs, 50), np.float32) wiki_vecs = np.zeros((num_vecs, 100), np.float32) for i in xrange(num_vecs): vec = np.fromfile(fin, np.float32, dim) # wiki_vecs[i][:] = vec[50:] wiki_vecs[i][:] = vec fin.close() # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_doc_vecs_av.bin' # el_doc_vec_file = 'e:/dc/el/vecs/wiki_vecs_av.bin' el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_eval_vecs.bin' # el_doc_vec_file = 'e:/dc/el/vecs/tac_2014_train_dw_vecs.bin' el_doc_vecs = ioutils.load_vec_list_file(el_doc_vec_file) top_list = close_vecs(wiki_vecs, el_doc_vecs[0]) for idx, dist in top_list: print page_ids[idx], dist
def __split_vecs(all_vecs_file_name, split_labels_file_name, dst_train_vecs_file_name, dst_test_vecs_file_name): all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name) split_labels = ioutils.load_labels_file(split_labels_file_name) train_vec_list = list() test_vec_list = list() for vec, split_label in zip(all_vec_list, split_labels): # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32) # print split_label if split_label == 1: test_vec_list.append(vec) else: train_vec_list.append(vec) print len(train_vec_list), 'training samples' print len(test_vec_list), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vecs_file_name) save_vecs(test_vec_list, dst_test_vecs_file_name)
def averaging_vecs(doc_word_file, word_vecs_file, dst_file): word_vecs = ioutils.load_vec_list_file(word_vecs_file) dim = len(word_vecs[0]) fin = open(doc_word_file, 'rb') num_left, num_right = np.fromfile(fin, np.int32, 2) print num_left, num_right fout = open(dst_file, 'wb') np.asarray([num_left, dim], np.int32).tofile(fout) for i in xrange(num_left): num_vertices = np.fromfile(fin, np.int32, 1) indices = np.fromfile(fin, np.int32, num_vertices) weights = np.fromfile(fin, np.uint16, num_vertices) vec = np.zeros(dim, np.float32) sum_weights = np.sum(weights) for idx, w in zip(indices, weights): vec += word_vecs[idx] * w vec /= sum_weights vec.tofile(fout) if i % 10000 == 10000 - 1: print i + 1 # break # TODO fout.close() fin.close()
def cluster_nyt(): num_clusters_list = [5, 10, 15, 20] method = 'RSM' result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower() labels_file_name = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/glove-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner-200.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-015.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw5-vecs-ner.bin' doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/rsm-vecs-20.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/drbm-vecs-30.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdm-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdbow-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/nvdm-nyt.bin' # doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/test-dedw-vecs.bin' # labels_file_name = 'e:/dc/20ng_bydate/test_labels.bin' perf_list = list() # for num_clusters in [5, 10, 15, 20]: vec_list = ioutils.load_vec_list_file(doc_vec_file_name) labels = ioutils.load_labels_file(labels_file_name) for num_clusters in num_clusters_list: print '%d clusters' % num_clusters # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters) nmi_score, purity_score, ri_score = cluster_and_eval(vec_list, labels, num_clusters) perf_list.append((num_clusters, nmi_score, purity_score, ri_score)) # break write_clustering_perf_to_csv(method, perf_list, result_file)
def split_vecs(all_vecs_file_name, split_labels_file_name, dst_train_vecs_file_name, dst_test_vecs_file_name, train_label=0, test_label=1): all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name) split_labels = ioutils.load_labels_file(split_labels_file_name) train_vec_list = list() test_vec_list = list() for vec, split_label in zip(all_vec_list, split_labels): # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32) # print split_label if split_label == test_label: test_vec_list.append(vec) elif split_label == train_label: train_vec_list.append(vec) print len(train_vec_list), 'training samples' print len(test_vec_list), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vecs_file_name) save_vecs(test_vec_list, dst_test_vecs_file_name)
def __cluster_nyt(): # num_clusters_list = [5, 10, 15, 20] num_clusters_list = [10, 15, 20] # num_clusters_list = [5] method = 'RSM' datadir = 'e:/data/emadr/nyt-less-docs/world' result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower( ) labels_file_name = os.path.join(datadir, 'bindata/test-labels.bin') # doc_vec_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin') doc_vec_file_name = os.path.join(datadir, 'bindata/test-pvdbow-vecs.bin') # doc_vec_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin') perf_list = list() # for num_clusters in [5, 10, 15, 20]: vec_list = ioutils.load_vec_list_file(doc_vec_file_name) labels = ioutils.load_labels_file(labels_file_name) for num_clusters in num_clusters_list: print '%d clusters' % num_clusters # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters) nmi_score, purity_score, ri_score = cluster_and_eval( vec_list, labels, num_clusters) perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
def close_docs(doc_vec_file, dst_idx=0, k=10): # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin' print 'loading', doc_vec_file doc_vecs = ioutils.load_vec_list_file(doc_vec_file) print 'done' dst_vec = doc_vecs[dst_idx] return close_vecs(doc_vecs, dst_vec)
def close_words(): words_dict_file = 'e:/dc/el/words_dict_proper.txt' word_vecs_file = 'e:/dc/el/vecs/word_vecs.bin' words = ioutils.load_words_dict_to_list(words_dict_file, False) print len(words) idx = 774318 print words[idx] word_vecs = ioutils.load_vec_list_file(word_vecs_file) print len(word_vecs) close_list = close_vecs(word_vecs, word_vecs[idx]) for idx, dist in close_list: print words[idx], dist
def close_words_of_docs(): word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' words = ioutils.load_words_dict_to_list(word_dict_file_name) bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(bow_docs_file_name) print num_words, 'words' doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin' word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin' doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name) word_vecs = ioutils.load_vec_list_file(word_vec_file_name) def show_close_words(doc_vec, word_indices): dist_list = list() for word_idx in word_indices: dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx)) dist_list.sort(key=lambda tup: tup[0]) # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0]) for dist, idx in dist_list: print dist, words[idx] show_close_words(doc_vecs[0], word_indices_list[0])
def close_words_of_docs(): word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' words = ioutils.load_words_dict_to_list(word_dict_file_name) bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin' word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file( bow_docs_file_name) print num_words, 'words' doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin' word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin' doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name) word_vecs = ioutils.load_vec_list_file(word_vec_file_name) def show_close_words(doc_vec, word_indices): dist_list = list() for word_idx in word_indices: dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx)) dist_list.sort(key=lambda tup: tup[0]) # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0]) for dist, idx in dist_list: print dist, words[idx] show_close_words(doc_vecs[0], word_indices_list[0])
def __cluster_20ng(): num_clusters = 20 labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-dedw-vecs.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-0_8-50.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-cluster-0_15-50.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdbow-vecs.bin' doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdm-vecs.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/rsm/test-rsm-vecs.bin' vec_list = ioutils.load_vec_list_file(doc_vec_file) labels = ioutils.load_labels_file(labels_file) nmi_score, purity_score, ri_score = cluster_and_eval( vec_list, labels, num_clusters) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
def close_20ng_docs(): # doc_vec_file = 'e:/dc/20ng_bydate/vecs/dedw_vecs_0.bin' doc_vec_file = 'e:/dc/20ng_bydate/vecs/dbow_doc_vecs.bin' # av_doc_vec_file = 'e:/dc/20ng_bydate/vecs/doc_vecs_av.bin' print 'loading', doc_vec_file doc_vecs = ioutils.load_vec_list_file(doc_vec_file) print len(doc_vecs) # dw_vecs = np.zeros((len(doc_vecs), 50), np.float32) # for i in xrange(len(doc_vecs)): # dw_vecs[i] = doc_vecs[i][50:] # print 'done' # print 'loading', av_doc_vec_file # av_doc_vecs = ioutils.load_vec_list_file(av_doc_vec_file) # print 'done' dst_vec = doc_vecs[0] close_list = close_vecs(doc_vecs, dst_vec) for idx, dist in close_list: print idx, dist
def doc_vecs_file_for_tsne(): tag = 'all' vecs_file_tag = '012' all_doc_vecs_file = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-%s.bin' % vecs_file_tag all_doc_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' doc_indices_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-indices-%s.bin' % tag dst_doc_vecs_file = 'e:/dc/nyt-world-full/processed/test/sne-emadr-vecs-%s-%s.txt' % (vecs_file_tag, tag) dst_labels_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-labels-%s.txt' % tag fin = open(doc_indices_file, 'rb') num_indices = np.fromfile(fin, np.int32, 1) doc_indices = np.fromfile(fin, np.int32, num_indices) fin.close() doc_vecs = ioutils.load_vec_list_file(all_doc_vecs_file) fout_vecs = open(dst_doc_vecs_file, 'wb') for idx in doc_indices: for v in doc_vecs[idx]: fout_vecs.write(' %f' % v) fout_vecs.write('\n') fout_vecs.close()
def doc_vecs_file_for_tsne(): tag = 'all' vecs_file_tag = '012' all_doc_vecs_file = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-%s.bin' % vecs_file_tag all_doc_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' doc_indices_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-indices-%s.bin' % tag dst_doc_vecs_file = 'e:/dc/nyt-world-full/processed/test/sne-emadr-vecs-%s-%s.txt' % ( vecs_file_tag, tag) dst_labels_file = 'e:/dc/nyt-world-full/processed/test/sne-doc-labels-%s.txt' % tag fin = open(doc_indices_file, 'rb') num_indices = np.fromfile(fin, np.int32, 1) doc_indices = np.fromfile(fin, np.int32, num_indices) fin.close() doc_vecs = ioutils.load_vec_list_file(all_doc_vecs_file) fout_vecs = open(dst_doc_vecs_file, 'wb') for idx in doc_indices: for v in doc_vecs[idx]: fout_vecs.write(' %f' % v) fout_vecs.write('\n') fout_vecs.close()
def split_vectors(all_vec_file_name, all_labels_file_name, dst_train_vec_file_name, dst_train_labels_file_name, dst_test_vec_file_name, dst_test_labels_file_name): all_vec_list = ioutils.load_vec_list_file(all_vec_file_name) all_labels = ioutils.load_labels_file(all_labels_file_name) train_vec_list = list() train_labels = list() test_vec_list = list() test_labels = list() for vec, label in zip(all_vec_list, all_labels): rand_val = random.randint(1, 10) if rand_val == 1: test_vec_list.append(vec) test_labels.append(label) else: train_vec_list.append(vec) train_labels.append(label) print len(train_labels), 'training samples' print len(test_labels), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() def save_labels(labels_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(labels_list)], np.int32).tofile(fout) np.asarray(labels_list, np.int32).tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vec_file_name) save_labels(train_labels, dst_train_labels_file_name) save_vecs(test_vec_list, dst_test_vec_file_name) save_labels(test_labels, dst_test_labels_file_name)
def __lda_clustering(): num_topics = 20 min_occurrence = 30 # datadir = 'e:/data/emadr/20ng_bydate/' # labels_file = os.path.join(datadir, 'bindata/test-labels.bin') # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) datadir = 'e:/data/emadr/nyt-less-docs/world' labels_file = os.path.join(datadir, 'bindata/test-labels.bin') topic_vecs_file = os.path.join( datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) topic_vecs = ioutils.load_vec_list_file(topic_vecs_file) gold_labels = ioutils.load_labels_file(labels_file) sys_labels = list() for i, topic_vec in enumerate(topic_vecs): cluster_idx = 0 max_dist = 0 for j, v in enumerate(topic_vec): if v > max_dist: cluster_idx = j max_dist = v # print cluster_idx, max_dist sys_labels.append(cluster_idx) if len(sys_labels) % 5000 == 0: print len(sys_labels) nmi_score = normalized_mutual_info_score(gold_labels, sys_labels) purity_score = purity(gold_labels, sys_labels) # ri_score = rand_index(gold_labels, sys_labels) ri_score = 0 print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)