def __doc_classification(classifier, train_vec_file, train_label_file, test_vec_file, test_label_file, vec_beg=0, vec_end=-1): train_x = load_features(train_vec_file) train_y = load_labels_file(train_label_file) test_x = load_features(test_vec_file) test_y = load_labels_file(test_label_file) # print train_y[1000:1100] # print test_y[1000:1100] print train_x[0][50:60] if vec_beg != 0 or vec_end != -1: __truncate_vecs(train_x, vec_beg, vec_end) __truncate_vecs(test_x, vec_beg, vec_end) print 'training model ...' classifier.fit(train_x, train_y) print 'done.' y_pred = classifier.predict(test_x) get_scores(test_y, y_pred)
def __emadr_vs_pvdbow(): datadir = 'e:/data/emadr/20ng_bydate' doc_paths_file = os.path.join(datadir, 'all_doc_path_list.txt') dataset_split_labels_file = os.path.join( datadir, 'bindata/dataset-split-labels.bin') y_true_file = os.path.join(datadir, 'bindata/test-labels.bin') y_pred_pvdbow_file = os.path.join(datadir, 'bindata/ypred-pvdbow.bin') y_pred_emadr_file = os.path.join(datadir, 'bindata/ypred-emadr.bin') dst_file = os.path.join(datadir, 'example-candidates.txt') test_doc_paths = __get_test_doc_paths(doc_paths_file, dataset_split_labels_file) y_true = ioutils.load_labels_file(y_true_file) y_pvdbow = ioutils.load_labels_file(y_pred_pvdbow_file) y_emadr = ioutils.load_labels_file(y_pred_emadr_file) fout = open(dst_file, 'wb') cnt = 0 for i, tup in enumerate(izip(y_true, y_pvdbow, y_emadr)): yt, y0, y1 = tup if yt != y0 and yt == y1: print test_doc_paths[i], doc_classes[yt], doc_classes[y0] doc_path = os.path.join(datadir, test_doc_paths[i][20:]) fout.write('%s\t%s\t%s\n' % (doc_path, doc_classes[yt], doc_classes[y0])) doc_text = __get_doc_text(doc_path) fout.write('%s\n' % doc_text) cnt += 1 print cnt, len(y_true), float(cnt) / len(y_true) fout.close()
def _bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file(train_bow_file_name, False) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file(test_bow_file_name, False) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] train_y = ioutils.load_labels_file(train_label_file_name) test_y = ioutils.load_labels_file(test_label_file_name) print 'training svm ...' # clf = svm.SVC(decision_function_shape='ovo') clf = svm.LinearSVC() clf.fit(train_cm, train_y) print 'done.' y_pred = clf.predict(test_cm) ftmp = open('e:/dc/20ng_data/tmp_labels.txt', 'wb') for i in xrange(len(y_pred)): ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') ftmp.close() print 'accuracy', accuracy_score(test_y, y_pred) print 'precision', precision_score(test_y, y_pred, average='macro') print 'recall', recall_score(test_y, y_pred, average='macro') print 'f1', f1_score(test_y, y_pred, average='macro')
def __bow_svm(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' uint16_cnts = True train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file( train_bow_file_name, uint16_cnts) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file( test_bow_file_name, uint16_cnts) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs train_y = ioutils.load_labels_file(train_label_file_name) # print train_y[:100] print Counter(train_y) test_y = ioutils.load_labels_file(test_label_file_name) # print test_y[:100] print Counter(test_y) print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] print 'training svm ...' # clf = svm.SVC(decision_function_shape='ovo', kernel='poly', degree=2) clf = svm.SVC(decision_function_shape='ovo', kernel='linear') # clf = svm.LinearSVC() clf.fit(train_cm, train_y) print 'done.' y_pred = clf.predict(test_cm) print y_pred[:100] print Counter(y_pred) # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb') # for i in xrange(len(y_pred)): # ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') # ftmp.close() # print 'accuracy', accuracy_score(test_y, y_pred) # print 'precision', precision_score(test_y, y_pred, average='macro') # print 'recall', recall_score(test_y, y_pred, average='macro') # print 'f1', f1_score(test_y, y_pred, average='macro') acc = accuracy_score(test_y, y_pred) prec = precision_score(test_y, y_pred, average='macro') recall = recall_score(test_y, y_pred, average='macro') f1 = f1_score(test_y, y_pred, average='macro') print 'accuracy', acc print 'precision', prec print 'recall', recall print 'macro f1', f1 print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1) return acc, prec, recall, f1
def __bow_lr(train_bow_file_name, train_label_file_name, test_bow_file_name, test_label_file_name): print 'loading file ...' uint16_cnts = True train_word_indices_list, train_word_cnts_list, num_words = ioutils.load_bow_file( train_bow_file_name, uint16_cnts) test_word_indices_list, test_word_cnts_list, num_words = ioutils.load_bow_file( test_bow_file_name, uint16_cnts) print num_words, 'words' idfs = _get_idf_values(train_word_indices_list, train_word_cnts_list, num_words) print idfs train_y = ioutils.load_labels_file(train_label_file_name) # print train_y[:100] print Counter(train_y) test_y = ioutils.load_labels_file(test_label_file_name) # print test_y[:100] print Counter(test_y) print 'to sparse ...' train_cm = _word_cnts_to_bow_vecs(train_word_indices_list, train_word_cnts_list, num_words, idfs) test_cm = _word_cnts_to_bow_vecs(test_word_indices_list, test_word_cnts_list, num_words, idfs) # print train_cm[0] print 'training lr ...' lr = LogisticRegression(C=1000, multi_class='multinomial', solver='newton-cg') lr.fit(train_cm, train_y) print 'done.' y_pred = lr.predict(test_cm) # print y_pred[:100] print Counter(y_pred) # ftmp = open('e:/data/emadr/20ng_data/tmp_labels.txt', 'wb') # for i in xrange(len(y_pred)): # ftmp.write(str(y_pred[i]) + '\t' + str(test_y[i]) + '\n') # ftmp.close() acc = accuracy_score(test_y, y_pred) prec = precision_score(test_y, y_pred, average='macro') recall = recall_score(test_y, y_pred, average='macro') f1 = f1_score(test_y, y_pred, average='macro') print 'accuracy', acc print 'precision', prec print 'recall', recall print 'macro f1', f1 print '%f\t%f\t%f\t%f' % (acc, prec, recall, f1)
def __test(): doc_labels_file = 'e:/data/emadr/nyt-world-full/processed/doc-labels.bin' data_split_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/data-split-labels.bin' train_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/train-labels.bin' test_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/test-labels.bin' doc_labels = load_labels_file(doc_labels_file) print len(doc_labels), doc_labels[:20] split_labels = load_labels_file(data_split_labels_file) print len(split_labels), split_labels[:20] train_labels = load_labels_file(train_labels_file) print len(train_labels), train_labels[:20] test_labels = load_labels_file(test_labels_file) print len(test_labels), test_labels[:20]
def cluster_nyt(): num_clusters_list = [5, 10, 15, 20] method = 'RSM' result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower() labels_file_name = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/de-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/glove-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw2-vecs-ner-200.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw4-vecs-015.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/dedw5-vecs-ner.bin' doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/rsm-vecs-20.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/drbm-vecs-30.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdm-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/pvdbow-vecs.bin' # doc_vec_file_name = 'e:/dc/nyt-world-full/processed/vecs/nvdm-nyt.bin' # doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/test-dedw-vecs.bin' # labels_file_name = 'e:/dc/20ng_bydate/test_labels.bin' perf_list = list() # for num_clusters in [5, 10, 15, 20]: vec_list = ioutils.load_vec_list_file(doc_vec_file_name) labels = ioutils.load_labels_file(labels_file_name) for num_clusters in num_clusters_list: print '%d clusters' % num_clusters # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters) nmi_score, purity_score, ri_score = cluster_and_eval(vec_list, labels, num_clusters) perf_list.append((num_clusters, nmi_score, purity_score, ri_score)) # break write_clustering_perf_to_csv(method, perf_list, result_file)
def lda(): num_clusters_list = [5, 10, 15, 20] result_file = 'd:/documents/lab/paper-data/plot/lda-results-ri.csv' text_doc_file = 'e:/dc/nyt-world-full/processed/docs_tokenized_lc.txt' dict_file = 'e:/dc/nyt-world-full/processed/lda/all-docs.dict' mm_file = 'e:/dc/nyt-world-full/processed/lda/all-docs.mm' lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model' gold_label_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file) perf_list = list() gold_labels = ioutils.load_labels_file(gold_label_file) word_dict = gensim.corpora.Dictionary.load(dict_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) for num_clusters in num_clusters_list: print num_clusters, 'clusters' lda_model = gensim.models.ldamodel.LdaModel(mm_corpus, id2word=word_dict, num_topics=num_clusters) lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model-%d' % num_clusters lda_model.save(lda_model_file) nmi_score, purity_score, ri_score = __eval_lda_clustering(lda_model, mm_corpus, gold_labels) perf_list.append((num_clusters, nmi_score, purity_score, ri_score)) write_clustering_perf_to_csv('LDA', perf_list, result_file)
def __lda_clustering_nyt(): num_clusters_list = [5, 10, 15, 20] min_occurrence = 30 datadir = 'e:/data/emadr/nyt-less-docs/world/' result_file = 'd:/documents/lab/paper-data/plot/lda-results-ri.csv' dict_file = os.path.join(datadir, 'lda/all-docs-%d.dict' % min_occurrence) mm_file = os.path.join(datadir, 'lda/all-docs-%d.mm' % min_occurrence) gold_label_file = 'e:/data/emadr/nyt-less-docs/world/bindata/test-labels.bin' # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file) perf_list = list() gold_labels = ioutils.load_labels_file(gold_label_file) word_dict = gensim.corpora.Dictionary.load(dict_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) for num_clusters in num_clusters_list: print num_clusters, 'clusters' lda_model = gensim.models.ldamodel.LdaModel(mm_corpus, id2word=word_dict, num_topics=num_clusters) lda_model_file = 'e:/dc/nyt-world-full/processed/lda/lda-model-%d' % num_clusters lda_model.save(lda_model_file) nmi_score, purity_score, ri_score = __eval_lda_clustering( lda_model, mm_corpus, gold_labels) perf_list.append((num_clusters, nmi_score, purity_score, ri_score)) write_clustering_perf_to_csv('LDA', perf_list, result_file)
def __split_vecs(all_vecs_file_name, split_labels_file_name, dst_train_vecs_file_name, dst_test_vecs_file_name): all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name) split_labels = ioutils.load_labels_file(split_labels_file_name) train_vec_list = list() test_vec_list = list() for vec, split_label in zip(all_vec_list, split_labels): # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32) # print split_label if split_label == 1: test_vec_list.append(vec) else: train_vec_list.append(vec) print len(train_vec_list), 'training samples' print len(test_vec_list), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vecs_file_name) save_vecs(test_vec_list, dst_test_vecs_file_name)
def __gen_class_labels(): doc_paths_file = 'e:/data/emadr/20ng_bydate/docpaths.txt' dataset_labels_file = 'e:/data/emadr/20ng_bydate/bindata/dataset-split-labels.bin' all_docs_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/labels.bin' training_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/train-labels.bin' validation_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/val-labels.bin' testing_class_labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin' fin = open(doc_paths_file, 'r') docpaths = list() for line in fin: docpaths.append(line.strip()) fin.close() all_labels, train_labels, val_labels, test_labels = list(), list(), list( ), list() dataset_split_labels = ioutils.load_labels_file(dataset_labels_file) for dataset_split_label, docpath in izip(dataset_split_labels, docpaths): class_label_idx = 0 for lidx, cl in enumerate(doc_classes): if cl in docpath: class_label_idx = lidx print dataset_split_label, docpath, class_label_idx all_labels.append(class_label_idx) if dataset_split_label == 0: train_labels.append(class_label_idx) elif dataset_split_label == 1: val_labels.append(class_label_idx) else: test_labels.append(class_label_idx) ioutils.save_labels(all_labels, all_docs_class_labels_file) ioutils.save_labels(train_labels, training_class_labels_file) ioutils.save_labels(val_labels, validation_class_labels_file) ioutils.save_labels(test_labels, testing_class_labels_file)
def split_vecs(all_vecs_file_name, split_labels_file_name, dst_train_vecs_file_name, dst_test_vecs_file_name, train_label=0, test_label=1): all_vec_list = ioutils.load_vec_list_file(all_vecs_file_name) split_labels = ioutils.load_labels_file(split_labels_file_name) train_vec_list = list() test_vec_list = list() for vec, split_label in zip(all_vec_list, split_labels): # vec = np.random.uniform(0, 1, len(vec)).astype(np.float32) # print split_label if split_label == test_label: test_vec_list.append(vec) elif split_label == train_label: train_vec_list.append(vec) print len(train_vec_list), 'training samples' print len(test_vec_list), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vecs_file_name) save_vecs(test_vec_list, dst_test_vecs_file_name)
def doc_classification_svm(train_vec_file, train_label_file, test_vec_file, vec_beg=0, vec_end=-1): train_x = load_features(train_vec_file) train_y = load_labels_file(train_label_file) test_x = load_features(test_vec_file) # print train_y[1000:1100] # print test_y[1000:1100] print train_x[0][50:60] def trunc_vecs(vec_list): for idx, vec in enumerate(vec_list): if vec_end != -1: vec_list[idx] = vec[vec_beg:vec_end] else: vec_list[idx] = vec[vec_beg:] if vec_beg != 0 or vec_end != -1: trunc_vecs(train_x) trunc_vecs(test_x) print 'training svm ...' clf = svm.SVC(decision_function_shape='ovo') # clf = svm.SVC(decision_function_shape='ovo', kernel='linear') # clf = svm.LinearSVC(dual=False) clf.fit(train_x, train_y) print 'done.' y_pred = clf.predict(test_x) return y_pred
def __cluster_nyt(): # num_clusters_list = [5, 10, 15, 20] num_clusters_list = [10, 15, 20] # num_clusters_list = [5] method = 'RSM' datadir = 'e:/data/emadr/nyt-less-docs/world' result_file = 'd:/documents/lab/paper-data/plot/%s-results-ri.csv' % method.lower( ) labels_file_name = os.path.join(datadir, 'bindata/test-labels.bin') # doc_vec_file_name = os.path.join(datadir, 'vecs/test-dedw-vecs.bin') doc_vec_file_name = os.path.join(datadir, 'bindata/test-pvdbow-vecs.bin') # doc_vec_file_name = os.path.join(datadir, 'rsm/test-rsm-vecs.bin') perf_list = list() # for num_clusters in [5, 10, 15, 20]: vec_list = ioutils.load_vec_list_file(doc_vec_file_name) labels = ioutils.load_labels_file(labels_file_name) for num_clusters in num_clusters_list: print '%d clusters' % num_clusters # nmi_score, purity_score, ri_score = clustering(doc_vec_file_name, labels_file_name, num_clusters) nmi_score, purity_score, ri_score = cluster_and_eval( vec_list, labels, num_clusters) perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
def __gen_data_split_labels_tvt(): # doc_labels_file = 'e:/data/emadr/nyt-world-full/processed/doc-labels.bin' # data_split_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/data-split-labels.bin' # train_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/train-labels.bin' # val_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/val-labels.bin' # test_labels_file = 'e:/data/emadr/nyt-world-full/processed/bin/test-labels.bin' main_class = 'business' doc_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/labels.bin' % main_class data_split_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/dataset-split-labels.bin' % main_class train_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/train-labels.bin' % main_class val_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/val-labels.bin' % main_class test_labels_file = 'e:/data/emadr/nyt-less-docs/%s/bindata/test-labels.bin' % main_class all_labels = load_labels_file(doc_labels_file) num_labels = len(all_labels) split_labels = np.random.randint(0, 3, num_labels) fout_data_split = open(data_split_labels_file, 'wb') np.asarray([num_labels], np.int32).tofile(fout_data_split) split_labels.tofile(fout_data_split) fout_data_split.close() def write_labels(cur_labels, filename): fout = open(filename, 'wb') np.asarray([len(cur_labels)], dtype=np.int32).tofile(fout) np.asarray(cur_labels, dtype=np.int32).tofile(fout) fout.close() train_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 0] val_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 1] test_labels = [cl for cl, sl in izip(all_labels, split_labels) if sl == 2] write_labels(train_labels, train_labels_file) write_labels(val_labels, val_labels_file) write_labels(test_labels, test_labels_file)
def __gen_lda_features(data_split_labels_file, mm_file, lda_model_file, dst_train_vecs_file, dst_val_vecs_file, dst_test_vecs_file): data_split_labels = ioutils.load_labels_file(data_split_labels_file) lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) train_vecs, val_vecs, test_vecs = list(), list(), list() for i, (l, doc) in enumerate(izip(data_split_labels, mm_corpus)): topic_dist = lda_model[doc] vec = np.zeros(lda_model.num_topics, np.float32) for tup in topic_dist: vec[tup[0]] = tup[1] if l == 0: train_vecs.append(vec) elif l == 1: val_vecs.append(vec) else: test_vecs.append(vec) # print topic_dist # print vec if i % 1000 == 0: print i # break # print train_vecs[:5] __save_vecs(train_vecs, dst_train_vecs_file) __save_vecs(val_vecs, dst_val_vecs_file) __save_vecs(test_vecs, dst_test_vecs_file)
def __get_test_doc_paths(doc_paths_file, dataset_split_labels_file): test_doc_paths = list() doc_paths = __load_doc_paths(doc_paths_file) split_labels = ioutils.load_labels_file(dataset_split_labels_file) assert len(doc_paths) == len(split_labels) for doc_path, sl in izip(doc_paths, split_labels): if sl == 2: test_doc_paths.append(doc_path) return test_doc_paths
def bow_clustering(): num_clusters_list = [5, 10, 15, 20] dw_file = 'e:/dc/nyt-world-full/processed/bin/dw-30.bin' gold_labels_file = 'e:/dc/nyt-world-full/processed/test/doc-labels.bin' result_file = 'd:/documents/lab/paper-data/plot/bow-results-ri.csv' perf_list = list() gold_labels = ioutils.load_labels_file(gold_labels_file) bow_vecs = __get_bow_vecs(dw_file) for num_clusters in num_clusters_list: print num_clusters, 'clusters'
def __eval_lda_clustering_20ng(): text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt' dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict' mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm' lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model' dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin' test_label_file = 'e:/dc/20ng_bydate/test_labels.bin' __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file) __train_lda_model(dict_file, mm_file, lda_model_file) dataset_labels = ioutils.load_labels_file(dataset_label_file) lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) sys_labels = list() for i, doc in enumerate(mm_corpus): if dataset_labels[i] == 0: continue topic_dist = lda_model[doc] # print topic_dist cluster_idx = 0 max_dist = 0 for tup in topic_dist: if tup[1] > max_dist: cluster_idx = tup[0] max_dist = tup[1] sys_labels.append(cluster_idx) if len(sys_labels) % 1000 == 0: print len(sys_labels) # if i > 10: # break print len(sys_labels) gold_labels = ioutils.load_labels_file(test_label_file) print len(gold_labels) print normalized_mutual_info_score(gold_labels, sys_labels) print cluster_accuracy(gold_labels, sys_labels)
def __eval_lda_clustering_20ng(): text_doc_file = 'e:/dc/20ng_bydate/twe/docs-nl.txt' dict_file = 'e:/dc/20ng_bydate/lda/all-docs.dict' mm_file = 'e:/dc/20ng_bydate/lda/all-docs.mm' lda_model_file = 'e:/dc/20ng_bydate/lda/lda-model' dataset_label_file = 'e:/dc/20ng_bydate/doc_split_labels.bin' test_label_file = 'e:/dc/20ng_bydate/test_labels.bin' # __text_file_to_mm_corpus(text_doc_file, dict_file, mm_file) __train_lda_model(dict_file, mm_file, lda_model_file) dataset_labels = ioutils.load_labels_file(dataset_label_file) lda_model = gensim.models.ldamodel.LdaModel.load(lda_model_file) mm_corpus = gensim.corpora.MmCorpus(mm_file) sys_labels = list() for i, doc in enumerate(mm_corpus): if dataset_labels[i] == 0: continue topic_dist = lda_model[doc] # print topic_dist cluster_idx = 0 max_dist = 0 for tup in topic_dist: if tup[1] > max_dist: cluster_idx = tup[0] max_dist = tup[1] sys_labels.append(cluster_idx) if len(sys_labels) % 1000 == 0: print len(sys_labels) # if i > 10: # break print len(sys_labels) gold_labels = ioutils.load_labels_file(test_label_file) print len(gold_labels) print normalized_mutual_info_score(gold_labels, sys_labels) print cluster_accuracy(gold_labels, sys_labels)
def __cluster_20ng(): num_clusters = 20 labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-dedw-vecs.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-0_8-50.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/vecs/dew-vecs-cluster-0_15-50.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdbow-vecs.bin' doc_vec_file = 'e:/data/emadr/20ng_bydate/bindata/test-pvdm-vecs.bin' # doc_vec_file = 'e:/data/emadr/20ng_bydate/rsm/test-rsm-vecs.bin' vec_list = ioutils.load_vec_list_file(doc_vec_file) labels = ioutils.load_labels_file(labels_file) nmi_score, purity_score, ri_score = cluster_and_eval( vec_list, labels, num_clusters) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
def split_docs_text_file_by_dataset_labels(doc_text_file, dataset_split_file, dst_train_doc_text_file, dst_test_doc_text_file): data_split_labels = load_labels_file(dataset_split_file) print data_split_labels[:10] print len(data_split_labels) fin = open(doc_text_file, 'r') ftrain = open(dst_train_doc_text_file, 'wb') ftest = open(dst_test_doc_text_file, 'wb') for l, line in izip(data_split_labels, fin): if l == 0: ftrain.write(line) else: ftest.write(line) fin.close() ftrain.close() ftest.close()
def __bow_clustering(): # num_clusters_list = [5, 10, 15, 20] num_clusters_list = [5] dw_file = 'e:/data/emadr/nyt-less-docs/world/bindata/dw-test-30.bin' gold_labels_file = 'e:/data/emadr/nyt-less-docs/world/bindata/test-labels.bin' result_file = 'd:/documents/lab/paper-data/plot/bow-results-ri-bak.csv' # dw_file = 'e:/data/emadr/20ng_bydate/bindata/dw-test-30.bin' # gold_labels_file = 'e:/data/emadr/20ng_bydate/bindata/test-labels.bin' # result_file = 'd:/documents/lab/paper-data/plot/bow-results-20ng.csv' perf_list = list() gold_labels = ioutils.load_labels_file(gold_labels_file) print len(gold_labels), gold_labels[:10] bow_vecs = __get_bow_vecs(dw_file) print bow_vecs.shape for num_clusters in num_clusters_list: print num_clusters, 'clusters' nmi_score, purity_score, ri_score = bow_kmeans(bow_vecs, gold_labels, num_clusters) perf_list.append((num_clusters, nmi_score, purity_score, ri_score))
def split_vectors(all_vec_file_name, all_labels_file_name, dst_train_vec_file_name, dst_train_labels_file_name, dst_test_vec_file_name, dst_test_labels_file_name): all_vec_list = ioutils.load_vec_list_file(all_vec_file_name) all_labels = ioutils.load_labels_file(all_labels_file_name) train_vec_list = list() train_labels = list() test_vec_list = list() test_labels = list() for vec, label in zip(all_vec_list, all_labels): rand_val = random.randint(1, 10) if rand_val == 1: test_vec_list.append(vec) test_labels.append(label) else: train_vec_list.append(vec) train_labels.append(label) print len(train_labels), 'training samples' print len(test_labels), 'testing samples' def save_vecs(vec_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(vec_list), len(vec_list[0])], np.int32).tofile(fout) for cur_vec in vec_list: cur_vec.tofile(fout) fout.close() def save_labels(labels_list, dst_file_name): fout = open(dst_file_name, 'wb') np.asarray([len(labels_list)], np.int32).tofile(fout) np.asarray(labels_list, np.int32).tofile(fout) fout.close() save_vecs(train_vec_list, dst_train_vec_file_name) save_labels(train_labels, dst_train_labels_file_name) save_vecs(test_vec_list, dst_test_vec_file_name) save_labels(test_labels, dst_test_labels_file_name)
def __lda_clustering(): num_topics = 20 min_occurrence = 30 # datadir = 'e:/data/emadr/20ng_bydate/' # labels_file = os.path.join(datadir, 'bindata/test-labels.bin') # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) datadir = 'e:/data/emadr/nyt-less-docs/world' labels_file = os.path.join(datadir, 'bindata/test-labels.bin') topic_vecs_file = os.path.join( datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) topic_vecs = ioutils.load_vec_list_file(topic_vecs_file) gold_labels = ioutils.load_labels_file(labels_file) sys_labels = list() for i, topic_vec in enumerate(topic_vecs): cluster_idx = 0 max_dist = 0 for j, v in enumerate(topic_vec): if v > max_dist: cluster_idx = j max_dist = v # print cluster_idx, max_dist sys_labels.append(cluster_idx) if len(sys_labels) % 5000 == 0: print len(sys_labels) nmi_score = normalized_mutual_info_score(gold_labels, sys_labels) purity_score = purity(gold_labels, sys_labels) # ri_score = rand_index(gold_labels, sys_labels) ri_score = 0 print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)
def get_scores_label_file(true_label_file, y_pred): y_true = load_labels_file(true_label_file) return get_scores(y_true, y_pred)