def __eval_lda_clustering(lda_model, mm_corpus, gold_labels): # lda_model = gensim.models.ldamodel.LdaModel.load(model_file) sys_labels = list() for i, doc in enumerate(mm_corpus): topic_dist = lda_model[doc] # print topic_dist cluster_idx = 0 max_dist = 0 for tup in topic_dist: if tup[1] > max_dist: cluster_idx = tup[0] max_dist = tup[1] sys_labels.append(cluster_idx) if len(sys_labels) % 5000 == 0: print len(sys_labels) # if i > 10: # break # print len(sys_labels) # print len(gold_labels) nmi_score = normalized_mutual_info_score(gold_labels, sys_labels) purity_score = purity(gold_labels, sys_labels) ri_score = rand_index(gold_labels, sys_labels) # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, sys_labels) # print 'Purity: %f' % purity(gold_labels, sys_labels) # print 'Accuracy: %f' % cluster_accuracy(gold_labels, sys_labels) print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) return nmi_score, purity_score, ri_score
def bow_kmeans(bow_vecs, gold_labels, num_clusters): print 'performing kmeans ...' model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20) model.fit(bow_vecs) # print len(gold_labels), 'samples' nmi_score = normalized_mutual_info_score(gold_labels, model.labels_) purity_score = purity(gold_labels, model.labels_) ri_score = rand_index(gold_labels, model.labels_) # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_) # print 'Purity: %f' % purity(gold_labels, model.labels_) # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_) print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) return nmi_score, purity_score, ri_score
def bow_kmeans(bow_vecs, gold_labels, num_clusters): print 'performing kmeans ...' model = KMeans(n_clusters=num_clusters, n_jobs=4, n_init=20) model.fit(bow_vecs) # print len(gold_labels), 'samples' nmi_score = normalized_mutual_info_score(gold_labels, model.labels_) purity_score = purity(gold_labels, model.labels_) ri_score = 0 # ri_score = rand_index(gold_labels, model.labels_) # print 'NMI: %f' % normalized_mutual_info_score(gold_labels, model.labels_) # print 'Purity: %f' % purity(gold_labels, model.labels_) # print 'Accuracy: %f' % cluster_accuracy(gold_labels, model.labels_) print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score) return nmi_score, purity_score, ri_score
def __lda_clustering(): num_topics = 20 min_occurrence = 30 # datadir = 'e:/data/emadr/20ng_bydate/' # labels_file = os.path.join(datadir, 'bindata/test-labels.bin') # topic_vecs_file = os.path.join(datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) datadir = 'e:/data/emadr/nyt-less-docs/world' labels_file = os.path.join(datadir, 'bindata/test-labels.bin') topic_vecs_file = os.path.join( datadir, 'lda/test-vecs-%d-%d.bin' % (num_topics, min_occurrence)) topic_vecs = ioutils.load_vec_list_file(topic_vecs_file) gold_labels = ioutils.load_labels_file(labels_file) sys_labels = list() for i, topic_vec in enumerate(topic_vecs): cluster_idx = 0 max_dist = 0 for j, v in enumerate(topic_vec): if v > max_dist: cluster_idx = j max_dist = v # print cluster_idx, max_dist sys_labels.append(cluster_idx) if len(sys_labels) % 5000 == 0: print len(sys_labels) nmi_score = normalized_mutual_info_score(gold_labels, sys_labels) purity_score = purity(gold_labels, sys_labels) # ri_score = rand_index(gold_labels, sys_labels) ri_score = 0 print 'NMI: %f Purity: %f Rand index: %f' % (nmi_score, purity_score, ri_score) # print 'Accuracy: %f' % cluster_accuracy(labels, model.labels_) print '%f\t%f\t%f' % (nmi_score, purity_score, ri_score)