def test_entropy(self): probs = np.array([0.1, 0.5, 0.01, 0.07, 0.02, 0.3, 0, 0, 0], dtype="d") self.assertEquals(entropy.entropy(probs), it_entropy(probs)) try: entropy.entropy(np.array([-1], dtype="d")) self.fail() except AssertionError: pass try: entropy.entropy(np.array([0.1, 0.8], dtype="d")) self.fail() except AssertionError: pass try: entropy.entropy(np.array([2, -1], dtype="d")) self.fail() except AssertionError: pass try: entropy.entropy(np.array([], dtype="d")) self.fail() except AssertionError: pass
def _summarize(data, vocabulary, labels_column, num_cluster): # Basic stats print("Number of songs per cluster") counter = Counter(labels_column) print(counter) print() prob_Ct, prob_Tc, prob_T = compute_probs(data, num_cluster, labels_column, counter) all_tags = range(len(prob_T)) print("Top tags per cluster") for clust in xrange(num_cluster): print(clust, "tags with max_freq_in_cluster") songs_in_cluster = np.where(labels_column == clust)[0] for tag in top_10_frequency(data[songs_in_cluster]): print("\t", vocabulary[tag]) print() print(clust, "tags with max_prob_p(c|t)") sort_func = lambda to_sort: prob_Ct[to_sort][clust] for tag in sorted(all_tags, key=sort_func, reverse=True)[:10]: print("\t", vocabulary[tag]) print() print() print("Term entropies for each cluster") term_entropies = [] for clust in xrange(num_cluster): h = entropy.entropy(prob_Tc[clust]) term_entropies.append(h) print(clust, h) print() # Number of shared tags between clusters X = np.zeros((num_cluster, len(all_tags))) for clust in xrange(num_cluster): for tag in all_tags: X[clust][tag] = prob_Tc[clust][tag] distances = pairwise_kernels(X) for i in xrange(num_cluster): distances[i, i] = 0 plt.imshow(distances, cmap="bone_r", interpolation="nearest") ax = plt.gca() plt.xticks(np.arange(0, num_cluster)) plt.yticks(np.arange(0, num_cluster)) plt.colorbar() plt.title("Confusion Matrix for Cluster Similarities") plt.ylabel("ClusterID") plt.xlabel("ClusterID") for i in xrange(num_cluster): ax.annotate("%.3f" % term_entropies[i], xy=(i, i), horizontalalignment="center", verticalalignment="center") plt.show() print("Mean difference") to_corr_1 = [] to_corr_2 = [] for clust in xrange(num_cluster): to_corr_1.append(term_entropies[clust]) to_corr_2.append(np.mean(distances[clust])) print(clust, term_entropies[clust], np.mean(distances[clust])) from scipy.stats import pearsonr print("R2 ", pearsonr(to_corr_1, to_corr_2))