def do_clustering(reduced_data, pca_samples): range_n_clusters = [2, 3, 4, 5, 6] best_score = 0 best_cluster_size = 0 for num_clusters in range_n_clusters: # Apply your unsupervised algorithm of choice to the reduced data clusterer = KMeans(n_clusters=num_clusters) clusterer.fit(reduced_data) # Predict the cluster for each data point preds = clusterer.predict(reduced_data) # Find the cluster centers (or means for GMM) centers = clusterer.cluster_centers_ # Predict the cluster for each transformed sample data point sample_preds = clusterer.predict(pca_samples) # Calculate the mean silhouette coefficient for the number of clusters chosen score = silhouette_score(reduced_data, preds) if score > best_score: best_score = score best_cluster_size = num_clusters print "Silhouette score for", num_clusters, "clusters =", score print "Best cluster size = ", best_cluster_size # re-run the unsupervised with a specific number of clusters clusterer = KMeans(n_clusters=best_cluster_size) clusterer.fit(reduced_data) preds = clusterer.predict(reduced_data) centers = clusterer.cluster_centers_ sample_preds = clusterer.predict(pca_samples) # Display the results of the unsupervised from implementation rs.cluster_results(reduced_data, preds, centers, pca_samples) # Display the predictions for i, pred in enumerate(sample_preds): print "Sample point", i, "predicted to be in Cluster", pred return centers
def do_clustering(reduced_data, pca_samples): range_n_clusters = [2, 3, 4, 5, 6] best_score = 0 best_cluster_size = 0 for num_clusters in range_n_clusters: # Apply your unsupervised algorithm of choice to the reduced data clusterer = KMeans(n_clusters=num_clusters) clusterer.fit(reduced_data) # Predict the cluster for each data point preds = clusterer.predict(reduced_data) # Find the cluster centers (or means for GMM) centers = clusterer.cluster_centers_ # Predict the cluster for each transformed sample data point sample_preds = clusterer.predict(pca_samples) # Calculate the mean silhouette coefficient for the number of clusters chosen score = silhouette_score(reduced_data, preds) if score > best_score: best_score= score best_cluster_size = num_clusters print "Silhouette score for" , num_clusters, "clusters =", score print "Best cluster size = ", best_cluster_size # re-run the unsupervised with a specific number of clusters clusterer = KMeans(n_clusters=best_cluster_size) clusterer.fit(reduced_data) preds = clusterer.predict(reduced_data) centers = clusterer.cluster_centers_ sample_preds = clusterer.predict(pca_samples) # Display the results of the unsupervised from implementation rs.cluster_results(reduced_data, preds, centers, pca_samples) # Display the predictions for i, pred in enumerate(sample_preds): print "Sample point", i, "predicted to be in Cluster", pred return centers
# TODO: Calculate the mean silhouette coefficient for the number of clusters chosen scores_K_n += [metrics.silhouette_score(kp,preds_K)] scores_G_n += [metrics.silhouette_score(kp,preds_G)] #print n, scores_K_n scores_K += [np.average(scores_K_n)] scores_G += [np.average(scores_G_n)] print pd.DataFrame(data={'Kmeans':scores_K,'GMM':scores_G},index=n_range) plt.plot(n_range,scores_K) plt.plot(n_range,scores_G) plt.legend(['KMeans','GMM']) plt.show() #clusterer = GMM(n_components=12) clusterer = KMeans(n_clusters=n_range[np.argmax(scores_K)]) preds = clusterer.fit_predict(kp) #centers = clusterer.means_ centers = clusterer.cluster_centers_ #print clusterer.weights_ rs.cluster_results(kp,preds,centers,np.asarray([(0,0)])) cv2.imwrite('sift_keypoints.jpg',image) #cv2.imshow('keypoints',image) plt.show()
for j in range(n_cl): ix = preds==j pl.scatter(reduced_data.ix[ix,0], reduced_data.ix[ix,1], color=clr[j]) pl.plot(centers[0,0], centers[0,1], 'yo', markersize=20) pl.plot(centers[1,0], centers[1,1], 'go', markersize=20) pl.xlabel('PC1'); pl.ylabel('PC2'); pl.axhline(0, color='k', linestyle='--'); pl.axvline(0, color='k', linestyle='--'); # TODO: Calculate the mean silhouette coefficient for the number of clusters chosen score_opt = silhouette_score(reduced_data, clf.labels_) print "Score (# clusters = %i) = %1.2f" % (n_cl, score_opt) # Display the results of the clustering from implementation rs.cluster_results(reduced_data, preds, centers, pca_samples) # -------------------------------- # GMM # -------------------------------- from sklearn import mixture clfGMM = mixture.GMM(n_components=2,covariance_type='full') aicGMM = np.zeros_like(score) bicGMM = np.zeros_like(score) scoreGMM = np.zeros_like(score) for i, n_cl in enumerate(n_clusters): print "Fitting with # clusters = %i" % n_cl clfGMM = mixture.GMM(n_components=n_cl, covariance_type='full') clfGMM.fit(reduced_data)
scores = [] for x in range(2, 4): gmm = GMM(n_components=x) clusterer = gmm.fit(reduced_data) preds = clusterer.predict(reduced_data) centers = clusterer.means_ score = silhouette_score(reduced_data, preds) scores.append(score) print scores # Display the results of the clustering from implementation rs.cluster_results(reduced_data, preds, centers) plt.show() # Inverse transform the centers log_centers = pca.inverse_transform(centers) # Exponentiate the centers true_centers = np.exp(log_centers) # Display the true centers segments = ['Segment {}'.format(i) for i in range(0, len(centers))] true_centers = pd.DataFrame(np.round(true_centers), columns=data.keys()) true_centers.index = segments true_centers = true_centers.append(data.describe().loc['50%']) true_centers = true_centers.append(data.describe().loc['mean'])