Пример #1
0
def do_clustering(reduced_data, pca_samples):
    range_n_clusters = [2, 3, 4, 5, 6]
    best_score = 0
    best_cluster_size = 0

    for num_clusters in range_n_clusters:
        # Apply your unsupervised algorithm of choice to the reduced data
        clusterer = KMeans(n_clusters=num_clusters)
        clusterer.fit(reduced_data)

        # Predict the cluster for each data point
        preds = clusterer.predict(reduced_data)

        # Find the cluster centers (or means for GMM)
        centers = clusterer.cluster_centers_

        # Predict the cluster for each transformed sample data point
        sample_preds = clusterer.predict(pca_samples)

        # Calculate the mean silhouette coefficient for the number of clusters chosen
        score = silhouette_score(reduced_data, preds)
        if score > best_score:
            best_score = score
            best_cluster_size = num_clusters
        print "Silhouette score for", num_clusters, "clusters =", score

    print "Best cluster size = ", best_cluster_size

    # re-run the unsupervised with a specific number of clusters
    clusterer = KMeans(n_clusters=best_cluster_size)
    clusterer.fit(reduced_data)
    preds = clusterer.predict(reduced_data)
    centers = clusterer.cluster_centers_
    sample_preds = clusterer.predict(pca_samples)

    # Display the results of the unsupervised from implementation
    rs.cluster_results(reduced_data, preds, centers, pca_samples)

    # Display the predictions
    for i, pred in enumerate(sample_preds):
        print "Sample point", i, "predicted to be in Cluster", pred

    return centers
Пример #2
0
def do_clustering(reduced_data, pca_samples):
	range_n_clusters = [2, 3, 4, 5, 6]
	best_score = 0
	best_cluster_size = 0

	for num_clusters in range_n_clusters:
		# Apply your unsupervised algorithm of choice to the reduced data
		clusterer = KMeans(n_clusters=num_clusters)
		clusterer.fit(reduced_data)

		# Predict the cluster for each data point
		preds = clusterer.predict(reduced_data)

		# Find the cluster centers (or means for GMM)
		centers = clusterer.cluster_centers_

		# Predict the cluster for each transformed sample data point
		sample_preds = clusterer.predict(pca_samples)

		# Calculate the mean silhouette coefficient for the number of clusters chosen
		score = silhouette_score(reduced_data, preds)
		if score > best_score:
			best_score= score
			best_cluster_size = num_clusters
		print "Silhouette score for" , num_clusters, "clusters =", score

	print "Best cluster size = ", best_cluster_size

	# re-run the unsupervised with a specific number of clusters
	clusterer = KMeans(n_clusters=best_cluster_size)
	clusterer.fit(reduced_data)
	preds = clusterer.predict(reduced_data)
	centers = clusterer.cluster_centers_
	sample_preds = clusterer.predict(pca_samples)

	# Display the results of the unsupervised from implementation
	rs.cluster_results(reduced_data, preds, centers, pca_samples)

	# Display the predictions
	for i, pred in enumerate(sample_preds):
		print "Sample point", i, "predicted to be in Cluster", pred

	return centers
Пример #3
0
        # TODO: Calculate the mean silhouette coefficient for the number of clusters chosen
        scores_K_n += [metrics.silhouette_score(kp,preds_K)]
        scores_G_n += [metrics.silhouette_score(kp,preds_G)]
        
    #print n, scores_K_n
    scores_K += [np.average(scores_K_n)]
    scores_G += [np.average(scores_G_n)]

print pd.DataFrame(data={'Kmeans':scores_K,'GMM':scores_G},index=n_range)

plt.plot(n_range,scores_K)
plt.plot(n_range,scores_G)
plt.legend(['KMeans','GMM'])
plt.show()

#clusterer = GMM(n_components=12)
clusterer = KMeans(n_clusters=n_range[np.argmax(scores_K)])

preds = clusterer.fit_predict(kp)
#centers = clusterer.means_
centers = clusterer.cluster_centers_

#print clusterer.weights_

rs.cluster_results(kp,preds,centers,np.asarray([(0,0)]))

cv2.imwrite('sift_keypoints.jpg',image)
#cv2.imshow('keypoints',image)
plt.show()

Пример #4
0
for j in range(n_cl):
    ix = preds==j
    pl.scatter(reduced_data.ix[ix,0], reduced_data.ix[ix,1], color=clr[j])
pl.plot(centers[0,0], centers[0,1], 'yo', markersize=20)
pl.plot(centers[1,0], centers[1,1], 'go', markersize=20)
pl.xlabel('PC1');
pl.ylabel('PC2');
pl.axhline(0, color='k', linestyle='--');
pl.axvline(0, color='k', linestyle='--');

# TODO: Calculate the mean silhouette coefficient for the number of clusters chosen
score_opt = silhouette_score(reduced_data, clf.labels_)
print "Score (# clusters = %i) = %1.2f" % (n_cl, score_opt)

# Display the results of the clustering from implementation
rs.cluster_results(reduced_data, preds, centers, pca_samples)


# --------------------------------
# GMM
# --------------------------------
from sklearn import mixture
clfGMM = mixture.GMM(n_components=2,covariance_type='full')
aicGMM = np.zeros_like(score)
bicGMM = np.zeros_like(score)
scoreGMM = np.zeros_like(score)
for i, n_cl in enumerate(n_clusters):
    print "Fitting with # clusters = %i" % n_cl
    clfGMM = mixture.GMM(n_components=n_cl, covariance_type='full')
    clfGMM.fit(reduced_data)
Пример #5
0
scores = []
for x in range(2, 4):
    gmm = GMM(n_components=x)
    clusterer = gmm.fit(reduced_data)

    preds = clusterer.predict(reduced_data)

    centers = clusterer.means_

    score = silhouette_score(reduced_data, preds)

    scores.append(score)
print scores

# Display the results of the clustering from implementation
rs.cluster_results(reduced_data, preds, centers)
plt.show()

# Inverse transform the centers
log_centers = pca.inverse_transform(centers)

# Exponentiate the centers
true_centers = np.exp(log_centers)

# Display the true centers
segments = ['Segment {}'.format(i) for i in range(0, len(centers))]
true_centers = pd.DataFrame(np.round(true_centers), columns=data.keys())
true_centers.index = segments

true_centers = true_centers.append(data.describe().loc['50%'])
true_centers = true_centers.append(data.describe().loc['mean'])