def good_number_of_clusters(vals): wcss = [] for ii in range(1, 30): kmeans = KMeans(n_clusters=ii, init="k-means++", n_init=10, max_iter=300) kmeans.fit_predict(vals) wcss.append(kmeans.inertia_) plt.plot(wcss, 'ro-', label="WCSS") plt.title("Computing WCSS for KMeans++") plt.xlabel("Number of clusters") plt.ylabel("WCSS") plt.show()
def do_KMeans_clustering(N_cluster, X, device): """ This function will use KMeans Clustering method to label training data according to its proximity with a cluster Input: N_cluster: number of cluster estimated by Gap Statistics X: Training data for the input layer Output: cluster_label: label assigned to every point over_coef: this will be used in the oversampling method to increase number of points in the less densed cluster region """ X = X.to(device) #Instantiating kmeans object kmeans = KMeans(n_clusters=N_cluster, mode='euclidean', verbose=1) cluster_label = kmeans.fit_predict(X) #Calculating the size of cluster (number of data near the cluster centroid) cluster_size = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): cluster_size[cluster] = len(torch.where(cluster_label==cluster)[0]) over_coef = torch.zeros(N_cluster, dtype=torch.int32).to(device) for cluster in range(N_cluster): over_coef[cluster] = torch.clone((max(cluster_size))/cluster_size[cluster]).to(device) if over_coef[cluster] > 10: over_coef[cluster] = 10 return cluster_label.cpu(), over_coef.cpu()
def do_gap_statistics(X, n_var, device): """ This function uses gap statistics method to calculate the number of clusters Input: X: Training data for the input layer n_var: The number of design variables of the problem Output: N_cluster: the best number of cluster that maximizes gap """ max_cluster = 30 trials = 10 X = X.to(device) count = torch.zeros(max_cluster, dtype=torch.int32).to(device) X_rnd = torch.randn(len(X), n_var).to(device) for trial in range(trials): gap = torch.zeros(max_cluster, dtype=torch.float32).to(device) gap_diff = torch.zeros(max_cluster, dtype=torch.float32).to(device) for cluster in range(max_cluster): kmeans = KMeans(n_clusters=cluster+1, mode='euclidean') labels = kmeans.fit_predict(X) kmeans_rnd = KMeans(n_clusters=cluster+1, mode='euclidean') labels_rnd = kmeans_rnd.fit_predict(X_rnd) gap[cluster] = torch.log(kmeans_rnd.inertia_(X_rnd,labels_rnd)/kmeans.inertia_(X,labels)) if cluster==0: gap_diff[0] = 0.0 else: gap_diff[cluster] = gap[cluster] - gap[cluster-1] if gap_diff[cluster] < 0.0: break count[torch.argmax(gap)] = count[torch.argmax(gap)] + 1 N_cluster = torch.argmax(count)+1 #+1 because cluster in the range(max_cluster) starts from zero return N_cluster
def test_score(self): """ Tests within-cluster variance """ X, y, centers = generate_cluster_samples() n_samples = X.shape[0] n_features = X.shape[1] k = centers.shape[0] kmeans = KMeans(k, N_ITER) assignments = kmeans.fit_predict(X) score = np.sqrt(kmeans.score(X)) / n_samples self.assertLess(score, EPS)
def predict_line_label(self): self.check() if self.method == '1D': kmeans = sklearn.cluster.KMeans(n_clusters=3) pred = kmeans.fit_predict( np.array(self.smoothed_TR).reshape(-1, 1)) self.distance1D = scipy.spatial.distance.cdist( np.array(self.smoothed_TR).reshape(-1, 1), kmeans.cluster_centers_) self.label1d = np.argmin(kmeans.cluster_centers_) line_label = pred != self.label1d elif self.method == '2D': kmeans = KMeans(n_clusters=3, fixed_centroids={0: [0, 0]}) norm_TR = np.linalg.norm(self.smoothed_TR[:len(self.smoothed_G)]) norm_G = np.linalg.norm(self.smoothed_G) X = np.dstack((self.smoothed_TR[:len(self.smoothed_G)], self.smoothed_G * norm_TR / norm_G))[0] pred = kmeans.fit_predict(X) self.distance2D = kmeans.distance line_label = np.hstack([pred != 0, [False] * self.alpha]) else: raise Exception("Wrong method argument: " + self.method) return line_label
from matplotlib.image import imread import matplotlib.pyplot as plt from kmeans import KMeans import torch image = imread('images/IMG_0015.jpg') X = image.reshape(-1, 3) X_t = torch.from_numpy(X).float() kmeans = KMeans(n_clusters=5) labels = kmeans.fit_predict(X_t) segmented_img = kmeans.centroids[labels] segmented_img = segmented_img.view(image.shape) new_img = segmented_img.numpy() plt.imsave('images/5.jpg', new_img.astype('uint8'))