def get_centroids(data_pnts,k): centroids = (utils.init_centroids(data_pnts,1).tolist()) # rest of the centroids are determined by how close they are to existing centroids for i in range(1,k): centroids.append(new_center(centroids,data_pnts)) # Finished initializing, continue with regular k means # print centroids centroids = np.array(centroids) return centroids
def main(): datafile = "toydata.txt" k = 3 # Read in file data_pnts = utils.read(datafile) centroids = utils.init_centroids(data_pnts,k) rv = run(centroids,data_pnts,k,"cost.png","k_means.png") # run through k_means ~ 19 times to account for random init for i in range(19): centroids = utils.init_centroids(data_pnts,k) temp = run(centroids,data_pnts,k,"cost.png") rv = np.concatenate((rv,temp),axis = 0) plt.figure() for j in rv: plot_cost(j[0],j[1]) plt.savefig("cost.png")
def initialize(data_pnts,k): '''Randomly select points to serve as means Covariance matrix equals covariance of full training set Each cluster has equal prior probability. Dataset is equally divided among clusters Each cluster has a mean, a probability of coming from it, and a covariance matrix of the zth gaussian ''' # randomly find k gaussians from n datapoints, with equal prob means = utils.init_centroids(data_pnts,k) # probability is uniform, fill pi pi = [1/float(k) for i in range(k)] # covar matrix initialized as identity matrix # initialize it as a 2x2 matrix covar = [np.identity(2) for j in range(k)] # covar = [np.dot(means[j] for j in range(k)] return (means,pi,covar)
import scipy.io as sio import numpy as np import matplotlib.pyplot as plt from utils import init_centroids, get_point_centroid_indices, compute_centroids, calculate_cost if __name__ == "__main__": # Load data from the file data = sio.loadmat('k_means_clustering/data/dataset.mat') X = np.matrix(data['X']) # Initial setup K = 3 max_iterations = 20 max_clusters = 10 centroids = init_centroids(X, K) # Elbow method (calculate optimal number of clusters) costs = np.zeros((max_clusters, 1)) for c in range(1, max_clusters + 1): centroids = init_centroids(X, c) # Iterate through the centroids for i in range(0, max_iterations): indices = get_point_centroid_indices(X, centroids) centroids = compute_centroids(X, indices, c) costs[c - 1, 0] = calculate_cost(X, indices, centroids) # Plot the cost graph plt.plot(np.arange(1, max_clusters + 1), costs, c='r', marker='o') plt.grid() plt.xlabel('Number of clusters')