def get_centroids(data_pnts,k):
	centroids = (utils.init_centroids(data_pnts,1).tolist())

	# rest of the centroids are determined by how close they are to existing centroids
	for i in range(1,k):
		centroids.append(new_center(centroids,data_pnts))

	# Finished initializing, continue with regular k means
	# print centroids
	centroids = np.array(centroids)
	return centroids
Exemplo n.º 2
0
def main():

    datafile = "toydata.txt"
    k = 3

    # Read in file
    data_pnts = utils.read(datafile)
    centroids = utils.init_centroids(data_pnts,k)

    rv = run(centroids,data_pnts,k,"cost.png","k_means.png")
    # run through k_means ~ 19 times to account for random init
    
    for i in range(19):
        centroids = utils.init_centroids(data_pnts,k)

        temp = run(centroids,data_pnts,k,"cost.png")
        rv = np.concatenate((rv,temp),axis = 0)
    plt.figure()


    for j in rv:
        plot_cost(j[0],j[1])
    plt.savefig("cost.png")
Exemplo n.º 3
0
def initialize(data_pnts,k):
    '''Randomly select points to serve as means
       Covariance matrix equals covariance of full training set
       Each cluster has equal prior probability. 
       Dataset is equally divided among clusters
       Each cluster has a mean, a probability of coming from it,
       and a covariance matrix of the zth gaussian
    '''
    # randomly find k gaussians from n datapoints, with equal prob
    means = utils.init_centroids(data_pnts,k)
    # probability is uniform, fill pi
    pi = [1/float(k) for i in range(k)]
    # covar matrix initialized as identity matrix
    # initialize it as a 2x2 matrix
    covar = [np.identity(2) for j in range(k)]


    # covar = [np.dot(means[j] for j in range(k)]

    return (means,pi,covar)
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt

from utils import init_centroids, get_point_centroid_indices, compute_centroids, calculate_cost

if __name__ == "__main__":
    # Load data from the file
    data = sio.loadmat('k_means_clustering/data/dataset.mat')
    X = np.matrix(data['X'])

    # Initial setup
    K = 3
    max_iterations = 20
    max_clusters = 10
    centroids = init_centroids(X, K)

    # Elbow method (calculate optimal number of clusters)
    costs = np.zeros((max_clusters, 1))
    for c in range(1, max_clusters + 1):
        centroids = init_centroids(X, c)
        # Iterate through the centroids
        for i in range(0, max_iterations):
            indices = get_point_centroid_indices(X, centroids)
            centroids = compute_centroids(X, indices, c)
        costs[c - 1, 0] = calculate_cost(X, indices, centroids)

    # Plot the cost graph
    plt.plot(np.arange(1, max_clusters + 1), costs, c='r', marker='o')
    plt.grid()
    plt.xlabel('Number of clusters')