Exemplo n.º 1
def kmeans(X, K, init='random', max_iter=100, do_plot=False, to_return=[1, 0, 0]):
	Perform K-means clustering on data X.

	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters),
		'farthest' (choose cluster 1 uniformly, then the point farthest
		from all cluster so far, etc.), or 'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of optimization iterations.
	do_plot : bool (optional)
		Plot 2D data?
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether c should be returned, etc.

	z : numpy array
		N x 1 array containing cluster numbers of data at indices in X.
	c : numpy array (optional)
		K x M array of cluster centers.
	sumd : scalar (optional)
		Sum of squared euclidean distances.
	TODO: test more
	n,d = twod(X).shape							# get data size

	if type(init) is str:
		init = init.lower()
		if init == 'random':
			pi = np.random.permutation(n)
			c = X[pi[0:K],:]
		elif init == 'farthest':
			c = k_init(X, K, True)
		elif init == 'k++':
			c = k_init(X, K, False)
			raise ValueError('kmeans: value for "init" ( ' + init +  ') is invalid')
		c = init

	z,c,sumd = __optimize(X, n, K, c,  max_iter)

	return optional_return(to_return, z - 1, c, sumd)
def em_cluster(X,
               to_return=[1, 0, 0, 0]):
	Perform Gaussian mixture EM (expectation-maximization) clustering on data X.

	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters)
				'farthest' (choose cluster 1 uniformly, then the point farthest
					 from all cluster so far, etc.)
				'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of iterations.
	tol : scalar (optional)
		Stopping tolerance.
	do_plot : bool (optional)
		Plot if do_plot == True.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether T should be returned, etc.

	z : numpy array
		1 x N numpy array of cluster assignments (int indices).
	T : {str -> numpy array} (optional)
		Gaussian component parameters:
			alpha : numpy array
			mu : numpy array
			sig : numpy array
	soft : numpy array (optional)
		Soft assignment probabilities (rounded for assign).
	ll : scalar (optional)
		Log-likelihood under the returned model.
    # init
    N, D = twod(X).shape  # get data size

    if type(init) is str:
        init = init.lower()
        if init == 'random':
            pi = np.random.permutation(N)
            mu = X[pi[0:K], :]
        elif init == 'farthest':
            mu = k_init(X, K, True)
        elif init == 'k++':
            mu = k_init(X, K, False)
            raise ValueError('em_cluster: value for "init" ( ' + init +
                             ') is invalid')
        mu = init

    sig = np.zeros((D, D, K))
    for c in range(K):
        sig[:, :, c] = np.eye(D)
    alpha = np.ones(K) / K
    R = np.zeros((N, K))

    iter, ll, ll_old = 1, np.inf, np.inf
    done = iter > max_iter
    C = np.log(2 * np.pi) * D / 2

    while not done:
        ll = 0
        for c in range(K):
            # compute log prob of all data under model c
            V = X - np.tile(mu[c, :], (N, 1))
            R[:, c] = -0.5 * np.sum(
                (V.dot(np.linalg.inv(sig[:, :, c]))) * V,
                axis=1) - 0.5 * np.log(np.linalg.det(sig[:, :, c])) + np.log(
                    alpha[c]) - C

        # avoid numberical issued by removing constant 1st
        mx = R.max(1)
        R -= np.tile(twod(mx).T, (1, K))
        # exponentiate and compute sum over components
        R = np.exp(R)
        nm = R.sum(1)
        # update log-likelihood of data
        ll = np.sum(np.log(nm) + mx)
        R /= np.tile(twod(nm).T,
                     (1, K))  # normalize to give membership probabilities

        alpha = R.sum(0)  # total weight for each component
        for c in range(K):
            # weighted mean estimate
            mu[c, :] = (R[:, c] / alpha[c]).T.dot(X)
            tmp = X - np.tile(mu[c, :], (N, 1))
            # weighted covar estimate
            sig[:, :, c] = tmp.T.dot(
                tmp * np.tile(twod(R[:, c]).T / alpha[c],
                              (1, D))) + 1e-32 * np.eye(D)
        alpha /= N

        # stopping criteria
        done = (iter >= max_iter) or np.abs(ll - ll_old) < tol
        ll_old = ll
        iter += 1

    z = from_1_of_k(R)
    soft = R
    T = {'pi': alpha, 'mu': mu, 'sig': sig}

    return optional_return(to_return, twod(z).T, T, soft, ll)
def agglom_cluster(X, n_clust, method='means', join=None, to_return=[1,0]):
	Perform hierarchical agglomerative clustering.

	X : numpy array
		N x M array of Data to be clustered.
	n_clust : int
		The number of clusters into which data should be grouped.
	method : str (optional)
		str that specifies the method to use for calculating distance between
		clusters. Can be one of: 'min', 'max', 'means', or 'average'.
	join : numpy array (optional)
		N - 1 x 3 that contains a sequence of joining operations. Pass to avoid
		reclustering for new X.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether join should be returned.

	z : numpy array
		N x 1 array of cluster assignments.
	join : numpy array (optional)
		N - 1 x 3 array that contains the sequence of joining operations 
		peformed by the clustering algorithm.
	m,n = twod(X).shape					# get data size
	D = np.zeros((m,m)) + np.inf		# store pairwise distances b/w clusters (D is an upper triangular matrix)
	z = arr(range(m))					# assignments of data
	num = np.ones(m)					# number of data in each cluster
	mu = arr(X)							# centroid of each cluster
	method = method.lower()

	if type(join) == type(None):		# if join not precomputed

		join = np.zeros((m - 1, 3))		# keep track of join sequence
		# use standard Euclidean distance
		dist = lambda a,b: np.sum(np.power(a - b, 2))
		for i in range(m):				# compute initial distances
			for j in range(i + 1, m):
				D[i][j] = dist(X[i,:], X[j,:])

		opn = np.ones(m)				# store list of clusters still in consideration
		val,k = np.min(D),np.argmin(D)	# find first join (closest cluster pair)
		for c in range(m - 1):
			i,j = np.unravel_index(k, D.shape)
			join[c,:] = arr([i, j, val])

			# centroid of new cluster
			mu_new = (num[i] * mu[i,:] + num[j] * mu[j,:]) / (num[i] + num[j])

			# compute new distances to cluster i
			for jj in np.where(opn)[0]:
				if jj in [i, j]:

				# sort indices because D is an upper triangluar matrix
				idxi = tuple(sorted((i,jj)))	
				idxj = tuple(sorted((j,jj)))	
				if method == 'min':
					D[idxi] = min(D[idxi], D[idxj])		# single linkage (min dist)
				elif method == 'max':
					D[idxi] = max(D[idxi], D[idxj])		# complete linkage (max dist)
				elif method == 'means':
					D[idxi] = dist(mu_new, mu[jj,:])	# mean linkage (dist b/w centroids)
				elif method == 'average':
					# average linkage
					D[idxi] = (num[i] * D[idxi] + num[j] * D[idxj]) / (num[i] + num[j])

			opn[j] = 0						# close cluster j (fold into i)
			num[i] = num[i] + num[j]		# update total membership in cluster i to include j
			mu[i,:] = mu_new				# update centroid list

			# remove cluster j from consideration as min
			for ii in range(m):
				if ii != j:
					# sort indices because D is an upper triangular matrix
					idx = tuple(sorted((ii,j)))	
					D[idx] = np.inf

			val,k = np.min(D), np.argmin(D)	# find next smallext pair

	# compute cluster assignments given sequence of joins
	for c in range(m - n_clust):
		z[z == join[c,1]] = join[c,0]

	uniq = np.unique(z)
	for c in range(len(uniq)):
		z[z == uniq[c]] = c

	return optional_return(to_return, twod(z).T, join)
