예제 #1
0
def kmeans(X, K, init='random', max_iter=100, do_plot=False, to_return=[1, 0, 0]):
	"""
	Perform K-means clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters),
		'farthest' (choose cluster 1 uniformly, then the point farthest
		from all cluster so far, etc.), or 'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of optimization iterations.
	do_plot : bool (optional)
		Plot 2D data?
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether c should be returned, etc.

	Returns
	-------
	z : numpy array
		N x 1 array containing cluster numbers of data at indices in X.
	c : numpy array (optional)
		K x M array of cluster centers.
	sumd : scalar (optional)
		Sum of squared euclidean distances.
		
	TODO: test more
	"""
	n,d = twod(X).shape							# get data size

	if type(init) is str:
		init = init.lower()
		if init == 'random':
			pi = np.random.permutation(n)
			c = X[pi[0:K],:]
		elif init == 'farthest':
			c = k_init(X, K, True)
		elif init == 'k++':
			c = k_init(X, K, False)
		else:
			raise ValueError('kmeans: value for "init" ( ' + init +  ') is invalid')
	else:
		c = init

	z,c,sumd = __optimize(X, n, K, c,  max_iter)

	return optional_return(to_return, z - 1, c, sumd)
def em_cluster(X,
               K,
               init='random',
               max_iter=100,
               tol=1e-6,
               do_plot=False,
               to_return=[1, 0, 0, 0]):
    """
	Perform Gaussian mixture EM (expectation-maximization) clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters)
				'farthest' (choose cluster 1 uniformly, then the point farthest
					 from all cluster so far, etc.)
				'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of iterations.
	tol : scalar (optional)
		Stopping tolerance.
	do_plot : bool (optional)
		Plot if do_plot == True.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether T should be returned, etc.

	Returns
	-------
	z : numpy array
		1 x N numpy array of cluster assignments (int indices).
	T : {str -> numpy array} (optional)
		Gaussian component parameters:
			alpha : numpy array
			mu : numpy array
			sig : numpy array
	soft : numpy array (optional)
		Soft assignment probabilities (rounded for assign).
	ll : scalar (optional)
		Log-likelihood under the returned model.
	"""
    # init
    N, D = twod(X).shape  # get data size

    if type(init) is str:
        init = init.lower()
        if init == 'random':
            pi = np.random.permutation(N)
            mu = X[pi[0:K], :]
        elif init == 'farthest':
            mu = k_init(X, K, True)
        elif init == 'k++':
            mu = k_init(X, K, False)
        else:
            raise ValueError('em_cluster: value for "init" ( ' + init +
                             ') is invalid')
    else:
        mu = init

    sig = np.zeros((D, D, K))
    for c in range(K):
        sig[:, :, c] = np.eye(D)
    alpha = np.ones(K) / K
    R = np.zeros((N, K))

    iter, ll, ll_old = 1, np.inf, np.inf
    done = iter > max_iter
    C = np.log(2 * np.pi) * D / 2

    while not done:
        ll = 0
        for c in range(K):
            # compute log prob of all data under model c
            V = X - np.tile(mu[c, :], (N, 1))
            R[:, c] = -0.5 * np.sum(
                (V.dot(np.linalg.inv(sig[:, :, c]))) * V,
                axis=1) - 0.5 * np.log(np.linalg.det(sig[:, :, c])) + np.log(
                    alpha[c]) - C

        # avoid numberical issued by removing constant 1st
        mx = R.max(1)
        R -= np.tile(twod(mx).T, (1, K))
        # exponentiate and compute sum over components
        R = np.exp(R)
        nm = R.sum(1)
        # update log-likelihood of data
        ll = np.sum(np.log(nm) + mx)
        R /= np.tile(twod(nm).T,
                     (1, K))  # normalize to give membership probabilities

        alpha = R.sum(0)  # total weight for each component
        for c in range(K):
            # weighted mean estimate
            mu[c, :] = (R[:, c] / alpha[c]).T.dot(X)
            tmp = X - np.tile(mu[c, :], (N, 1))
            # weighted covar estimate
            sig[:, :, c] = tmp.T.dot(
                tmp * np.tile(twod(R[:, c]).T / alpha[c],
                              (1, D))) + 1e-32 * np.eye(D)
        alpha /= N

        # stopping criteria
        done = (iter >= max_iter) or np.abs(ll - ll_old) < tol
        ll_old = ll
        iter += 1

    z = from_1_of_k(R)
    soft = R
    T = {'pi': alpha, 'mu': mu, 'sig': sig}

    return optional_return(to_return, twod(z).T, T, soft, ll)
def agglom_cluster(X, n_clust, method='means', join=None, to_return=[1,0]):
	"""
	Perform hierarchical agglomerative clustering.

	Parameters
	----------
	X : numpy array
		N x M array of Data to be clustered.
	n_clust : int
		The number of clusters into which data should be grouped.
	method : str (optional)
		str that specifies the method to use for calculating distance between
		clusters. Can be one of: 'min', 'max', 'means', or 'average'.
	join : numpy array (optional)
		N - 1 x 3 that contains a sequence of joining operations. Pass to avoid
		reclustering for new X.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether join should be returned.

	Returns
	-------
	z : numpy array
		N x 1 array of cluster assignments.
	join : numpy array (optional)
		N - 1 x 3 array that contains the sequence of joining operations 
		peformed by the clustering algorithm.
	"""
	m,n = twod(X).shape					# get data size
	D = np.zeros((m,m)) + np.inf		# store pairwise distances b/w clusters (D is an upper triangular matrix)
	z = arr(range(m))					# assignments of data
	num = np.ones(m)					# number of data in each cluster
	mu = arr(X)							# centroid of each cluster
	method = method.lower()

	if type(join) == type(None):		# if join not precomputed

		join = np.zeros((m - 1, 3))		# keep track of join sequence
		# use standard Euclidean distance
		dist = lambda a,b: np.sum(np.power(a - b, 2))
		for i in range(m):				# compute initial distances
			for j in range(i + 1, m):
				D[i][j] = dist(X[i,:], X[j,:])


		opn = np.ones(m)				# store list of clusters still in consideration
		val,k = np.min(D),np.argmin(D)	# find first join (closest cluster pair)
		
		for c in range(m - 1):
			i,j = np.unravel_index(k, D.shape)
			join[c,:] = arr([i, j, val])

			# centroid of new cluster
			mu_new = (num[i] * mu[i,:] + num[j] * mu[j,:]) / (num[i] + num[j])

			# compute new distances to cluster i
			for jj in np.where(opn)[0]:
				if jj in [i, j]:
					continue

				# sort indices because D is an upper triangluar matrix
				idxi = tuple(sorted((i,jj)))	
				idxj = tuple(sorted((j,jj)))	
					
				if method == 'min':
					D[idxi] = min(D[idxi], D[idxj])		# single linkage (min dist)
				elif method == 'max':
					D[idxi] = max(D[idxi], D[idxj])		# complete linkage (max dist)
				elif method == 'means':
					D[idxi] = dist(mu_new, mu[jj,:])	# mean linkage (dist b/w centroids)
				elif method == 'average':
					# average linkage
					D[idxi] = (num[i] * D[idxi] + num[j] * D[idxj]) / (num[i] + num[j])

			opn[j] = 0						# close cluster j (fold into i)
			num[i] = num[i] + num[j]		# update total membership in cluster i to include j
			mu[i,:] = mu_new				# update centroid list

			# remove cluster j from consideration as min
			for ii in range(m):
				if ii != j:
					# sort indices because D is an upper triangular matrix
					idx = tuple(sorted((ii,j)))	
					D[idx] = np.inf

			val,k = np.min(D), np.argmin(D)	# find next smallext pair

	# compute cluster assignments given sequence of joins
	for c in range(m - n_clust):
		z[z == join[c,1]] = join[c,0]

	uniq = np.unique(z)
	for c in range(len(uniq)):
		z[z == uniq[c]] = c

	return optional_return(to_return, twod(z).T, join)
def em_cluster(X, K, init='random', max_iter=100, tol=1e-6, do_plot=False, to_return=[1,0,0,0]):
	"""
	Perform Gaussian mixture EM (expectation-maximization) clustering on data X.

	Parameters
	----------
	X : numpy array
		N x M array containing data to be clustered.
	K : int
		Number of clusters.
	init : str or array (optional)
		Either a K x N numpy array containing initial clusters, or
		one of the following strings that specifies a cluster init
		method: 'random' (K random data points (uniformly) as clusters)
				'farthest' (choose cluster 1 uniformly, then the point farthest
					 from all cluster so far, etc.)
				'k++' (choose cluster 1 
		uniformly, then points randomly proportional to distance from
		current clusters).
	max_iter : int (optional)
		Maximum number of iterations.
	tol : scalar (optional)
		Stopping tolerance.
	do_plot : bool (optional)
		Plot if do_plot == True.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether T should be returned, etc.

	Returns
	-------
	z : numpy array
		1 x N numpy array of cluster assignments (int indices).
	T : {str -> numpy array} (optional)
		Gaussian component parameters:
			alpha : numpy array
			mu : numpy array
			sig : numpy array
	soft : numpy array (optional)
		Soft assignment probabilities (rounded for assign).
	ll : scalar (optional)
		Log-likelihood under the returned model.
	"""
	# init
	N,D = twod(X).shape					# get data size

	if type(init) is str:
		init = init.lower()
		if init == 'random':
			pi = np.random.permutation(N)
			mu = X[pi[0:K],:]
		elif init == 'farthest':
			mu = k_init(X, K, True)
		elif init == 'k++':
			mu = k_init(X, K, False)
		else:
			raise ValueError('em_cluster: value for "init" ( ' + init +  ') is invalid')
	else:
		mu = init

	sig = np.zeros((D,D,K))
	for c in range(K):
		sig[:,:,c] = np.eye(D)
	alpha = np.ones(K) / K
	R = np.zeros((N,K))

	iter,ll,ll_old = 1, np.inf, np.inf
	done = iter > max_iter
	C = np.log(2 * np.pi) * D / 2

	while not done:
		ll = 0
		for c in range(K):
			# compute log prob of all data under model c
			V = X - np.tile(mu[c,:], (N,1))			
			R[:,c] = -0.5 * np.sum((V.dot(np.linalg.inv(sig[:,:,c]))) * V, axis=1) - 0.5 * np.log(np.linalg.det(sig[:,:,c])) + np.log(alpha[c]) - C

		# avoid numberical issued by removing constant 1st
		mx = R.max(1)
		R -= np.tile(twod(mx).T, (1,K))
		# exponentiate and compute sum over components
		R = np.exp(R)
		nm = R.sum(1)
		# update log-likelihood of data
		ll = np.sum(np.log(nm) + mx)
		R /= np.tile(twod(nm).T, (1,K))		# normalize to give membership probabilities

		alpha = R.sum(0)					# total weight for each component
		for c in range(K):
			# weighted mean estimate
			mu[c,:] = (R[:,c] / alpha[c]).T.dot(X)
			tmp = X - np.tile(mu[c,:], (N,1))
			# weighted covar estimate
			sig[:,:,c] = tmp.T.dot(tmp * np.tile(twod(R[:,c]).T / alpha[c], (1,D))) + 1e-32 * np.eye(D)
		alpha /= N

		# stopping criteria
		done = (iter >= max_iter) or np.abs(ll - ll_old) < tol
		ll_old = ll
		iter += 1

	z = from_1_of_k(R)
	soft = R
	T = {'pi': alpha, 'mu': mu, 'sig': sig}
		
	return optional_return(to_return, twod(z).T, T, soft, ll)
def agglom_cluster(X, n_clust, method='means', join=None, to_return=[1, 0]):
    """
	Perform hierarchical agglomerative clustering.

	Parameters
	----------
	X : numpy array
		N x M array of Data to be clustered.
	n_clust : int
		The number of clusters into which data should be grouped.
	method : str (optional)
		str that specifies the method to use for calculating distance between
		clusters. Can be one of: 'min', 'max', 'means', or 'average'.
	join : numpy array (optional)
		N - 1 x 3 that contains a sequence of joining operations. Pass to avoid
		reclustering for new X.
	to_return : [bool] (optional)
		Array of bools that specifies which values to return. The bool
		at to_return[0] indicates whether z should be returned; the bool
		at to_return[1] indicates whether join should be returned.

	Returns
	-------
	z : numpy array
		N x 1 array of cluster assignments.
	join : numpy array (optional)
		N - 1 x 3 array that contains the sequence of joining operations 
		peformed by the clustering algorithm.
	"""
    m, n = twod(X).shape  # get data size
    D = np.zeros(
        (m, m)
    ) + np.inf  # store pairwise distances b/w clusters (D is an upper triangular matrix)
    z = arr(range(m))  # assignments of data
    num = np.ones(m)  # number of data in each cluster
    mu = arr(X)  # centroid of each cluster
    method = method.lower()

    if type(join) == type(None):  # if join not precomputed

        join = np.zeros((m - 1, 3))  # keep track of join sequence
        # use standard Euclidean distance
        dist = lambda a, b: np.sum(np.power(a - b, 2))
        for i in range(m):  # compute initial distances
            for j in range(i + 1, m):
                D[i][j] = dist(X[i, :], X[j, :])

        opn = np.ones(m)  # store list of clusters still in consideration
        val, k = np.min(D), np.argmin(
            D)  # find first join (closest cluster pair)

        for c in range(m - 1):
            i, j = np.unravel_index(k, D.shape)
            join[c, :] = arr([i, j, val])

            # centroid of new cluster
            mu_new = (num[i] * mu[i, :] + num[j] * mu[j, :]) / (num[i] +
                                                                num[j])

            # compute new distances to cluster i
            for jj in np.where(opn)[0]:
                if jj in [i, j]:
                    continue

                # sort indices because D is an upper triangluar matrix
                idxi = tuple(sorted((i, jj)))
                idxj = tuple(sorted((j, jj)))

                if method == 'min':
                    D[idxi] = min(D[idxi],
                                  D[idxj])  # single linkage (min dist)
                elif method == 'max':
                    D[idxi] = max(D[idxi],
                                  D[idxj])  # complete linkage (max dist)
                elif method == 'means':
                    D[idxi] = dist(
                        mu_new, mu[jj, :])  # mean linkage (dist b/w centroids)
                elif method == 'average':
                    # average linkage
                    D[idxi] = (num[i] * D[idxi] + num[j] * D[idxj]) / (num[i] +
                                                                       num[j])

            opn[j] = 0  # close cluster j (fold into i)
            num[i] = num[i] + num[
                j]  # update total membership in cluster i to include j
            mu[i, :] = mu_new  # update centroid list

            # remove cluster j from consideration as min
            for ii in range(m):
                if ii != j:
                    # sort indices because D is an upper triangular matrix
                    idx = tuple(sorted((ii, j)))
                    D[idx] = np.inf

            val, k = np.min(D), np.argmin(D)  # find next smallext pair

    # compute cluster assignments given sequence of joins
    for c in range(m - n_clust):
        z[z == join[c, 1]] = join[c, 0]

    uniq = np.unique(z)
    for c in range(len(uniq)):
        z[z == uniq[c]] = c

    return optional_return(to_return, twod(z).T, join)