Пример #1
0
def KMeans_std():
    lst = []
    space = np.linspace(0.01, 10, 10)
    samples = 1000
    centers = 4
    dim = 2

    for i in space:
        x, y = make_blobs(n_samples=samples,
                          centers=centers,
                          n_features=dim,
                          random_state=1,
                          cluster_std=i)
        _y = cluster.KMeans(x, 4)

        acc = sklearn.metrics.homogeneity_score(y, _y)
        lst.append(acc)

    plt.plot(space, np.array(lst), 'r')
    plt.xlabel('Standart deviation')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('S1amples: {0} Centers: {1} Dimensional {2}'.format(
        samples, centers, dim))
    plt.grid(color='#dddddd', linestyle='-', linewidth=1)
    plt.show()
Пример #2
0
def split_words_bykeam(texteg,wn):
    ret = np.where(texteg)
    pt = np.array(  zip(ret[1],ret[0] ) )
    ap =  cluster.KMeans(n_clusters=wn).fit(pt)
    words = [[]]* wn
    for k in range(wn): 
        labelpt = pt[np.where(ap.labels_ == k)[0]]
        tmp = np.zeros(texteg.shape)
        tmp[labelpt[:,1],labelpt[:,0]] = 1
        words[k] = tmp > 0
       
    return words 
Пример #3
0
def KMeans_image(img):
    s = img.shape
    img = img.reshape((img.shape[0] * img.shape[1], 3))
    std = np.std(img)
    y = cluster.KMeans(img, 5)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255)]
    for i, v in enumerate(y):
        img[i] = colors[v]

    img = img.reshape(s)

    return img
Пример #4
0
    def fit(self,
            X,
            n_iter=10,
            min_covar=1e-3,
            thresh=1e-2,
            params='wmc',
            init_params='wmc'):

        X = np.asanyarray(X)

        if hasattr(self, 'n_features') and self.n_features != X.shape[1]:
            raise ValueError('Unexpected number of dimensions, got %s but '
                             'expected %s' % (X.shape[1], self.n_features))

        self.n_features = X.shape[1]

        if 'm' in init_params:
            self._means = cluster.KMeans(
                k=self._n_states).fit(X).cluster_centers_
        elif not hasattr(self, 'means'):
            self._means = np.zeros((self.n_states, self.n_features))

        if 'w' in init_params or not hasattr(self, 'weights'):
            self.weights = np.tile(1.0 / self._n_states, self._n_states)

        if 'c' in init_params:
            cv = np.cov(X.T)
            if not cv.shape:
                cv.shape = (1, 1)
            self._covars = _distribute_covar_matrix_to_match_cvtype(
                cv, self._cvtype, self._n_states)
        elif not hasattr(self, 'covars'):
            self.covars = _distribute_covar_matrix_to_match_cvtype(
                np.eye(self.n_features), self.cvtype, self.n_states)

        logprob = []
        for i in xrange(n_iter):

            curr_logprob, posteriors = self.eval(X)
            logprob.append(curr_logprob.sum())

            if i > 0 and abs(logprob[-1] - logprob[-2]) < thresh:
                break

            self._do_mstep(X, posteriors, params, min_covar)

        return self
Пример #5
0
In [82]: from sklearn import cluster, datasets

In [83]: iris = datasets.load_iris()

In [84]: k_means = cluster.KMeans(k=3)

In [85]: k_means.fit(iris.data) 
Out[85]: 
KMeans(copy_x=True, init='k-means++', k=3, max_iter=300, n_init=10, n_jobs=1,
    precompute_distances=True,
    random_state=<mtrand.RandomState object at 0x7f4d860642d0>, tol=0.0001,
    verbose=0)

In [86]: print k_means.labels_[::10]
[1 1 1 1 1 2 2 2 2 2 0 0 0 0 0]

In [87]: print iris.target[::10]
Пример #6
0
def has_converged(mu, oldmu):
    return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu])
 
def find_centers(X, K):
    # Initialize to K random centers
    oldmu 	= random.sample(X, K)
    mu 		= random.sample(X, K)
    while not has_converged(mu, oldmu):
        oldmu 		= mu
        # Assign all points in X to clusters
        clusters 	= cluster_points(X, mu)
        # Reevaluate centers
        mu 			= reevaluate_centers(oldmu, clusters)
    return(mu, clusters)
	
import random
 
def init_board(N):
    X = np.array([(random.uniform(-1, 1), random.uniform(-1, 1)) for i in range(N)])
    return X

def init_board_gauss(N, k):
    n = float(N)/k
    X = []
    for i in range(k):
        c = (random.uniform(-1, 1), random.uniform(-1, 1))
        s = random.uniform(0.05,0.5)
        x = []
        while len(x) < n:
            a, b = np.array([np.random.normal(c[0], s), np.random.normal(c[1], s)])
            # Continue drawing points from the distribution in the range [-1,1]
            if abs(a) < 1 and abs(b) < 1:
                x.append([a,b])
        X.extend(x)
    X = np.array(X)[:N]
    return X
###################################
#	https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/
def Wk(mu, clusters):
		K = len(mu)
		return sum([np.linalg.norm(mu[i]-c)**2/(2*len(c)) \ for i in range(K) for c in clusters[i]])
				
def bounding_box(X):
		xmin, xmax = min(X,key=lambda a:a[0])[0], max(X,key=lambda a:a[0])[0]
		ymin, ymax = min(X,key=lambda a:a[1])[1], max(X,key=lambda a:a[1])[1]
		return (xmin,xmax), (ymin,ymax)
 
def gap_statistic(X):
		(xmin,xmax), (ymin,ymax) = bounding_box(X)
		# Dispersion for real distribution
		ks 		= range(1,10)
		Wks 	= zeros(len(ks))
		Wkbs 	= zeros(len(ks))
		sk 		= zeros(len(ks))
		for indk, k in enumerate(ks):
				mu, clusters 	= find_centers(X,k)
				Wks[indk] 		= np.log(Wk(mu, clusters))
				# Create B reference datasets
				B 				= 10
				BWkbs 			= zeros(B)
				for i in range(B):
						Xb 		= []
						for n in range(len(X)):
								Xb.append([random.uniform(xmin,xmax),random.uniform(ymin,ymax)])
						Xb 				= np.array(Xb)
						mu, clusters 	= find_centers(Xb,k)
						BWkbs[i] 		= np.log(Wk(mu, clusters))
				Wkbs[indk] 		= sum(BWkbs)/B
				sk[indk] 		= np.sqrt(sum((BWkbs-Wkbs[indk])**2)/B)
		sk = sk*np.sqrt(1+1/B)
		return(ks, Wks, Wkbs, sk)

X = init_board_gauss(200,3)
ks, logWks, logWkbs, sk = gap_statistic(X)

#http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

######################################################################################################################
--------------------------------------
#http://stats.stackexchange.com/questions/90769/using-bic-to-estimate-the-number-of-k-in-kmeans
from sklearn import cluster
from scipy.spatial import distance
import sklearn.datasets
from sklearn.preprocessing import StandardScaler
import numpy as np

def compute_bic(kmeans,X):
		#Computes the BIC metric for a given clusters. Parameters:
			#kmeans:  List of clustering object from scikit learn
			#X     :  multidimension np array of data points
			#Returns: BIC value
		#####################
		# assign centers and labels
		centers 	= [kmeans.cluster_centers_]
		labels  	= kmeans.labels_
		#number of clusters
		m 			= kmeans.n_clusters
		# size of the clusters
		n 			= np.bincount(labels)
		#size of data set
		N, d 		= X.shape
		#compute variance for all clusters beforehand
		cl_var 		= (1.0 / (N - m) / d) * sum([sum(distance.cdist(X[np.where(labels == i)], [centers[0][i]], 'euclidean')**2) for i in range(m)])
		const_term 	= 0.5 * m * np.log(N) * (d+1)
		BIC 		= np.sum([n[i]*np.log(n[i]) - n[i]*np.log(N) - ((n[i] * d)/2)*np.log(2*np.pi*cl_var) - ((n[i] - 1)*d/2) for i in range(m)]) - const_term
		return(BIC)

#IRIS DATA
iris 	= sklearn.datasets.load_iris()
X 		= iris.data[:, :4]  # extract only the features
#Xs = StandardScaler().fit_transform(X)
Y 		= iris.target
ks 		= range(1,10)
# run 9 times kmeans and save each result in the KMeans object
KMeans 	= [cluster.KMeans(n_clusters = i, init="k-means++").fit(X) for i in ks]
# now run for each cluster the BIC computation
#a=df1.as_matrix()
BIC 	= [compute_bic(kmeansi,X) for kmeansi in KMeans]
print BIC

#[-901.8088330799194, -562.67814893720902, -442.4179569307467, -401.31661808222532, -373.70396994638168, -367.27568113462917, -369.13543294596866, -351.7636856213748, -360.97885983416268]

plt.plot(ks,BIC,'r-o')
plt.title("iris data  (cluster vs BIC)")
plt.xlabel("# clusters")
plt.ylabel("# BIC")
plt.show()

#######################################################################################################################################
--------------------------------
# https://www.linkedin.com/pulse/finding-k-k-means-clustering-jaganadh-gopinadhan
import pylab as plt
import numpy as np
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

iris 				= load_iris()
k 					= range(1,11)
clusters 			= [cluster.KMeans(n_clusters = c,init = 'k-means++').fit(iris.data) for c in k]
centr_lst 			= [cc.cluster_centers_ for cc in clusters]
k_distance 			= [cdist(iris.data, cent, 'euclidean') for cent in centr_lst]
clust_indx 			= [np.argmin(kd,axis=1) for kd in k_distance]
distances 			= [np.min(kd,axis=1) for kd in k_distance]
avg_within 			= [np.sum(dist)/iris.data.shape[0] for dist in distances]
with_in_sum_square 	= [np.sum(dist ** 2) for dist in distances]
to_sum_square 		= np.sum(pdist(iris.data) ** 2)/iris.data.shape[0]
bet_sum_square 		= to_sum_square - with_in_sum_square

kidx 				= 2
fig 				= plt.figure()
ax 					= fig.add_subplot(111)
ax.plot(k, avg_within, 'g*-')
ax.plot(k[kidx], avg_within[kidx], marker='o', markersize=12, markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
plt.grid(True)
plt.xlabel('Number of clusters')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering (IRIS Data)')

########################################################################################################################################
---------------------------------------------
#	http://stanford.edu/~cpiech/cs221/handouts/kmeans.html
# Function: K Means
# -------------
# K-Means is an algorithm that takes in a dataset and a constant
# k and returns k centroids (which define clusters of data in the
# dataset which are similar to one another).
def kmeans(dataSet, k):
	
    # Initialize centroids randomly
    numFeatures 	= dataSet.getNumFeatures()
    centroids 		= getRandomCentroids(numFeatures, k)    
    # Initialize book keeping vars.
    iterations 		= 0
    oldCentroids 	= None
    
    # Run the main k-means algorithm
    while not shouldStop(oldCentroids, centroids, iterations):
			# Save old centroids for convergence test. Book keeping.
			oldCentroids 	= centroids
			iterations 		+= 1        
			# Assign labels to each datapoint based on centroids
			labels 			= getLabels(dataSet, centroids)        
			# Assign centroids based on datapoint labels
			centroids 		= getCentroids(dataSet, labels, k)        
    # We can get the labels too by calling getLabels(dataSet, centroids)
    return centroids
# Function: Should Stop
# -------------
# Returns True or False if k-means is done. K-means terminates either
# because it has run a maximum number of iterations OR the centroids
# stop changing.
def shouldStop(oldCentroids, centroids, iterations):
		if iterations 		> MAX_ITERATIONS: return True
		return oldCentroids == centroids
# Function: Get Labels
# -------------
# Returns a label for each piece of data in the dataset. 
def getLabels(dataSet, centroids):
    # For each element in the dataset, chose the closest centroid. 
    # Make that centroid the element's label.
# Function: Get Centroids
# -------------
# Returns k random centroids, each of dimension n.
def getCentroids(dataSet, labels, k):
    # Each centroid is the geometric mean of the points that
    # have that centroid's label. Important: If a centroid is empty (no points have
    # that centroid's label) you should randomly re-initialize it.
	
#######################################################################################################################
#######################################################################################################################
##	https://gist.github.com/jaganadhg/ddbf0956a7921b83ceef90b8a81dfaee
"""
Author : Jaganadh Gopinadhan
Licence : Apahce 2
e-mail jaganadhg at gmail dot com 
"""
import scipy

from sklearn.cluster import KMeans
from sklearn.datasets import load_iris

import pandas as pd


class TWHGapStat(object):
    """
    Implementation of Gap Statistic from Tibshirani, Walther, Hastie to determine the 
    inherent number of clusters in a dataset with k-means clustering.
    Ref Paper : https://web.stanford.edu/~hastie/Papers/gap.pdf
    """
    
    def generate_random_data(self, X):
        """
        Populate reference data.
        
        Parameters
        ----------
        X : Numpy Array
            The base data from which random sample has to be generated
        
        Returns
        -------
        reference : Numpy Array
            Reference data generated using the Numpy/Scipy random utiity .
            NUmber of diamensions in the data will be as same as the base
            dataset. 
        """
        reference = scipy.random.random_sample(size=(X.shape[0], X.shape[1]))
        return reference
    
    def _fit_cluster(self,X, n_cluster, n_iter=5):
        """
        Fit cluster on reference data and return inertia mean.
        
        
        Parameters
        ----------
        X : numpy array
            The base data 
            
        n_cluster : int 
            The number of clusters to form 
            
        n_iter : int, default = 5
            number iterative lustering experiments has to be perfromed in the data.
            If the data is large keep it less than 5, so that the run time will be less.
        
        Returns
        -------
        mean_nertia : float 
            Returns the mean intertia value. 
        """
        iterations = range(1, n_iter + 1)
        
        ref_inertias = pd.Series(index=iterations)
        
        for iteration in iterations:
            clusterer = KMeans(n_clusters=n_cluster, n_init=3, n_jobs=-1)
            # If you are using Windows server n_jobs = -1 will be dangerous. So the 
            # value should be set to max cores - 3 . If we use all the cores available
            # in Windows server sklearn tends to throw memory error 
            clusterer.fit(X)
            ref_inertias[iteration] = clusterer.inertia_
        
        mean_nertia = ref_inertias.mean()
        
        return mean_nertia
    
    def fit(self,X,max_k):
        """
        Compute Gap Statistics
        Parameters
        ----------
        X : numpy array
            The base data 
        max_k :int 
            Maximum value to which we are going to test the 'k' in k-means algorithmn 
        Returns
        -------
        gap_stat : Pandas Series
            For eack k in max_k range gap stat value is returned as a Pandas Sereies.
            Index is K and valuess correspondes to gap stattistics for each K
        """
        
        k_range = range(1,max_k + 1)
        gap_stat = pd.Series(index=k_range)
        
        ref_data = self.generate_random_data(X)
        
        for k in k_range:
            base_clusterer = KMeans(n_clusters=k,n_init = 3, n_jobs = -1)
            base_clusterer.fit(X)
            
            ref_intertia = self._fit_cluster(ref_data,k)
            
            cur_gap = scipy.log(ref_intertia - base_clusterer.inertia_)
            
            gap_stat[k] = cur_gap
        
        return gap_stat

if __name__ == "__main__":
    iris = load_iris()
    X = iris.data 
    
    gap_stat = TWHGapStat()
    gs = gap_stat.fit(X,5)
    print gs
	
Пример #7
0
import pandas as pd
import matplotlib.pyplot as plt
import cluster as clt

import timeit

start = timeit.default_timer()

dataset = pd.read_csv('/home/neo/Desktop/kmeans/dataset.csv')
dataset = dataset.values

wcss = []
for i in range(1, 10):
    kmeans = clt.KMeans(n_clusters=i, shift_tolerance=0.02, thread_capacity=4)
    kmeans.fit(dataset)
    wcss.append(kmeans.inertia)
plt.plot(range(1, 10), wcss)
plt.show()

kmeans = clt.KMeans(n_clusters=2, shift_tolerance=0.005, thread_capacity=4)
kmeans.fit_showDetails(dataset)

plt.scatter([x[0] for x in kmeans.cluster[0]],
            [x[1] for x in kmeans.cluster[0]],
            s=2,
            color='blue')
plt.scatter([x[0] for x in kmeans.cluster[1]],
            [x[1] for x in kmeans.cluster[1]],
            s=2,
            color='red')
plt.scatter([x[0] for x in kmeans.cluster[2]],
Пример #8
0
def sort_split_word(texteg,wn):
    total_area = texteg.sum()*1.0
    
    labels = measure.label(texteg,neighbors=8)
    rgps = measure.regionprops(labels)
    
    sort_lab = np.zeros((len(rgps),2))
    for i  in range(len(rgps)):
        lab = rgps[i]
        rt = lab.area /total_area
        sort_lab[i] = [rt,lab.label]
    
    label_sort_area = np.argsort(-sort_lab[:,0]) 
    
    words = []
    
    bwhole = 0 

    for i in range(len(label_sort_area)):
        ilabel = int( sort_lab[label_sort_area[i]][1] )
        hword  = ( labels == ilabel ) 
        if i == 0 :
            word = labels == ilabel
            rect = getMaxRect(word)
            words.extend([word])
            
            if rect[3]- rect[2] >  18 :
                # whole dont'break;
                bwhole = 1
                print 'whole body'
                #break
                if rect[3]- rect[2]  > 30  and   sort_lab[label_sort_area[i]][0] > 0.85:
                   # complete whole body
                   bwhole = 2 
                                       
            continue
        match = []
        for k in range(len(words)):
            tpwd = words[k]
            binword,dis = is_maybe_inword(ilabel,labels,tpwd,bwhole) 
            if binword ==0  and len(words) < wn and  bwhole  < 2 : 
         
                word = labels == ilabel
                words.extend([word])
                break
            elif binword == 1 and dis == 0  :
               words[k] = tpwd + ( labels == ilabel ) 
               break
            elif binword == 1  and dis > 0 :
               match.extend([[dis,k ]]) 
        
        match = np.array(match)
        
        if len(match) == 0:
            continue
        if len(match) == 1:
            mk = int( match[0][1] )
            words[mk] = words[mk] + ( labels == ilabel ) 
        else :
             mk =  np.argsort(match[:,0])[0]
             
             words[mk] = words[mk] + ( labels == ilabel ) 
        
    
    if bwhole > 0  :
        if len(words) ==1 :
           wholeword = words[0]
        else :
           wholeword = words[0] + words[1]
        ret = np.where(wholeword)
        pt = np.array(  zip(ret[1],ret[0] ) )
        ap =  cluster.KMeans(n_clusters=wn).fit(pt)
        words = [[]]* wn
        for k in range(wn): 
            labelpt = pt[np.where(ap.labels_ == k)[0]]
            tmp = np.zeros(wholeword.shape)
            tmp[labelpt[:,1],labelpt[:,0]] = 1
            words[k] = tmp > 0
           
    return words