示例#1
0
def cluster_images(images, cluster_count):
    """
    Cluster images into specified number of clusters using k-means
    """
    shapes = set([image.data.shape for image in images])
    if len(shapes) > 1:
        raise ValueError("Images should have the same dimensions")
    else:
        image_shape = list(shapes)[0]

    # Do clustering
    vectors = [
        image.to_vector()
        for image in images
    ]
    kmeans = KMeans(n_clusters=cluster_count)
    kmeans.fit(vectors)
    centroids = kmeans.cluster_centers_

    # Return Image instances
    clustered_images = [
        Image(centroid.reshape(image_shape))
        for centroid in centroids
    ]
    return clustered_images
示例#2
0
def k_means(data, n_clusters=5):
    '''
    sklearn based KMeans method.
        Inputs:
            - data: training data set containing the events to be processed (matrix [m,n])            
            - n_clusters: number of clusters to be used
        Outputs: (Z, centroids, kmeans) array containing to which formed cluster every event in the training data set belongs to (array [m]), the centroids or mean values (matrix [m, n_clusters]) and the resulting kmeans object
    '''
    kmeans = KMeans(init='k-means++', n_clusters=n_clusters, n_init=20)
    # testing kmedians and fuzzy kmeans...
    #kmeans = KMedians(k=n_clusters)
    # kmeans = FuzzyKMeans(k=7, m=2)
    kmeans.fit(data)

    # Obtain labels for each point in mesh. Use last trained model.
    #Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
    #Z = kmeans.predict(data)
    Z = kmeans.labels_
    # obtain centroids
    centroids = kmeans.cluster_centers_
    inertia = kmeans.inertia_
    print('Sum of distances of events to their closest cluster center: ' +
          str(inertia))

    return Z, centroids, kmeans
示例#3
0
文件: fca.py 项目: hunanhd/vns
def fca(data,n_clusters=3):
    # st = time.time()
    ward = Ward(n_clusters=n_clusters).fit(data)
    label = ward.labels_
    # print("Elapsed time: ", time.time() - st)
    # print("Number of points: ", label.size)
    #print label

    centroids=[]
    #reduced_data = PCA(n_components=).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=1, n_init=10)
    for i in range(n_clusters):
        c_data=[]
        c_index=[]
        for j in range(label.size):
            if label[j] == i:
                #print 'i=',i,'j=',j,'data=',data
                c_data.append(data[j])
                c_index.append(j)

        kmeans.fit(c_data)
        centroid = kmeans.cluster_centers_[0]
        #print c_data
        #print centroid
       
        c_dist=[]
        for xdata in c_data:
            c_dist.append(sum([(x-y)**2 for x,y in zip(xdata,centroid)]))
        c_i=c_dist.index(max(c_dist))
        centroids.append(c_index[c_i])
    return centroids
示例#4
0
def calculate_wcss(data):
    wcss = []
    for n in range(2, 21):
        kmeans = KMeans(n_clusters=n)
        kmeans.fit(X=data)
        wcss.append(kmeans.inertia_)
    return wcss
示例#5
0
def visualize_clusters(data, target, problem, k):
    '''
    pca = PCA(n_components=2).fit(data)
    pca_2d = pca.transform(data)
    # now visualize classified data in new projected space
    pl.figure('Reference Plot ' + problem)
    pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=['black'])
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(data)
    pl.figure('K-means with 2 clusters ' + problem)
    pl.scatter(pca_2d[:, 0], pca_2d[:, 1], c=['navy', 'darkorange', 'green'], alpha=0.4)
    pl.legend()
    pl.show()
    '''

    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure(1)
    pl.clf()
    pl.imshow(Z,
              interpolation='nearest',
              extent=(xx.min(), xx.max(), yy.min(), yy.max()),
              cmap=pl.cm.Paired,
              aspect='auto',
              origin='lower')

    pl.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    pl.scatter(centroids[:, 0],
               centroids[:, 1],
               marker='x',
               s=169,
               linewidths=3,
               color='w',
               zorder=10)
    pl.title('K-means clustering on the ' + problem +
             ' dataset (PCA-reduced data)\n'
             'Centroids are marked with white cross')
    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)
    pl.xticks(())
    pl.yticks(())
    pl.show()
示例#6
0
文件: bow.py 项目: PierreHao/QScode
 def MiniBatchKMeans(self, X, batch=10000):
 	print("in fit method", X.shape, self.k)
     kmeans = MiniBatchKMeans(init='k-means++', n_clusters=self.k, batch_size=batch)
     kmeans.fit(X)
     centers = kmeans.cluster_centers_
     clusters = kmeans.labels_
     print("shape of centers is ", centers.shape)
     return centers
示例#7
0
    def calcKMeans(self, data):
        if len(data) == 0: return None
        kmeans = KMeans(n_clusters=self.samplesize)
        centoids = {}
        for key, val in data.items():
            kmeans.fit(np.squeeze(val))
            centoids[key] = kmeans.labels_

        return centoids
示例#8
0
def cluster(x, y, n):
    # data generation
    kmeans = KMeans(n_clusters=n)
    kmeans.fit(x)
    y_kmeans = kmeans.predict(x)
    plt.scatter(x[:, 0], y[:, 0], c=y_kmeans, s=50, cmap='viridis')
    plt.show()
    centers = kmeans.cluster_centers_
    print(centers)
    plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
    plt.show()
    def calcKMeans(self, data):
        if len(data) == 0: return None
        kmeans = KMeans(n_clusters=self.samplesize)
        histData = {}
        for key, val in data.items():
            kmeans.fit(np.squeeze(val))

            # the histogram of the data
            cnts, _ = np.histogram(kmeans.labels_, self.samplesize)
            histData[key] = cnts

        return histData
示例#10
0
def clusteringKMeans(XVal):
    kmeans = KMeans(n_clusters=4, random_state=0)
    kmeans.fit(XVal)
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_
    colors = ["g.", "r.", "y.", "c.", "b."]

    for i in range(len(XVal)):
        plt.plot(XVal[i][0], XVal[i][1], colors[labels[i]], markersize=10)
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker="x",
                s=150,
                linewidth=5,
                zorder=10)
    return kmeans, plt, labels
示例#11
0
plt.show()


# In[9]:

X = np.array([x, y, z, a])


# In[10]:

kmeans =KMeans(n_clusters= 4)


# In[11]:

kmeans.fit(X)


# In[12]:

#Center marker of the clusters
centroids = kmeans.cluster_centers_


# In[13]:

labels = kmeans.labels_


# In[14]:
示例#12
0
# For each K, compare the difference in error between current K and K-1 vs 
# K and K+1 to determine where the most significant improvement in error rates are
    for i in range(1, len(avgWithinSS[name]) - 1):
        if ratio2 > ratio:
            k = i
            ratio = ratio2
        diff = avgWithinSS[name][i - 1] - avgWithinSS[name][i]
        diff2 = avgWithinSS[name][i] - avgWithinSS[name][i + 1]
        ratio2 = diff - diff2

# k-means clustering by PC9 volume
# Re-Run K-Means clustering algorithm for the specific K value as determined by the Elbow Test
    list_k = [i for i in range(k)]

    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)

# Plot the results of the K-Means algorithm
    mglearn.discrete_scatter(X[:, 0], X[:, 1], kmeans.labels_, markers='o')
    mglearn.discrete_scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], list_k, 
        markers='^', markeredgewidth=2)

# Store the results of the algorithm back within the original data set
    PC9_Shipment_Qty['PC9_Vol_Cluster_Unsorted'] = kmeans.labels_ 

# Order Volume Clusters by Avg. Shipment Vol to order cluster from Smallest to Largest
    Volume_Cluster_Definitions = PC9_Shipment_Qty.groupby(['PC9','PC9_Vol_Cluster_Unsorted']).sum().groupby('PC9_Vol_Cluster_Unsorted').mean()
    Volume_Cluster_Definitions = Volume_Cluster_Definitions.sort_values(by=['PC9_Shipped_Qty'])
    Volume_Cluster_Definitions['Unsorted_Cluster'] = Volume_Cluster_Definitions.index.get_values()
    Sorted_Grouping_List = [i for i in range(len(Volume_Cluster_Definitions.index))]
    Volume_Cluster_Definitions['Sorted_Cluster'] = Sorted_Grouping_List
示例#13
0
import sys
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, optimal_leaf_ordering
import matplotlib.pyplot as plt
import pandas as pd
plt.style.use('ggplot')
from sklearn.cluster import KMeans
from scipy.cluster.vq import kmeans,vq

df=pd.read_table(sys.argv[1], sep = "\t", header = 0, index_col = 0).loc[:, ("CFU", "poly")]
array = df.values
col_names = df.columns.values.tolist()
#print(df)
Z = linkage(array, 'ward')
kmeans = KMeans(n_clusters = 4)
kmeans.fit(Z)
y_means = kmeans.predict(Z)
fig, ax = plt.subplots()
plt.scatter(Z[:, 0], Z[:, 1], c = y_means, s=50, cmap = "viridis")
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c = "black", s=200, alpha = 0.5)
fig.savefig("kmeans.png")
plt.close(fig)



#kmeans = scipy.cluster.vq.kmeans(Z, 2)
#centroids, _ = kmeans(Z, 2)
#idx, _ = vq(Z, centroids)
#plot(data[idx==0,0], data[idx==0,1], "ob",
       # data[idx==1,0],data[idx==1,1], "or")
示例#14
0
    print('-' * 80)
    print("Benchmarking with several k values: ")
    rang = 10
    for k in range(max(2, num_clusters - rang),
                   min(len(instance_names) - 1, num_clusters + rang)):
        bench_k_means(KMeans(init='k-means++', n_clusters=k, n_init=10),
                      name="k-means++ (k=" + str(k) + ")",
                      data=features_data)
    print('-' * 80)

    # Prepare the data for visualization in 2D-plot.
    reduced_data = PCA(
        n_components=2,
        random_state=np.random.randint(1)).fit_transform(features_data)
    kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .005  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # execute: python3 cluster-features-alberto.py seed file n_clusters
    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot