示例#1
0
def compute_log_inertia(X, n_clusters, T, bb_min, bb_max, random_state=0):
    """Compute the log inertia of X and X_t.

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
        List of n_features-dimensional data points. Each row corresponds
        to a single data point.

    n_clusters: int
        The desired number of clusters.

    T: int
        Number of draws of X_t.

    bb_min: array, shape (n_features,)
        Inferior corner of the bounding box of X.

    bb_max: array, shape (n_features,)
        Superior corner of the bounding box of X.

    random_state: int, defaults to 0.
        A random number generator instance.

    Returns
    -------
    log_inertia: float
        Log of the inertia of the K-means applied to X.

    mean_log_inertia_rand: float
        Mean of the log of the inertia of the K-means applied to the different
        X_t.

    std_log_inertia_rand: float
        Standard deviation of the log of the inertia of the K-means applied to
        the different X_t.
    """
    nb_experiences = 100

    log_inertia = np.log(kmeans(X, n_clusters, show=False)[2])
    experiences = []
    np.random.seed(random_state)
    for _ in range(nb_experiences):
        Xt = np.random.uniform(bb_min, bb_max, size=(T, 2))
        experiences.append(np.log(kmeans(Xt, n_clusters, show=False)[2]))
    mean_log_inertia_rand = np.mean(experiences)
    std_log_inertia_rand = np.std(experiences)

    return log_inertia, mean_log_inertia_rand, std_log_inertia_rand
示例#2
0
    def test_kmeans(self):
        X = create_clusters([(20, 30), (20, 60), (30, 45), (40, 60)], 30, 8)
        dist_label, centroids = kmeans(X, 4)
        icons = ['b_', 'b.', 'bo', 'b+', 'b*']

        for idx, l in enumerate(dist_label):
            plt.plot(X[idx, 0], X[idx, 1], icons[int(l[1])])
示例#3
0
def main():
  points = [np.random(2) for n in range(50)]
  results = kmeans(points, 5)
  animations = [draw_points([r[0] for r in result], 
                            [r[1] for r in result]) 
                for result in results]
  write_animation("kmeans", animations)
示例#4
0
def plot_color_clusters(colors, frequencies):
    centroids, clusters = kmeans(8, colors, frequencies, 1)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    r, g, b = zip(*centroids)
    colors = [norm_rgb(color) for color in centroids]
    ax.scatter(r, g, b, c=colors, s=100)
    for color, cluster in zip(colors, clusters):
        r, g, b = zip(*cluster)
        ax.scatter(r, g, b, c=color, s=10)
    ax.set_xlabel('R')
    ax.xaxis.label.set_color('red')
    ax.set_ylabel('G')
    ax.yaxis.label.set_color('green')
    ax.set_zlabel('B')
    ax.zaxis.label.set_color('blue')
    ax.set_xlim(0, 256)
    ax.set_xticks(range(0, 257, 32))
    ax.tick_params(axis='x', colors='red')
    ax.set_ylim(0, 256)
    ax.set_yticks(range(0, 257, 32))
    ax.tick_params(axis='y', colors='green')
    ax.set_zlim(0, 256)
    ax.set_zticks(range(0, 257, 32))
    ax.tick_params(axis='z', colors='blue')
    return fig, ax
示例#5
0
    def test_kmeans(self):
        X = create_clusters([(20, 30), (20, 60), (30, 45), (40, 60)], 30, 8)
        dist_label, centroids = kmeans(X, 4)
        icons = ['b_', 'b.', 'bo', 'b+', 'b*']

        for idx, l in enumerate(dist_label):
            plt.plot(X[idx, 0], X[idx, 1], icons[int(l[1])])
示例#6
0
def main():
    # load url content if already downloaded,
    # otherwise use scraper to scrape contents on the fly.
    if os.path.isfile(webfilename) and reload:
        with open(webfilename, 'rb') as file:
            rawtexts = pickle.load(file)
    else:
        rawtexts = []
        for url in urls:
            rawtexts += [scrape_website(url)]

        with open(webfilename, "w") as file:
            pickle.dump(rawtexts, file)

    # convert raw text to vectored. each feature is hashed.
    text_vectors = []
    for text in rawtexts:
        features = text_to_words(text)
        text_vectors += [words_to_vector(features, ndim=N)]

    #print(text_vectors)

    # apply k means algorithm.
    clusters, labels = kmeans(text_vectors, 3)

    # print(labels)
    # Now show content from which url belongs to which cluster
    for clusterindex in labels:
        print("cluster:" + str(clusterindex) + "\n")
        for urlindex in labels[clusterindex]:
            print("\t" + urls[urlindex])
示例#7
0
文件: main.py 项目: romickid/kmeans
def runKmeans(arrayP, arrayPclusters,
              arrayC, arrayCsum, arrayCnumpoint):

    # 开始计时
    start = time()

    for i in range(REPEAT):
        # 使用点对象中的前k个点初始化聚类
        for i1 in range(NUMBER_OF_CENTROIDS):
            arrayC[i1, 0] = arrayP[i1, 0]
            arrayC[i1, 1] = arrayP[i1, 1]

        arrayC, arrayCsum, arrayCnumpoint = kmeans(
            arrayP, arrayPclusters,
            arrayC, arrayCsum, arrayCnumpoint,
            NUMBER_OF_POINTS, NUMBER_OF_CENTROIDS
        )

        if i + 1 == REPEAT:
            printCentroid(arrayC, arrayCsum, arrayCnumpoint)

    # 结束计时
    end = time()
    total = (end - start) * 1000 / REPEAT

    print("Iterations: {:d}".format(ITERATIONS))
    print("Average Time: {:.4f} ms".format(total))
示例#8
0
def make_kmeans():
    with open("./data/train.txt", 'r') as f:
        anchor_txt = open("anchor.txt", 'w')
        bo_list = []
        data_lines = f.readlines()
        for line in data_lines:
            line = line.strip('\n')
            line = line.split(',')
            _boxes = np.array([float(x) for x in line[1:]])
            # _boxes = np.array(list(map(float, strs[1:])))
            index_box = len(_boxes) // 5
            boxes = np.split(_boxes, index_box)
            for bo in boxes:
                w, h = bo[2], bo[3]
                bo_list.append([w, h])

        data_np = np.array(bo_list)
        out = kmeans(data_np, 9)
        area_data = out[:, 0] * out[:, 1]
        data = out[np.argsort(area_data)]
        data_list1 = [str(i) for k in data[:3] for i in k]
        data_list2 = [str(i) for k in data[3:6] for i in k]
        data_list3 = [str(i) for k in data[6:9] for i in k]
        list1_str = ','.join(data_list1)
        list2_str = ','.join(data_list2)
        list3_str = ','.join(data_list3)
        anchor_txt.write(list3_str + '\n' + list2_str + '\n' + list1_str)
        print(out)
        print("Accuracy: {:.2f}%".format(avg_iou(data_np, out) * 100))
示例#9
0
def main(fileName, k):

    sourceData = readData.readData(fileName)

    result1 = kmeans(sourceData, k)
    result2 = kmeansPlusPlus(sourceData, k)

    return result1, result2
示例#10
0
def main():
    bdd = Bdd()
    bdd.connect("e13")
    bdd.change_default_timeout(3600)
    #handle_file.file_insertion_handler(Bdd)
    x = kmeans(bdd, "facts", "end", 5)
    print(x)
    answer(bdd)
    bdd.disconnect()
示例#11
0
文件: iris.py 项目: chmcewan/Numnum
def iris(k=3):
    data = pd.read_csv("test/iris.dat")
    (means,clusts, err) = kmeans( data.loc[:, "sepal_length":"petal_width"].values, k)

    f = mp.figure(1)
    for c in data["class"].unique():
        points = data[ data["class"] == c ]
        mp.plot( points["sepal_length"], points["sepal_width"], "o", color=np.random.random((3, 1)))
    mp.plot( means[:,0], means[:,1], "ko", markersize=7)
    mp.show()
def compress(image_path, n_color=2, n_iterations=10, n_images=3, err_tol=100):
    image = cv2.imread(image_path)
    height = image.shape[1]
    width = image.shape[0]
    image = image.reshape(width * height, RGB_SIZE)

    calculated_image = np.ndarray(image.shape)

    (centroids, clusters) = kmeans(n_color, image, n_iters=int(n_iterations))
    for key, value in clusters.items():
        calculated_image[key] = centroids[value]
    return calculated_image.reshape(width, height, RGB_SIZE)
示例#13
0
 def run(self):
     self.read_input()
     algo = self.options['algo']
     params = self.options['params']
     if algo == 'kmeans':
         model = kmeans(self.doc, params)
     elif algo == 'dbscan':
         model = dbscan(self.doc, params)
     elif algo == 'agglo':
         model = agglo(self.doc, params)
     elif algo == 'minib':
         model = minib(self.doc, params)
     model.evaluate()
示例#14
0
def iris(k=3):
    data = pd.read_csv("test/iris.dat")
    (means, clusts,
     err) = kmeans(data.loc[:, "sepal_length":"petal_width"].values, k)

    f = mp.figure(1)
    for c in data["class"].unique():
        points = data[data["class"] == c]
        mp.plot(points["sepal_length"],
                points["sepal_width"],
                "o",
                color=np.random.random((3, 1)))
    mp.plot(means[:, 0], means[:, 1], "ko", markersize=7)
    mp.show()
示例#15
0
def main():

    #Kmeans
    kmeans_machine = kmeans(sample, k)
    kmeans_machine.train()

    #KNN
    knn_machine = knn(test, sample, target, k)
    print knn_machine.train()

    #Decision tree

    #SVM
    svm_run(Dataset, Trainset)
示例#16
0
def restart_kmeans(data, k, times=10):
    minimum_objective = 0
    first = True
    for i in range(times):
        clusters = initialize_cluster_centers(data, k)
        cluster_centers, objective = kmeans(data, clusters)
        if first is True:
            minimum_objective = objective
        else:
            if minimum_objective > objective:
                minimum_objective = objective
    print(
        f"Best objective of k value: {k} for {times} times: {minimum_objective}"
    )
    return minimum_objective
    def test_success_valid_learning(self):
        K = 3
        kmeans_instance = KMeans(k = K)
        samples = np.array([[77.3,13.0,9.7,1.5,6.4],
                            [82.5,10.0,7.5,1.5,6.5],
                            [66.9,20.6,12.5,2.3,7.0],
                            [47.2,33.8,19.0,2.8,5.8],
                            [65.3,20.5,14.2,1.9,6.9],
                            [83.3,10.0,6.7,2.2,7.0],
                            [81.6,12.7,5.7,2.9,6.7],
                            [47.8,36.5,15.7,2.3,7.2], 
                            [48.6,37.1,14.3,2.1,7.2],
                            [61.6,25.5,12.9,1.9,7.3],
                            [58.6,26.5,14.9,2.4,6.7],
                            [69.3,22.3,8.4,4.0,7.0],
                            [61.8,30.8,7.4,2.7,6.4],
                            [67.7,25.3,7.0,4.8,7.3],
                            [57.2,31.2,11.6,2.4,6.5],
                            [67.2,22.7,10.1,3.3,6.2],
                            [59.2,31.2,9.6,2.4,6.0],
                            [80.2,13.2,6.6,2.0,5.8],
                            [82.2,11.1,6.7,2.2,7.2],
                            [69.7,20.7,9.6,3.1,5.9]])
        init_centroid = [np.array([82.5,10.0,7.5,1.5,6.5]),
                         np.array([47.8,36.5,15.7,2.3,7.2]),
                         np.array([67.2,22.7,10.1,3.3,6.2])]
        
        kmeans_instance.set_samples(samples)
        kmeans_instance.set_centroids(init_centroid)
        ok_(kmeans_instance.learn())

        # answer is np.kmeans
        book = np.array(([82.5,10.0,7.5,1.5,6.5], 
                         [47.8,36.5,15.7,2.3,7.2], 
                         [67.2,22.7,10.1,3.3,6.2]))
        np_kmeans_results = kmeans(samples, book)
        answer = np.array([np.argmin(np.sum((d - np_kmeans_results[0]) ** 2,
                                   axis = 1)) 
                  for d in samples])
        answer_each_size = [np.sum(answer == k) for k in range(K)]

        actual_each_size = kmeans_instance.get_each_cluster_size()
        for i, v in enumerate(answer_each_size):
            eq_(v, actual_each_size[i])
        
        actual_assign = kmeans_instance.get_assign_list()
        matched_results = (answer == actual_assign)
        ok_(np.all(matched_results))
示例#18
0
def improve_clast_by_disp(X, disp, matExpend, disp_eps, eps):
    currDisp = np.max(X - matExpend)
    if (abs(currDisp - disp) < disp_eps): # error
        return None
    if (len(X) <= 1):
        return X
    clasters = kmeans(X, 2, eps)
    labels = assign_clusters(X, clasters)
    new_clusters_list = []
    for i in range(clasters.shape[0]):
        ret = improve_clast_by_disp(X[np.where(labels == i)],
                                    currDisp, clasters[i],
                                    disp_eps, eps)
        if (ret is None):
            new_clusters_list.append(clasters[i].reshape(1, X.shape[1]))
        else:
            new_clusters_list.append(ret)
    new_clusters = np.concatenate(new_clusters_list)
    return new_clusters
示例#19
0
def plot_best():
    best = {
        "clustering1": 2,
        "clustering2": 3,
        "clustering3": 4,
        "clustering4": 5,
    }
    for key, value in best.items():
        data = eval(key)
        clusters = initialize_cluster_centers(data, value)
        cluster_centers, objective = kmeans(data, clusters)
        labels = assign_clusters(data, cluster_centers)
        plt.scatter([x[0] for x in data], [x[1] for x in data], c=labels)
        plt.scatter([x[0] for x in cluster_centers],
                    [x[1] for x in cluster_centers],
                    c='r',
                    marker='P')
        plt.savefig('report/kmeans-' + key + '-' + str(value) + '.png',
                    bbox_inches='tight')
        plt.clf()
示例#20
0
        j = j + 1
        if j > printEntries:
            print("\nVocab size = " + str(len(words)) + "\n")
            break


if __name__ == "__main__":
    #Assign values to all arguments.
    pathOfVectors = "data\\agentlogsVecsAscii.txt"  #By default
    vectorDimLen = 200  #By default
    regexs = ["diskhealthmonitor", "createcontainer"]  #By default
    if len(sys.argv) > 1:
        vectorDimLen = int(sys.argv[1])

    if len(sys.argv) > 2:
        pathOfVectors = sys.argv[2]

    if len(sys.argv) > 3:
        regexs = sys.argv[3:]

    print "vector dimension should be: " + str(vectorDimLen)
    pv = plotVecs(vectorDimLen, pathOfVectors, regexs)
    #Try k-means clustering
    k = 3
    res = kmeans(pv.X, np.array(sample(pv.X, k)))
    for i in xrange(k):
        splitClusterWords(np.array(pv.rawLogs)[res[1] == i])
    #Give optics a shot
    pv.X = np.array(pv.X)
    optics(pv.X)
示例#21
0
from pylab import *
from pyIOUtils import *
from kmeans import *

data = array(readMatFile("alldata.mat"))
allsmallsets = set()
l = 256
for i in range(20):
    members = kmeans(data, 2, l)[1]
    smallset = members[1]
    if len(members[0]) < len(members[1]):
        smallset = members[0]
    for j in smallset:
        allsmallsets.add(j)
print allsmallsets

m = 256
for l in range(1, 256, 3):
    allsmallsetscomp = set()
    avg = 0.0
    for t in range(20):
        R = randn(m, l)
        pdata = dot(data, R) * (1. / float(m))**.5
        members = kmeans(pdata, 2, l)[1]
        smallset = members[1]
        if len(members[0]) < len(members[1]):
            smallset = members[0]
        for j in smallset:
            allsmallsetscomp.add(j)
        uall = float(len(allsmallsetscomp.union(allsmallsets)))
        iall = float(len(allsmallsetscomp.intersection(allsmallsets)))
示例#22
0
# MAIN PART
# read data from the multi databases

data3G1 = readData('../Databases/SimpleOCR/DB3/G1',False)
data3G2 = readData('../Databases/SimpleOCR/DB3/G2',False)
data2G1 = readData('../Databases/SimpleOCR/DB2/G1',False)
data2G2 = readData('../Databases/SimpleOCR/DB2/G2',False)
data1G12 = readData('../Databases/SimpleOCR/DB1',True)

print len(data1G12[0])

data = data3G1 + data3G2
#data = [[1,2],[3,4],[1,1]]

clusters=kmeans(data,10,0.001)

for idx,i in enumerate(data2G1):
    print "Numero",idx
    print "classificado ", classify(clusters,i)

for idx,i in enumerate(data2G2):
    print "Numero",idx
    print "classificado ",classify(clusters,i)

for idx,i in enumerate(data1G12):
    print "Numero ",idx
    print  "classificado ",classify(clusters,i)


示例#23
0
from tf_idf import *
from data import *
from rfm import *
from evaluation import get_score
'''
Author : Wen-Han Hu
'''

# load the data after preprocessing
df = load_data()

# build typical rfm
print("Building typical RFM model")
typical_rfm = rfm(df)
matrix = rfm_matrix(typical_rfm)
clusters = kmeans(matrix=matrix, cluster_num=4)
#typical_rfm = rfm_write_back(typical_rfm,clusters)
result = get_score(matrix, clusters, 'Typical RFM')

# build stock_id rfm
print("Building StockID RFM model")
stock_rfm = rfm(df, model_type='StockID')
#stock_rfm = rfm_transform(stock_rfm)
matrix = rfm_matrix(stock_rfm, model_type=1)
clusters = kmeans(matrix=matrix, cluster_num=5)
#stock_rfm = rfm_write_back(stock_rfm,clusters)
result = get_score(matrix, clusters, 'StockID RFM', result, flag=1)

# build tf-idf rfm
print("Building TF-IDF RFM model")
matrix = tf_idf(df)
示例#24
0
def test_kmeans(dim, kc, kn, m):
	result = kmeans(dim, kc, kn, m)
	title = "prediccionesKmeans.csv"
	save_csv(result, title)
示例#25
0
def kMeansCol(mu_c, sig_c, n_iter = 100, n_clusters = 3, delta = 0.001, verbose = 2):
	"""
	mu_c and sig_c have the same shape as OSMatrix.
	mu_c and sig_c are the cropped mu and sigma for every region of the OSMatrix
	"""
	centroids = np.empty((mu_c.shape), dtype = 'object')
	weights = np.empty((mu_c.shape), dtype = 'object')

	for i in range(mu_c.shape[0]):
		for j in range(mu_c.shape[1]):
			mu_sigma = np.array([mu_c[i,j].ravel(), sig_c[i,j].ravel()]).T

			data = mu_sigma
			X = data
			ncluster = n_clusters
			kmdelta = delta
			kmiter = n_iter

			if X.shape[0] <= ncluster:
				ncluster = 1

			centres, xtoc, dist = kmeans(data = X, nclusters = ncluster, niter = n_iter, delta = delta,datatype = 2, verbose = False)

			centroids[i,j] = centres


			wt = Counter( xtoc )
			#wt = [(g[0], len(list(g[1]))) for g in itertools.groupby(xtoc)]
			wtx = wt.items() 
			if len(wtx) == 1:
				wt = [1.0]
			else:
				wt = [sec for (one,sec) in wtx]
				one = [one for (one,sec) in wtx]
				wt = [1 - (x/sum(wt)) for x in wt] #1 - (wt/sum(wt))
				wt = [(1 - x) for x in wt]
			
			weights[i,j] = wt
			# print wt
			# print centres
			# print xtoc
			mean_centroids = (centres[:,0])
			variance_centroids = (centres[:,1])
			mean_centroids = [x.flatten() for x in mean_centroids]
			#print mean_centroids

			#mean_centroids = np.reshape(mean_centroids, (len(centres), 2))
			#variance_centroids = np.reshape(variance_centroids, (len(centres),2,2))

			#print mean_centroids.shape, variance_centroids.shape

			colors = ['r', 'g', 'b'] # length of this should be `k`
			#fig = figure()
			#ax = fig.add_subplot(111, aspect='equal')

			#m,v = centres
			#print m
			#print wt

			##### Plot Clusters :-
			# wt = map(int, wt)
			# maxwt = np.max(wt)
			# minwt = np.min(wt)

			# mc =  [(x - minwt)/(maxwt - minwt) for x in wt]
			# mc = softmax(mc)
			# X = gmm.sample_gaussian_mixture(mean_centroids, variance_centroids, samples = 100)
			# plot(X[:,0], X[:,1], '.')

			# for j in range(len(mc)):
			# 	x1,x2 = gmm.gauss_ellipse_2d(mean_centroids[j], variance_centroids[j])
			# 	plot(x1,x2,colors[j], linewidth = 2)

			# show()
			##### Plotted! #####



	return centroids, weights
示例#26
0
import os
from stopwords import *
from tfidf import get_all_vector
from kmeans import *
import shutil

filepath = "F:\\PycharmProjects\\Crawl\\data.json"
savepath = "F:\\PycharmProjects\\Clustering\\data"
newspath = "F:\\PycharmProjects\\Clustering\\news"
historypath = "F:\\PycharmProjects\\Clustering\\history"

dividetotxt(filepath, savepath)
stop_words_set = stop_words("F:\\PycharmProjects\\Clustering\\stopwords.txt")
dataset = get_all_vector(savepath, historypath, stop_words_set)
result = kmeans(dataset[1], 8)
if os.path.exists(newspath):
    shutil.rmtree(newspath)
os.makedirs(newspath)
resultpaths = []
for i in range(result[1].shape[0]):
    temp = dataset[0][i].rfind("\\") + 1
    sort = int(result[1].tolist()[i][0])
    resultpath = newspath + "\\" + str(sort)
    if resultpath not in resultpaths:
        resultpaths.append(resultpath)
    if not os.path.exists(resultpath):
        os.makedirs(resultpath)
    shutil.copyfile(dataset[0][i],
                    resultpath + "\\" + dataset[0][i][temp:len(dataset[0][i])])

resultpaths.sort()
示例#27
0
   [ 70, 140],
   [ 70 ,160],
    [65, 132],
    [48, 75],
     [72, 175],
     [ 67 ,167],
     [69 ,140],
     [96, 285],
      [70, 172],
       [70, 185 ],
[71, 168],
 [70, 180],
  [69 ,170],
   [70 ,150],
   [ 70 ,170 ],
   [71 ,144],
   [ 66 ,140],
   [67, 175],
   [ 67, 165],
   [ 72 ,175] ])

matrix = matrix.astype(np.float64)

for row in matrix:
    print row

std_matrix = standardizeData(matrix)
for row in std_matrix:
    print row
kmeans(matrix, 2)
示例#28
0
        cls = np.argmax(gmm.resp, axis=1)
        # print(cls)

        clr = ["r", "g", "gold", "brown", "black"]
        for k in range(KK):
            index_k = np.where(cls == k)[0]
            x_k = gmm.data[index_k].T
            plt.scatter(x_k[0], x_k[1], s=15, c=clr[k])

        plt.title('em gmm')
        plt.show()

    if case == "kmeans":
        KK = 4
        gmm = EM_GMM(K=KK, data="40")
        gmm.means = kmeans(data=gmm.data, K=KK)
        pt = gmm.data.T
        fig, ax = plt.subplots()
        gmm.EM()
        print(gmm.coeff)
        print(gmm.means)
        print(gmm.covar)
        for i in range(KK):
            v, w = np.linalg.eigh(gmm.covar[i])
            v = 2. * np.sqrt(2.) * np.sqrt(v)
            u = w[0] / np.linalg.norm(w[0])
            angle = np.arctan(u[1] / u[0])
            angle = 180. * angle / np.pi  # convert to degrees
            ell = matplotlib.patches.Ellipse(gmm.means[i],
                                             v[0],
                                             v[1],
示例#29
0
#/usr/bin/python

from kmeans import *
from numpy import *
import time
import matplotlib.pyplot as plt

## step 1: load data
print "step 1: load data..."
dataSet = []
fileIn = open('./txt/spec-429-100M.txt')
#fileIn = open('./123.txt')
for line in fileIn.readlines():
	lineArr = line.strip().split('\t')
        dataTmp = []
        for num in range(len(lineArr)):
	    dataTmp.append(float(lineArr[num]))
	dataSet.append(dataTmp)

## step 2: clustering...
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 1
centroids, clusterAssment = kmeans(dataSet, k)

## step 2.5: delete the furthest 0.1% points

# step 3: show the result
#print "step 3: show the result..."
#showCluster(dataSet, k, centroids, clusterAs`sment)
示例#30
0
from pylab import *
from pyIOUtils import *
from kmeans import *

data = array(readMatFile("alldata.mat"))
allsmallsets = set()
l = 256
for i in range(20):
    members = kmeans(data,2,l)[1]
    smallset = members[1] 
    if len(members[0])<len(members[1]):
        smallset = members[0]
    for j in smallset:
        allsmallsets.add(j)
print allsmallsets

m=256
for l in range(1,256,3):
    allsmallsetscomp = set()
    avg = 0.0
    for t in range(20):
        R = randn(m,l)
        pdata = dot(data,R)*(1./float(m))**.5
        members = kmeans(pdata,2,l)[1]
        smallset = members[1] 
        if len(members[0])<len(members[1]):
            smallset = members[0]
        for j in smallset:
            allsmallsetscomp.add(j)
        uall = float(len(allsmallsetscomp.union(allsmallsets)))
        iall = float(len(allsmallsetscomp.intersection(allsmallsets)))
示例#31
0
def kMeansInt(mu_c, sig_c, n_iter = 100, n_clusters = 3, delta = 0.001, verbose = 2):
	"""
	mu_c and sig_c have the same shape as OSMatrix.
	mu_c and sig_c are the cropped mu and sigma for every region of the OSMatrix
	"""
	centroids = np.empty((mu_c.shape), dtype = 'object')
	weights = np.empty((mu_c.shape), dtype = 'object')

	#print mu_c[0,0].shape, mu_c[1,0].shape, mu_c[2,0].shape

	for i in range(mu_c.shape[0]):
		for j in range(mu_c.shape[1]):
			mu_sigma = np.array([mu_c[i,j].ravel(), sig_c[i,j].ravel()]).T
			#print mu_sigma.shape
			data = mu_sigma

			X = data
			ncluster = n_clusters
			kmdelta = delta
			kmiter = n_iter

			#print X.shape, ncluster


			if X.shape[0] <= ncluster:
				ncluster = 1

			centres, xtoc, dist = kmeans(data = X, nclusters = ncluster, niter = n_iter, delta = delta,datatype = 1, verbose = False)
			

			centroids[i,j] = centres
			wt = Counter( xtoc )
			#print wt
			#wt = [(g[0], len(list(g[1]))) for g in itertools.groupby(xtoc)]
			wt = wt.items()
			print wt 
			wt = [sec for (one,sec) in wt]
			print wt
			if len(wt) == 1:
				wt = [1.0]
			else:
				wt = [1 - (x/sum(wt)) for x in wt] #1 - (wt/sum(wt))
				wt = [(1 - x) for x in wt]
			
			weights[i,j] = wt
			
			print wt


			# idx = xtoc

			# centroids1 = centres

			# plot(data[idx==0,0],data[idx==0,1],'ob',
			#      data[idx==1,0],data[idx==1,1],'or',
			#      data[idx==2,0],data[idx==2,1],'og',
			#      data[idx==3,0],data[idx==3,1],'oy',
			#      data[idx==4,0],data[idx==4,1],'oc')

			# plot(centroids1[:,0],centroids1[:,1],'sg',markersize=8)
			# show()

	#print centroids[0,0].shape, weights[0,0].shape
	return centroids, weights
示例#32
0

            os.system("./a.out X.mat " + str(k))
            means = readMatFile("out.mat")
            rp2Avg.append(getLabelAccuracy(means,testX,testY,[k]))


            os.system("./rp1.out X.mat " + str(k))
            means = readMatFile("out.mat")
            rp1Avg.append(getLabelAccuracy(means,testX,testY,[k]))


            start = time.time()

            #standard kmeans
            means,clusters = kmeans(trainX,k,h)

            print time.time()-start

            kmAvg.append(getLabelAccuracy(means,testX,testY,[k]))

            del(X,Y,trainX,trainY,testX,testY)

        rp2AllhashPR.append(sum(rp2AllAvg)/float(av))
        rp2hashPR.append(sum(rp2AllAvg)/float(av))
        rp1hashPR.append(sum(rp2AllAvg)/float(av))
        kmeansPR.append(sum(kmAvg)/float(av))
        dimlist.append(h)

        mrp2All = sum(rp2AllAvg)/float(av)
        mrp2 = sum(rp2Avg)/float(av)
示例#33
0
# MAIN PART
# read data from the multi databases

data3G1 = readData('../Databases/SimpleOCR/DB3/G1', False)
data3G2 = readData('../Databases/SimpleOCR/DB3/G2', False)
data2G1 = readData('../Databases/SimpleOCR/DB2/G1', False)
data2G2 = readData('../Databases/SimpleOCR/DB2/G2', False)
data1G12 = readData('../Databases/SimpleOCR/DB1', True)

print len(data1G12[0])

data = data3G1 + data3G2
#data = [[1,2],[3,4],[1,1]]

clusters = kmeans(data, 10, 0.001)

for idx, i in enumerate(data2G1):
    print "Numero", idx
    print "classificado ", classify(clusters, i)

for idx, i in enumerate(data2G2):
    print "Numero", idx
    print "classificado ", classify(clusters, i)

for idx, i in enumerate(data1G12):
    print "Numero ", idx
    print "classificado ", classify(clusters, i)

showClusters(
    clusters
    cx = np.sum(cm, axis=0)
    cy = np.sum(cm, axis=1)
    tp, fp, fn, FM = [], [], [], []
    for j in range(10):
        tp.append(cm[j, j])
        fp.append(cx[j] - cm[j, j])
        fn.append(cy[j] - cm[j, j])
        FM.append(((cm[j, j] / cx[j]) * (cm[j, j] / cy[j]))**0.5)
    F_M = 0
    for temp in FM:
        F_M = float(F_M + temp)
    print("The Fowlkes–Mallows index is: ", float(F_M / 10))


centroids, clusterAssment = kmeans(dataSet, k)
pre = []
for i in range(numSamples):
    pre.append(clusterAssment[i, 0])
print("Result of kmeans:")
result(pre)

ward = cluster.AgglomerativeClustering(n_clusters=k, linkage='ward')
pre = ward.fit_predict(digits.data)
print("Result of Agglomerative clustering with Ward linkage:")
result(pre)

affinity_propagation = cluster.AffinityPropagation()
pre = affinity_propagation.fit_predict(digits.data)
print("Result of cluster fot AffinityPropagation:")
print(pre)
示例#35
0
import matplotlib.pyplot as plt
from kmeans import *

## step 1: load data
# print ("step 1: load data..." )
weight = np.load("./save_np/fc_w_noprune.npz")
fc1_w = np.mat(weight['fc3_w'])
fc1_w = fc1_w.reshape([192 * 10, 1])
print(fc1_w.shape)
print(type(fc1_w))
print(fc1_w)
## step 2: clustering...
print("step 2: clustering...")

k = 10
centroids, clusterAssment = kmeans(fc1_w, k)  #调用KMeans文件中定义的kmeans方法。
clusterAssment = clusterAssment[:, 0]
print(centroids)
print(clusterAssment)
np.savez("./save_kmeans/fc3_w_clusterAssment.npz",
         fc1_w_clusterAssment=clusterAssment)
## step 3: show the result
# print ("step 3: show the result..."  )
# showCluster(fc1_w, k, centroids, clusterAssment)

# fc1_w_mask = np.load("./save_kmeans/fc2_w_clusterAssment.npz")
# fc1_w_clusterAssment = np.mat(fc1_w_mask['fc1_w_clusterAssment'])
# a = fc1_w_clusterAssment.reshape([120, 84])
# print(a.shape)
# print(type(a))
# print(a)
示例#36
0
		if j>printEntries:
			print("\nVocab size = " + str(len(words)) + "\n")
			break


if __name__ == "__main__":
	#Assign values to all arguments.
	pathOfVectors = "data\\agentlogsVecsAscii.txt" #By default
	vectorDimLen = 200 #By default
	regexs = ["diskhealthmonitor", "createcontainer"] #By default
	if len(sys.argv) > 1:
		vectorDimLen = int(sys.argv[1])

	if len(sys.argv) > 2:
		pathOfVectors = sys.argv[2]

	if len(sys.argv) > 3:
		regexs = sys.argv[3:]

	print "vector dimension should be: " + str(vectorDimLen)
	pv = plotVecs(vectorDimLen, pathOfVectors, regexs)
	#Try k-means clustering
	k = 3
	res = kmeans(pv.X,np.array(sample(pv.X, k)))
	for i in xrange(k):
		splitClusterWords(np.array(pv.rawLogs)[res[1] == i])
	#Give optics a shot
	pv.X = np.array(pv.X)
	optics(pv.X)

示例#37
0
# plt.title('Cumulative variance vs PC')
# plt.show()

exit()

# question 3.3

from kmeans import *  # this script contains everything for kmeans algorithm running

k = 4
initialIndices = np.array([0, 1, 2,
                           3])  # first 4 data point will be initial clusters
###initialIndices=np.array(random.sample(range(0, len(data)), k)) #if want random

# run kmeans
new_centers, _ = kmeans(data, initialIndices, k)

#adding centeres as new datapoints to the original dataset
data2 = np.vstack([data, new_centers])
#print data2.shape #[1279,1568]

#running PCA on the new dataset
_, _, Y_centers = apply_pca(data2)

#adding color labels for the cluster centers
col_labels.extend(['yellow', 'yellow', 'yellow', 'yellow'])

#####plotting
PC1_centers = Y_centers[:, 0]
PC2_centers = Y_centers[:, 1]
print Y_centers.shape
示例#38
0
if __name__ == '__main__':
    k = 16
    img = Image.open('bird_small.png').getdata()
    # picture size
    leng, wid = img.size  #128, 128
    # number of pixels
    m = leng * wid
    data = np.array(img, dtype=np.float64) / 255
    #print(data[0])  #[ 0.85882353  0.70588235  0.40392157]
    #print(len(data))  #16384
    # original picture pixel density
    orgpic = data.copy()
    # compressed picture pixel density
    compic = data.copy()
    # initialize centroids randomly
    centroids = centInit(data, k, 3)
    idx, history = kmeans(data, centroids)
    # get the final converged points
    centroids = np.array(history[len(history) - 1])

    # compress the picture, replace the sampe labeled pixels with its centroid
    for i in range(0, k):
        compic[idx == i] = centroids[i]

    fig, ax = plt.subplots(2)
    # The value for each component of MxNx3 and MxNx4 float arrays should be
    # in the range 0.0 to 1.0; MxN float arrays may be normalised.
    ax[0].imshow(orgpic.reshape((leng, wid, 3)))
    ax[1].imshow(compic.reshape((leng, wid, 3)))
    plt.show()
示例#39
0
    n = len(dataset)
    number_of_clusters = 5
    x = dataset[:, 1]  # coordinate x
    y = dataset[:, 2]  # coordinate y
    pop = dataset[:, 3]  # population
    coordinates = [(x[i], y[i]) for i in range(n)]  # points

    # NORMAL EXECUTION
    # clusters = hierarchical_clustering(coordinates, number_of_clusters)
    # clusters = kmeans(coordinates, number_of_clusters, pop, q=6)

    # BEGIN DISTORTION
    kmeans_distortion = []
    for i in range(6, 21):
        clusters = kmeans(coordinates, i, pop, q=5)
        kmeans_distortion.append(distortion(clusters))

    clusters, distortion_clusters = hierarchical_clustering(coordinates, number_of_clusters)
    hierarchical_distortion = [distortion(c) for c in distortion_clusters]

    plt.plot(np.arange(6, 21), hierarchical_distortion[::-1])
    plt.plot(np.arange(6, 21), kmeans_distortion)
    plt.legend(labels=["Hierarchical", "K-means"])
    plt.xlabel("Number of clusters")
    plt.ylabel("Distortion")
    plt.title("Distortion graph of dataset: {} counties".format(f_code))
    plt.gca().invert_xaxis()
    plt.xticks(np.arange(6, 21, 1))
    plt.grid()
    # plt.savefig("./risposte/distortion_{}_domanda9.png".format(f_code))
示例#40
0
import numpy as np
from kmeans import *

if __name__ == '__main__':
    fig, ax = plt.subplots()
    ex7data2 = np.load('ex7data2.npz')
    x = ex7data2['X']
    m, n = x.shape
    k = 3
    """
    centroids = np.zeros((k,n)) 
    centroids[0] = [3, 3]
    centroids[1] = [6, 2]
    centroids[2] = [8, 5]
    """
    centroids = centInit(x, k, 2)
    #idx =  closestCentroids(x,centroids)
    #print computeCentroids(x,idx,k)
    idx, history = kmeans(x, centroids, tol=1e-5)
    plotResult(ax, x, idx)
    moveTrace(ax, history)
    plt.show()
示例#41
0
def split_tuple(centers):
    result = [[],[]]
    for i in centers:
        result[0].append(i[0])
        result[1].append(i[1])
    return result

data = []
print '这是一个k-means算法及其实例程序,程序会创建随机点集并自动使用k-means算法进行聚类,并给出一个效果图(算法本身中并没有效果图的实现)'
print '请输入随机点的个数:'
n = input('>')
print '请输入划分集合的个数k:'
k = input('>')
for i in range(n):
    data.append(produce_random_point())
centers = kmeans(data, k)



centers[0] = split_tuple(centers[0])
for i in range(k):
    centers[1][i] = split_tuple(centers[1][i])


colors = random_color(k)
fig = plt.figure(figsize=(16,12), dpi=72, facecolor="white")
for i in range(k):
    plt.scatter(centers[1][i][0], centers[1][i][1], color=colors[i])
    plt.triplot(centers[1][i][0], centers[1][i][1], linewidth=0.1)
plt.scatter(centers[0][0], centers[0][1], marker='*', s=300)
plt.show()