def select_next(iterval): """ select the next best data sample using robust map or simply the max iterval ... """ if self._robust_map: k = np.argsort(iterval)[::-1] d_sub = self.data[:, k[:self._robust_nselect]] self.sub.extend(k[:self._robust_nselect]) # cluster d_sub kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster) kmeans_mdl.factorize(niter=10) # get largest cluster h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster + 1))[0] largest_cluster = np.argmax(h) sel = pdist( kmeans_mdl.W[:, largest_cluster:largest_cluster + 1], d_sub) sel = k[np.argmin(sel)] else: sel = np.argmax(iterval) return sel
def init_h(self): if not hasattr(self, "H"): # init basic matrices self.H = np.zeros((self._num_bases, self._num_samples)) # initialize using k-means km = Kmeans(self.data[:, :], num_bases=self._num_bases) km.factorize(niter=10) assign = km.assigned num_i = np.zeros(self._num_bases) for i in range(self._num_bases): num_i[i] = len(np.where(assign == i)[0]) self.H.T[range(len(assign)), assign] = 1.0 self.H += 0.2 * np.ones((self._num_bases, self._num_samples)) if not hasattr(self, "G"): self.G = np.zeros((self._num_samples, self._num_bases)) self.G[range(len(assign)), assign] = 1.0 self.G += 0.01 self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1]) if not hasattr(self, "W"): self.W = np.dot(self.data[:, :], self.G)
def create_splits(self, X): # get shape of dataset N, D = X.shape # thresholds is set of K-Means of each feature self.thresholds = [] for d in range(D): # reshape (n,) to (n,1) feature = X[:, d] feature = np.reshape(feature, [feature.size, 1]) # Initialize K-Means model k_means = Kmeans(k=k) min_err = np.inf min_err_means = None for i in range(50): k_means.fit(feature) error = k_means.error(feature) if error < min_err: min_err = error min_err_means = k_means.means self.thresholds.append(min_err_means)
class BagOfWords: def __init__(self, local_feature_extractor_name='hog', nclusters=256): self.nclusters = nclusters if local_feature_extractor_name == 'hog': self.feature_extractor = HOGFeatureExtractor() else: raise Exception("Unknown feature extractor") self.kmeans = None def extract(self, X): assert X.ndim == 4 return self.feature_extractor.predict(X, unflatten=True) def fit(self, X): assert X.ndim == 3 X_features = X.reshape(X.shape[0] * X.shape[1], -1) self.kmeans = Kmeans(self.nclusters) self.kmeans.fit(X_features) def predict(self, X): assert X.ndim == 3 X_features = X.reshape(X.shape[0] * X.shape[1], -1) X_clustered = self.kmeans.predict(X_features) X_clustered = X_clustered.reshape(X.shape[0], X.shape[1]) ret = numpy.zeros((X.shape[0], self.nclusters)) for i, x in enumerate(X_clustered): for word in x: ret[i, word] += 1 return ret
def quantize(self, img): """ Quantizes an image into 2^b clusters Parameters ---------- img : a (H,W,3) numpy array Returns ------- quantized_img : a (H,W,1) numpy array containing cluster indices Stores ------ colours : a (2^b, 3) numpy array, each row is a colour """ H, W, _ = img.shape pixels = img.reshape((-1, 3)) model = Kmeans(2**self.b) model.fit(pixels) quantized_img = model.predict(pixels).reshape((H, W, 1)) self.colours = model.means return quantized_img
def main(): '''Train tha model and evaluate performance''' '''Load the MNIST training data. Also flatten the images from 28X28 arrays to a single vector''' images, labels = amitgroup.io.mnist.load_mnist('training', path='./', asbytes=True) images = [image.ravel() for image in images] '''Find unique labels and which are the first images that correspnd to them''' indices = unique(labels, return_index=True)[1] '''Create the clustering engine. Use the unique images found above as centers''' clustering = Kmeans() clustering.train(data=images, centers=take(images, indices, axis=0), max_iterations=100) '''Load the testing data set and flatten the images''' test_images, test_labels = amitgroup.io.load_mnist('testing', path='./', asbytes=True) test_images = [image.ravel() for image in test_images] '''Assign the test data to clusters and evaluate the performance''' predictions = [clustering.cluster(image) for image in test_images] success = (predictions == test_labels) correct, counts = unique(success, return_counts=True) print('{} of the testing data set where put in the wrong cluster'.format(counts[0])) plot_images_separately([reshape(center, (28,28)) for center in clustering.centers])
def main(): # Reading the training data path_train = './data/EMGaussian.data' path_test = './data/EMGaussian.test' data = dp.parse_data_wo_labels(path_train, 2, delimiter=' ') data_test = dp.parse_data_wo_labels(path_test, 2, delimiter=' ') # Initialization with K-means best_kmean_model = None min_distortion = float("inf") distortions = [] for i in xrange(NB_INITIALIZATION_RETRIES): kmean_model = Kmeans(data, NB_CLUSTERS, MAX_K_MEAN_ITER) kmean_model.run() distortions.append(kmean_model.distortion) if kmean_model.distortion < min_distortion: best_kmean_model = kmean_model min_distortion = kmean_model.distortion # Showing the distortions plt.plot(range(1, NB_INITIALIZATION_RETRIES + 1), distortions) plt.xlabel("Initialization number") plt.ylabel("Distortion") plt.title("Running few Kmeans and measuring the distortion for each") plt.show() # Plotting the result best_kmean_model.plot() # Case where the covariance matrix is proportional to identity run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=True) # General Case run_em_model(data, data_test, best_kmean_model, sigma_prop_identity=False)
def main(): # 1.读取数据 dataDF = getDF() # 2.测试最佳K值,第一次出现明显拐角处便是最佳K值 km = Kmeans() km.searchK(SAVAPATH,dataDF,2,12) # 查看保存的图片,选择最佳K值
def init_h(self): if not hasattr(self, 'H'): # init basic matrices self.H = np.zeros((self._num_bases, self._num_samples)) # initialize using k-means km = Kmeans(self.data[:, :], num_bases=self._num_bases, seed=self.seed) km.factorize(niter=10) assign = km.assigned num_i = np.zeros(self._num_bases) for i in range(self._num_bases): num_i[i] = len(np.where(assign == i)[0]) self.H.T[range(len(assign)), assign] = 1.0 self.H += 0.2 * np.ones((self._num_bases, self._num_samples)) if not hasattr(self, 'G'): self.G = np.zeros((self._num_samples, self._num_bases)) self.G[range(len(assign)), assign] = 1.0 self.G += 0.01 self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1]) if not hasattr(self, 'W'): self.W = np.dot(self.data[:, :], self.G)
def kmeansselect(self): kmeans_mdl = Kmeans(self.data, num_bases=self._nsub) kmeans_mdl.initialization() kmeans_mdl.factorize() # pick data samples closest to the centres idx = dist.vq(kmeans_mdl.data, kmeans_mdl.W) return idx
def quantize(self, img): b = self.b C, R, D = img.shape self.img = img X = np.reshape(img, (C * R, D)) model = Kmeans(k=pow(2, b)) model.fit(X) self.model = model return model.means
class ClusteringTest(TestCase): def setUp(self): self.reader = Reader(filename) self.clusterer = Kmeans(3) def test_01_courses(self): courses = self.reader.courses #returns list of courses self.assertEqual(courses[:3], ['Bioinformatik', 'Informatik', 'Mathematik']) def test_02_normalize(self): word = "(Studienrichtung" normalized_word = self.reader.normalize_word( word) #returns list of courses self.assertEqual(normalized_word, "studienrichtung") def test_03_vocabulary(self): words = self.reader.vocabulary self.assertEqual(words[:3], ['albanologie', 'allgemeine', 'als']) def test_04_vectorspaced(self): word_to_vectorspace = self.reader.vectorspaced("Slavische Philologie") vocab_size = len(self.reader.vocabulary) self.assertEqual(vocab_size, len(word_to_vectorspace)) self.assertEqual(word_to_vectorspace, [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) def test_05_distance(self): a = [1, 2, 3] b = [4, 5, 6] euclidean_dist = self.clusterer.distance(a, b) self.assertEqual(int(euclidean_dist), 5) def test_06_vector_mean(self): vectors = [[1, 2, 3], [4, 5, 6]] mean = self.clusterer.vector_mean(vectors) self.assertEqual(mean, [2.5, 3.5, 4.5]) def test_07_classify(self): vectorspaced_data = self.reader.vector_spaced_data #clusters are always differrent self.clusterer.train(vectorspaced_data) clusters = [self.clusterer.classify(vec) for vec in vectorspaced_data] self.assertEqual(len(clusters), len(vectorspaced_data))
def create_splits(self, X): model = Kmeans(3) N, D = X.shape splits = np.empty((D * model.k, )) for d in range(D): conSplit = X[:, d] conSplit = np.array(conSplit).transpose() model.fit(conSplit) for i in range(model.k): splits[d] = model.means[i, ] self.thresholds = np.unique(self.means)
def quantize(self, img): """ Quantizes an image into 2^b clusters Parameters ---------- img : a (H,W,3) numpy array Returns ------- quantized_img : a (H,W) numpy array containing cluster indices Stores ------ colours : a (2^b, 3) numpy array, each row is a colour """ H, W, D = img.shape # model = KMeans(n_clusters=2**self.b, n_init=3) model = Kmeans(k=2**self.b) X = np.reshape(img, (H * W, 3)) model.fit(X) y = model.predict(X) print(y.shape) # self.y=y # self.center=model.means # Reshape 2D-matrix to 3D-img # quantized_img = img # X=np.reshape(img,(H*W,3)) # model.fit(X) # y=model.predict(X) # m=y.shape # print(m) # quantized_img=y self.colours = np.zeros((2**self.b, 3), dtype='uint8') # ,dtype='uint8') for i in range(2**self.b): # img[i, :] = quantized_img[i] self.colours[i, :] = model.means[i, :] img = np.zeros((H * W), dtype='uint8') for i in range(H * W): img[i] = y[i] img = np.reshape(img, (H, W)) quantized_img = img # TODO: fill in code here # raise NotImplementedError() return quantized_img
def create_splits(self, X): #k value obtained via elbow method N, D = X.shape splits = [] for i in range(D): model = Kmeans(k=10) #all values in an example vec = X[:, i].reshape(N, 1) model.fit(vec) threshs = model.means splits.append(np.squeeze(threshs)) self.thresholds = splits
def quantize_image(self, img): # w, h, d = img.shape w, h, d = original_shape = tuple(img.shape) resized_image = np.reshape(img, (w * h, d)) model = Kmeans(k=(2**self.b)) model.fit(resized_image) labels = model.predict(resized_image) self.means = getattr(model, "means") print("Cluster Assignments") print(labels) return labels
def main(): KMEANS = leerModelo() if (KMEANS == None): print('NO EXISTE UN MODELO') spotify = SpotifyPro() df = spotify.iniciar(idPlaylist='1QP6tyANnZZ9bRTfQG4X7a') k = Kmeans(df) # contiene red y datasets if (len(df)): k.importarDatos() if (k.red != None): guardarModelo(k) else: print('Ya existe un modelo')
def dequantize_image(self, img): w, h, d = img.shape resized_image = np.reshape(img, (w * h, d)) original_image = np.zeros(img.shape) model = Kmeans(k=(2**self.b)) model.fit(resized_image) labels = model.predict(resized_image) self.means = getattr(model, "means") label_idx = 0 for i in range(w): for j in range(h): original_image[i][j] = self.means[labels[label_idx]] label_idx += 1 return original_image
def main(): dataset1 = np.genfromtxt(r'../data/new_dataset_1.txt', dtype=float, delimiter='\t') dataset2 = np.genfromtxt(r'../data/cho.txt', dtype=float, delimiter='\t') km1 = Kmeans(dataset1[:, 2:], dataset1[:, 1], 3) km2 = Kmeans(dataset2[:, 2:], dataset2[:, 1], 10) ic1 = km1.initial_centroids(3, 5, 9) #ic1 = km1.initial_random_centroids(5) ic2 = km2.initial_random_centroids(5) # km1.centroids = km1.init_centroids = np.loadtxt(r'../log/cho_ground_centroids.txt') # specify iteration as parameter here km1.kmeans_algorithm() km2.kmeans_algorithm() extr_index_validation1 = ExternalIndex(km1.ground_truth_clusters, km1.clusters) extr_index_validation2 = ExternalIndex(km2.ground_truth_clusters, km2.clusters) print('Rand Index of dataset1 clusters :', extr_index_validation1.rand_index()) print('Jaccard Coefficient of dataset1 clusters :', extr_index_validation1.jaccard_coefficient()) print('Rand Index of dataset2 clusters :', extr_index_validation2.rand_index()) print('Jaccard Coefficient of dataset2 dataset clusters :', extr_index_validation2.jaccard_coefficient()) plot1 = Visualization(dataset1.data[:, 2:], km1.clusters, dataset1.data[:, 1]) plot2 = Visualization(dataset2.data[:, 2:], km2.clusters, dataset2.data[:, 1]) plot1.plot(r'../log/td1.jpg') plot2.plot(r'../log/cho2.jpg') # gene_cluster_matched = km1.cluster_validation() # print('Genes that matched in clusters: ', gene_cluster_matched) return
def closure_1_3_1(): k = 4 best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model plt.figure() utils.plot_2dclustering(X, best_model.predict(X)) fname = os.path.join("..", "figs", "kmeans_outliers_best_model.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
def _init_parameters(self, X, method='kmeans'): """ 初始化高斯分布的参数。 如果 method == 'kmeans',那么使用kmeans进行初始化; 如果 method == 'random',那么进行随机初始化。 """ n = X.shape[0] self.Guass = [Guass_distribution(dim=self.dim) for i in range(self.m)] if method is 'kmeans': try: kmeans = Kmeans() labels, centroids = kmeans.main(X, k=self.m, t=100, c_strategy='kmeans') except: centroids, labels = vq.kmeans2(X, self.m, minit='points', iter=1000) clusters = [[j for j in range(n) if labels[j] == i] for i in range(self.m)] elif method is 'random': time_seed = int(time.time()) np.random.seed(time_seed) clusters = [[] for i in range(self.m)] centroids = random.sample(list(range(n)), self.m) # 随机生成m个中心 for i in range(n): ci = np.argmin([la.norm(X[i] - X[c]) for c in centroids]) clusters[ci].append(i) else: raise ValueError("Unknown method type!") for i in range(self.m): guass = self.Guass[i] data = X[clusters[i]] guass.init(data) guass.weight = len(clusters[i]) / n
def sliding_window_three_months(df, date): starting_date_obj = datetime.datetime.strptime( date, "%Y-%m-%d") preprocess_obj = preprocess() monthly_data = preprocess_obj.get_three_monthly_candlestick_data( df, date) kmeans = Kmeans(monthly_data) e = kmeans.get_clusters() print('original', e) ctut = [] ctlt = [] ctbl = [] ctc = [] for i in range(0, len(e)): ctut.append(e[i][0]) ctlt.append(e[i][1]) ctbl.append(e[i][2]) ctc.append(e[i][3]) candlestickst = candlestickState(ctut, ctlt, ctbl, ctc) return candlestickst
def _init_kmeans(self): """ Initialize using k-means. Uses random intialization for k-means. This is a really bad idea. """ data = self.data k = self.k # Estimate the means of the mixture components, using k-means km = Kmeans(data, k) return km.cluster.T, km.label
def initialization(self): # init basic matrices self.W = np.zeros((self._data_dimension, self._num_bases)) self.H = np.zeros((self._num_bases, self._num_samples)) self.G = np.zeros((self._num_samples, self._num_bases)) ##### # initialize using k-means km = Kmeans(self.data[:, :], num_bases=self._num_bases, show_progress=self._show_progress) km.initialization() km.factorize() assign = km.assigned num_i = np.zeros(self._num_bases) for i in range(self._num_bases): num_i[i] = len(np.where(assign == i)[0]) self.G[range(len(assign)), assign] = 1.0 self.G += 0.01 self.G /= np.tile(np.reshape(num_i[assign], (-1, 1)), self.G.shape[1]) self.H.T[range(len(assign)), assign] = 1.0 self.H += 0.2 * np.ones((self._num_bases, self._num_samples)) self.W = np.dot(self.data[:, :], self.G)
def select_next(iterval): """ select the next best data sample using robust map or simply the max iterval ... """ if self._robust_map: k = np.argsort(iterval)[::-1] d_sub = self.data[:,k[:self._robust_nselect]] self.sub.extend(k[:self._robust_nselect]) # cluster d_sub kmeans_mdl = Kmeans(d_sub, num_bases=self._robust_cluster) kmeans_mdl.factorize(niter=10) # get largest cluster h = np.histogram(kmeans_mdl.assigned, range(self._robust_cluster+1))[0] largest_cluster = np.argmax(h) sel = pdist(kmeans_mdl.W[:, largest_cluster:largest_cluster+1], d_sub) sel = k[np.argmin(sel)] else: sel = np.argmax(iterval) return sel
def kmeans(imagens, segmentadas, path): k = Kmeans(imagens) qtd = 0 for i in imagens: # Leitura Imagem img = imread(imagens[qtd][0]) #img = cv2.resize(img, (segmentadas[qtd][2], segmentadas[qtd][1])) res2 = k.kmeans_seg(img, 2) / 255 res3 = k.kmeans_seg(img, 3) / 255 res9 = k.kmeans_seg(img, 5) / 255 cv2.imwrite("res2.png", res2) cv2.imwrite("res3.png", res3) cv2.imwrite("res9.png", res9) fig = plt.figure(figsize=(9,3), dpi=200) k.add_image(fig, img, 1, 4, 1, 'original') k.add_image(fig, res2, 1, 4, 2, 'k=2') k.add_image(fig, res3, 1, 4, 3, 'k=3') k.add_image(fig, res9, 1, 4, 4, 'k=5')
def main(_argv): probki_string, nazwy_atr, czy_atr_symb = wczytaj_baze_probek_z_tekstem( 'spirala.txt', 'spirala-type.txt') probki = probki_str_na_liczby(probki_string, (0, 1)) grupy, osrodki = Kmeans(probki, FLAGS.groups, FLAGS.iterations, progress) fig = plt.figure(1) anim = animation.FuncAnimation(fig, Animate, frames=len(progress), repeat=False, interval=500) chart.show()
def quantize(self, img): """ Quantizes an image into 2^b clusters Parameters ---------- img : a (H,W,3) numpy array Returns ------- quantized_img : a (H,W) numpy array containing cluster indices Stores ------ colours : a (2^b, 3) numpy array, each row is a colour """ H, W, _ = img.shape #print(img.shape) z = [] #print(z) #print (img[0]) for h in range(img.shape[0]): for w in range(img.shape[1]): z.append(img[h][w]) #print(z) #print(len(img)) #model = KMeans(n_clusters=2**self.b, n_init=3) model = Kmeans(k = 2**self.b) m = model.fit(np.array(z)) y = model.predict(np.array(z)) print('y =', y) print('means =', m)
def quantize(self, X): N, D, C = X.shape X_reshaped = np.reshape(X, (N * D, C)) print(X_reshaped) model = Kmeans(np.power(2, self.b)) model.fit(X_reshaped) model.predict(X_reshaped) y = np.reshape(model.predict(X_reshaped), (N, D)) self.means = model.means self.y = y self.X = X
def closure_1_3_2(): minErrs = [] for k in range(1, 11): best_model = None min_error = np.inf for i in range(50): model = Kmeans(k) model.fit(X) error = model.error(X) if error < min_error: min_error = error best_model = model minErrs.append(min_error) plt.figure() plt.plot(list(range(1, 11)), minErrs) plt.xlabel('k') plt.ylabel('Error') plt.title('k-means training error as k increases') fname = os.path.join("..", "figs", "kmeans_err_k_outliers.png") plt.savefig(fname) print("\nFigure saved as '%s'" % fname)
def main(): km = Kmeans(tc.init_board_gauss(nb_points, nb_classe, mini, maxi, ecart_min, ecart_max), nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir) km.run_global(choose_nb_graph=True, grphq=True) km.save(km_path) print("\n{}".format(km)) return None
def main(): df = bdd.date_dir(path) idx, mtx = bdd.df2np(df) del df km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx) km.run_global(grphq=True, choose_nb_graph=True) km.save(km_path) print("\n{}".format(km)) return None
def main(): df = bdd.concat_dir(path) df = bdd.drop_profile(df, drop_var) df = bdd.bdd2bow(df) idx, mtx = bdd.df2np(df) col = df.columns.values.astype(str) del df km = Kmeans(mtx, nb_cluster=nb_cluster, cpu=cpu, methode_dist=methode_dist, adr=img_dir, index=idx) km.run_global(choose_nb_graph=True) bdd.print_means_words(km, col) km.save(km_path) print("\n{}".format(km)) return None
from kmeans import Kmeans import numpy as np data = np.array([[1.9,1.5,0.4,0.4,0.1,0.2,2.0,0.3,0.1],[2.3,2.5,0.2,1.8,0.1,1.8,2.5,1.5,0.3]]) codes = 3 km = Kmeans(data,codes) print 'Class labels = ', km.label print('Due to the random initialization, different (wrong) labels\nare often returned') x = np.array([0.25,2.0]) print km.classify(x) #km.label = lab2 km.plot() print('Verify the answer using the graph.') # Specify the initial cluster means. codes = np.array([data[:,0],data[:,2],data[:,3]]).T km = Kmeans(data,codes) print 'Clusters = ',km.cluster print 'Class labels = ', km.label
print 'Normalizing frequencies...', stdout.flush() # Don't modify the original set for i, doc in enumerate(parser.docset): normalize(doc, parser.words, idf) print i gc.collect() print 'done' for chooser in [choose_initial]: #choose_initial_pp, choose_initial: for k in [10]: errors = [] print '\nStemming words: %s' % stem print 'Using IDF: %s' % idf print 'Running with %d centroids' % k if chooser is choose_initial: print 'Chooser: normal' else: print 'Chooser: plusplus' stdout.flush() for _ in xrange(13): kmeans = Kmeans(parser.docset, k, distance, calc_centroid, chooser, tol=0.001) clusters = get_clusters(kmeans.result(), parser.docset) freqs = get_docs_frequencies(clusters) errors.append(sum(calc_error(freqs))) print 'Error mean: %d and median: %d' % \ (numpy.mean(errors), numpy.median(errors)) gc.collect()