def run(self): """ Main method that drives Spectral Co-Clustering. """ self.nfeatures = self.A.shape[0] self.ndocs = self.A.shape[1] self.logger.debug("Word By Documentmatrix A has dim:(%d,%d)", \ self.nfeatures, self.ndocs) self.logger.debug("Generating normalized Adjacency Matrix, A_n") self.gen_An() self.logger.debug("Finding SVD of An") un, s, vnt = spla.svd(self.An.todense()) self.logger.debug('Shape of un (%d,%d)', un.shape[0], un.shape[1]) vn = vnt.T self.logger.debug('Shape of vn (%d,%d)', vn.shape[1], vn.shape[1]) self.logger.debug("Generating Z matrix") self.get_Z(un, vn) data = (self.Z.T).tocsc() kmeans = kmeans.KMeans(data, self.k, self.n, self.delta, self.rc, \ self.cl, self.verbose) result = kmeans.run() self.centroids = result['centroids'] self.centroid_dict = result['centroiddict'] self.clusters = result['clusters'] self.cluster_dict = self._get_cluster_dict() self.logger.debug('Number of co-clusters produced: %d', \ len(self.clusters)) return {'centroids' : self.centroids, \ 'centroiddict' : self.centroid_dict, \ 'clusters' : self.clusters, \ 'clusterdict' : self.cluster_dict}
def test_duplicate_dual_kmeans(): data = np.array([ [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], ]) impl = kmeans.KMeans(2, max_iter, rstate) impl.Fit(data) r_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 2) == True
def train(self, data, labels): # Kmeans to find centers km = kmeans.KMeans(self.k) self.centers = km(data, error_target=.001) # Estimate Covariances uniqs = list(set(sorted(labels))) self.covariances = {} for lab in uniqs: data_class = np.array([d for d,l in zip(data,labels) if l==lab]) self.covariances[lab] = self.estimate_cov(np.transpose(data_class,[1,0])) self.center_covs = [] for lab in labels: self.center_covs.append(self.covariances[lab]) # Plot self.plot_stuff(data, self.centers) # Forward pass self.X = np.zeros((len(data), self.k)) for i in range(len(data)): iota = np.zeros((self.k)) # for all J for j in range(len(self.centers)): out = self.rbf(data[i], self.centers[j], self.center_covs[j]) iota[j] = out self.X[i] = copy.deepcopy(iota) # Backward Pass self.hl_weights = np.dot( np.dot( np.linalg.inv(np.dot(self.X.T , self.X)) , self.X.T ) , labels ) return self.centers
def TestLaws(mgnames, NJ=100): # create laws filters filts = texture.BuildLawsFilters() # allocate for jets NI = len(mgnames) # number of images jets = np.zeros((NJ * NI, 25)) # for each image for i in xrange(NI): # load # correlate #corrs = BruteCorrelate( data, filts ) data = mgnames[i] + 0 corrs = map(lambda x: correlate2d(data, x), filts) for j in range(25): corrs[i] = cspline2d(abs(corrs[i]), 200) corrs = np.array(corrs) # extract random jets V, H = data.shape vs = range(V) hs = range(H) np.random.shuffle(vs) np.random.shuffle(hs) for j in range(NJ): jets[i * NJ + j] = corrs[:, vs[j], hs[j]] # k-means clustering clust, mmb = kmeans.KMeans(NI, jets) #return jets cffs, evecs = pca.PCA(clust, 3) cffs = pca.Map2PCA(clust, evecs) gnu.Save('Laws_results.txt', cffs) return clust, cffs
def train(self, data, labels): # Kmeans to find centers km = kmeans.KMeans(self.k) self.centers = km(data, error_target=.001) self.centers = np.array(self.centers) # Estimate sigma sigma = self.estimate_sigma(self.centers) self.sigma = np.repeat(sigma, self.k) # Plot # self.plot_stuff(data, self.centers) # Forward pass self.X = np.zeros((len(data), self.k)) for i in range(len(data)): iota = np.zeros((self.k)) # for all J for j in range(len(self.centers)): out = self.rbf(data[i], self.centers[j], self.sigma[j]) iota[j] = out self.X[i] = copy.deepcopy(iota) # Backward Pass self.hl_weights = np.dot( np.dot(np.linalg.inv(np.dot(self.X.T, self.X)), self.X.T), labels) return self.centers
def word_clusters_neighbours(w2v, word, n_senses, *, window): assert window similar = w2v.most_similar(positive=[word], topn=100) words = np.array([w for w, _ in similar]) word_vectors = np.array([w2v[w] for w in words]) km = kmeans.KMeans(word_vectors, k=n_senses, metric='cosine', verbose=0) return words, km
def test_large_kmeans(): data, r_labels = datasets.make_blobs(n_samples=1000, centers=1) impl = kmeans.KMeans(1, max_iter, rstate) impl.Fit(data) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 1) == True
def test_clear_blobs_kmeans(): centers = ((-5, -5), (5, 5)) data, r_labels = datasets.make_blobs(n_samples=100, centers=centers) impl = kmeans.KMeans(2, max_iter, rstate) impl.Fit(data) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 2) == True
def test_dimensionality_kmeans(): data, r_labels = datasets.make_blobs(n_samples=288, n_features=16, cluster_std=0.2, random_state=31) impl = kmeans.KMeans(3, max_iter, rstate) impl.Fit(data) assert True == t.check_clusters(r_labels, impl.GetLabels(data.shape[0]), 3)
def fit(self, x): self.dunn = _dunn_index(x, self.estimator, self.estimator.metric) for k in range(self.mincluster + 1, self.maxcluster): estim_next = km.KMeans(k, metric=self.estimator.metric) estim_next = estim_next.fit(x) cur_dunn = _dunn_index(x, estim_next, self.estimator.metric) if cur_dunn > self.dunn: self.dunn = cur_dunn self.estimator = estim_next return self.estimator
def test_kmeans_get_colors(self): """Tests the get_colors method from the KMeans class.""" # Fibonacci sequence anyone? im = [(0, 1, 1), (2, 3, 5), (8, 13, 21), (34, 55, 89)] im_kmeans = kmeans.KMeans(im, 4) im_colors = im_kmeans.get_colors() # Sometimes the colors are in a different order because of the # centroids being randomly picked. self.assertEqual(sorted(im), sorted(im_colors))
def test_simple_single_kmeans(): data = np.array([ [1, 1], ]) impl = kmeans.KMeans(1, max_iter, rstate) impl.Fit(data) r_labels = np.array([0]) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 1) == True
def test_large_values_kmeans(): max_int = 1 << 25 data, r_labels = datasets.make_blobs(n_samples=144, centers=( (max_int, max_int), (0, 0), (-max_int, -max_int), )) impl = kmeans.KMeans(3, max_iter, rstate) impl.Fit(data) assert True == t.check_clusters(r_labels, impl.GetLabels(data.shape[0]), 3)
def k_means_away(fit_predict, clusters, tolerance, max_it, error_loops, plus_plus, file): print("Now we are clustering") X = dp.prep_file(file) k_m = km.KMeans(clusters=clusters, tolerance=tolerance, k_plus_plus=plus_plus, max_iterations=max_it, verification_loops=error_loops) if fit_predict: # run the fit predict module k_m.fit_predict(X) else: # run fit plot k_m.fit_plot(X, 0, 1, 0, 1) del k_m
def word_clusters_ctx(w2v, word, n_senses, min_weight=1.5, min_count=10, *, window): weights, contexts = utils.weights_contexts(word, window) words = [ w for w, cnt in Counter(w for ctx in contexts for w in ctx).items() if cnt >= min_count and weights.get(w, 0) > min_weight and w in w2v ] print(len(words)) w2v_vecs = np.array([w2v[w] for w in words]) km = kmeans.KMeans(w2v_vecs, k=n_senses, metric='cosine', verbose=0) words = np.array(words) return words, km
def test_double_fit_kmeans(): data1, r_labels1 = datasets.make_blobs(n_samples=288, centers=6, random_state=31) data2, r_labels2 = datasets.make_blobs(n_samples=288, centers=6, random_state=73) impl = kmeans.KMeans(6, max_iter, rstate) impl.Fit(data1) t1 = t.check_clusters_with_allowance(r_labels1, impl.GetLabels(data1.shape[0]), 6, .01) impl.Fit(data2) t2 = t.check_clusters_with_allowance(r_labels2, impl.GetLabels(data2.shape[0]), 6, .01) assert t1 == True and t2 == True
for unique in unique_elements: if unique not in text_digit_vals: text_digit_vals[unique] = x x += 1 df[col] = list(map(convert_to_int, df[col])) return df df = handle_non_numerical_data(df) X = np.array(df.drop(['survived'], 1).astype(float)) X = preprocessing.scale(X) y = np.array(df['survived']) #classifier = KMeans(n_clusters=2) classifier = kmeans.KMeans() classifier.fit(X) #labels = classifier.labels_ correct = 0 for i in range(len(X)): predict = np.array(X[i].astype(float)) predict = predict.reshape(-1, len(predict)) prediction = classifier.predict(predict) if prediction == y[i]: correct += 1 print("Accuracy: ", 1.0 * correct / len(X))
import numpy as np import matplotlib.pyplot as plt import kmeans np.random.seed(0) points1 = np.random.randn(50, 2) points2 = np.random.randn(50, 2) + np.array([5, 0]) points3 = np.random.randn(50, 2) + np.array([5, 5]) points = np.r_[points1, points2, points3] np.random.shuffle(points) model = kmeans.KMeans(3) model.fit(points) markers = ["+", "*", "o"] for i in range(3): p = points[model.labels_ == i, :] plt.scatter(p[:, 0], p[:, 1], color="k", marker=markers[i]) plt.show()
import kmeans import numpy as np import sys from sklearn.cluster import KMeans k = kmeans.KMeans(3, 10, 1) a = np.arange(20).reshape(5, 4) + 100 b = np.arange(20).reshape(5, 4) c = np.arange(20).reshape(5, 4) - 100 a = np.concatenate((a, b, c)) print('np array:') print(a[:5]) print('...') print(a[-5:]) skk = KMeans(3) skk.fit(a) print('sklearns centroids:') print(skk.cluster_centers_) print('fitting c++ cluster') k.Fit(a) print('trying to get clusters using invalid array') clusters = np.ndarray((1, 9), dtype=float) k.GetClusters(clusters) print('c++ centroids:') clusters = np.ndarray((3, 4), dtype=float)
offspring2.append(father[0]) offspring2.append(mother[1]) offspring2.append(mother[2]) #child1 = mother[0] + father[1] + father[2] #child2 = father[0] + mother[1] + mother[2] print("child1: ", offspring1) print("child2: ", offspring2) cluster.populasi.append(offspring1) cluster.populasi.append(offspring2) jumlahChild += 2 print("jumlahChild: ", jumlahChild) return jumlahChild cluster = kmeans.KMeans('iris.csv', 3) make_populasi() fitness = [] for i in range(0, 50): evaluasi_populasi(i) cluster.kClustering() centroid, sse, akurasi = cluster.pengelompokanData() eval = centroid, sse, akurasi fitness.append(eval) #print("Data cluster: ", cluster.data) print("Chromosome: ", centroid) print("SSE chromosome: %.2f" % sse) print("AKurasi chromosome: %.2f" % akurasi) print("fitness: ", fitness) sortedFitness = sorted(fitness, key=itemgetter(2), reverse=True) print("sortedFitness: ", sortedFitness)
orb = cv2.ORB_create(nfeatures=l) for i in range(1, 41): img = cv2.imread('processed_dataset/H%d.png' % i, 0) kp = orb.detect(img, None) kp_list_h.append(cv2.KeyPoint_convert(kp)) for i in range(52, 92): img = cv2.imread('processed_dataset/P%d.png' % i, 0) kp = orb.detect(img, None) kp_list_p.append(cv2.KeyPoint_convert(kp)) kp_list_h = np.concatenate(kp_list_h) kp_list_p = np.concatenate(kp_list_p) cp, cluster = km.KMeans(kp_list_h, k) cp2, cluster2 = km.KMeans(kp_list_p, k) kp_list_h = cp kp_list_p = cp2 for i in range(41, 52): img = cv2.imread('processed_dataset/H%d.png' % i, 0) kp = orb.detect(img, None) points = cv2.KeyPoint_convert(kp) n = points.shape[0] dist_h = 0 dist_p = 0 nh = 0 np0 = 0
from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import random, math, kmeans print kmeans.KMeans elements = [] # Random elements for i in range(150): x = random.randint(0, 255) y = random.randint(0, 255) z = random.randint(0, 255) elements.append([x, y, z]) km = kmeans.KMeans(3, elements) Kcentroids = km.centroids print 'Cluster list', Kcentroids check_stop = True while check_stop: check_stop = km.step() #plot points #plt.plot(kmeans.get_column_values(elements,0),kmeans.get_column_values(elements,1), 'bs') for kcluster in Kcentroids: print 'Centroid:', kcluster print 'Position: ', kcluster.position print 'Elements: ', kcluster.elements for element in kcluster.elements:
def train(self, data, labels, alpha=.01, epochs=100, batch_size=50, dw_target=.01, save_config=False): # hidden layer output self.iota = np.zeros((self.k, self.o)) # weights with random numbers self.hl_weights = np.random.uniform(-max(labels), max(labels), size=(self.k, self.o)) # Init centers and covariances # Kmeans to find centers km = kmeans.KMeans(self.k) self.centers = km(data, error_target=.001) # Estimate Covariances uniqs = list(set(sorted(labels))) self.covariances = {} for lab in uniqs: data_class = np.array( [d for d, l in zip(data, labels) if l == lab]) self.covariances[lab] = self.estimate_cov( np.transpose(data_class, [1, 0])) self.center_covs = [] for lab in labels: self.center_covs.append(self.covariances[lab]) best_weights = None best_error = np.inf self.plot_stuff(data, self.centers) ## Epochs last_20 = [] epochs_error = [] for k in tqdm(range(epochs)): labels_ep = [] outs_ep = [] old_h1 = copy.deepcopy(self.hl_weights) batch_error = [] for b in range(batch_size): ind = np.random.randint(0, len(data)) x = data[ind] y = labels[ind] # forward pass output = self.forward(x) labels_ep.append(y) outs_ep.append(output[0][0]) # backward error = self.backward(output, y, alpha=alpha) batch_error.append(error) batch_avg_error = np.sum(batch_error) / batch_size if (abs(batch_avg_error) < best_error): best_error = abs(batch_avg_error) best_weights = self.hl_weights epochs_error.append(batch_avg_error) # Break if below error target if (np.linalg.norm(self.hl_weights - old_h1) < dw_target): break self.hl_weights = best_weights if (save_config): self.save_config() # # dist of outputs/labels # # fixed bin size # print('Hist of iota') # new_iota = [i for i in self.iota if i > 1e-5] # take out zeros # print(len(new_iota)) # bins = np.arange(-3, 3, .01) # fixed bin size # plt.xlim([min(new_iota)-1, max(new_iota)+1]) # plt.hist(new_iota, bins=bins, alpha=0.5) # plt.show() # print('Hist of outputs') # bins = np.arange(-5, 5, .1) # fixed bin size # plt.xlim([min(outs_ep)-1, max(outs_ep)+1]) # plt.hist(outs_ep, bins=bins, alpha=0.5) # plt.show() # # fixed bin size # print('Hist of weights') # bins = np.arange(-15, 15, .1) # fixed bin size # plt.xlim([min(self.hl_weights)-1, max(self.hl_weights)+1]) # plt.hist(self.hl_weights, bins=bins, alpha=0.5) # plt.show() print('Completed Training:', k + 1, 'epochs') print('Best Error:', best_error) return epochs_error, self.centers
def train(self, data, labels, alpha=.01, epochs=100, batch_size=50, dw_target=.01, save_config=False): # hidden layer output self.iota = np.zeros((self.k, self.o)) # weights with random numbers self.hl_weights = np.random.uniform(-max(labels), max(labels), size=(self.k, self.o)) # Init centers # Kmeans to find centers km = kmeans.KMeans(self.k) self.centers = km(data, error_target=.001) # # Estimate Covariances # uniqs = list(set(sorted(labels))) # self.covariances = {} # for lab in uniqs: # data_class = np.array([d for d,l in zip(data,labels) if l==lab]) # self.covariances[lab] = self.estimate_cov(np.transpose(data_class,[1,0])) # self.center_covs = [] # for lab in labels: # self.center_covs.append(self.covariances[lab]) # Estimate sigma sigma = self.estimate_sigma(self.centers) self.sigma = np.repeat(sigma, self.k) best_weights = None best_error = np.inf # self.plot_stuff(data, self.centers) ## Epochs last_20 = [] epochs_error = [] for k in tqdm(range(epochs)): labels_ep = [] outs_ep = [] old_h1 = copy.deepcopy(self.hl_weights) batch_error = [] for b in range(batch_size): ind = np.random.randint(0, len(data)) x = data[ind] y = labels[ind] # cov = self.center_covs[ind] # forward pass output = self.forward(x) labels_ep.append(y) outs_ep.append(output) # backward error = self.backward(x, output, y, alpha=alpha) batch_error.append(error) batch_avg_error = np.sum(batch_error) / batch_size if (abs(batch_avg_error) < best_error): best_error = abs(batch_avg_error) best_weights = self.hl_weights epochs_error.append(batch_avg_error) # Break if below error target if (np.linalg.norm(self.hl_weights - old_h1) < dw_target): break self.hl_weights = best_weights if (save_config): self.save_config() print('Completed Training:', k + 1, 'epochs') print('Best Error:', best_error) return epochs_error, self.centers
def result(): if request.method == 'POST': # ------------------------ # load in input data # ------------------------ # if error, output error.html # get number of cluster try: c_cnt = int(request.form['c_cnt']) except ValueError: error_msg = 'Please provide the number of clusters' return render_template('error.html', error_msg=error_msg) # set default value for method and n_init if no value is set. try: method = int(request.form['method']) n_init = int(request.form['n_init']) except ValueError: method = 2 n_init = 30 # get the vector data csvfile = io.TextIOWrapper(request.files['file'].stream, encoding='gbk') reader = csv.reader(csvfile) X = np.array([[float(x) for x in line] for line in reader]) if X.shape[0] == 0: error_msg = "No node data has been uploaded." return render_template('error.html', error_msg=error_msg) if X.shape[0] < c_cnt: error_msg = "The number of cluster (k) is bigger than the number of vectors. Try a smaller k." return render_template('error.html', error_msg=error_msg) # ----------------- # clustering all X # ----------------- kmeans = kn.KMeans(c_cnt, method=method, n_init=n_init) kmeans.fit(X) # ------------------ # output results # ------------------ # the number of nodes for each cluster c_node_num = np.array([len(nodes) for nodes in kmeans.clusters.nodes]) # the ordered cluster center based on the number of nodes centers = np.array([ x for _, x in sorted(zip(c_node_num, kmeans.clusters.coords), key=lambda pair: pair[0]) ]) centers = np.round(centers, 1) centers = [', '.join(center.astype(str)) for center in centers] # save detailed clustering result app.vars['result'] = [ x for _, x in sorted(zip(c_node_num, kmeans.clusters.nodes), key=lambda pair: pair[0]) ] app.vars['result'] = [[str(node) for node in cluster] for cluster in app.vars['result']] # sorted node number for each cluster c_node_num = ','.join(sorted(c_node_num.astype(str))) return render_template('result.html', c_cnt=c_cnt, c_node_num=c_node_num, centers=centers, node_num=X.shape[0])
import kmeans import numpy as np if __name__ == "__main__": num_cluster = 20 #you can fix this print("With ", num_cluster, " cluster: ") model = kmeans.KMeans(num_cluster) model.load_data(data_path='./data/data_tf_idfs.txt', word_idfs_path='./data/word_idfs.txt') # Initialize centroids several times # and choose the one with highest NMI (or purity). max_NMI = -1 purity = -1 best_seed = 17 centroids = [] print("Try to get the best seed: ") for time in range(5): np.random.seed(17 + time) seed_value = np.random.randint(10) print("**Seed = ", seed_value) model.run(seed_value=seed_value, criterion='max_iters', threshold=25) NMI_value = model.compute_NMI() if NMI_value > max_NMI: max_NMI = NMI_value purity = model.compute_purity() best_seed = seed_value #evaluate clustering:
def calculateDistance(self, centroid): self.distanceVector = [] return km.KMeans().findDistance(self.docVectors[str(centroid)], self.queryDocVector)
blobs, _ = datasets.make_blobs(num) t_total = 0.0 for i in range(n): c = cluster.KMeans(3) t0 = time.time() c.fit(blobs) t1 = time.time() t_total += (t1-t0) kref = t_total/n t_total = 0.0 for i in range(n): c = kmeans.KMeans(3) t0 = time.time() c.Fit(blobs) t1 = time.time() t_total += (t1-t0) kimpl = t_total/n t_total = 0.0 for i in range(n): c = cluster.AgglomerativeClustering(3) t0 = time.time() c.fit(blobs) t1 = time.time() t_total += (t1-t0) aref = t_total/n
def test_kmeans(self): test1 = k.KMeans(3) self.assertTrue(test1)
import filereader import filewriter import kmeans as clstr import matplotlib.pyplot as plt file_reader = filereader.FileReader("places.txt") data_vector = file_reader.read_file_to_data_points() kmeans = clstr.KMeans(data_vector, 3) # kmeans.run() cluster_centers = kmeans.clusters clusters = kmeans.cluster_points(cluster_centers) plt.figure(figsize=(12, 5)) plt.plot(cluster_centers.get_values(1), cluster_centers.get_values(2), 'yx', label="Start points") kmeans.run() cluster_centers = kmeans.clusters clusters = kmeans.cluster_points(cluster_centers) plt.plot(clusters[0].get_values(1), clusters[0].get_values(2), 'co', label="Cluster 1") plt.plot(clusters[1].get_values(1), clusters[1].get_values(2), 'ro', label="Cluster 2") plt.plot(clusters[2].get_values(1), clusters[2].get_values(2), 'go', label="Cluster 3") plt.plot(cluster_centers.get_values(1), cluster_centers.get_values(2), 'kx', label="Cluster centers") plt.title("") plt.suptitle("Cluster analysis with k-means algorithm") plt.legend() indexed = kmeans.cluster_numbered_index(cluster_centers)