def getKmeans(a, k, threshold=1, iter=40, thresh=1e-05, minit="random", missing="warn"): """input : a, k threshold output : atk """ if minit == "matrix": seeds, k = k, len(k) a.k = k # initialise (could move it to __init__ but not bothered for the moment) height, width = a.matrix.shape pixels = a.matrix > threshold print "width, height:", width, height # debug print "sum of relevant pixels:", sum(sum(pixels)) # debug dataPoints = [[(i, j) for i in range(width) if pixels[j, i]] for j in range(height)] dataPoints = sum(dataPoints, []) dataPoints = np.array(dataPoints) print dataPoints[:20] if minit == "matrix": a.centroids = kmeans2(data=dataPoints, k=seeds, iter=iter, thresh=thresh, minit=minit, missing=missing) else: a.centroids = kmeans2(data=dataPoints, k=k, iter=iter, thresh=thresh, minit=minit, missing=missing) a.data = dataPoints resultPattern = ma.zeros((height, width)) resultPattern.mask = True resultPattern.fill_value = -999 for i in range(len(dataPoints)): resultPattern[dataPoints[i][1], dataPoints[i][0]] = a.centroids[1][i] resultPattern = dbz( name="Clustering for %s with %d clusters" % (a.name, k + 1), matrix=resultPattern, vmin=0, vmax=k ) atk = {"centroids": a.centroids, "data": a.data, "pattern": resultPattern} return atk
def _discover_centroids(self, dataset_input): self.centroids, labels = kmeans2(dataset_input, self.n_centroids) while np.unique(labels).shape[0] != self.n_centroids: # print "Empty cluster found. Retrying kmeans.." self.centroids, labels = kmeans2(dataset_input, self.n_centroids) return (self.centroids, labels)
def RunClustering(self,N,vector,K0): data = vector.reshape(N**2,3) import scipy.cluster.vq as vq resmap,indexmap = vq.kmeans2(data,K0,iter=50,minit='random') newresmap,indexmap = vq.kmeans2(data,resmap,iter=50,minit='matrix') self.indexmap = indexmap.reshape(N,N) self.CheckTopology(N)
def _init_responsibilities( self, data ): ''' Intialise responsibilities via k-means clustering. ''' a_1 = np.asarray( data.a['normal'], dtype=np.float64 ) b_1 = np.asarray( data.b['normal'], dtype=np.float64 ) p_1 = a_1 / ( a_1 + b_1 ) a_2 = np.asarray( data.a['tumour'], dtype=np.float64 ) b_2 = np.asarray( data.b['tumour'], dtype=np.float64 ) p_2 = a_2 / ( a_2 + b_2 ) shape = ( data.nrows, 9 ) responsibilities = np.zeros( shape ) init_centers = np.array( ( 1., 0.5, 0. ) ) cluster_centers_1, labels_1 = kmeans2( p_1, init_centers, minit='matrix' ) cluster_centers_2, labels_2 = kmeans2( p_2, init_centers, minit='matrix' ) labels = 3 * labels_1 + labels_2 for id in range( 9 ): index = labels == id responsibilities[index, id] = 1. self.responsibilities = responsibilities
def kMeansCluster(x, k, trials): """kMeansCluster performs k means clustering on a dataset :param x: a data object (must contain field 'data') :type x: dict :param k: the number of centroids to cluster to :type k: int :param trials: the number of times to run kmeans2 (will be run with both 'random' and 'points'. The best of the two trials will be used. :type trials: int :returns: a dictionary with keys idx and cents. idx is the group number for each protein (in the orde given in the x data object cents is a list of rowVectors with the centroids for each cluster """ data = x['data'] centsR, idxR = scv.kmeans2(data.copy(), k, iter=trials, minit='random') centsP, idxP = scv.kmeans2(data.copy(), k, iter=trials, minit='points') distR = calcDistortion(centsR, idxR, data) distP = calcDistortion(centsP, idxP, data) if distR > distP: centsR = centsP idxR = idxP distR = distP return {'idx': idxR, 'cents':centsR}
def cluster(dataArray): warnings.filterwarnings('error') bestKmeans=None #Gross code to handle warning from numpy for an empty cluster while bestKmeans is None: try: bestKmeans, bestMapping=kmeans2(dataArray, 5) except: pass minDB=DaviesBouldinIndex(bestKmeans, bestMapping, dataArray).getDBindex() for numClusters in range(5,11): kmeans=None while kmeans is None: try: kmeans, mapping=kmeans2(dataArray, numClusters) except: pass #print "Valid cluster created with numClusters:%i." % numClusters db=DaviesBouldinIndex(kmeans, mapping, dataArray).getDBindex() if db<minDB: minDB=db bestKmeans=kmeans bestMapping=mapping return bestKmeans, minDB, bestMapping
def test_kmeans2_simple(self): initc = np.concatenate(([[X[0]], [X[1]], [X[2]]])) code = initc.copy() code1 = kmeans2(X, code, iter=1)[0] code2 = kmeans2(X, code, iter=2)[0] assert_array_almost_equal(code1, CODET1) assert_array_almost_equal(code2, CODET2)
def test_kmeans2_simple(self): initc = np.concatenate(([[X[0]], [X[1]], [X[2]]])) for tp in np.array, np.matrix: code1 = kmeans2(tp(X), tp(initc), iter=1)[0] code2 = kmeans2(tp(X), tp(initc), iter=2)[0] assert_array_almost_equal(code1, CODET1) assert_array_almost_equal(code2, CODET2)
def test_kmeans2_rank1(self): data = TESTDATA_2D data1 = data[:, 0] initc = data1[:3] code = initc.copy() kmeans2(data1, code, iter=1)[0] kmeans2(data1, code, iter=2)[0]
def train(self,white=False): ''' each train change everything ''' if (white): self.centroids,self.labels=kmeans2(whiten(self.X),self.K,minit='random', missing='warn') else: self.centroids,self.labels=kmeans2(self.X,self.K,minit='random', missing='warn')
def test_kmeans2_empty(self): """Ticket #505.""" try: kmeans2([], 2) raise AssertionError("This should not succeed.") except ValueError, e: # OK, that's what we expect pass
def test_kmeans2_simple(self): """Testing simple call to kmeans2 and its results.""" initc = np.concatenate(([[X[0]], [X[1]], [X[2]]])) code = initc.copy() code1 = kmeans2(X, code, iter=1)[0] code2 = kmeans2(X, code, iter=2)[0] assert_array_almost_equal(code1, CODET1) assert_array_almost_equal(code2, CODET2)
def test_kmeans2_rank1(self): data = np.fromfile(DATAFILE1, sep=", ") data = data.reshape((200, 2)) data1 = data[:, 0] initc = data1[:3] code = initc.copy() kmeans2(data1, code, iter=1)[0] kmeans2(data1, code, iter=2)[0]
def test_kmeans2_rank1(self): """Testing simple call to kmeans2 with rank 1 data.""" data = np.fromfile(DATAFILE1, sep=", ") data = data.reshape((200, 2)) data1 = data[:, 0] data2 = data[:, 1] initc = data1[:3] code = initc.copy() code1 = kmeans2(data1, code, iter=1)[0] code2 = kmeans2(data1, code, iter=2)[0]
def test_kmeans_lost_cluster(self): # This will cause kmean to have a cluster with no points. data = np.fromfile(DATAFILE1, sep=", ") data = data.reshape((200, 2)) initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087, -0.05160469]]) kmeans(data, initk) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) kmeans2(data, initk, missing="warn") assert_raises(ClusterError, kmeans2, data, initk, missing="raise")
def test_kmeans_lost_cluster(self): # This will cause kmean to have a cluster with no points. data = TESTDATA_2D initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087,-0.05160469]]) kmeans(data, initk) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) kmeans2(data, initk, missing='warn') assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
def gencode(image, k, oldcenters=None): t1 = time.time() npix = image.size / 3 P = np.reshape(image, (npix, 3), order='F') Pw = vq.whiten(P) if oldcenters == None: (centers, label) = vq.kmeans2(Pw, k, iter=30) else: (centers, label) = vq.kmeans2(Pw, oldcenters, iter=5) (code, distortion) = vq.vq(Pw, centers) code = np.reshape(code, image.shape[0:2], order='F') print time.time() - t1 return code, centers
def kmeans(self, id, k=5, is_row=True): """ K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering Clusterizes the (cols) values of a given row, or viceversa :param id: row (or col) id to cluster its values :param k: number of clusters :param is_row: is param *id* a row (or a col)? :type is_row: Boolean """ # TODO: switch to Pycluster? # http://pypi.python.org/pypi/Pycluster if VERBOSE: sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id)) point = None if is_row: point = self.get_matrix().get_row(id) else: point = self.get_matrix().get_col(id) points = [] points_id = [] for i in point.nonzero_entries(): label = point.label(i) points_id.append(label) if not is_row: points.append(self.get_matrix().get_row(label)) else: points.append(self.get_matrix().get_col(label)) #return kmeans(array(points), k) if VERBOSE: sys.stdout.write('id %s has %s points\n' % (id, len(points))) M = array(points) MAX_POINTS = 150 # Only apply Matrix initialization if num. points is not that big! if len(points) <= MAX_POINTS: centers = self._kinit(array(points), k) centroids, labels = kmeans2(M, centers, minit='matrix') else: centroids, labels = kmeans2(M, k, minit='random') i = 0 clusters = dict() for cluster in labels: if not clusters.has_key(cluster): clusters[cluster] = dict() clusters[cluster]['centroid'] = centroids[cluster] clusters[cluster]['points'] = [] clusters[cluster]['points'].append(points_id[i]) i += 1 return clusters
def test_kmeans_lost_cluster(self): # This will cause kmeans to have a cluster with no points. data = TESTDATA_2D initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087,-0.05160469]]) with suppress_warnings() as sup: sup.filter(UserWarning, "One of the clusters is empty. Re-run kmean with a different initialization") kmeans(data, initk) kmeans2(data, initk, missing='warn') assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
def seq_cluster(traj,seq_len,stride,K): ''' Put several frames together to be clustered as a sequence Args: - traj: the trajectory object - seq_len: length of each sequence - stride: steps for moving the sequence window - K: number of clusters Return: labels ''' # get flattened coordinates of each frame coords = traj.xyz coords = np.reshape(coords,(len(coords),-1)) # print np.shape(coords) # compute the covarance of each sequence as the features seqs = [] for i in xrange(0,len(coords),stride): # covariance matrix of the coordinates covm = np.cov(np.transpose(coords[i:i+seq_len])) # print np.shape(covm) seqs.append(np.diag(covm)) # print np.shape(seq_data) centroids, labels = kmeans2(np.asarray(seqs),K,iter=100) # test clustering consistancy using classification clf = KNeighborsClassifier() # knn works best for alanine coords data_scores = cross_val_score(clf,seqs,labels,cv=5) print("Accuracy with 5 folds: %0.2f (+/- %0.2f)" % (data_scores.mean(), data_scores.std())) return labels
def _init_responsibilities( self, data ): ''' Intialise responsibilities via k-means clustering. ''' shape = ( data.nrows, self.ncomponents ) responsibilities = np.zeros( shape ) labels = {} for genome in constants.genomes: a = np.asarray( data.a[genome], dtype=np.float64 ) b = np.asarray( data.b[genome], dtype=np.float64 ) d = a + b p = a / d init_centers = np.linspace( 1, 0, self.nclass[genome] ) clustering_result = kmeans2( p, init_centers, minit='matrix' ) labels[genome] = clustering_result[1] print "Initial class ceneters : ", clustering_result[0] labels = self.nclass['normal'] * labels['normal'] + labels['tumour'] for id in range( self.ncomponents ): indices = ( labels == id ) responsibilities[indices, id] = 1. self.responsibilities = responsibilities
def _init_kmeans(self, num_comp): """Initialize using k-means """ (init_mean, labels) = kmeans2(self.data, num_comp) init_covar = self._get_covar(self.data, labels) init_mixweights = element_weights(labels) return (init_mean, labels, init_covar, init_mixweights)
def kmeans(dataset, n_cluster = 625): from scipy.cluster.vq import kmeans2, whiten feature_matrix = numpy.asarray(dataset) whitened = whiten(feature_matrix) cluster_num = 625 _, cluster_labels = kmeans2(whitened, cluster_num, iter = 100) return cluster_labels
def find_freq_clusters(freqs): # first make a histogram minf, maxf = freqs.min(), freqs.max() maxbins = 8 # related to the max colors defined... df = 4.0 # MHz if ((maxf - minf) < df): # Only a single freq to our resolution return [[0.0, 'inf']] numbins = int((maxf - minf) / df) + 2 lobound = minf - 0.5 * df hibound = lobound + numbins * df hist, edges = _np.histogram(freqs, numbins, [lobound, hibound]) # Now choose the maxbins biggest bins where there are TOAs hibins = hist.argsort()[::-1] hibins = hibins[hist[hibins] > 0] if len(hibins) > maxbins: hibins = hibins[:maxbins] ctrs = edges[hibins] + 0.5 * df ctrs.sort() # and use these as starting points for kmeans kmeans, indices = kmeans2(freqs, ctrs) if len(kmeans)==1: return [[0.0, 'inf']] elif len(kmeans)==2: return [[0.0, kmeans.mean()], [kmeans.mean(), 'inf']] else: freqbands = [[0.0, kmeans[0:2].mean()]] for ii in range(len(kmeans)-2): freqbands.append([kmeans[ii:ii+2].mean(), kmeans[ii+1:ii+3].mean()]) freqbands.append([kmeans[-2:].mean(), 'inf']) return freqbands
def clustering2(img,clusters): "another clustering method - no major differences" #Reshaping image in list of pixels to allow kmean Algorithm #From 1792x1792x3 to 1792^2x3 pixels = np.reshape(img,(img.shape[0]*img.shape[1],3)) centroids,_ = kmeans2(pixels,3,iter=3,minit= 'random') #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids)) #print centroids # quantization #Assigns a code from a code book to each observation #code : A length N array holding the code book index for each observation. #dist : The distortion (distance) between the observation and its nearest code. code,_ = vq(pixels,centroids) #print ("Code : ",code.dtype,code.shape,type(code)) #print code # reshaping the result of the quantization reshaped = np.reshape(code,(img.shape[0],img.shape[1])) #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped)) clustered = centroids[reshaped] #print ("clustered : ",clustered.dtype,clustered.shape,type(clustered)) #scatter3D(centroids) return clustered
def train_classifier(train_inds, dict_size=300, shuffle=False): # load OFH descriptors from training videos from all-but-two classes train_action_n, train_video_n, train_descs, train_labels = load_actions(actions[train_inds]) # cluster and quantize to produce BoW descriptors print 'clustering...' print 'train_descs:', train_descs.shape if path.exists(path.join(savedir, 'clusters.npy')): clusters = np.load(path.join(savedir, 'clusters.npy')) cluster_inds = np.load(path.join(savedir, 'cluster_inds.npy')) else: clusters, cluster_inds = vq.kmeans2(train_descs, dict_size, iter=20, minit='points') np.save(path.join(savedir, 'clusters.npy'), clusters) np.save(path.join(savedir, 'cluster_inds.npy'), cluster_inds) if shuffle: random.shuffle(train_labels) # produce quantized histograms for each training video print 'quantizing...' f = path.join(savedir, 'train_hists.npy') if path.exists(f): train_hists = np.load(f) else: train_hists = get_desc_hists(clusters, train_descs, train_video_n) np.save(f, train_hists) # linearly regress for each attribute based on manually produced labels print 'training regressors...' cls = lin_reg.train(train_hists, train_labels) return clusters, cls
def test_kmeans2_rank1_2(self): """Testing simple call to kmeans2 with rank 1 data.""" data = np.fromfile(DATAFILE1, sep=", ") data = data.reshape((200, 2)) data1 = data[:, 0] code1 = kmeans2(data1, 2, iter=1)
def find_color(image, rargs): MAX_SIZE = 250 priority = (1, 1.7, 1.8) NUM_CLUSTERS = 4 if "p1" in rargs: priority = (float(rargs["p1"]), float(rargs["p2"]), float(rargs["p3"])) if "clusters" in rargs: NUM_CLUSTERS = int(rargs["clusters"]) if image.size[0] > MAX_SIZE: resize_factor = image.size[0] / MAX_SIZE image = image.resize((MAX_SIZE / 8, MAX_SIZE / 8), Image.BICUBIC) image_data = list(image.getdata()) image_data = map(lambda x: rgb_to_hsv(*x), image_data) np_array = np.asarray(image_data) * priority clusters = vq.kmeans2(np_array, NUM_CLUSTERS, minit="points")[0] clusters /= priority out_colors = [] for color in clusters: rgb = colorsys.hsv_to_rgb(*color) rgb = tuple([255 * x for x in rgb]) out_colors.append("%02x%02x%02x" % rgb) return out_colors
def scipy_labels(data, clusters, nReps): # run scipy.cluster.vq.kmeans on data using an initial clusters # number of iterations is one less than used for mpi, since the # starting clusters are the result after one mpi iteration codebook, dist = kmeans2(data, clusters, nReps, 1e-6) labels, dist = vq(data, codebook) return labels, codebook
def computeClustering(data, k=k, textureFolder=textureFolder): t0 = time.time() outputFolder=textureFolder #self reminding alias height, width, depth = data.shape data = data.reshape(height*width, depth) clust = kmeans2(data=data, k=k, iter=10, thresh=1e-05,\ minit='random', missing='warn') # output to textureFolder try: os.makedirs(textureFolder) except: """don't crash. fail gracefully or not at all""" print 'folder already exists!' os.rename(textureFolder, textureFolder[:-1]+ 'pre'+ str(timestamp)) os.makedirs(textureFolder) texturelayer= [] for i in range(k): print i texturelayer.append( (clust[1]==i).reshape(881,921) ) #plt.imshow(cluster[i]) #plt.show() if texturelayer[i].sum()==0: continue pic = dbz( name='texture layer'+str(i), matrix= np.flipud(texturelayer[i]), vmin=-2, vmax=1, imagePath= textureFolder+ '/texturelayer'+ str(i) + '.png') #pic.show() pic.saveImage() timespent= time.time()-t0; print "time spent:",timespent pickle.dump({'content':texturelayer, 'notes':"%d texture layers from 'armor/filter/gaborFilterVectorField.pydump' " %k}, open(textureFolder+'/texturelayer.pydump','w')) return clust, texturelayer
def initkmeans(data, k): d = data.shape[1] # XXX: This initialization can be better (code, label) = kmeans2(data, data[:k], 5, minit='matrix') w = np.ones(k) / k mu = code.copy() va = np.zeros((k, d)) for c in range(k): for i in range(d): va[c, i] = np.cov(data[np.where(label == c), i], rowvar=0) return w, mu, va
def classify_embeddings(embeddings, support_labels, support_embeddings, actual_labels=None): num_centroids = len(set(support_labels)) centroid, labels = kmeans2(embeddings, num_centroids, minit='points') # labeling the centroids with knn on test embeddings knn = KNeighborsClassifier(n_neighbors=5) knn.fit(support_embeddings, support_labels) predicted_centroid_labels = knn.predict(centroid) if actual_labels is not None: translated_labels = translate_labels(actual_labels, labels, predicted_centroid_labels) draw_embeddings_cluster('comp_num_vs_text.png', embeddings, translated_labels, centroid) return [predicted_centroid_labels[label] for label in labels]
def kmeans_clust(vecs, words, K): if VERBOSE: print 'Running kmeans!' if np.mean(vecs[:,-1] == 1) == 1: # exclude the column of 1s vex = deepcopy(vecs[:, :-1]) else: vex = deepcopy(vecs) # normalise (kmeans does euclidean distance, so this is required for cosine') vex /= np.linalg.norm(vex, axis=1).reshape(-1, 1) centroids, cluster_assignments = kmeans2(vex, K) assignments = pd.DataFrame({'word':words, 'cluster':cluster_assignments}) csizes, indices = eval_assignments(assignments, K, None) return assignments, csizes, indices
def init_inducing_points(X, m): """ initialize m inducing points by using k-means on X inputs: X : data points m : number of clusters """ seed = int(np.abs(X.flatten()[0])) numpy_rand_state = np.random.get_state() np.random.seed(seed) Z_init = kmeans2(X, k=m)[0] np.random.set_state(numpy_rand_state) return Z_init
def _init_posterior(self, obs): """ Initialize posterior parameters """ nmix = self._nstates nobs, ndim = obs.shape # initialize hidden states self.z = np.ones((nobs, nmix)) / float(nmix) # initialize mixing coefficients self.pi = np.ones(nmix) / float(nmix) # initialize mean vectors with K-Means clustering self.mu, temp = vq.kmeans2(obs, nmix) # initialize covariance matrices with sample covariance matrix self.cv = np.tile(np.atleast_2d(np.cov(obs.T)), (nmix, 1, 1))
def gen_codebook(k_means, codebook_file): out_vector = run_get_train_vector() logger.info('Finished get train vector') out_vector = vq.whiten(out_vector) codebook, distortion = vq.kmeans2(out_vector, k=k_means, minit='++') # with open('C:\\Users\\TienHai\\Desktop\\iDT\\run_LLC\\output\\test_codebook.txt','wb') as f: # np.savetxt(f, codebook, fmt='%7f', delimiter='\t') logger.info('Finished gen codebook') with open(codebook_file, 'wb') as f: np.savetxt(f, codebook, fmt='%7f', delimiter='\t')
def __init__(self, kern, outputs, n_inducing, fixed_mean, X): self.inputs, self.outputs, self.kernel = kern.input_dim, outputs, kern self.M, self.fixed_mean = n_inducing, fixed_mean self.Z = tf.Variable(kmeans2(X, self.M, minit='points')[0], dtype=tf.float64, name='Z') if self.inputs == outputs: self.mean = np.eye(self.inputs) elif self.inputs < self.outputs: self.mean = np.concatenate([np.eye(self.inputs), np.zeros((self.inputs, self.outputs - self.inputs))], axis=1) else: _, _, V = np.linalg.svd(X, full_matrices=False) self.mean = V[:self.outputs, :].T self.U = tf.Variable(np.zeros((self.M, self.outputs)), dtype=tf.float64, trainable=False, name='U')
def __init__(self, kernel, d_out, n_inducing, X): self.d_in, self.d_out = kernel.d, d_out self.kernel = kernel self.n_inducing = n_inducing self.Z = tf.Variable(kmeans2(X, self.n_inducing)[0], dtype=tf.float64) self.mean = np.zeros((self.d_in, self.d_out)) for i in range(min(self.d_in, self.d_out)): self.mean[i, i] = 1 self.U = tf.Variable(np.zeros((self.n_inducing, self.d_out)), dtype=tf.float64, trainable=False)
def classifyChunks(self, chType): if chType == Chunk.LFH: chunks = [i for i in self.fileHeaders] elif chType == Chunk.CD: chunks = [i for i in self.CDHeaders] number = len(self.zipFiles) # Eliminate chunks with invalid datetime. chunks = [c for c in chunks if not c.last_mod_datetime == None] centroids,classes = kmeans2([i.sig_vector() for i in chunks], number, minit='points') silos = [[j[1] for j in zip(classes,chunks) if j[0]==i] for i in range(number)] return zip(silos,centroids)
def subcluster(cluster): data = getvalidrows(cluster.waves) try: c,l = vq.kmeans2(data,k,it) except: l = np.array([i % k for i in range(np.shape(data)[0])]) result = [] for i in range(k): mask = np.tile(l != i,(np.shape(cluster.waves)[1],1)).T fmask = np.tile(True,np.shape(cluster.waves)) fmask[~cluster.waves.mask[:,0]] = mask waves = np.ma.masked_array(cluster.waves,fmask) result.append(wavecluster(waves,cluster,cluster.label)) cluster.subclusters = result
def select_Z_mbs(nZ, mbs, XP_tr): """Select inducing point locations with kmeans from training data XP_tr, and minibatch size. n_tr = number of training points. If nZ<1, there will be nZ * n_tr inducing points. Otherwise there will be nZ training points. Same applies for the minibatch size mbs, except that if mbs=0 or mbs > n_tr, mbs is set to n_tr""" n_tr = XP_tr.shape[0] if nZ < 1: nZ = int(np.ceil(nZ * n_tr)) Z = kmeans2(XP_tr, nZ, minit='points')[0] if mbs == 0 or mbs > n_tr: mbs = n_tr # use all data elif mbs < 1: mbs = int(np.ceil(mbs * n_tr)) return Z, mbs
def _set_posterior(self,obs,use_emgmm=False): nobs = len(obs) nmix = self._nstates # hidden states self.z = dirichlet(np.tile(1.0/nmix,nmix),nobs) # mixing coefficients self.u = np.tile(self._u0,nmix) # posterior mean vector self.m, temp = vq.kmeans2(obs,nmix) self.beta = np.tile(self._beta0,nmix) # posterior degree of freedom self.nu = np.tile(float(nobs)/nmix,nmix) # posterior precision self.s = np.tile(self._s0,nmix)
def kMeansCluster(self): """ Creates a simple 5-segment k-means cluster based on a small number of league parameters. The top-ranked teams are then given a k-means score of 1.0, the middle rank 0.5 and the losers are assigned 0.0. Due to random fluctuation the same team may fall into a nearby group. """ # Put the teams into a numpy array clusterList = [] # List to record the order of the teams kMeansTeams = [] # Check each team has more than min games for index, team in enumerate(self.leagueTable): points = team.getPoints() recentForm = team.getForm(5) probWin = team.getProbWin() goalsFor = team.getGF() kMeansTeams.append(team.getTeamName()) row = [points, recentForm, probWin, goalsFor] clusterList.append(row) # Get the cluster array clusterArray = np.array(clusterList) # Normalize this array rows, cols = clusterArray.shape for col in xrange(cols): clusterArray[:, col] /= abs(clusterArray[:, col]).max() # Randomize over several kmeans res, groupIDs = kmeans2(clusterArray, 5) # Set the team's index = 0 topGroup = groupIDs[0] bottomGroup = groupIDs[-1] for groupID in groupIDs: if groupID == topGroup: self.setKMeans(kMeansTeams[index], 1.0) elif groupID == bottomGroup: self.setKMeans(kMeansTeams[index], 0.0) else: self.setKMeans(kMeansTeams[index], 0.5) index += 1
def get_walkers_cluster(N, bounds_fcn, samples=None, data=None): walkers = [] if (not (samples == None)): # Get a bunch of samples at random cand_samp_idx = np.random.randint(low=0, high=samples.shape[0], size=50000) cand_samp = samples[cand_samp_idx] # Cluster this subset of samples and evaluate likelihood k = N cents, lab = kmeans2(cand_samp, k) csps = [] for cs in cents: csp = lnlike(cs, data) csps.append(csp) # Then sort by lnlike srt_idx = range(len(csps)) srt_idx.sort(key=csps.__getitem__, reverse=True) srt_samp = map(cents.__getitem__, srt_idx) # Keep first N (implicitly, dropping N-k clusters) walkers = srt_samp[0:N / 2] i = 0 M = samples.shape[1] for w in srt_samp[0:N / 2]: w2 = np.zeros(M) for j in range(M): w2[j] = w[j] + np.random.normal(scale=np.abs(w[j]) * .01) walkers.append(w2) for w in walkers: pp = lnlike(w, data) print "Walker ", i, " at ", w, " lnlike = ", pp i += 1 else: lb, ub = bounds_fcn() print "D0", lb[0], ub[0] print "K0", lb[1], ub[1] print "D1", lb[2], ub[2] print "K1", lb[3], ub[3] for i in range(N): this_walker = [] for l, u in zip(lb, ub): t = np.random.uniform() this_walker.append(l + t * (u - l)) walkers.append(np.array(this_walker)) return walkers
def process_template(template, k=3, use_kmeans=True): """Process timecourse template into time bins.""" df = pd.read_csv(template) df_copy = df.copy(deep=True) df.drop('plate_well_neuron', 1, inplace=True) ordered_columns = np.argsort(np.asarray([int(x) for x in df.columns])) mat_df = df.as_matrix() mat_df = mat_df[:, ordered_columns] raveled_mat = mat_df.ravel() # raveled_mat = raveled_mat[np.isnan(raveled_mat) == 0] raveled_mat[raveled_mat == 0] = np.nan masked_data = raveled_mat[np.isnan(raveled_mat) == 0] if use_kmeans: bin_lengths, groups = kmeans2(masked_data, k, iter=10000) fixed_groups = np.zeros((raveled_mat.shape)) * np.nan fixed_groups[np.isnan(raveled_mat) == 0] = groups groups = fixed_groups else: sorted_inds = np.argsort(masked_data) group_ids = np.array_split(sorted_inds, k) groups = np.zeros((len(raveled_mat))) for idx, g in enumerate(group_ids): for gr in g: groups[gr] = idx bin_lengths = np.asarray( [raveled_mat[sorted_inds == x] for x in range(k)]).ravel() sort_idx = np.argsort(bin_lengths) sorted_groups = np.zeros((len(groups)), dtype=int) * np.nan for idx, g in enumerate(groups): if not np.isnan(g): sorted_groups[idx] = sort_idx[int(g)] print 'Timecourse group means: %s' % np.sort(bin_lengths) group_maps = { k: v for k, v in zip(raveled_mat, sorted_groups) if not np.isnan(v) } group_maps[0.0] = np.nan proc_mat = np.zeros((mat_df.shape)) for r in range(proc_mat.shape[0]): for c in range(proc_mat.shape[1]): entry = mat_df[r, c] if np.isnan(entry): proc_mat[r, c] = entry else: proc_mat[r, c] = group_maps[entry] proc_columns = [str(x) for x in ordered_columns] proc_df = pd.DataFrame(proc_mat, columns=proc_columns) proc_df['plate_well_neuron'] = df_copy['plate_well_neuron'] return proc_df
def svgp(args, dataloader, test_x, kernel=None): N = len(dataloader.dataset) inducing_points, _ = kmeans2(dataloader.dataset.train_x.numpy(), args.n_inducing, minit='points') inducing_points = torch.from_numpy(inducing_points).squeeze(-1) model = SVGP(inducing_points, kernel) # p(y|f) likelihood = GaussianLikelihood() model.train() likelihood.train() optimizer = optim.Adam([{ 'params': model.parameters() }, { 'params': likelihood.parameters() }], lr=args.learning_rate) mll = VariationalELBO(likelihood, model, N, combine_terms=False) for epoch in range(args.n_iters): for train_x, train_y in dataloader: train_x, train_y = train_x.squeeze(), train_y.squeeze() optimizer.zero_grad() output = model(train_x) log_ll, kl_div, log_prior = mll(output, train_y) loss = -(log_ll - kl_div + log_prior) loss.backward() optimizer.step() if epoch % 50 == 0: print("Iter {}, lower bound = {:.4f}, obs_var = {:.4f}".format( epoch, -loss.item(), likelihood.noise.item())) test_stats = TestStats(None, None) model.eval() likelihood.eval() with torch.no_grad(): observed_pred = likelihood(model(test_x)) test_y_mean = observed_pred.mean test_y_var = observed_pred.variance test_stats = test_stats._replace(test_y_mean=test_y_mean, test_y_var=test_y_var) return test_stats
def kmeans2(d, headers=None, K=None, whiten=True): if 'numpy' in str(type(d)): A = d else: A = d.get_data(headers) if whiten: W = vq.whiten(A) else: W = A codebook, bookerror = vq.kmeans2(W, K) codes, errors = vq.vq(W, codebook) return codebook, codes, errors
def surf_img(img): imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) surf = cv2.SURF() surf.extended = True surf.hessianThreshold = 1000 kp, des = surf.detectAndCompute(imgray, None) features = np.asarray(des) centroid, label = kmeans2(features, cluster_n, iter=10, thresh=1e-05, minit='random', missing='warn') return features, centroid, label, kp
def PFA(df): corrMat = df.corr() eigen_values, eigen_vectors = np.linalg.eig(corrMat) # Using Kmeans2 for getting the centroids of clusters and an array which centroids, labels = kmeans2(eigen_vectors[:, :7], 7) clusterVectors = [[] for i in range(7)] count = 0 for i in labels: clusterVectors[i].append(count) count = count + 1 # Getting vectors closest to each cluster centroid closest, _ = pairwise_distances_argmin_min(centroids, eigen_vectors[:, :7]) return closest, clusterVectors
def kmeans(img): height, width, channels = img.shape lab_img = color.rgb2lab(img.astype(np.float32) / 255) ab_img = lab_img[:, :, 1:3].flatten() ab_img.shape = (ab_img.size / 2, 2) cluster_count = 2 centroid, clusters = kmeans2(whiten(ab_img), cluster_count) clusters = 255 * clusters clusters.shape = (height, width) sumBorders = sum(clusters[0, :]) + sum(clusters[:, 0]) + sum( clusters[-1, :]) + sum(clusters[:, -1]) if sumBorders / (2 * (height + width)) > 127: clusters = 255 - clusters mask = np.array(clusters, dtype=np.uint8) return mask
def form_groups(points, estimated_size=10, iter=1): if len(points) < 1: return [] points = array(points) centroids, variance = kmeans2(points, estimated_size, iter=iter, minit='points') group_indicies, dist = vq(points, centroids) group = {} for i, index in enumerate(group_indicies): if index not in group: group[index] = [] group[index].append(points[i]) return group.values()
def surf_img(img1): print "-> calculating SURF" #Calculate surf desciptors, and apply Kmeans algo to create clusters surf = cv2.SURF() surf.extended = True #kp = surf.detect(img1) kp, descript = surf.detectAndCompute(img1, None) descriptors = np.asarray(descript) centroid, label = kmeans2(descriptors, cluster_n, iter=10, thresh=1e-05, minit='random', missing='warn') return descript, centroid, label, kp
def init_layers(X, dims_in, dims_out, M, final_inducing_points, share_inducing_inputs): q_mus, q_sqrts, mean_functions, Zs = [], [], [], [] X_running = X.copy() for dim_in, dim_out in zip(dims_in[:-1], dims_out[:-1]): if dim_in == dim_out: # identity for same dims W = np.eye(dim_in) elif dim_in > dim_out: # use PCA mf for stepping down _, _, V = np.linalg.svd(X_running, full_matrices=False) W = V[:dim_out, :].T elif dim_in < dim_out: # identity + pad with zeros for stepping up I = np.eye(dim_in) zeros = np.zeros((dim_out - dim_in, dim_in)) W = np.concatenate([I, zeros], 0).T mean_functions.append(Linear(A=W)) Zs.append(kmeans2(X_running, M, minit='points')[0]) if share_inducing_inputs: q_mus.append([np.zeros((M, dim_out))]) q_sqrts.append([np.eye(M)[:, :, None] * np.ones((1, 1, dim_out))]) else: q_mus.append([np.zeros((M, 1))] * dim_out) q_sqrts.append([np.eye(M)[:, :, None] * np.ones( (1, 1, 1))] * dim_out) X_running = X_running.dot(W) # final layer (as before but no mean function) mean_functions.append(Zero()) Zs.append(kmeans2(X_running, final_inducing_points, minit='points')[0]) q_mus.append([np.zeros((final_inducing_points, 1))]) q_sqrts.append( [np.eye(final_inducing_points)[:, :, None] * np.ones((1, 1, 1))]) return q_mus, q_sqrts, Zs, mean_functions
def spectral_clustering(G, k): GU = G.to_undirected() A = nx.adjacency_matrix(GU).toarray() # Create degree matrix D = np.diag(np.sum(A, axis=0)) # Create Laplacian matrix L = D - A eigval, eigvec = np.linalg.eig(L) # Calculate eigenvalues and eigenvectors eigval = eigval.real # Keep the real part eigvec = eigvec.real # Keep the real part idx = eigval.argsort() # Get indices of sorted eigenvalues eigvec = eigvec[:, idx] # Sort eigenvectors according to eigenvalues Y = eigvec[:, :k] # Keep the first k vectors centroids, labels = kmeans2(Y, k) return labels
def spectral(X, n_clusters=3, verbose=False): m = len(X) labels = np.zeros((m, 1)) simi_matrix = build_simi_matrix(X) d_matrix = np.sum(simi_matrix, axis=1) d2 = np.sqrt(1 / d_matrix) d2 = np.diag(d2) lap_matrix = np.dot((np.dot(d2, simi_matrix)), d2) U, s, V = np.linalg.svd(lap_matrix, full_matrices=True) kerN = U[:, m - n_clusters + 1:] for i in range(m): kerN[i, :] = kerN[i, :] / np.linalg.norm(kerN[i, :]) _, labels = kmeans2(kerN, n_clusters, iter=100) return labels
def run_and_publish_kmeans(self, data): kmeans_output = clusteralgos.kmeans2(data, self.num_agents) center_points = kmeans_output[0] ''' The following two lines are for debugging. They make sure node0 gets all the features. ''' # data_mean = np.average(data, 0) # center_points = np.array([data_mean, np.array([-999 for i in range(128)]), np.array([-999 for i in range(128)])]) center_points_flattened = center_points.flatten() msg = Feature() msg.data = center_points_flattened msg.header.frame_id = str(self.node_id) self.pub_kmeans.publish(msg)
def _init_posterior(self, obs): """ Initialize posterior parameters """ nmix = self._nstates nobs, ndim = obs.shape avr_N = float(nobs) / float(nmix) # parameters of posterior mixing coefficients self._u = np.ones(nmix) * (self._u0 + avr_N) # parameters of posterior precision matrices self._nu = np.ones(nmix) * (self._nu0 + avr_N) self._V = np.tile(np.array(self._V0), (nmix, 1, 1)) # parameters of posterior mean vectors self._beta = np.ones(nmix) * (self._beta0 + avr_N) self._m, temp = vq.kmeans2(obs, nmix) # initialize by K-Means
def _find_orientation(x, y, eye_color_index=4): """Find the orientation of the face.""" old = np.seterr(all='raise') try: eyes, _ = vq.kmeans2(x[y == eye_color_index], 2) except: return 0 finally: np.seterr(**old) eye_line = eyes[0] - eyes[1] rad = np.arctan2(*eye_line) + np.pi / 2 eyes_rot = _rot_mat(rad).dot(eyes.T).T if eyes_rot[0, 1] < 0: rad += np.pi return rad
def performKMeansClustering(vector_matrix): kmeans_results = dict() whitened = whiten(vector_matrix) std_devs = numpy.std(vector_matrix, axis=0) for k in range(2, 11): centroids, labels = kmeans2(whitened, k, minit='points') kmeans_results[k] = { 'centroids': centroids.tolist(), 'labels': labels.tolist() } for i, centroid in enumerate(kmeans_results[k]['centroids']): for j, val in enumerate(centroid): kmeans_results[k]['centroids'][i][j] = \ val * std_devs[j] return kmeans_results
def fit(self, X, Y): Z = kmeans2(X, self.ARGS.num_inducing, minit='points' )[0] if X.shape[0] > self.ARGS.num_inducing else X.copy() if not self.model: # NB mb_size does not change once the model is created mb_size = self.ARGS.minibatch_size if X.shape[ 0] >= self.ARGS.minibatch_size else None if self.K == 2: lik = gpflow.likelihoods.Bernoulli() num_latent = 1 else: lik = gpflow.likelihoods.MultiClass(self.K) num_latent = self.K kern = gpflow.kernels.RBF(X.shape[1], lengthscales=float(X.shape[1])**0.5) self.model = gpflow.models.SVGP(X, Y, kern, lik, feat=Z, whiten=False, num_latent=num_latent, minibatch_size=mb_size) self.opt = gpflow.train.AdamOptimizer(self.ARGS.adam_lr) self.sess = self.model.enquire_session() iters = self.ARGS.iterations else: iters = self.ARGS.small_iterations # we might have new data self.model.X.assign(X, session=self.sess) self.model.Y.assign(Y, session=self.sess) self.model.feature.Z.assign(Z, session=self.sess) num_outputs = self.model.q_sqrt.shape[0] self.model.q_mu.assign(np.zeros((self.ARGS.num_inducing, num_outputs)), session=self.sess) self.model.q_sqrt.assign(np.tile( np.eye(self.ARGS.num_inducing)[None], [num_outputs, 1, 1]), session=self.sess) self.opt.minimize(self.model, maxiter=iters, session=self.sess)