def kmeans(tsne_model, vz,data,cat): num_clusters = 2 kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=False, max_iter=1000) kmeans = kmeans_model.fit(vz) kmeans_clusters = kmeans.predict(vz) kmeans_distances = kmeans.transform(vz) sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(num_clusters): print("Cluster %d:" % i, end='') for j in sorted_centroids[i, :10]: print(' %s' % terms[j], end='') print() tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:10000]) output_file(cat+".html", title="Euro 2016") plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Euro 2016 (k-means)", tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], color=colormap[kmeans_clusters][:10000], source=bp.ColumnDataSource({ "tweet": data['text'][:10000], "processed": data['processed'][:10000], "cluster": kmeans_clusters[:10000] })) hover = plot_kmeans.select(dict(type=HoverTool)) hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"} show(plot_kmeans)
def test_predict_minibatch_dense_sparse(init): # check that models trained on sparse input also works for dense input at # predict time mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init=init, n_init=10, random_state=0).fit(X_csr) assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
def mbkm_wrapper(full_dissimilarity_matrix, n_clusters, streamlines_ids): """Wrapper of MBKM with API compatible to the Manipulator. streamlines_ids can be set or list. """ sids = np.array(list(streamlines_ids)) dissimilarity_matrix = full_dissimilarity_matrix[sids] print "MBKM clustering time:", init = 'random' mbkm = MiniBatchKMeans(init=init, n_clusters=n_clusters, batch_size=1000, n_init=10, max_no_improvement=5, verbose=0) t0 = time.time() mbkm.fit(dissimilarity_matrix) t_mini_batch = time.time() - t0 print t_mini_batch print "exhaustive smarter search of the medoids:", medoids_exhs = np.zeros(n_clusters, dtype=np.int) t0 = time.time() idxs = [] for i, centroid in enumerate(mbkm.cluster_centers_): idx_i = np.where(mbkm.labels_==i)[0] if idx_i.size == 0: idx_i = [0] tmp = full_dissimilarity_matrix[idx_i] - centroid medoids_exhs[i] = sids[idx_i[(tmp * tmp).sum(1).argmin()]] idxs.append(set(sids[idx_i].tolist())) t_exhs_query = time.time() - t0 print t_exhs_query, "sec" clusters = dict(zip(medoids_exhs, idxs)) return clusters
class DocDescriptor(object): def __init__(self, word_descriptor, n_clusters = 1000): self._n_clusters = n_clusters self._cluster = MiniBatchKMeans(n_clusters=n_clusters,verbose=1,max_no_improvement=None,reassignment_ratio=1.0) self._word_descriptor = word_descriptor def get_word_descriptor(self, img): X = get_features_from_image(img) words = [] for i in X: words.append(self._word_descriptor.transform(i)) return words def partial_fit(self, img): X = self.get_word_descriptor(img) self._cluster.partial_fit(X) def transform(self, img): X = self.get_word_descriptor(img) Y = self._cluster.predict(X) desc = [0]*self._n_clusters unit = 1.0/self._n_clusters for i in range(0, len(Y)): desc[Y[i]] += unit return desc
def _run_cluster(origin_list, cluster_num = 8, batch_size=100,resize=(64,64)): clf = MiniBatchKMeans(n_clusters=cluster_num,batch_size=batch_size) def next_batch(allfiles,batch_size): imgs = [] inds = [] for ind,(path,label) in enumerate(allfiles): img = Image.open(path).convert("L") img = img.resize(size=resize,Image.ANTIALIAS) img = np.reshape(np.array(img),(1,-1)).astype(np.float32) / 255.0 imgs.append(img) inds.append(ind) if len(imgs) >= batch_size: yield np.vstack(imgs), inds imgs = [] inds = [] if len(inds) > 0: return np.vstack(imgs), inds for _,batch in next_batch(origin_list,batch_size): clf.partial_fit(batch) cluster_dict = defaultdict(list) for inds, batch in next_batch(origin_list, batch_size): Ys = clf.predict(batch) for y, ind in zip(Ys, inds): path,label = origin_list[ind] cluster_dict.setdefault(y,[]).append((path,label)) return cluster_dict
def correct_y(X,Y): # Correct wrongly assigned ZIP codes print "Correcting wrong ZIP codes..." [N, Nfeats]=X.shape NZIP=857 # use K-means clustering to make it faster cluster=MiniBatchKMeans(NZIP,init_size=2000,max_iter=500) cluster_distance = cluster.fit_transform(X) cluster_values = cluster.predict(X) clstr=np.zeros((N,2)) min_dist=1000*np.ones(NZIP) Y_min=np.zeros(NZIP) # clstr contains for each line cluster and cluster distance to center for i in xrange(N): idx = int(cluster_values[i]) clstr[i][0]=idx clstr[i][1]=cluster_distance[i][idx] if (clstr[i][1]<min_dist[idx]) : min_dist[idx]=clstr[i][1] Y_min[idx]=Y[i] counter=0 for i in xrange(N): idx = int(clstr[i][0]) if ((clstr[i][1]<1.5) & (int(Y[i]/1000)==int(Y_min[idx]/1000))) : Y[i]= Y_min[idx] counter+=1 print "%s ZIP codes corrected.", counter return(Y)
class ClusteringEnsemble(BaseEstimator): def __init__(self, estimator_const=LinearRegression, n_clusters=2): self.estimator_const_ = estimator_const self.n_clusters_ = n_clusters self.clustering = MiniBatchKMeans(n_clusters=self.n_clusters_) def get_params(self, deep=True): return { "n_clusters": self.n_clusters_} def fit(self, X, y): print("Training KMeans") colors = self.clustering.fit_predict(X).reshape(X.shape[0]) print("Training Estimators") # each estimator is assigned to one cluster self.estimators = [self.estimator_const_() for i in range(self.n_clusters_)] for i in range(self.n_clusters_): rows = colors == i self.estimators[i].fit(X[rows], y[rows]) def predict(self, X): y = np.zeros(X.shape[0]) print("Predicting clusters") colors = self.clustering.predict(X) print("Estimating results") for i in range(self.n_clusters_): rows = colors == i y[rows] = self.estimators[i].predict(X[rows]) return y
def clusterSurfFeatures(surf_all_hist, n_clusters): # all_hists = [] for imagename in surf_all_hist: all_hists.append(surf_all_hist[imagename]) # X_train_surf_features = np.concatenate(all_hists) # print 'Clustering', len(X_train_surf_features), 'features (k=' + str(n_clusters) + ')' estimator = MiniBatchKMeans(n_clusters=n_clusters) estimator.fit_transform(X_train_surf_features) # final_features = {} for imagename in surf_all_hist: instance = surf_all_hist[imagename] # clusters = estimator.predict(instance) features = np.bincount(clusters) # if len(features) < n_clusters: features = np.append(features, np.zeros((1, n_clusters-len(features)))) #print features # final_features[imagename] = features return final_features
def extract_spatial_pyramid(images, dataset, vq=None, n_words=1000): descriptors, locations = sift_descriptors(images, dataset) if vq is None: vq = MiniBatchKMeans(n_clusters=n_words, verbose=1, init='random', batch_size=2 * n_words, compute_labels=False, reassignment_ratio=0.0, random_state=1, n_init=3) #vq = KMeans(n_clusters=n_words, verbose=10, init='random') vq.fit(shuffle(np.vstack(descriptors))) else: n_words = vq.n_clusters pyramids = [] for descr, locs in zip(descriptors, locations): words = vq.predict(descr) global_ = np.bincount(words, minlength=n_words).astype(np.float) global_ /= max(global_.sum(), 1) third_of_image = locs[1].max() // 3 + 1 stripe_indicator = locs[1] // third_of_image inds = np.vstack([stripe_indicator, words]) stripe_hists = sparse.coo_matrix((np.ones(len(words)), inds), shape=(3, n_words)).toarray() stripe_hists = [x / max(x.sum(), 1) for x in stripe_hists] pyramids.append(np.hstack([np.hstack(stripe_hists), global_])) return vq, np.vstack(pyramids)
def generateCodebook(self, features): """ Generate codebook using extracted features """ codebook = None if self._codebookGenerateMethod == 'k-means': # # Codebook generation using scipy k-means # while run: # try: # # Set missing = 'raise' to raise exception # # when one of the clusters is empty # whitenedFeatures = whiten(features) # codebook, _ = kmeans2(whitenedFeatures, # self._codebookSize, # missing = 'raise') # # # No empty clusters # run = False # except ClusterError: # # If one of the clusters is empty, re-run k-means # run = True # Codebook generation using sklearn k-means whitenedFeatures = whiten(features) kmeans = MiniBatchKMeans(n_clusters = config.codebookSize) kmeans.fit(whitenedFeatures) codebook = kmeans.cluster_centers_ else: pass self._codebook = codebook
def project(self, ndim=None): """ Projects the data object given to the constructor onto `ndim` dimensions Parameters ---------- ndim : int The number of dimensions we want to project the data on. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data Example ------- >>> tri = KMeansTri(data) >>> datatri = tri.project(5) """ import scipy.spatial.distance as scidist from sklearn.cluster import MiniBatchKMeans from htmd.metricdata import MetricData datconcat = np.concatenate(self.data.dat) mb = MiniBatchKMeans(n_clusters=ndim) mb.fit(datconcat) # TODO: Could make it into a loop to waste less memory dist = scidist.cdist(datconcat, mb.cluster_centers_) dist = np.mean(dist, axis=1)[:, np.newaxis] - dist dist[dist < 0] = 0 return MetricData(dat=self.data.deconcatenate(dist), ref=self.data.ref, simlist=self.data.simlist, fstep=self.data.fstep, parent=self.data)
def cluster_function(user_np): ############################################################################## # Compute clustering with Means if len(user_np) < 10 : n_cl = 2 elif len(user_np) <= 100 : n_cl = 10 elif len(user_np) <= 500 : n_cl = 15 elif len(user_np) <= 1000 : n_cl = 20 else : n_cl = 30 k_means = MiniBatchKMeans(n_clusters=n_cl, init='k-means++', max_iter=100, batch_size=100, verbose=0, compute_labels=True, random_state=None, tol=0.0, max_no_improvement=10, init_size=None, n_init=3, reassignment_ratio=0.01) t0 = time.time() k_means.fit(user_np) t_batch = time.time() - t0 print "Batch running time : ", t_batch k_means_labels = k_means.labels_ #prediction = k_means.predict(user_np) return k_means_labels
def make_cluster(datasets): num_clusters = 5 lsa_dim = 500 max_df = 0.8 max_features = 10000 minibatch = True print("datasets are %(datasets)s" % locals()) km = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10, verbose=True) km.fit(datasets) labels = km.labels_ transformed = km.transform(x) dists = np.zeros(labels.shape) for i in range(len(labels)): dists[i] = transformed[i, labels[i]] clusters = [] for i in range(num_clusters): cluster = [] ii = np.where(labels == i)[0] dd = dists[ii] di = np.vstack([dd, ii]).transpose().tolist() di.sort() for d, j in di: cluster.append(datasets[int(j)]) clusters.append(cluster) return clusters
def getCluster(X,k,M,opts): # M: knnNum # t0 = time() # print("knn graph") knn_graph = None # knn_graph = kneighbors_graph(X, M) # print("knn graph done in %0.3fs" % (time() - t0)) # outfile.write("knn graph done in %0.3fs\n" % (time() - t0)) # aggl = AgglomerativeClustering(linkage='ward', connectivity=knn_graph, n_clusters=k) if opts.minibatch: km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=50, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=50, verbose=opts.verbose) #aggl = AgglomerativeClustering(linkage='ward', n_clusters=k) print("Clustering sparse data with %s" % km) # outfile.write("Clustering sparse data with %s\n" % aggl) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) # outfile.write("clustering done in %0.3fs\n" % (time() - t0)) print() labels = km.labels_ clus2doc = {} for i in range(len(labels)): clus2doc[labels[i]] = clus2doc.get(labels[i],set()) clus2doc[labels[i]].add(i) return (km,clus2doc,knn_graph)
def initializeWeight(D, type, N_OUT): # Here we first whiten the data (PCA or ZCA) and then optionally run k-means # on this whitened data. import numpy as np if D.shape[0] < N_OUT: print( " Not enough data for '%s' estimation, using elwise"%type ) return np.random.normal(0, 1, (N_OUT,D.shape[1])) D = D - np.mean(D, axis=0, keepdims=True) # PCA, ZCA, K-Means assert type in ['pca', 'zca', 'kmeans', 'rand'], "Unknown initialization type '%s'"%type C = D.T.dot(D) s, V = np.linalg.eigh(C) # order the eigenvalues ids = np.argsort(s)[-N_OUT:] s = s[ids] V = V[:,ids] s[s<1e-6] = 0 s[s>=1e-6] = 1. / np.sqrt(s[s>=1e-6]+1e-3) S = np.diag(s) if type == 'pca': return S.dot(V.T) elif type == 'zca': return V.dot(S.dot(V.T)) # Whiten the data wD = D.dot(V.dot(S)) wD /= np.linalg.norm(wD, axis=1)[:,None] if type == 'kmeans': # Run k-means from sklearn.cluster import MiniBatchKMeans km = MiniBatchKMeans(n_clusters = wD.shape[1], batch_size=10*wD.shape[1]).fit(wD).cluster_centers_ elif type == 'rand': km = wD[np.random.choice(wD.shape[0], wD.shape[1], False)] C = km.dot(S.dot(V.T)) C /= np.std(D.dot(C.T), axis=0, keepdims=True).T return C
def init_all(K,X,DT): km = MiniBatchKMeans(n_clusters=K, init='k-means++', n_init=10,init_size=1000, batch_size=1000,verbose=True) # km = KMeans(n_clusters=K, init='k-means++', max_iter=100, n_init=50) km.fit(X) labels = km.labels_ centers = km.cluster_centers_ # print number of doc in each cluster clus2doc = {} for i in range(len(labels)): clus2doc[labels[i]] = clus2doc.get(labels[i],set()) clus2doc[labels[i]].add(i) if len(clus2doc) < K: K_ = len(clus2doc) print (str(K_)+" clusters") print("kmeans reduce K to "+str(K_)) return init_all(K_,X,DT) #print("kmeans reduce K to "+str(K-1)) #return init_all(K-1,X,DT) for i in clus2doc: print (str(i+1)+"\t"+str(len(clus2doc[i]))) # init nDocs,nWords = X.shape Pz_d_km = np.zeros((K,nDocs)) for i in range(nDocs): Pz_d_km[labels[i],i] = 1 Pz_d_km = Pz_d_km +0.01; Pz_d_km = Pz_d_km / np.tile(sum(Pz_d_km),(K,1)) C = centers.T+1/nWords/nWords Pw_z_km = C/np.tile(sum(C),(nWords,1)) mu_km, sigma_km= inittime(DT,K,labels) return (K,[Pz_d_km,Pw_z_km,mu_km, sigma_km])
def train(self, featurefiles, k=100, subsampling=10): nbr_images = len(featurefiles) descr = [] descr.append(sift.read_features_from_file(featurefiles[0])[1]) descriptors = descr[0] print "begin loading image feature files..." for i in np.arange(1, nbr_images): descr.append(sift.read_features_from_file(featurefiles[i])[1]) # descriptors = np.vstack((descriptors, descr[i])) descriptors = np.vstack((descriptors, descr[i][::subsampling,:])) if i%100 == 0: print i, "images have been loaded..." print "finish loading image feature files!" # self.voc, distortion = cluster.kmeans(descriptors[::subsampling,:], k, 1) print "begin MiniBatchKMeans cluster....patient" mbk = MiniBatchKMeans(k, init="k-means++", compute_labels=False, n_init=3, init_size=3*k) # mbk.fit(descriptors[::subsampling,:]) mbk.fit(descriptors) self.voc = mbk.cluster_centers_ print "cluster finish!" self.nbr_word = self.voc.shape[0] imwords = np.zeros((nbr_images, self.nbr_word)) for i in xrange(nbr_images): imwords[i] = self.project(descr[i]) nbr_occurences = np.sum((imwords > 0)*1, axis=0) self.idf = np.log( (1.0*nbr_images) / (1.0*nbr_occurences+1) ) self.traindata = featurefiles
def main(): with open("aas/corpus.json") as f: corpus = json.loads(f.read()) corpus = [(k,v) for k,v in corpus.items() if v > 5] corpus = sorted(corpus, key=lambda x: x[1]) corpus = corpus[:-6] Ncorpus = len(corpus) with open("aas/abstracts.json") as f: abstracts = json.loads(f.read()) X = np.zeros((len(abstracts),Ncorpus)) for jj,abstract in enumerate(abstracts): for ii in range(Ncorpus): try: X[jj,ii] = abstract['counts'][corpus[ii][0]] except KeyError: continue X = bsr_matrix(X) print("Initializing k-means") km = MiniBatchKMeans(n_clusters=50, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=True) print("fitting") t0 = time.time() km.fit(X) # X is nsamples, nfeatures print("Took {} seconds".format(time.time()-t0)) return km
def aggregate(self, track_dataset): """ An example implementation of the k-means algorithm implemented in DSI Studio. This function is automatically applied to all TrackDatasets returned from a query. Parameters: ----------- track_dataset:dsi2.streamlines.track_dataset.TrackDataset """ # extract the streamline data tracks = track_dataset.tracks # Make a matrix of downsampled streamlines points = np.array([ downsample(trk, 3).flatten() \ for trk in tracks]) # Calculate the length of each streamline lengths = np.array([len(trk) for trk in tracks]).reshape(-1,1) # Concatenate the points and the track lengths features = np.hstack((points, lengths)) # Initialize the k-means algorithm kmeans = MiniBatchKMeans(n_clusters=self.k, compute_labels=True) kmeans.fit(features) # Return the labels return kmeans.labels_
def cluster_tfidf(tfidf): kmeans = MiniBatchKMeans(n_clusters=10, init='k-means++', n_init=1, init_size=1000, batch_size=1000) kmeans.fit(tfidf) return kmeans.cluster_centers_
def obtainCodebook(self, sampled_x, x): print 'Obatining codebook using online k-means...' sampled_x = np.array(sampled_x) sampled_x = sampled_x.astype(float) x = np.array(x) x = x.astype(float) #normalize scaled_x_sampled = StandardScaler().fit_transform(sampled_x) scaled_x = StandardScaler().fit_transform(x) des_vector_suffled = scaled_x_sampled #shuffle list of descriptors np.random.shuffle(des_vector_suffled) minibatch = MiniBatchKMeans(n_clusters=self.size, init='k-means++', batch_size=self.batch_size, n_init=10, max_no_improvement=10, verbose=0, random_state=0) codebook = minibatch.fit(des_vector_suffled, y=None) #for n in range(0,len(des_vector_suffled)/batchsize+1): #if n!=len(des_vector_suffled)/batchsize: #data = des_vector_suffled[n*batchsize:n*batchsize+batchsize] #else: #data = des_vector_suffled[n*batchsize:] #kmeans.partial_fit(data) projections = minibatch.predict(scaled_x) print 'Codebook obtained.' return codebook.cluster_centers_, projections
class MiniCluster: def __init__(self, nclusters=1000, psize=16): self.psize = 16 self.patch_size = (self.psize, self.psize) self.nclusters = nclusters self.rng = np.random.RandomState(0) self.kmeans = MiniBatchKMeans(n_clusters=nclusters, random_state=self.rng, verbose=True) def fit(self, images): buffer = [] index = 1 t0 = time.time() # The online learning part: cycle over the whole dataset 4 times index = 0 passes = 10 for _ in range(passes): for img in images: data = extract_patches_2d(img, self.patch_size, max_patches=15, random_state=self.rng) data = np.reshape(data, (len(data), -1)) #This casting is only needed for RGB data #buffer.append(data.astype(float)) buffer.append(data) index += 1 #if index % 1000 == 0: if index % (self.nclusters * 2) == 0: data = np.concatenate(buffer, axis=0) data = gcn(data) data = whiten(data) self.kmeans.partial_fit(data) buffer = [] dt = time.time() - t0 print('done in %.2fs.' % dt)
def clustering(self, X, NUM_CLUSTERS, MINIBATCH): ''' k平均法によってクラス分け ''' if MINIBATCH: km = MiniBatchKMeans(n_clusters = NUM_CLUSTERS, init='k-means++', batch_size=1000, n_init=10, max_no_improvement=10) else: km = KMeans(n_clusters=NUM_CLUSTERS, init='k-means++', n_init=1) km.fit(X) transformed = km.transform(X) #商品の各クラスの中心への距離 labels = km.labels_ dists = [] for i in range(len(labels)): dists.append(transformed[i, labels[i]]) #商品の属するクラスの中心への距離 labels = DataFrame(labels) dists = DataFrame(dists) labels.columns = ['label'] dists.columns = ['dists'] self.data = pd.concat([labels, dists, self.data], axis=1) #元のデータにラベルを加える return km
def clusterize(self): X = np.ndarray((len(self.real_power_data), 2)) X[:, 0] = self.real_power_data X[:, 1] = self.reac_power_data clustering = MiniBatchKMeans(self.spinbox_cluster.value()) clustering.fit(X) # Identifica os centróides dos clusteres centroids = clustering.cluster_centers_.tolist() # Conta quantos elementos cada cluster possui predictions = clustering.predict(X) occurrences = Counter(predictions) # Identifica os clusteres que possuem somente 1 elemento # (serão tratados como clusteres de transição) transition_clusters = [k for k, v in occurrences.iteritems() if v < 2] # Remove os centróides de clusteres de transição centroids = [e for i, e in enumerate(centroids) if i not in transition_clusters] predictions = [-1 if v in transition_clusters else v for v in predictions] self.prototypes = centroids self.plot_clusterized(predictions)
def main(): if len(sys.argv) != 4: print(__doc__) return 1 infiles = glob(sys.argv[1]) outfile = sys.argv[2] K = int(sys.argv[3]) print("Reading in", len(infiles), "files") fullarr = np.loadtxt(fileinput.input(infiles), delimiter = '\t')[:,:-7] summary_stats = None stats_file = '/n/fs/gcf/dchouren-repo/COS513-Finance/summary_stats/stats2' with open(stats_file, 'rb') as inf: summary_stats = np.loadtxt(inf) stds = summary_stats[:len(summary_stats)/2] means = summary_stats[len(summary_stats)/2:] fullarr = (fullarr - means) / stds print("Learning MiniBatchKMeans with K =", K) km = MiniBatchKMeans(n_clusters = K, verbose = True) # TODO max_iter km.fit(fullarr) print("KMeans trained, saving") with open(outfile, 'wb') as out_model: pickle.dump(km, out_model) print("Score:", km.score(fullarr)) return 0
def color_quantization_sk(image, clusters): # load the image and grab its width and height (h, w) = image.shape[:2] # convert the image from the RGB color space to the L*a*b* # color space -- since we will be clustering using k-means # which is based on the euclidean distance, we'll use the # L*a*b* color space where the euclidean distance implies # perceptual meaning image = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) # reshape the image into a feature vector so that k-means # can be applied image = image.reshape((image.shape[0] * image.shape[1], 3)) # apply k-means using the specified number of clusters and # then create the quantized image based on the predictions clt = MiniBatchKMeans(n_clusters = clusters) labels = clt.fit_predict(image) quant = clt.cluster_centers_.astype("uint8")[labels] # reshape the feature vectors to images quant = quant.reshape((h, w, 3)) # convert from L*a*b* to RGB quant = cv2.cvtColor(quant, cv2.COLOR_LAB2BGR) return quant
def do_clustering(): keys = request.get_json() points = [] values = [] types = set(CLUSTER_SENSOR_TYPES) for id, timestamp in keys: time = datetime.fromtimestamp(timestamp) point = MeasurePoint.query.get((id, time)) value = {v.type_name: v.value for v in point.values.all()} if not set(value.keys()) >= types: continue # Normalization: value['OxidizingGas'] /= 1000 value['ReducingGas'] /= 10000 points.append(point) values.append(value) X = [[v[t] for t in CLUSTER_SENSOR_TYPES] for v in values] X = np.array(X) k = KMeans(n_clusters=2, init='k-means++') k.fit(X) score = silhouette_score(X, k.labels_) groups = [[_point_to_json_dict(p) for p, l in zip(points, k.labels_) if l == i] for i in range(max(k.labels_) + 1)] return jsonify(score=score, groups=groups)
def define_clusters(projections): """ Creates several different clusterings of the data in projections. :param projections: dict(string, (2 x Num_Samples) numpy.ndarray) dictionary mapping the projection type (e.g. "tSNE") to an array containing the two-dimensional coordinates for each sample in the projection. :return: dict of string (projection name) => (dict of string (cluster technique) => np.ndarray of size N_Samples (cluster assignments)) """ pbar = ProgressBar(4 * len(projections)); out_clusters = dict(); for key in projections: proj_data = projections[key]; proj_clusters = dict(); # K-means for k = 2-5 for k in range(2, 6): clust_name = "K-Means, k=" + str(k); kmeans = MiniBatchKMeans(n_clusters=k); clust_assignments = kmeans.fit_predict(proj_data.T); proj_clusters.update({clust_name: clust_assignments}); pbar.update(); out_clusters.update({key: proj_clusters}); pbar.complete(); return out_clusters;
def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model # should not longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=1, random_state=42) mb_k_means.fit(this_X) centers_before = mb_k_means.cluster_centers_.copy() try: old_stdout = sys.stdout sys.stdout = StringIO() # Turn on verbosity to smoke test the display code _mini_batch_step(this_X, (X ** 2).sum(axis=1), mb_k_means.cluster_centers_, mb_k_means.counts_, np.zeros(X.shape[1], np.double), False, random_reassign=True, random_state=42, reassignment_ratio=1, verbose=True) finally: sys.stdout = old_stdout centers_after = mb_k_means.cluster_centers_.copy() # Check that all the centers have moved assert_greater(((centers_before - centers_after)**2).sum(axis=1).min(), .2)
def VideoFrameReaders(VideoDirectory): cap = cv2.VideoCapture(VideoDirectory) kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3)) fgbg = cv2.createBackgroundSubtractorMOG2() timestamp = [] count = 0 try: while cap.isOpened(): ret,frame = cap.read() time = cap.get(0) #get the frame in seconds timestamp.append(time) print timestamp if frame == None: break; # frame = cv2.cvtColor(frame,cv2.COLOR_RGB2GRAY) image = frame.reshape((frame.shape[0]*frame.shape[1],3)) K = 4 clf = MiniBatchKMeans(K) #predict cluster labels and quanitize each color based on the labels cls_labels = clf.fit_predict(image) print cls_labels cls_quant = clf.cluster_centers_astype("uint8")[labels] except EOFError: pass
print(dirname) for (direcpath, direcnames, files) in os.walk(path + "\\" + dirname): for file in files: actual_path = path + "\\\\" + dirname + "\\\\" + file print(actual_path) des = func(actual_path) img_descs.append(des) y.append(label) label = label + 1 #finding indexes of test train and validate y = np.array(y) training_idxs, test_idxs, val_idxs = train_test_val_split_idxs( len(img_descs), 0.4, 0.0) #creating histogram using kmeans minibatch cluster model X, cluster_model = cluster_features(img_descs, training_idxs, MiniBatchKMeans(n_clusters=150)) #splitting data into test, train, validate using the indexes X_train, X_test, X_val, y_train, y_test, y_val = perform_data_split( X, y, training_idxs, test_idxs, val_idxs) #using classification methods predict_knn(X_train, X_test, y_train, y_test) #predict_mlp(X_train, X_test,y_train, y_test) predict_svm(X_train, X_test, y_train, y_test) predict_lr(X_train, X_test, y_train, y_test) predict_nb(X_train, X_test, y_train, y_test)
print '-----feature size-----' print TFIDFvectorizer.get_feature_names() print len(TFIDFvectorizer.get_feature_names()) print '-----feature mapping-----' print '-----start SVD-----' svd = TruncatedSVD(n_components=20, n_iter = 30, random_state = 50) normalizer = Normalizer(copy = False) U = svd.fit_transform(x) #fit: create V in SVD #transform: create U*S in SVD, which is need U = normalizer.fit_transform(U) print svd.explained_variance_ print '-----start kmeans-----' minikmeans = MiniBatchKMeans(n_clusters = 22, init = 'k-means++', n_init = 1,init_size = 500) minikmeans.fit(U) #print minikmeans.cluster_centers_ print minikmeans.inertia_ label = minikmeans.labels_ ''' print '-----save labels-----' fout = open('checklabel','w') for i in label: fout.write(' '+str(i)+' \n') fout.close() ''' print '-----loading check_index.csv-----' f = open(str(path)+'check_index.csv','r') index = []
kmeans.fit(word_embeddings), y_kmeans = kmeans.predict(word_embeddings) #generateCSV(y_kmeans,etichette) pca(y_kmeans) if (algo == 'GaussianMM'): gmm = GaussianMixture(n_components=n_cluster).fit(word_embeddings) labels = gmm.predict(word_embeddings) pca(labels) if (algo == 'MiniBatch100'): kmeans = MiniBatchKMeans( n_clusters=n_cluster, batch_size=100, ).fit(word_embeddings) kmeans.fit(word_embeddings), y_kmeans = kmeans.predict(word_embeddings) print(y_kmeans) pca(y_kmeans) if (algo == 'MiniBatch250'): kmeans = MiniBatchKMeans( n_clusters=n_cluster, batch_size=250, ).fit(word_embeddings) kmeans.fit(word_embeddings),
def createKMeans(self, n_clusters,max_iter): self.kmeansClf = MiniBatchKMeans(n_clusters= n_clusters, max_iter = max_iter, random_state=self.randomSeed)
class RecommendMovie(): remarks = None randomSeed = None kmeansClf = None numU = None #用户数 numM = None #电影数 userGroup = None #用于记录所有用户的类别 userNeighSize = 100 movieNeighSize = 10 reductSize = 50 zeroImputer = None movieTitle = None #一些公用参数的初始化 def __init__(self, sparseMatrix_file, movieTitle, randomSeed = None): self.remarks = sparse.load_npz(sparseMatrix_file) self.remarks = self.remarks.tocsr() self.randomSeed = randomSeed self.numU,self.numM = self.remarks.shape self.movieTitle = movieTitle #修改协同过滤参数 def tunningCollParams(self, userNeighSize, movieNeighSize, reductSize): self.userNeighSize = userNeighSize self.movieNeighSize = movieNeighSize self.reductSize = reductSize #初始化kmeans模型 def createKMeans(self, n_clusters,max_iter): self.kmeansClf = MiniBatchKMeans(n_clusters= n_clusters, max_iter = max_iter, random_state=self.randomSeed) #训练kmeans模型 def fitKMeans(self, trainsetsize, modelSavePath=None): #抽取训练kmeans所用的集合 np.random.seed(self.randomSeed) trainset_dense = self.remarks[np.random.choice(np.arange(self.numU), size = trainsetsize)].toarray() #平均值填补missing和用户(按行)标准化 self.zeroImputer = Imputer(missing_values=0, strategy='mean', axis=1, copy = False) trainset_dense = self.zeroImputer.fit_transform(trainset_dense) trainset_dense = scale(X=trainset_dense, axis=1) self.kmeansClf.fit(trainset_dense) if modelSavePath!=None: joblib.dump(self.kmeansClf, filename=modelSavePath) # def updateKMeans(self, updatesetsize, modelSavePath=None) #给定一个稀疏矩阵,预测所属类别 def __predictGroup(self, sparseArr): denseArr = self.zeroImputer.fit_transform(sparseArr.toarray()) denseArr = scale(denseArr, axis=1) return self.kmeansClf.predict(denseArr) #预测所有用户所属的类别 def findUsersGroup(self): userGroup = np.array([]) for i in range(int(self.numU/10000)): temp = self.__predictGroup(self.remarks[(i*10000):((i+1)*10000)]) userGroup = np.hstack([userGroup, temp]) temp = self.__predictGroup(self.remarks[((i+1)*10000):]) self.userGroup = np.hstack([userGroup,temp]) print(self.userGroup, len(self.userGroup)) print(pd.Series(self.userGroup).value_counts()) #标准化单个用户的评分 def __userRemarkScaler(self, aUser): tempaUser = self.zeroImputer.fit_transform(aUser.toarray()) return scale(tempaUser,axis = 1).flatten() #找出同类成员并抽取子样 def __findNeigbor(self, aUser, neighborsize=1000): g = self.__predictGroup(aUser)[0] np.random.seed(self.randomSeed) gindex = np.random.choice(np.where(self.userGroup==g)[0],size=neighborsize) groupMember = self.remarks[gindex].toarray() groupMember = self.zeroImputer.fit_transform(groupMember) groupMember = scale(groupMember, axis=1) return groupMember,gindex #按照pearson correlation遴选用户近邻 def __pearsonRNeigh(self, aUser): groupMember, gindex = self.__findNeigbor(aUser,neighborsize=self.userNeighSize*10) pearsonDis = np.zeros(shape=self.userNeighSize*10) tempaUser = self.__userRemarkScaler(aUser) for i in range(self.userNeighSize*10): pearsonDis[i] = stats.pearsonr(tempaUser.flatten(),groupMember[i])[0] cutpoint = np.percentile(pearsonDis,90) maxindex = np.where(pearsonDis>=cutpoint) return groupMember[maxindex], gindex[maxindex] #给定 用户近邻-子矩阵和目标电影ID,计算电影间距离 def __getScoreAndDist(self, movieindex, subRemarkMat, ratedIndex): u,s,vt = slinalg.svds(subRemarkMat, k=self.reductSize, which='LM') movieScore = vt.transpose()[movieindex,:].reshape(1,-1) dist = np.array([distance.cosine(movieScore, vt.transpose()[i]) for i in list(ratedIndex)]) return dist #给定 电影间距离 计算用户对某电影的评分 def __scorePredict(self, aUser, dist, ratedIndex): distRated = dist distSort = np.argsort(distRated)[1:min(self.movieNeighSize+1,len(distRated))] userSort = ratedIndex[distSort] similarity = 1-(dist[distSort]) similarity[similarity<0]=0 userRemarkP = (aUser.toarray()[0][userSort].dot(similarity.reshape(-1,1)))/(np.sum(similarity)+0.001) return userRemarkP #给定用户ID&电影ID预测得分 def __user2Movie(self, userID, movieID,subRemarkMat, gindex, ratedIndex): #aUser = self.remarks[userID] #subRemarkMat,gindex = self.__pearsonRNeigh(aUser) dist = self.__getScoreAndDist(movieID, subRemarkMat, ratedIndex) #ratedIndex = np.where(aUser[0].toarray().flatten()!=0)[0]#等会写到外层去 score = self.__scorePredict(self.remarks[userID], dist, ratedIndex) return score #给定用户ID预测对候选电影的评分 def recommend2User(self, userID, toPredict): #toPredict=a list of movieID aUser = self.remarks[userID] subRemarkMat,gindex = self.__pearsonRNeigh(aUser) ratedIndex=np.where(self.remarks[userID].toarray().flatten()!=0)[0] movieScoreP = [] for movieID in toPredict: movieScoreP.append(self.__user2Movie(userID, movieID,subRemarkMat,gindex,ratedIndex)) movieRecommend = pd.DataFrame(index=toPredict, data={'Movie Title':self.movieTitle[toPredict], 'Estimate':movieScoreP}) movieRecommend.sort_values(by='Estimate', ascending=False, inplace=True) return movieRecommend
def eval_batch(x_train, y_train, x_test, y_test, classifier, components, no_clusters, dimensionality): cluster_finder = cluster.KMeans(n_clusters=no_clusters) if classifier == 'mbk': cluster_finder = MiniBatchKMeans(init='k-means++', n_clusters=no_clusters, batch_size=16, n_init=10, max_no_improvement=10, verbose=0) cluster_finder.fit(x) cddd = str(cluster_finder.score) clll = str(cluster_finder) log = str( components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str( components) + dimensionality labels = cluster_finder.labels_ else: cluster_finder = cluster.KMeans(n_clusters=no_clusters) cluster_finder.fit(x) cddd = str(cluster_finder.score) clll = str(cluster_finder) log = str( components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str( components) + dimensionality labels = cluster_finder.labels_ clustered_x = [] clustered_y = [] test_clustered_x = [] test_clustered_y = [] for c in range(0, no_clusters): clustered_x.append([]) clustered_y.append([]) test_clustered_x.append([]) test_clustered_y.append([]) for i, item in enumerate(x_train): item = item.reshape(1, item.shape[0]) # print('item', item) # sys.exit(0) predicted = cluster_finder.predict(item) clustered_x[predicted[0]].append(item) clustered_y[predicted[0]].append(y[i]) for i, item in enumerate(x_test): item = item.reshape(1, item.shape[0]) predicted = cluster_finder.predict(item) test_clustered_x[predicted[0]].append(item) test_clustered_y[predicted[0]].append(y_test[i]) print(len(clustered_x)) print(len(clustered_y)) print(len(test_clustered_x)) print(len(test_clustered_y)) for j, jtem in enumerate(clustered_x): file_name = "./clusters/all_train/cluster" + str(j) + ".txt" c_file = open(file_name, 'w') for m, mtem in enumerate(clustered_x[j]): ii = str(decode_sequence(mtem[0])) oo = str(decode_sequence(clustered_y[j][m])) c_file.writelines(ii + "===" + oo + "\n") c_file.close() for j, jtem in enumerate(test_clustered_x): file_name = "./clusters/all_test/cluster" + str(j) + ".txt" c_file = open(file_name, 'w') for m, mtem in enumerate(test_clustered_x[j]): ii = str(decode_sequence(mtem[0])) oo = str(decode_sequence(test_clustered_y[j][m])) c_file.writelines(ii + "===" + oo + "\n") c_file.close() # print(test_clustered_x) sys.exit(0) '''
plt.yticks([]) plt.tight_layout(rect=[0, 0.03, 1, 0.95]) if inertia: label = label + ", inertia={0:0.2f}".format(inertia) plt.title(label) plt.show() plot_clustering('Ground truth', y, centers, inertia=inertia(X, centers)) ############################################################################## # We run a regular MiniBatchKMeans. KMeans would be more suited for this kind # of small dataset but we are aiming at using Increment KMeans on large # datasets so its implementation relies on MiniBatchKMeans. kmeans = MiniBatchKMeans(n_clusters=8, random_state=2) kmeans.fit(X) plot_clustering('KMeans', kmeans.predict(X), kmeans.cluster_centers_, inertia=kmeans.inertia_) ############################################################################## # We now consider that we are aware of 4 of the 8 clusters. We fix them in # the IncrementalMiniBatchKMeans so that they are strictly enforced. ikmeans = IncrementalMiniBatchKMeans(n_clusters=8, random_state=2) ikmeans.fit(X, fixed_cluster_centers=centers[:n_fixed_clusters]) plot_clustering('Incremental KMeans', ikmeans.predict(X), centers[n_fixed_clusters:],
import pandas as pd from sklearn.cluster import MiniBatchKMeans if __name__ == "__main__": dataset = pd.read_csv('./data/candy.csv') print(dataset.head(10)) X = dataset.drop('competitorname', axis=1) kmeans = MiniBatchKMeans(n_clusters=4, batch_size=8).fit(X) print("Total de centros: ", len(kmeans.cluster_centers_)) print("="*64) print(kmeans.predict(X)) dataset['group'] = kmeans.predict(X) print(dataset)
def test_minibatch_tol(): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10, random_state=42, tol=.01).fit(X) _check_fitted_model(mb_k_means)
def _init_classifier(self, opt): if "base_estimator" in opt: b_est = self._init_classifier(opt["base_estimator"]) else: b_est = None if "n_estimators" in opt: n_estimators = opt["n_estimators"] else: n_estimators = 200 if "max_iter" in opt: max_iter = opt["max_iter"] else: max_iter = 100000 if "num_parallel_tree" in opt: num_parallel_tree = opt["num_parallel_tree"] else: num_parallel_tree = 5 if "layer_structure" in opt: layer_structure = opt["layer_structure"] else: layer_structure = (100, ) if "n_clusters" in opt: n_clusters = opt["n_clusters"] else: n_clusters = 8 if opt["type"] in ["random_forrest", "rf"]: return RandomForestClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1) elif opt["type"] == "ada_boost": return AdaBoostClassifier(base_estimator=b_est, n_estimators=n_estimators) elif opt["type"] in ["logistic_regression", "lr"]: return LogisticRegression(class_weight='balanced', max_iter=max_iter) elif opt["type"] == "sgd": return SGDClassifier(class_weight='balanced', max_iter=max_iter) elif opt["type"] in ["gaussian_bayes", "bayes", "gaussian_nb"]: return GaussianNB() elif opt["type"] in ["support_vector_machine", "svm"]: return SVC(kernel='rbf', class_weight='balanced', gamma="scale") elif opt["type"] in ["multilayer_perceptron", "mlp"]: return MLPClassifier(hidden_layer_sizes=layer_structure, max_iter=max_iter) elif opt["type"] in ["decision_tree", "dt", "tree"]: return DecisionTreeClassifier() elif opt["type"] in ["b_decision_tree", "b_dt", "b_tree"]: return DecisionTreeClassifier(class_weight="balanced") elif opt["type"] in ["neighbours", "knn"]: return KNeighborsClassifier(n_neighbors=opt["n_neighbours"]) elif opt["type"] == "extra_tree": return ExtraTreesClassifier(n_estimators=n_estimators, class_weight="balanced", n_jobs=-1) elif opt["type"] == "xgboost": return XGBClassifier(objective='binary:logistic', n_estimators=n_estimators, num_parallel_tree=num_parallel_tree, tree_method="hist", booster="gbtree", n_jobs=-1) elif opt["type"] in ["b_random_forrest", "b_rf"]: return BalancedRandomForestClassifier(n_estimators=n_estimators, n_jobs=-1) elif opt["type"] == "b_bagging": return BalancedBaggingClassifier(base_estimator=b_est, n_estimators=n_estimators) elif opt["type"] == "b_boosting": return RUSBoostClassifier(base_estimator=b_est, n_estimators=n_estimators) elif opt["type"] == "kmeans": return MiniBatchKMeans(n_clusters=n_clusters) else: raise ValueError("type: {} not recognised".format(opt["type"]))
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=10) assert_warns(RuntimeWarning, mb_k_means.fit, X)
def test_mini_match_k_means_invalid_init(): km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters) assert_raises(ValueError, km.fit, X)
def test_mb_k_means_plus_plus_init_sparse_matrix(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X_csr) _check_fitted_model(mb_k_means)
def test_minibatch_k_means_perfect_init_sparse_csr(): mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1).fit(X_csr) _check_fitted_model(mb_k_means)
# exclude 'comp.os.ms-windows.misc' categories = ['alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'] data = get_data() vectorizer = TfidfVectorizer(stop_words='english', min_df=5, tokenizer=number_aware_tokenizer) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=100, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X)
def test_minibatch_init_with_large_k(): mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20) # Check that a warning is raised, as the number clusters is larger # than the init_size assert_warns(RuntimeWarning, mb_k_means.fit, X)
print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() # ############################################################################# # Do the actual clustering if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) print()
def test_mb_k_means_plus_plus_init_dense_array(): mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters, random_state=42) mb_k_means.fit(X) _check_fitted_model(mb_k_means)
class KMeansSMOTE(BaseSMOTE): """Apply a KMeans clustering before to over-sample using SMOTE. This is an implementation of the algorithm described in [1]_. Read more in the `User Guide <https://imbalanced-learn.org/stable/over_sampling.html#smote-adasyn>`_. Parameters ---------- {sampling_strategy} {random_state} k_neighbors : int or object, default=2 If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. {n_jobs} kmeans_estimator : int or object, default=None A KMeans instance or the number of clusters to be used. By default, we used a :class:`~sklearn.cluster.MiniBatchKMeans` which tend to be better with large number of samples. cluster_balance_threshold : "auto" or float, default="auto" The threshold at which a cluster is called balanced and where samples of the class selected for SMOTE will be oversampled. If "auto", this will be determined by the ratio for each class, or it can be set manually. density_exponent : "auto" or float, default="auto" This exponent is used to determine the density of a cluster. Leaving this to "auto" will use a feature-length based exponent. Attributes ---------- kmeans_estimator_ : estimator The fitted clustering method used before to apply SMOTE. nn_k_ : estimator The fitted k-NN estimator used in SMOTE. cluster_balance_threshold_ : float The threshold used during ``fit`` for calling a cluster balanced. See Also -------- SMOTE : Over-sample using SMOTE. SVMSMOTE : Over-sample using SVM-SMOTE variant. BorderlineSMOTE : Over-sample using Borderline-SMOTE variant. ADASYN : Over-sample using ADASYN. Notes ----- See the original papers: [1]_ for more details. Supports multi-class resampling. A one-vs.-rest scheme is used. References ---------- .. [1] Felix Last, Georgios Douzas, Fernando Bacao, "Oversampling for Imbalanced Learning Based on K-Means and SMOTE" https://arxiv.org/abs/1711.00837 Examples -------- >>> import numpy as np >>> from imbalanced_ensemble.sampler.over_sampling import KMeansSMOTE >>> from sklearn.datasets import make_blobs >>> blobs = [100, 800, 100] >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) >>> # Add a single 0 sample in the middle blob >>> X = np.concatenate([X, [[0, 0]]]) >>> y = np.append(y, 0) >>> # Make this a binary classification problem >>> y = y == 1 >>> sm = KMeansSMOTE(random_state=42) >>> X_res, y_res = sm.fit_resample(X, y) >>> # Find the number of new samples in the middle blob >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() >>> print("Samples in the middle blob: %s" % n_res_in_middle) Samples in the middle blob: 801 >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) Middle blob unchanged: True >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum())) More 0 samples: True """ @_deprecate_positional_args def __init__( self, *, sampling_strategy="auto", random_state=None, k_neighbors=2, n_jobs=None, kmeans_estimator=None, cluster_balance_threshold="auto", density_exponent="auto", ): super().__init__( sampling_strategy=sampling_strategy, random_state=random_state, k_neighbors=k_neighbors, n_jobs=n_jobs, ) self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent def _validate_estimator(self): super()._validate_estimator() if self.kmeans_estimator is None: self.kmeans_estimator_ = MiniBatchKMeans( batch_size=4096, random_state=self.random_state, ) elif isinstance(self.kmeans_estimator, int): self.kmeans_estimator_ = MiniBatchKMeans( batch_size=4096, n_clusters=self.kmeans_estimator, random_state=self.random_state, ) else: self.kmeans_estimator_ = clone(self.kmeans_estimator) # validate the parameters for param_name in ("cluster_balance_threshold", "density_exponent"): param = getattr(self, param_name) if isinstance(param, str) and param != "auto": raise ValueError( f"'{param_name}' should be 'auto' when a string is passed." f" Got {repr(param)} instead.") self.cluster_balance_threshold_ = ( self.cluster_balance_threshold if self.kmeans_estimator_.n_clusters != 1 else -np.inf) def _find_cluster_sparsity(self, X): """Compute the cluster sparsity.""" euclidean_distances = pairwise_distances(X, metric="euclidean", n_jobs=self.n_jobs) # negate diagonal elements for ind in range(X.shape[0]): euclidean_distances[ind, ind] = 0 non_diag_elements = (X.shape[0]**2) - X.shape[0] mean_distance = euclidean_distances.sum() / non_diag_elements exponent = (math.log(X.shape[0], 1.6)**1.8 * 0.16 if self.density_exponent == "auto" else self.density_exponent) return (mean_distance**exponent) / X.shape[0] def _fit_resample(self, X, y, sample_weight=None): self._validate_estimator() X_resampled = X.copy() y_resampled = y.copy() total_inp_samples = sum(self.sampling_strategy_.values()) for class_sample, n_samples in self.sampling_strategy_.items(): if n_samples == 0: continue # target_class_indices = np.flatnonzero(y == class_sample) # X_class = _safe_indexing(X, target_class_indices) X_clusters = self.kmeans_estimator_.fit_predict(X) valid_clusters = [] cluster_sparsities = [] # identify cluster which are answering the requirements for cluster_idx in range(self.kmeans_estimator_.n_clusters): cluster_mask = np.flatnonzero(X_clusters == cluster_idx) X_cluster = _safe_indexing(X, cluster_mask) y_cluster = _safe_indexing(y, cluster_mask) cluster_class_mean = (y_cluster == class_sample).mean() if self.cluster_balance_threshold_ == "auto": balance_threshold = n_samples / total_inp_samples / 2 else: balance_threshold = self.cluster_balance_threshold_ # the cluster is already considered balanced if cluster_class_mean < balance_threshold: continue # not enough samples to apply SMOTE anticipated_samples = cluster_class_mean * X_cluster.shape[0] if anticipated_samples < self.nn_k_.n_neighbors: continue X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample)) valid_clusters.append(cluster_mask) cluster_sparsities.append( self._find_cluster_sparsity(X_cluster_class)) cluster_sparsities = np.array(cluster_sparsities) cluster_weights = cluster_sparsities / cluster_sparsities.sum() cluster_n_samples_list = np.zeros_like(cluster_weights) # if class_sample == 1: # print (n_samples) # print (cluster_weights) # print ([math.ceil( # n_samples * cluster_weights[valid_cluster_idx] - 1e-3 # ) for valid_cluster_idx, _ in enumerate(valid_clusters)]) # print (cluster_n_samples) if not valid_clusters: raise RuntimeError( f"No clusters found with sufficient samples of " f"class {class_sample}. Try lowering the " f"cluster_balance_threshold or increasing the number of " f"clusters.") for valid_cluster_idx, valid_cluster in enumerate(valid_clusters): X_cluster = _safe_indexing(X, valid_cluster) y_cluster = _safe_indexing(y, valid_cluster) X_cluster_class = _safe_indexing( X_cluster, np.flatnonzero(y_cluster == class_sample)) self.nn_k_.fit(X_cluster_class) nns = self.nn_k_.kneighbors(X_cluster_class, return_distance=False)[:, 1:] if valid_cluster_idx == self.kmeans_estimator_.n_clusters - 1: cluster_n_samples = int(n_samples - sum(cluster_n_samples_list)) else: cluster_n_samples = math.floor( n_samples * cluster_weights[valid_cluster_idx]) cluster_n_samples_list[valid_cluster_idx] = cluster_n_samples X_new, y_new = self._make_samples( X_cluster_class, y.dtype, class_sample, X_cluster_class, nns, cluster_n_samples, 1.0, ) stack = [np.vstack, sparse.vstack][int(sparse.issparse(X_new))] X_resampled = stack((X_resampled, X_new)) y_resampled = np.hstack((y_resampled, y_new)) # If given sample_weight if sample_weight is not None: # sample_weight is already validated in self.fit_resample() sample_weight_new = \ np.empty(y_resampled.shape[0] - y.shape[0], dtype=np.float64) sample_weight_new[:] = np.mean(sample_weight) sample_weight_resampled = np.hstack( [sample_weight, sample_weight_new]).reshape(-1, 1) sample_weight_resampled = \ np.squeeze(normalize(sample_weight_resampled, axis=0, norm='l1')) return X_resampled, y_resampled, sample_weight_resampled else: return X_resampled, y_resampled
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.cluster import MiniBatchKMeans dataset = pd.read_csv('studentinfoCHK.csv') X = dataset.iloc[10000:20000, [6, 7]].values mb_clustering = clustering = MiniBatchKMeans(n_clusters=2) y_mb_clustering = mb_clustering.fit_predict(X) plt.scatter(X[y_mb_clustering == 0, 0], X[y_mb_clustering == 0, 1], s=20, c='red', label='high score and high study credits') plt.scatter(X[y_mb_clustering == 1, 0], X[y_mb_clustering == 1, 1], s=20, c='blue', label='high score and low study credits') plt.title('Clusters of STUDENTS') plt.xlabel('SCORE') plt.ylabel('STUDIED CREDITS') plt.plot()
True), ('Sparse comp. - MiniBatchSparsePCA', decomposition.MiniBatchSparsePCA(n_components=n_components, alpha=0.8, n_iter=100, batch_size=3, random_state=rng), True), ('MiniBatchDictionaryLearning', decomposition.MiniBatchDictionaryLearning(n_components=15, alpha=0.1, n_iter=50, batch_size=3, random_state=rng), True), ('Cluster centers - MiniBatchKMeans', MiniBatchKMeans(n_clusters=n_components, tol=1e-3, batch_size=20, max_iter=50, random_state=rng), True), ('Factor Analysis components - FA', decomposition.FactorAnalysis(n_components=n_components, max_iter=20), True), ] # ############################################################################# # Plot a sample of the input data plot_gallery("First centered Olivetti faces", faces_centered[:n_components]) # ############################################################################# # Do the estimation and plot it
def partialAddingLearn(feature_extracted_model, n_cluster, channal, ex_epoches, batch_size=141, epoches=32): # 获得训练数据 Kmeans_ds = ReadImage.getKmeansDataSet(batch_size) print('数据提取完成') # 获得预训练模型 # feature_extracted_model = getPre_trainedModel(checkpoint_path, image_height, image_weight) # print('预训练模型提取完成') # 使用tensorflow每次训练数据 iterator = Kmeans_ds.make_initializable_iterator() data_element = iterator.get_next() sess = tf.Session() sess.run(iterator.initializer) Kmeans_label = [] #增量式学习不需要生成全体样本特征 Kmeans_feature = [] print('进入训练特征提取+增量式kmeans学习过程') kmeans_mode = MiniBatchKMeans(n_clusters=n_cluster, batch_size=batch_size * channal, random_state=0) #初始化增量式学习模型 for cur_epoch in range(ex_epoches): for i in range(epoches): Kmeans_image, curKmeans_label = sess.run(data_element) cur_image_feature = feature_extracted_model.predict( Kmeans_image) #得到一个batchsize的深度特征2维矩阵,可以放入增量式学习当中 cur_image_vectors = getFeatureVector(cur_image_feature) #转为一维变量 if cur_epoch == 0: #只有在第一次循环时收集样本信息 Kmeans_feature.extend(cur_image_vectors) Kmeans_label.extend(curKmeans_label) kmeans_mode.partial_fit(cur_image_vectors) print('第%d轮增量式学习完成' % i) sess.close() # 保存kmeans模型 Kmeans_feature = np.asarray(Kmeans_feature) Kmeans_label = np.asarray(Kmeans_label) CSV.csvWrite('./BoW/data_csv/train_label.csv', enumerate(Kmeans_label)) joblib.dump(kmeans_mode, filename='./BoW/result/kmeans_' + str(n_cluster) + '_.pkl') # 保存文件 print('增量式学习完成,kmeans——%d保存完成' % n_cluster) # 获得相应的特征标签 print('获得相应的特征标签') feature_labels = kmeans_mode.predict(Kmeans_feature) print('特征标签获取完成') # 进入特征编码过程 print('进入特征编码过程') m = len(Kmeans_label) #样本数 print(m) histogram_code = [] for cur_image in range(m): cur_code = np.zeros((1, n_cluster)) for cur_feature in range(cur_image * channal, (cur_image + 1) * channal): cur_cluster = feature_labels[cur_feature] cur_code[0, cur_cluster] += 1 cur_code = cur_code / np.sum(cur_code) # 归一化 histogram_code.append(cur_code[0]) #数组只要在创造后就变为两维(ones,zeros) histogram_code = np.asarray(histogram_code) print(np.shape(histogram_code)) print('训练集histogram编码完成') CSV.csvWrite('./BoW/data_csv/train_code_' + str(n_cluster) + '.csv', histogram_code) print('直方图编码完成') return kmeans_mode, histogram_code, Kmeans_label
from sklearn.cluster import KMeans, MiniBatchKMeans """ ######### K MEANS ################################################################ """ num_clusters = 5 km = MiniBatchKMeans( n_clusters=num_clusters, max_iter=300, n_init=10, init="k-means++", batch_size=100, compute_labels=True, ) result = km.fit_predict(soft) soft_label_pred = km.labels_ centroids = km.cluster_centers_ inertia = km.inertia_
print("fatto in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() # ############################################################################# # Do the actual clustering if opts.minibatch: print("*******MINI BATCH KMEANS***********") km = MiniBatchKMeans(n_clusters=num_cluster, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) #elif opts.use_spectral: #print("*******SPECTRAL CLUSTERING***********") #km = SpectralClustering(n_clusters=num_cluster, affinity='precomputed', n_init=100, assign_labels = 'discretize') #elif opts.use_agglomerative: # print("************AGGLOMERATIVE CLUSTERING*********") #km = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', # connectivity=None, linkage='ward', memory=None, n_clusters=num_cluster, # pooling_func='deprecated') else: print("*******K-MEANS***********") km = KMeans(n_clusters=num_cluster, init='k-means++', max_iter=100,
from sklearn.cluster import MiniBatchKMeans # this now clusters in 100 or 400 clusters # I also used cluster numbers of 8, 20 and 40 cluster_names = [(8, 'cl0'), (20, 'cl1'), (40, 'cl2')] ckeys = [c[1] for c in cluster_names] kms = {} for nclust, key in cluster_names: km = MiniBatchKMeans(n_clusters=nclust) X = None count = 0 for e in event_info.find(): if X is None: X = np.array(e['words']) else: X = np.vstack((X, e['words'])) count += 1 if count % 10000 == 0: km.partial_fit(X) X = None print(count) kms[key] = km event_clusters = {} for e in event_info.find(): clusters = {key: int(km.predict(e['words'])[0]) for key, km in kms.items()} event_info.update({'id': e['id']}, {'$set': clusters}) # load database data, for fast access
class Offbeatr(object): def __init__(self, random_state=823): self.rng = np.random.RandomState(random_state) self.keepers = ['danceability', 'energy', 'loudness', 'speechiness','acousticness', \ 'liveness', 'valence', 'tempo'] def get_songs(self, songfile=None, host='35.196.88.209', user='******', password='******', database='SPOTIFY'): """As a security measure, IP must be whitelisted in Google cloud prior to getting song data""" if not songfile: conn = pymysql.connect(host='35.196.88.209', user='******', \ password='******', database='SPOTIFY') query = """ SELECT * FROM songs """ print('fetching songs from database') self.songs = pd.read_sql(query, conn) conn.close() else: print('reading songs from local file') self.songs = pd.read_csv(songfile, skiprows=[1]) self.N = self.songs.shape[0] self.songs_labeled_ = self.songs[['song_id']].copy() qt = QuantileTransformer(output_distribution='normal', random_state=self.rng) self.raw_data = qt.fit_transform(np.array(self.songs[self.keepers])) dump(qt, 'qt.pickle') print("Saved transformer to file: 'qt.pickle'") def get_starting_clusters(self, mb_kmeans_n_clusters=25000, random_state=0, batch_size=100000, verbose=0): print('computing starting clusters') self.mb_kmeans = MiniBatchKMeans(n_clusters=mb_kmeans_n_clusters, \ random_state=random_state, batch_size=batch_size, \ verbose=verbose) self.preds = self.mb_kmeans.fit_predict(self.raw_data) def agglom_cluster(self, cluster_sizes=[3800, 2528, 1264, 632]): """Run the aglomerative clustering algorithm""" #inits num_levels = len(cluster_sizes) centroids = np.zeros((sum(cluster_sizes), len(self.keepers))) fit_list = [] colnames = [] #calculate the agglomerative cluster labels print('performing agglomerative clustering') for i in range(num_levels): agglom = AgglomerativeClustering(n_clusters=cluster_sizes[i]) fit = agglom.fit_predict(self.mb_kmeans.cluster_centers_) fit_list.append(fit + sum(cluster_sizes[:i])) level_labels = [ fit_list[i][self.preds[j]] for j, _ in enumerate(self.preds) ] colnames.append("level" + str(i)) self.songs_labeled_[colnames[i]] = level_labels print("DataFrame created: 'songs_labeled_'") #calculate centroids for i in range(num_levels): colname = colnames[i] val = sum(cluster_sizes[:i]) for j in range(val, val + cluster_sizes[i]): centroids[j,:] = np.mean(self.raw_data[self.songs_labeled_[ \ self.songs_labeled_[colname]==j].index], axis=0) self.centroids_ = pd.DataFrame(centroids) self.centroids_.columns = self.keepers print("DataFrame created: 'centroids_'") def export_csv(self, save_songs=True, save_centroids=True, num_parts=20): #save transformer object as pickle if save_songs == True and save_centroids == True: print("creating 'songs_labeled_{id}.csv' and 'centroids.csv'") for id, df_i in enumerate( np.array_split(self.songs_labeled_, num_parts)): df_i.to_csv('songs_labeled_{id}.csv'.format(id=id), index=False) self.centroids_.to_csv('centroids.csv', index=False) elif save_songs == True: print("creating songs_labeled_{id}.csv") for id, df_i in enumerate( np.array_split(self.songs_labeled_, num_parts)): df_i.to_csv('songs_labeled_{id}.csv'.format(id=id), index=False) elif save_centroids == True: print("Creating 'centroids.csv'") self.centroids_.to_csv('centroids.csv', index=False) else: print('nothing to do') def beat_master(self, songfile=None, host='35.196.88.209', user='******', password='******', \ database='SPOTIFY',mb_kmeans_n_clusters=25000, random_state=0, \ batch_size=100000, verbose=0, cluster_sizes=[3800,2528,1264,632], \ save_songs=True, save_centroids=True, num_parts=20): "This self contained script takes ~2.5 hours to run" self.get_songs(songfile, host, user, password, database) self.get_starting_clusters(mb_kmeans_n_clusters, random_state, batch_size, verbose) self.agglom_cluster(cluster_sizes) self.export_csv(save_songs, save_centroids, num_parts) print('done!')
_, p = np.unique(true_labels, return_inverse=True) # Counts the number of each index counts = np.bincount(p) # Gets the index with the highest count maxpos = counts.argmax() mistakes += (p != maxpos).sum() return mistakes / datapoints #print "Preparing Tfidf vectorizer" # prepare features vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(data) #print "Fitting K-means for clusters 1 through 20" #print "___________________" #print '% 9s' % 'clusters time inertia h**o compl v-meas ARI AMI Mistake Rate' print "numc,h**o,comp,v-meas,mr" for numc in range(1, true_k + 1): mbkm = MiniBatchKMeans(n_clusters=numc, init='k-means++', max_iter=100, n_init=5, verbose=False) assess_mbkm(mbkm, numc, X, labels)
print('Loading data and getting representations') model = BoAW(100) model.load(data_dir) model.fit() print('Building lookup') lkp = model.toLookup() print('Loading auditory space') asp = AggSpace(lkp, 'mean') the_data, labels_true = [], [] for instrument in instruments: the_data.append(asp.space[instrument]) labels_true.append(instclass[instrument]) the_data = np.array(the_data) mbk = MiniBatchKMeans(n_clusters=len(classes), batch_size=2, verbose=True, compute_labels=True, max_iter=10000, n_init=25) mbk.fit(the_data) centroids = mbk.cluster_centers_ #labels_pred, _ = vq(the_data, centroids) score = v_measure_score(labels_true, mbk.labels_) print 'V-measure:', score for instrument, label in zip(instruments, mbk.labels_): print('Instrument=%s,\tcluster=%d' % (instrument, label))
sigma = np.zeros(K) for i in range(K): ts = np.array(DT)[labels == i] mu[i] = np.mean(ts) sigma[i] = np.std(ts) return mu, sigma # input args: K display with open('test30.pickle') as f: [X,Xp,Xl,Xo,X_all,K,Learn,Pz_d_km,Pw_z_km,Pw_z,Pz_d,Pd,Li,\ labels,terms,termsp,termsl,termso,terms_all,DT,ind2obj,clusModel]=pickle.load(f) if K != int(sys.argv[1]): km = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=100, init_size=1000, batch_size=1000, verbose=True) km.fit(X) labels = km.labels_ centers = km.cluster_centers_ clus2doc = {} for i in range(len(labels)): clus2doc[labels[i]] = clus2doc.get(labels[i], set()) clus2doc[labels[i]].add(i) ## print number of docs in each cluster for i in clus2doc: print(str(i + 1) + "\t" + str(len(clus2doc[i]))) t0 = time() Learn = (1, 10)
class TopicDocs: def __init__(self, ndim=128, random_seed=1965123, topic_tokens=8196, verbose=True): """ Class initialization method. :param ndim: Number of latent dimensions :param targets: The target vector :param random_seed: The random seed used :param ed_cutoff: Cutoff for fuzzy string matching when comparing documents :param doc_limit: The max number of documents to be considered. :param verbose: Whether to have the printouts """ self.ndim = int(np.sqrt(ndim)) self.verbose = verbose self.random_seed = random_seed self.topic_tokens = topic_tokens def fit(self, text_list): """ The fit method. :param text_list: List of input texts """ if not type(text_list) == list: text_list = text_list.values.tolist() self.clx = TfidfVectorizer(max_features=self.topic_tokens) docspace = self.clx.fit_transform(text_list).T fnames = [(x, y) for x, y in self.clx.vocabulary_.items()] fnames = [x[0] for x in sorted(fnames, key=lambda x: x[1])] self.clustering_algo = MiniBatchKMeans(n_clusters=self.ndim) clusters = self.clustering_algo.fit(docspace) assert len(clusters.labels_) == docspace.shape[0] cluster_assignments = clusters.labels_ assert len(clusters.labels_) == len(fnames) self.topic_features = defaultdict(set) for k, v in zip(fnames, cluster_assignments): self.topic_features[v].add(k) def transform(self, new_documents): """ Transform method. :param new_documents: The new set of documents to be transformed. :return all_embeddings: The final embedding matrix """ if not type(new_documents) == list: new_documents.values.tolist() if self.verbose: logging.info("Transforming new documents.") new_features = np.zeros((len(new_documents), self.ndim)) for enx, doc in tqdm.tqdm(enumerate(new_documents), total=len(new_documents)): parts = set(doc.lower().strip().split()) for k, v in self.topic_features.items(): denominator = len(v) overlap = len(parts.intersection(v)) / denominator if not overlap is None: new_features[enx, k] = overlap return new_features def fit_transform(self, documents, b=None): """ The sklearn-like fit-transform method. """ self.fit(documents) return self.transform(documents) def get_feature_names(self): """ Get feature names. """ return list(["topic_" + str(x) for x in range(self.ndim)])