def mean_shift(model_data, prediction_data=None): t0 = time() ms = MeanShift().fit(model_data) if prediction_data == None: labels = ms.predict(model_data) else: labels = ms.predict(prediction_data) means = ms.cluster_centers_ print "Number of Means:", means.shape[0] print "Mean Shift Time: %0.3f" % (time() - t0) return labels, means
def mean_shift(model_data, prediction_data = None): t0 = time() ms = MeanShift().fit(model_data) if prediction_data == None: labels = ms.predict(model_data) else: labels = ms.predict(prediction_data) means = ms.cluster_centers_ print "Number of Means:", means.shape[0] print "Mean Shift Time: %0.3f" % (time() - t0) return labels, means
def meanshift(df): from sklearn.cluster import MeanShift meanshift = MeanShift() meanshift.fit(X) labels = meanshift.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) # Predict the cluster for all the samples P = meanshift.predict(X) pca = PCA(n_components=2, random_state=40) reduced_features = pca.fit_transform(features) plt.scatter(reduced_features[:,0], reduced_features[:,1], c=meanshift.predict(features)) plt.show()
class TMeanshiftClus(Discretize): def __init__(self, bandwidth): self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10) def fit_transform(self, created_at): indi = pd.DatetimeIndex(created_at) lts = indi.values.astype(np.int64) dates = [disc_time_to_sec_day(ts) for ts in lts] dates = np.array(dates) self.ms.fit(dates) return ["temp_" + str(centroid) for centroid in list(self.ms.labels_)] def transform(self, created_at): indi = pd.DatetimeIndex(created_at) lts = indi.values.astype(np.int64) dates = [disc_time_to_sec_day(ts) for ts in lts] return [ "temp_" + str(centroid) for centroid in list(self.ms.predict(dates)) ]
def _select(X, bandwidth=None, min_bin_freq=1): min_ = min(x[0] for x in X) max_ = max(x[0] for x in X) if min_ == max_: return [min_, min_ + 1] if bandwidth is None: bandwidth = estimate_bandwidth(X, quantile=0.1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=min_bin_freq) try: ms.fit(X) split_points = {} for x in X: label = ms.predict([x])[0] val = x[0] if label not in split_points: split_points[label] = val else: split_points[label] = min(val, split_points[label]) sp = list(split_points.values()) except: sp = [min_] sp += [max_ + 1] return sorted(sp)
def mean_shift(x_train, y_train, x_test, y_test, range_bandwidth): for n_bandwidth in range_bandwidth: ms = MeanShift(bandwidth=n_bandwidth) ms.fit(x_train, y_train) y_pred = ms.predict(x_test) print('mean shift n_bandwidth = {}, f1_score = {}'.format( n_bandwidth, str(f1_score(y_test, y_pred, average='micro'))))
def __call__(self, data, n): bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) y_pred = ms.predict(data) clusters = {i: np.where(y_pred == i)[0] for i in np.unique(y_pred)} return clusters
def mean_shift(im): tmp = im.shape im = im.reshape((-1, 3)) bandwidth = estimate_bandwidth(im, quantile=0.1, n_samples=1500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(im) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print "number of estimated cluster :%d" % n_clusters_ imNew = np.zeros(im.shape) l = ms.predict(im) area = np.zeros((n_clusters_, 1)) cnt = 0 for i in range(len(l)): imNew[i] = cluster_centers[l[i]] area[l[i]] += 1 imNew = imNew.reshape(tmp) area = area * 1.0 / area.sum() * 100 #node_labels = zip(cluster_centers , area) scipy.misc.imsave('outfile.jpg', imNew) labels = labels.reshape((-1, tmp[1])) return labels, area, cluster_centers, ms, imNew
def cluster(features): model = MeanShift().fit(features) meanshift_labels = model.predict(features) print(meanshift_labels) np.save('meanshift_labels.npy', meanshift_labels) with open('meanshift.pkl', 'wb') as f: pickle.dump(model, f)
def cropImage(image): croppedImages = [] img = image.copy() height, width = img.shape[:2] sf = float(height) / float(11675) histogram = pd.Series([ height - cv2.countNonZero(img[:, i]) for i in list(range(width)) ]).rolling(5).mean() ax = histogram.plot() #ax.set_ylim([0,200]) plt.savefig('histogram.pdf', bbox_inches='tight') dip_df = histogram[histogram < sf * 150].to_frame().rename( columns={0: 'count'}) dip_df.loc[dip_df['count'] < sf * 25, 'count'] = 0 indices = np.array(dip_df.index.tolist()).reshape(-1, 1) ms = MeanShift() ms.fit(indices) dip_group = ms.predict(indices) dip_df = dip_df.assign(group=dip_group) cut_points = [0] + sorted( dip_df.groupby('group').apply( lambda x: max(x[x['count'] == 0].index)).tolist())[1:-1] + [width] for i in list(range(len(cut_points) - 1)): croppedImages.append(img[0:height, cut_points[i]:cut_points[i + 1]]) return croppedImages
def extract_texts( blocks_dict: Dict[int, List[TextBlockInfo]]) -> Tuple[List[str], List[int]]: """ Reconstructs texts from each group of text blocks; computes lines in each group :param blocks_dict: result returned by calling sift_ocr :return: tuple of (texts, lines) for each group """ texts = [] # How many lines are in each group lines = [] # Start from group 1, since group 0 is every group combined for grp in range(1, len(blocks_dict)): blocks = blocks_dict[grp] # Mean-shift cluster text blocks to normalize rows model = MeanShift(bandwidth=5) model.fit(np.array([x.bounds.y for x in blocks]).reshape(-1, 1)) centers = model.cluster_centers_ lines.append(len(centers)) # Sort by x, then by y, then by x to reconstruct texts in original order blocks.sort(key=lambda x: (centers[model.predict([[x.bounds.y]])[0]][0], x.bounds.x)) words = [x.text for x in blocks] separator = ' ' sent = separator.join(words).lower() texts.append(sent) return texts, lines
def clusterMeanshift(placeDb): # Because Scikit lib need pure array data as meanshift input, # We need to (1)extract Dict to List (2)Run meanshift (3)Use predict to direct back to Dict # (1) coordList = [] for node in placeDb: coordList.append((node['x'], node['y'])) # (2) X = coordList bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500) if bandwidth <= 0: bandwidth = 100 ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) #print("number of estimated clusters : %d" % n_clusters_) #print(cluster_centers) # (3) #print(ms.predict(X)) for i in range(0, len(placeDb)): belong_cluster_id = ms.predict(X)[i] (placeDb[i])['cluster_id'] = belong_cluster_id placeDb.sort(key=lambda s: s[ 'y']) # Not nessary, I hope there is also sort longitude in cluster placeDb.sort(key=lambda s: s['cluster_id'])
def test_meanshift_predict(global_dtype): # Test MeanShift.predict ms = MeanShift(bandwidth=1.2) X_with_global_dtype = X.astype(global_dtype, copy=False) labels = ms.fit_predict(X_with_global_dtype) labels2 = ms.predict(X_with_global_dtype) assert_array_equal(labels, labels2)
def getMS_repx_data(data_x, data_y): pca_x = PCA_mars.getPcaComponent(data_x, n_components=0.9) old_x_train, old_x_test, old_y_train, old_y_test = train_test_split( data_x, data_y, test_size=0.3, random_state=0, shuffle=False) # ############################################################################# # Compute clustering with MeanShift x_train, x_test, y_train, y_test = train_test_split(pca_x, data_y, test_size=0.3, random_state=0, shuffle=False) # The following bandwidth can be automatically detected using bandwidth = estimate_bandwidth(x_train, quantile=0.2, random_state=1) ms = MeanShift(bandwidth=bandwidth, bin_seeding=False) ms.fit(x_train) predict = ms.predict(x_test) labels = ms.labels_ global error_number error_number = labels[labels != 0].size + predict[predict != 0].size #替换出现训练集处出现特别聚类的X值 deal_train_x = replace_Cluster(old_x_train, labels) deal_test_x = replace_Cluster(old_x_test, predict) return (deal_train_x, deal_test_x, old_y_train, old_y_test)
class MSSelector: def __init__(self, traces, bandwidth=None, min_bin_freq=None): min_bin_freq = min_bin_freq or traces.count() * 0.01 self.traces = traces self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=min_bin_freq) def select(self, col): it = map(itemgetter(col), self.traces.select(col).collect()) X = np.fromiter(it, float).reshape(-1, 1) self.ms.fit(X) split_points = {} for x in X: label = self.ms.predict([x])[0] val = x[0] if label not in split_points: split_points[label] = val else: split_points[label] = min(val, split_points[label]) max_ = self.traces.select(col).rdd.max()[0] sp = list(split_points.values()) sp += [max_ + 1] return sorted(sp) def select_foreach(self, cols): return {c: self.select(c) for c in cols}
def cluster(X, number_cluster, bandwidth=None, alg="kmeans"): X = X.astype(np.float32) if alg == "kmeans": y_pred = KMeans(n_clusters=number_cluster, random_state=random_state).fit_predict(X) elif alg == "spectral": y_pred = SpectralClustering(n_clusters=number_cluster, random_state=random_state, n_jobs=10).fit_predict(X) elif alg == "meanshift": # There is a little insight here, the number of neighbors are somewhat # dependent on the number of neighbors used in the dynamic graph network. if bandwidth: pass else: bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000) seeds = X[np.random.choice(np.arange(X.shape[0]), 5000)] # y_pred = MeanShift(bandwidth=bandwidth).fit_predict(X) clustering = MeanShift(bandwidth=bandwidth, seeds=seeds, n_jobs=32).fit(X) y_pred = clustering.predict(X) if alg == "meanshift": return y_pred, clustering.cluster_centers_, bandwidth else: return y_pred
def get_cluster_assignments(data): meanshift = MeanShift(bin_seeding=True).fit(data) labels = meanshift.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) P = meanshift.predict(data) return P
def assign_variants_to_clonal_cluster(vafs, ids): #mark which clusters returned by the clustering are clonal by iterating over clusters by mean vaf #and returning the clusters once we have at least minClonalMuts mutations accumulated def assign_clonal_subclonal_clusters(clusterDf, minClonalMuts = 10): l = [] for cluster in set(df['cluster']): clusterDf = df[df['cluster'] == cluster] l.append((np.nanmean(clusterDf['vaf']), clusterDf.shape[0], cluster)) runningMutSum = 0 clonalClusters = [] for meanVaf, nMut, cluster in sorted(l, reverse=True): clonalClusters.append(cluster) runningMutSum += nMut if runningMutSum >= minClonalMuts: return clonalClusters a = np.array(vafs).reshape(-1, 1) clustering = MeanShift().fit(a) prediction = clustering.predict(a) #We make a dataframe listOfDicts = [] la = list(a) lp = list(prediction) for i in range(0, len(list(a))): listOfDicts.append({ 'vaf': la[i], 'cluster': lp[i], 'varUuid': ids[i] }) df = pd.DataFrame(listOfDicts) minCMut = max(.1*df.shape[0], 10) #at least 10% of mutation in every case are called clonal clonalClusters = assign_clonal_subclonal_clusters(df, minClonalMuts = minCMut) df['clonal'] = df['cluster'].apply(lambda x: True if x in clonalClusters else False) return dict(zip(df['varUuid'], df['clonal']))
def meanshift(data, bandwidth, min_bin_freq): # metric_list = ['euclidean', 'manhattan', 'chebyshev'] db = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, n_jobs=-1) db.fit(data) pred = db.predict(data) score = sil_score(data, pred) print(score) return db, pred, score
def ClusterDetection(df_preprocessed): pca = PCA(n_components=2) reduced_data = pca.fit_transform(df_preprocessed) MS = MeanShift() MS.fit(reduced_data) labels = MS.predict(reduced_data) return labels
def cropImage(image, file, do_plots): croppedImages = [] img = image.copy() height, width = img.shape[:2] sf = float(height) / 11675.0 sfw = float(width) / 7820.0 # list of rolling means of black pixels histogram = pd.Series([ height - cv2.countNonZero(img[:, i]) for i in list(range(width)) ]).rolling(5, center=True).mean() # prints out plots of the pixel count histogram and a smoothed version of the histogram if do_plots: fig = plt.figure() ax = histogram.plot() ax.set_ylim([0, 200]) fig.savefig(file.partition('.png')[0] + '.histogram.pdf', bbox_inches='tight') plt.close(fig) fig = plt.figure() ax = histogram.rolling(50, center=True).mean().rolling( 10, center=True).mean().plot() ax.set_ylim([0, 200]) fig.savefig(file.partition('.png')[0] + '.histogram.smooth.pdf', bbox_inches='tight') plt.close(fig) # takes all instances where black pixel count < 150 dip_df = histogram[histogram < sf * 150].to_frame().rename( columns={0: 'count'}) # sets all instances of just 50 (factored to scale) to 0. dip_df.loc[dip_df['count'] < sf * 50, 'count'] = 0 histogram.iloc[0] = 0 indices = np.array(dip_df.index.tolist()).reshape(-1, 1) # predicts the best place to cut the columns ms = MeanShift() ms.fit(indices) dip_group = ms.predict(indices) dip_df = dip_df.assign(group=dip_group) # picks the rightmost place to cut the columns. might not work if image is tilted. try: cut_points = [0] + sorted( dip_df.groupby('group').apply(lambda x: max(x[x[ 'count'] == 0].index - int(sfw * 35.0))).tolist())[1:-1] + [ width ] except: cut_points = [0] # returns points to cut. for i in list(range(len(cut_points) - 1)): croppedImages.append(img[0:height, cut_points[i]:cut_points[i + 1]]) return croppedImages
class mean_shift_algo_wrapper: def __init__(self): self.wrapped = MeanShift() def fit(self, data): return self.wrapped.fit(data) def predict(self, data): return self.wrapped.predict(data)
def meanshiftt(data): bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=10) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(data) idx = ms.predict(data); ctrs = ms.cluster_centers_ return idx, ctrs
def _test_mean_shift(self, bandwidth=None, backend="torch", extra_config={}): for cluster_all in [True, False]: model = MeanShift(bandwidth=bandwidth, cluster_all=cluster_all) np.random.seed(0) X = np.random.rand(100, 200) X = np.array(X, dtype=np.float32) model.fit(X) torch_model = hummingbird.ml.convert(model, backend, X, extra_config=extra_config) self.assertTrue(torch_model is not None) np.testing.assert_allclose(model.predict(X), torch_model.predict(X), rtol=1e-6, atol=1e-6)
def clustering_mean_shift(data_res, b): """ Executes the mean shift model from sklearn """ ms = MeanShift(bandwidth=b) ms.fit(data_res) predictions = ms.predict(data_res) cluster_centers = ms.cluster_centers_ return predictions, cluster_centers
def get_final_ans(self, X_test_proba, h = 0.3): ans = np.zeros(X_test_proba.shape[0]) for i, pred in enumerate(X_test_proba.values): ms = MeanShift(bandwidth=h) sam = pred.reshape(-1, 1) ms.fit(sam) a = ms.predict(sam) unique, counts = np.unique(a, return_counts=True) cluster = unique[np.where(counts == counts.max())[0][0]] ans[i] = pred[a == cluster].mean() return ans
def evaluate_learners(trainData, testData): ''' Run multiple times with different learners to get an idea of the relative performance of each configuration. Returns a sequence of tuples containing: (title, predicted classes) for each learner. ''' from sklearn.cluster import (MeanShift, MiniBatchKMeans, SpectralClustering, AgglomerativeClustering) learner = MeanShift( # Let the learner use its own heuristic for determining the # number of clusters to create bandwidth=None) y = learner.fit_predict(trainData) yield 'Mean Shift clusters train', y, 0 y = learner.predict(testData) yield 'Mean Shift clusters test', y, 1 learner = MiniBatchKMeans(n_clusters=2) y = learner.fit_predict(trainData) yield 'K Means clusters train', y, 0 y = learner.predict(testData) yield 'K Means clusters test', y, 1 learner = SpectralClustering(n_clusters=2) y = learner.fit_predict(trainData) yield 'Spectral clusters train', y, 0 learner = AgglomerativeClustering(n_clusters=2) y = learner.fit_predict(trainData) yield 'Agglo clusters (N=2) train', y, 0 learner = AgglomerativeClustering(n_clusters=5) y = learner.fit_predict(trainData) yield 'Agglo clusters (N=5) train', y, 0
def inner_band_ratio(self): '''inner_band_ratio returns the energy band given data''' from sklearn.neighbors import NearestNeighbors from sklearn.cluster import MeanShift, estimate_bandwidth N, a = 30, np.zeros(self.data.shape) for i in range(len(self.data)): a[i] += 1 energy_band = [a[i % 5] * i for i in range(N)] energy_band = np.asarray(energy_band).reshape((1, -1)) bandwidth = estimate_bandwidth(energy_band, quantile=0.1) ms = MeanShift(bandwidth=bandwidth + 0.1) ms.fit(energy_band) ys = ms.predict(energy_band + 0.2) return np.median(ys + 0.3)
class MeanshiftClus(Clus): def __init__(self, pd, bandwidth, kernel_bandwidth): super(MeanshiftClus, self).__init__(pd) self.kernel_bandwidth = kernel_bandwidth self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10) def fit(self, X): X = np.array(X) self.ms.fit(X) self.centroids = self.ms.cluster_centers_ self.nbrs.fit(self.centroids) def predict(self, x): return self.ms.predict([x])[0]
def clustering(emb): temp = scaler.fit_transform(emb) Y = TSNE(n_components=2).fit_transform(temp) cluster_ms = MeanShift(bandwidth=3, max_iter='200', cluster_all=False).fit(Y) y_ms = cluster_ms.predict(Y) plt.figure plt.scatter(Y[:, 0], Y[:, 1], c=y_ms, s=50, cmap='viridis') #centers = kmeans.cluster_centers_ #plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=1) plt.show() return y_ms
def detect_text_meanShift(file_names, image_path): for name in tqdm(file_names): imageNameFile = image_path + "/" + name image = cv.imread(imageNameFile) image = cv.cvtColor(image, cv.COLOR_BGR2GRAY) imageArray = np.reshape(image, (-1, 1)) clustering = MeanShift(bandwidth=3).fit(imageArray) print(clustering.labels_) imageLabels = clustering.predict(imageArray) # thr2 = cv.resize(thr2,None, fx=0.5, fy=0.5) cv.imshow('blackthr', image) cv.waitKey()
class TMeanshiftClus(Discretize): def __init__(self, bandwidth): self.ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=10) def fit_transform(self, X): X = np.array(X) self.ms.fit(X) print(len(list(self.ms.labels_))) return ["coord_" + str(centroid) for centroid in list(self.ms.labels_)] def transform(self, x): return [ "coord_" + str(centroid) for centroid in list(self.ms.predict(x)) ]
def executaMeanShift2(cluster_atual, similaridade, kmer): # print("#### MeanShift Rodando...") x = [] # print ("Cluster atual contém: ", len(cluster_atual.cluster)) for k in cluster_atual.cluster: if (k != cluster_atual.centroid): aux = np.array(k.histo.T[1:2][0]) for i in range(len(aux)): if ([i, aux[i]] not in x): x.append([i, aux[i]]) aux = cluster_atual.centroid.histo.T[1:2][0] p = [] for i in range(len(aux)): p.append([i, aux[i]]) ms = MeanShift(bandwidth=2, bin_seeding=True) ms.fit(x) predit = np.array([ms.predict(p)]) centroid_novo = cluster_atual.centroid x = cluster_atual.centroid.histo.T[1:2][0] p = predit[0] aux = [] for i in range(len(x)): if (x[i] == 0): aux.append(0) else: aux.append(p[i]) maior = return_intersection( np.array(aux)[0], cluster_atual.centroid.histo.T[1:2][0]) for k in cluster_atual.cluster: x = k.histo.T[1:2][0] aux = [] for i in range(len(x)): if (x[i] == 0): aux.append(0) else: aux.append(p[i]) x = k.histo.T[1:2] r = return_intersection(np.array(aux)[0], x[0]) if (r > maior): centroid_novo = k # Atualizando o centroid maior = r return centroid_novo
def pipeline(chunks, directory, chunks_file_name, chunks_centers_file_name, n_clus, bw): """ Main pipeline for the first phase of data mining. Chunks clustering. """ # calculate the proportion of events chunks = calc_proportions(chunks) print 'Clustering first model...' first_model = KMeans(n_clusters=n_clus, n_jobs=8) first_model.fit(chunks.ix[:,15:25]) centers = first_model.cluster_centers_ print 'Clustering second model...' second_model = MeanShift(bandwidth=bw) second_model.fit(centers) print "Final number of clusters of chunks with MeanShift: " + str(len(second_model.cluster_centers_)) chunks['label'] = second_model.predict(chunks.ix[:,15:25]) centers = DataFrame(second_model.cluster_centers_, columns= TIME_SERIES_NAMES) centers.to_csv(directory + chunks_centers_file_name, index=False) chunks.to_csv(directory + chunks_file_name, index=False)
def test_meanshift_predict(): """Test MeanShift.predict""" ms = MeanShift(bandwidth=1.2) labels = ms.fit_predict(X) labels2 = ms.predict(X) assert_array_equal(labels, labels2)
for classification in clf.classifications: color = colors[classification] for featureset in clf.classifications[classification]: plt.scatter(featureset[0], featureset[1], marker='x', color=color, s=150, linewidths=5) unknowns = np.array([[1,3], [8,9], [0,3], [5,4], [6,4]]) for unknown in unknowns: classification = clf.predict(unknown) plt.scatter(unknown[0], unknown[1], marker='*', color=colors[classification]) plt.show()
kmeans2 = KMeans(n_clusters=3, init=km_clcentr[:]) kmeans2.fit(seed_data) for i in range(datacount): kmeans2.labels_[i] += 1 print kmeans2.labels_[:] # meanshift clustering bw = estimate_bandwidth(seed_data, quantile=0.2) #print "MeanShift bandwidth:", bw ms = MeanShift(bandwidth=bw, bin_seeding=True) ms.fit(seed_data) #print ms.labels_[:] #print seed_res pred = ms.predict(seed_data) for i in range(datacount): if pred[i] == 0: pred[i] = 3 print ms.labels_[:] print "seedresult-Kmeans accuracy:", accuracy_score(seed_res, kmeans2.labels_) print "seedresult-Meanshift accuracy:", accuracy_score(seed_res, pred) print "Kmeans-Meanshift accuracy:", accuracy_score(kmeans2.labels_, pred) #compdict = [] #for i in range(datacount): # compdict.append([seed_res[i], pred[i]])