print('quantile', 'n_clusters_', 'n_noise', 'silhouette_score', 'davies_bouldin_score', 'calinski_harabasz_score', sep=',') r = range(0, 11, 1) for quantile in r: quantile = quantile / 10 # logger.info('Estimating Bandwidth') bandwidth = estimate_bandwidth(df, quantile=quantile, n_jobs=-1) logger.info('Bandwidth estimate: %f, quantile: %f' % (bandwidth, quantile)) if bandwidth > 0.0: # logger.info('Clustering') ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, n_jobs=-1).fit(df) labels = ms.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print(n_clusters_) if n_clusters_ > 1: silhouette_score = metrics.silhouette_score(df, ms.labels_) davies_bouldin_score = metrics.davies_bouldin_score(df, ms.labels_) calinski_harabasz_score = metrics.calinski_harabasz_score( df, ms.labels_) print(quantile, n_clusters_, n_noise_, silhouette_score, davies_bouldin_score, calinski_harabasz_score,
return df print ". Loading CSV file ({})...".format(config.db_path) df = read_data(config.db_path) ################################################################################# # Clustering X = df[['latitude', 'longitude']].values print ". Estimating MeanShift's bandwidth..." bandwidth = estimate_bandwidth(X, quantile=0.0005, n_samples=10000) print ". Clustering with MeanShift..." clustering = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False, min_bin_freq=10) clustering.fit(X) labels = clustering.labels_ df['cluster'] = clustering.labels_ cluster_centers = clustering.cluster_centers_ labels_unique = np.unique(l for l in labels if l != -1) n_clusters_ = len(labels_unique) cluster_count = df[df['cluster'] != -1].groupby('cluster').size() # Building a DataFrame describing each cluster print ". Fetching cluster data with Google Places..." c_data = list() for i, cluster in enumerate(labels_unique):
avg_emb_zip = zip(sense_keys_temp, hom_types, avg_embs) avg_emb_df = pd.DataFrame(avg_emb_zip, columns=['sense_key', 'hom_type', 'avg_emb']) len(h_embs) from sklearn.cluster import DBSCAN from sklearn.cluster import MeanShift from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import pairwise_distances from scipy.spatial.distance import cosine bert_vecs = np.array(avg_embs) len(bert_vecs) cluster = MeanShift().fit(bert_vecs) cluster.labels_ clustering = DBSCAN(eps=0.25, min_samples=2, metric='cosine').fit(bert_vecs) clustering.labels_ ag_cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55).fit(bert_vecs) ag_cluster.labels_ # relevant from sklearn.manifold import TSNE from sklearn.manifold import LocallyLinearEmbedding # Will be used for LLE, LTSA, Hessian LLE, and Modified LLE. from sklearn.manifold import MDS from sklearn.manifold import SpectralEmbedding from sklearn.manifold import Isomap
import numpy as np from sklearn.cluster import MeanShift, estimate_bandwidth import utilities # Load data from input file X = utilities.load_data('data_multivar.txt') # Estimating the bandwidth bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=len(X)) # Compute clustering with MeanShift meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanshift_estimator.fit(X) labels = meanshift_estimator.labels_ centroids = meanshift_estimator.cluster_centers_ num_clusters = len(np.unique(labels)) print "Number of clusters in input data =", num_clusters ########################################################### # Plot the points and centroids import matplotlib.pyplot as plt from itertools import cycle plt.figure() # specify marker shapes for different clusters markers = '.*xv'
def node(): global frontiers,mapData,global1,global2,global3,globalmaps,litraIndx,n_robots,namespace_init_count rospy.init_node('filter', anonymous=False) # fetching all parameters map_topic= rospy.get_param('~map_topic','/map') threshold= rospy.get_param('~costmap_clearing_threshold',70) info_radius= rospy.get_param('~info_radius',1.0) #this can be smaller than the laser scanner range, >> smaller >>less computation time>> too small is not good, info gain won't be accurate goals_topic= rospy.get_param('~goals_topic','/detected_points') n_robots = rospy.get_param('~n_robots',1) namespace = rospy.get_param('~namespace','') namespace_init_count = rospy.get_param('namespace_init_count',1) rateHz = rospy.get_param('~rate',100) litraIndx=len(namespace) rate = rospy.Rate(rateHz) #------------------------------------------- rospy.Subscriber(map_topic, OccupancyGrid, mapCallBack) #--------------------------------------------------------------------------------------------------------------- for i in range(0,n_robots): globalmaps.append(OccupancyGrid()) if len(namespace) > 0: for i in range(0,n_robots): rospy.Subscriber(map_topic, OccupancyGrid, globalMap) elif len(namespace)==0: rospy.Subscriber(map_topic, OccupancyGrid, globalMap) #wait if map is not received yet while (len(mapData.data)<1): pass #wait if any of robots' global costmap map is not received yet for i in range(0,n_robots): while (len(globalmaps[i].data)<1): pass global_frame="/"+mapData.header.frame_id tfLisn=tf.TransformListener() if len(namespace) > 0: for i in range(0,n_robots): tfLisn.waitForTransform(global_frame[1:], '/odom', rospy.Time(0),rospy.Duration(10.0)) elif len(namespace)==0: tfLisn.waitForTransform(global_frame[1:], '/odom', rospy.Time(0),rospy.Duration(10.0)) rospy.Subscriber(goals_topic, PointStamped, callback=callBack,callback_args=[tfLisn,global_frame[1:]]) pub = rospy.Publisher('frontiers', Marker, queue_size=10) pub2 = rospy.Publisher('centroids', Marker, queue_size=10) filterpub = rospy.Publisher('filtered_points', PointArray, queue_size=10) rospy.loginfo("the map and global costmaps are received") # wait if no frontier is received yet while len(frontiers)<1: pass points=Marker() points_clust=Marker() #Set the frame ID and timestamp. See the TF tutorials for information on these. points.header.frame_id= mapData.header.frame_id points.header.stamp= rospy.Time.now() points.ns= "markers2" points.id = 0 points.type = Marker.POINTS #Set the marker action for latched frontiers. Options are ADD, DELETE, and new in ROS Indigo: 3 (DELETEALL) points.action = Marker.ADD; points.pose.orientation.w = 1.0 points.scale.x=0.2 points.scale.y=0.2 points.color.r = 255.0/255.0 points.color.g = 255.0/255.0 points.color.b = 0.0/255.0 points.color.a=1; points.lifetime = rospy.Duration(); p=Point() p.z = 0; pp=[] pl=[] points_clust.header.frame_id= mapData.header.frame_id points_clust.header.stamp= rospy.Time.now() points_clust.ns= "markers3" points_clust.id = 4 points_clust.type = Marker.POINTS #Set the marker action for centroids. Options are ADD, DELETE, and new in ROS Indigo: 3 (DELETEALL) points_clust.action = Marker.ADD; points_clust.pose.orientation.w = 1.0; points_clust.scale.x=0.2; points_clust.scale.y=0.2; points_clust.color.r = 0.0/255.0 points_clust.color.g = 255.0/255.0 points_clust.color.b = 0.0/255.0 points_clust.color.a=1; points_clust.lifetime = rospy.Duration(); temppoint=PointStamped() temppoint.header.frame_id= mapData.header.frame_id temppoint.header.stamp=rospy.Time(0) temppoint.point.z=0.0 arraypoints=PointArray() tempPoint=Point() tempPoint.z=0.0 #------------------------------------------------------------------------- #--------------------- Main Loop ------------------------------- #------------------------------------------------------------------------- while not rospy.is_shutdown(): #------------------------------------------------------------------------- #Clustering frontier points centroids=[] front=copy(frontiers) if len(front)>1: ms = MeanShift(bandwidth=0.3) ms.fit(front) centroids= ms.cluster_centers_ #centroids array is the centers of each cluster #if there is only one frontier no need for clustering, i.e. centroids=frontiers if len(front)==1: centroids=front frontiers=copy(centroids) #------------------------------------------------------------------------- #clearing old frontiers z=0 while z<len(centroids): cond=False temppoint.point.x=centroids[z][0] temppoint.point.y=centroids[z][1] for i in range(0,n_robots): transformedPoint=tfLisn.transformPoint(globalmaps[i].header.frame_id,temppoint) x=array([transformedPoint.point.x,transformedPoint.point.y]) cond=(gridValue(globalmaps[i],x)>threshold) or cond if (cond or (informationGain(mapData,[centroids[z][0],centroids[z][1]],info_radius*0.5))<0.2): centroids=delete(centroids, (z), axis=0) z=z-1 z+=1 #------------------------------------------------------------------------- #publishing arraypoints.points=[] for i in centroids: tempPoint.x=i[0] tempPoint.y=i[1] arraypoints.points.append(copy(tempPoint)) filterpub.publish(arraypoints) pp=[] for q in range(0,len(frontiers)): p.x=frontiers[q][0] p.y=frontiers[q][1] pp.append(copy(p)) points.points=pp pp=[] for q in range(0,len(centroids)): p.x=centroids[q][0] p.y=centroids[q][1] pp.append(copy(p)) points_clust.points=pp pub.publish(points) pub2.publish(points_clust) rate.sleep()
Std_RGB = np.array([38.55379149, 35.64913446, 39.07419321]) Data_Norm = (Img_Data - Mean_RGB)/Std_RGB Data_NFlat = np.reshape(Data_Norm, (Size_Data[0], 32*32*3)) # Shullf data per = np.random.permutation(Data_Norm.shape[0]) Shuf_Data_Norm = Data_NFlat[per, :] Shuf_Label_Data = Label_Data[per] for i in range(F_n): DataN_te = Shuf_Data_Norm[Fold_size*i:Fold_size*(i+1), :] DataN_tr_1 = Shuf_Data_Norm[0:(Fold_size*i), :] DataN_tr_2 = Shuf_Data_Norm[Fold_size*(i+1):, :] DataN_tr = np.concatenate((DataN_tr_1,DataN_tr_2)) DataN_te_y = Shuf_Label_Data[Fold_size*i:Fold_size*(i+1)] DataN_tr_y_1 = Shuf_Label_Data[0:(Fold_size*i)] DataN_tr_y_2 = Shuf_Label_Data[Fold_size*(i+1):] DataN_tr_y = np.concatenate((DataN_tr_y_1,DataN_tr_y_2)) model_name_rbf = 'Model_' + str(i+1) + '_rbf.model' model_name_linear = 'Model_' + str(i+1) + '_linear.model' # Mean shift cluster clustering_Mf = MeanShift(n_jobs=-1) clustering_Mf.fit(DataN_tr) Cluster_predict = clustering_Mf.predict(DataN_te) print(Cluster_predict[0:40]) ''' score_rbf = clf_rbf.score(DataN_te,DataN_te_y) print("The score of rbf is : %f"%score_rbf) joblib.dump(clf_rbf, model_name_rbf) '''
# for production taxi_df = pd.read_sql_query('SELECT * FROM tripdata', engine) taxi_data=taxi_df[['pickup_datetime', 'pickup_longitude', 'pickup_latitude']].copy() taxi_data.columns=['datetime', 'lng', 'lat'] taxi_df = None data_set = taxi_data data_set['dayofweek']=data_set.datetime.dt.weekday data_set['hourofday']=data_set.datetime.dt.hour data_set['weekdays']=(data_set.dayofweek < 5)*1 from sklearn.cluster import MeanShift ms=MeanShift(bandwidth=0.003, cluster_all=False, min_bin_freq=5) all_clusters=None # loop through weekdays and weekends, and each 2-hours time duration # train the data with MeanShift # get the cluster_centers # and packed with 'items' equals to the number of items belongs to the center # which will be used as a weighting in display for d in [0, 1]: for h in [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15],[16,17],[18,19],[20,21],[22,23]]: print 'Clusters for weekdays=', d, '; hour=', h # train only on lng and lat X = data_set[(data_set.weekdays==d) & (data_set.hourofday.isin(h))][['lng', 'lat']] ms.fit(X)
def color_analysis(preffixName, suffixName, textNum, textLines): textLinesLeft = [] backColors = [] maxSize = 0 index_maxSize = 0 for i in range(textNum + 1): textImg = Image.open(preffixName + 'text' + str(i) + suffixName) textImg = textImg.convert('RGB') textImg = np.asarray(textImg) if (textImg.shape[0] * textImg.shape[1]) > maxSize: maxSize = textImg.shape[0] * textImg.shape[1] index_maxSize = i saliencyImg = Image.open(preffixName + 'saliency' + str(i) + suffixName) saliencyImg = np.asarray(saliencyImg) textBack_color = textImg[saliencyImg < 10] backColors.append(np.mean(textBack_color, axis=0)) #尝试聚类处理 #backColor = getMainColor(textBack_color,5) #backColors.append(backColor) if len(backColors) > 0: backColors = np.array(backColors) #区分的阈值,该值可调整 ms = MeanShift(bandwidth=50, bin_seeding=True) ms.fit(backColors) labels = ms.labels_ cluster_centers = ms.cluster_centers_ binCount = np.bincount(labels).tolist() Label_index = [] for i in range(len(binCount)): eachLabel_index = [] for j in range(len(labels)): if labels[j] == i: eachLabel_index.append(j) Label_index.append(eachLabel_index) maxLabel = labels[index_maxSize] results = [] for i in range(len(Label_index)): if i != maxLabel: texts_eachlabel = Label_index[i] if len(texts_eachlabel) > 1: yAxis = [] for index in texts_eachlabel: yAxis.append(textLines[index][1]) yAxis_copy = list(yAxis) yAxis_copy.sort(reverse=False) results.append(textLines[texts_eachlabel[yAxis.index( yAxis_copy[0])]]) for i in range(1, len(yAxis_copy)): index = texts_eachlabel[yAxis.index(yAxis_copy[i])] eachline = results[-1] eachline_next = textLines[index] if overlap_degree(eachline, eachline_next) > -1: #合并 results[-1][0] = min(eachline[0], eachline_next[0]) results[-1][1] = min(eachline[1], eachline_next[1]) results[-1][2] = max(eachline[2], eachline_next[2]) results[-1][3] = max(eachline[3], eachline_next[3]) else: results.append(eachline_next) else: results.append(textLines[texts_eachlabel[0]]) else: texts_eachlabel = Label_index[i] for index in texts_eachlabel: textLinesLeft.append(textLines[index]) return results, textLinesLeft else: return [], textLines
A = pd.read_excel( r'D:\Program Files\JetBrains\PyCharmFile\Cluster\Data\e2.xlsx', sheet_name="Sheet1", header=0) tabtop = A.columns.values.tolist() # 读出表头 B = np.array(A) temp = ['2016年', '2017年', '2018年'] x = 1 y = 13 for j in range(3): # 对某一行业的进行聚类(三次) C = B[0:109, x:y] D = DataFrame(C) E = D.loc[~(D == 0).all(axis=1), :] # 删除全为0的行 bw = estimate_bandwidth(E, quantile=0.3) # 设置带宽 model = MeanShift(bandwidth=bw, bin_seeding=True) # 聚类函数参数设置 model.fit(E) # 开始聚类 r1 = (Series(model.labels_)).value_counts() # 统计各个类别的数目 r2 = DataFrame(model.cluster_centers_) # 找出聚类中心 r = pd.concat([r2, r1], axis=1) # 横向连接数据 r.columns = tabtop[x:y] + [u'类别数目'] name1 = temp[j] + '.xlsx' print(name1) file1 = r.to_excel(name1) g = pd.concat([E, Series(model.labels_, index=E.index)], axis=1) g.columns = tabtop[x:y] + [u'聚类类别'] name2 = 'A' + temp[j] + '.xlsx' print(name2) file2 = g.to_excel(name2) x = y y = y + 12
# import libraries import numpy as np from sklearn.cluster import MeanShift from sklearn.datasets.samples_generator import make_blobs # creadte data centers = [[3,3,3],[4,5,5],[3,10,10]] X, _ = make_blobs(n_samples=700, centers = centers, cluster_std=0.5) # create model MSh = MeanShift() # train model MSh.fit(X) labels = MSh.labels_ cluster_centers = MSh.cluster_centers_ print(cluster_centers)
#load libraries from sklearn import datasets from sklearn.preprocessing import StandardScaler from sklearn.cluster import MeanShift #load data iris = datasets.load_iris() features = iris.data #standardize features scaler = StandardScaler() features_std = scaler.fit_transform(features) #create meanshift object cluster = MeanShift(n_jobs=-1) #train model model = cluster.fit(features_std)
"Confidence in national government" ]] subdf.fillna(0, inplace=True) scaler = preprocessing.StandardScaler() scaled_df = scaler.fit_transform(subdf) reducer_p = PCA(n_components=2) pca_df = reducer_p.fit_transform(scaled_df) # ===================================================================== # 5. Clustering # ===================================================================== learner = MeanShift(bandwidth=None) ms = learner.fit_predict(pca_df) learner = MiniBatchKMeans(n_clusters=3) mbkm = learner.fit_predict(pca_df) learner = SpectralClustering(n_clusters=3) sc = learner.fit_predict(pca_df) learner = AgglomerativeClustering(n_clusters=3) ac = learner.fit_predict(pca_df) # ===================================================================== # 5. Cluster graphs # =====================================================================
print("n_samples: %d, n_features: %d" % matrix.shape) print() #降维 print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(2) #维度 normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) matrix_l = lsa.fit_transform(matrix) # ############################################################################# # Do the actual clustering bandwidth = estimate_bandwidth(matrix_l, quantile=0.2, n_samples=500) t0 = time() ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(matrix_l) print("done in %0.3fs" % (time() - t0)) print() labels_Pred = ms.labels_ print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_ture, labels_Pred)) print("Completeness: %0.3f" % metrics.completeness_score(labels_ture, labels_Pred)) print("NMI: %0.3f" % metrics.normalized_mutual_info_score(labels_ture, labels_Pred, average_method='arithmetic'))
elif i == 2: y_corrected.append(1) elif i == 3: y_corrected.append(2) elif i == 4: y_corrected.append(5) else: y_corrected.append(3) #establish a KNN model KNN = KNeighborsClassifier(n_neighbors=5) KNN.fit(x, y) KNN.predict([[5, 0]]) y_predict2 = KNN.predict(x) print("KNN accuracy", accuracy_score(y, y_predict2)) print(pd.value_counts(y_predict2)) #establish a meanshift model bw = estimate_bandwidth(x, n_samples=120) print(bw) ms = MeanShift(bandwidth=bw) ms.fit(x) y_predict_ms = ms.predict(x) print("meanshift 归类结果", pd.value_counts(y_predict_ms)) fig5 = plt.figure() label1 = plt.scatter(x['x'][y_predict_ms == 1], x['y'][y_predict_ms == 1]) label2 = plt.scatter(x['x'][y_predict_ms == 0], x['y'][y_predict_ms == 0]) plt.legend((label1, label2), ('label1', 'label2')) plt.show()
import numpy import cPickle import scipy.misc import numpy as np import pandas as pd import json from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs from itertools import cycle import data print "Generating Clusters" df = pd.read_csv('taxi_data/train.csv', converters={'POLYLINE': lambda x: json.loads(x)[-1:]}) dests = [] for p in df['POLYLINE']: if len(p) > 0: dests.append([p[0][1], p[0][0]]) pts = numpy.array(dests) bw = 0.001 means = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5) means.fit(pts) cluster_centers = means.cluster_centers_ print "Clusters shape: ", cluster_centers.shape print cluster_centers
centerNorms[nanIdx] = -1 centers = centers[(centerNorms < 100) & (centerNorms > 0)] fig = plt.figure() ax = Axes3D(fig) ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red') ax.axis('equal') plt.show() res = 200 bandwidthList = list(np.linspace(0.01, 0.045, res)) nCenters = np.zeros((res)) i = 0 for bandwidth in bandwidthList: meanShiftClusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=5, cluster_all=False) wasError = False try: meanShiftClusterer.fit(centers) except ValueError: print('No point for bandwidth {}'.format(bandwidth)) wasError = True if not wasError: clusterCenters = meanShiftClusterer.cluster_centers_ nCenters[i] = clusterCenters.shape[0] else: nCenters[i] = 0 i += 1 plt.plot(np.array(bandwidthList), nCenters)
final = [] for i in UserId: if i in list(Q1['userId']) and i in list(Q2['userId']) and i in list( FinalExam['userId']): a = float(Q1[Q1['userId'] == i].get('score')) q1.append(a) q2.append(float(Q2[Q2['userId'] == i].get('score'))) final.append(float(FinalExam[FinalExam['userId'] == i].get('score'))) d = {'userId': "", 'score_Q1': "", 'scoreQ2': "", 'score_final': ""} x = np.array([q1, q2, final]) x = x.reshape(len(x[0]), 3) kmeans = km(n_clusters=3, random_state=0).fit(x) clusteringMS = MeanShift(bandwidth=2).fit(x) clusteringDB = DBSCAN(eps=3, min_samples=2).fit(x) clusteringDB.labels_ # kmeans.labels_ 这个是分类好了的表格 #quizAndExam KMEANS 效果贼差,决定不用了。 quizAndExam = pd.DataFrame(data=x, columns={"q1Score", "q2Score", "finalScore"}) quizAndExam['class'] = clusteringDB.labels_ quizAndExam['label'] = kmeans.labels_ #d quizAndExam.to_excel('output.xlsx', engine='xlsxwriter') """ 三个学生的成绩等级。TB20个,B36个,F47个。咱大学的学生水平不行啊。 """
X = X.to_numpy() return X def Dataset(): np.random.seed(10) X, _ = datasets.make_blobs(n_samples=1500, n_features=2, centers=3, cluster_std=2.1) # dumpfile(X,'fileSet') # data = loadfile('fileSet') # print(data) return X np.set_printoptions(threshold=sys.maxsize) ## clustering_algorithms MeanShift = MeanShift(bandwidth=2) KMeans = KMeans(9, verbose=1000) SpectralClustering = SpectralClustering(n_clusters=9, assign_labels="discretize", random_state=0) DBSCAN = DBSCAN(eps=3, min_samples=2) OPTICS = OPTICS(min_samples=2) AffinityPropagation = AffinityPropagation() # clustering_algorithms = [['MeanShift', MeanShift], ['KMeans', KMeans], # ['SpectralClustering', SpectralClustering], ['DBSCAN', DBSCAN], ['OPTICS', OPTICS],['AffinityPropagation',AffinityPropagation]] file = [['1k','0-0-1000'],['10k','0-0-10000'],['100k','0-0-100k'],['1m','0-0-1m']] clustering_algorithms = [['KMeans', KMeans]] # file = [['1k','0-0-1000']] for n, f in file: X = readdata(f) for c, model in clustering_algorithms: print(c, "Start.......!") z = model.fit_predict(X)
import numpy as np from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.datasets.samples_generator import make_blobs centers = [[1, 1], [-1, -1], [1, -1]] X,_ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6) bandwidth = estimate_bandwidth(X, n_samples=500) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) print("number of estimated clusters : %d" % n_clusters_) # Plot result import matplotlib.pyplot as plt for k in range(n_clusters_): my_members = labels == k cluster_center = cluster_centers[k] plt.scatter(X[my_members, 0], X[my_members, 1]) plt.plot(cluster_center[0], cluster_center[1], 'o', markeredgecolor='b', markersize=14) plt.title('Estimated number of clusters: %d' % n_clusters_)
def func_meanshift(): print('\nCLUSTERING: MEAN SHIFT\n') #Creating the model using photometric data print('Fitting the model...') print( 'If you want to choose the bandwidth, please write it. Else, write 0') bandwidth = input() if bandwidth == '0': bandwidth = estimate_bandwidth(dataused) else: bandwidth = float(bandwidth) print('Using a bandwidth of', bandwidth) ms = MeanShift(bandwidth=bandwidth) ms.fit(dataused) print('Mean Shift problem solved') # Model parameters centers = ms.cluster_centers_ print('POSITIONS OF THE CENTERS:\n', centers) # Adjusting the data to the model print('Obtaining the labels...') labels = ms.labels_ labeledlabels = ms.predict(labeleddataused) # Saving data in ASCII format plotfile = (root + '/MeanShift/' + n + root_folder + root_file + '_MeanShift') print('Saving data with labels in ASCII format...') np.savetxt(plotfile + '.txt', np.c_[data['id_2MASS'], data['id_AllWISE'], dataset, labels], header=dataheading, delimiter='\t', fmt='%s') np.savetxt(plotfile + '_labeled.txt', np.c_[labeleddataset, labeledlabels, labeleddata['z'], labeleddata['class'], labeleddata['subClass']], header=dataheading[20:] + '\tz\tclass\tsubClass', delimiter='\t', fmt='%s') print('Data file saved successfully, check your MeanShift folder') # Saving data in FITS format print('Saving data with labels in FITS format...') bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile + '.txt out=' + plotfile + '.fits') subprocess.run(bashorder, shell=True) bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile + '_labeled.txt out=' + plotfile + '_labeled.fits') subprocess.run(bashorder, shell=True) print('Data file saved successfully, check your KMeans folder') # Saving centers information plotfile = root + '/MeanShift/' + n + root_folder + 'Centers' print('Saving centers position...') np.savetxt(plotfile + '.txt', centers, header=dataheading[20:-6], delimiter='\t', fmt='%s') print('Centers positions saved successfully, check your MeanShift folder') print('\nMEAN SHIFT TECHNIQUE APPLIED\n')
import numpy as np from sklearn.cluster import MeanShift from sklearn.datasets.samples_generator import make_blobs import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from matplotlib import style style.use('ggplot') centers = [[1, 1, 1], [5, 5, 5], [3, 10, 10]] X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1) ms = MeanShift() ms.fit(X) labels = ms.labels_ cluster_centers = ms.cluster_centers_ print(cluster_centers) n_clusters_ = len(np.unique(labels)) print('Number of estimated clusters: ', n_clusters_) colors = 10 * ['g', 'r', 'c', 'b', 'k', 'y', 'm'] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i in range(len(X)): ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o') ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], cluster_centers[:, 2], marker='x', color='k',
# mean shift clustering from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import MeanShift from matplotlib import pyplot # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = MeanShift() # fit model and predict clusters yhat = model.fit_predict(X) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot pyplot.show() # optics clustering from numpy import unique
def window_analysis(Windows, ref_labels, labels1, Chr=1, ncomp=4, amova=True, supervised=True, include_who=[], range_sample=[130, 600], rand_sample=0, clsize=15, cl_freqs=5, Bandwidth_split=20): kde_class_labels = labels1 kde_label_dict = { z: [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z] for z in list(set(kde_class_labels)) } if include_who: include = [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] in include_who ] ref_labels = include_who kde_class_labels = [kde_class_labels[x] for x in include] kde_label_dict = { z: [ x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z ] for z in include_who } if rand_sample: sample = rand_sample sample_range = [0, sample] Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in np.random.choice( list(Windows[Chr].keys()), sample, replace=True) } } if range_sample: sample_range = range_sample Freq_extract = { Chr: { bl: Windows[Chr][bl] for bl in list(sorted(Windows[Chr].keys())) [sample_range[0]:sample_range[1]] } } Results = {'header': ['Chr', 'window'], 'info': []} Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []} PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []} pc_density = [] pc_coords = [] sim_fst = [] for c in Freq_extract[Chr].keys(): Sequences = Windows[Chr][c] if Sequences.shape[1] <= 3: Results[Chr][c] = [0, 0] print('hi') continue Sequences = np.nan_to_num(Sequences) pca = PCA(n_components=ncomp, whiten=False, svd_solver='randomized').fit(Sequences) data = pca.transform(Sequences) if include_who: data = data[include, :] ##### PC density PC = 0 pc_places = data[:, PC] X_plot = np.linspace(-8, 8, 100) kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( np.array(pc_places).reshape(-1, 1)) log_dens = kde.score_samples(X_plot.reshape(-1, 1)) pc_density.append(np.exp(log_dens)) pc_coords.append(pc_places) PC_var['coords'].append([Chr, c]) PC_var['info'].append([x for x in pca.explained_variance_]) ### params = { 'bandwidth': np.linspace(np.min(data), np.max(data), Bandwidth_split) } grid = GridSearchCV(KernelDensity(algorithm="ball_tree", breadth_first=False), params, verbose=0) ###################################### ####### TEST global Likelihood ####### ###################################### Focus_labels = list(range(data.shape[0])) #### Mean Shift approach ## from sklearn.cluster import MeanShift, estimate_bandwidth bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=len(Focus_labels)) if bandwidth <= 1e-3: bandwidth = 0.1 ms = MeanShift(bandwidth=bandwidth, cluster_all=False, min_bin_freq=clsize) ms.fit(data[Focus_labels, :]) labels = ms.labels_ Tree = { x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x] for x in [g for g in list(set(labels)) if g != -1] } Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize] Tree = {x: Tree[x] for x in Keep} Ngps = len(Tree) SpaceX = {x: data[Tree[x], :] for x in Tree.keys()} these_freqs = [] ### Extract MScluster likelihood by sample for hill in SpaceX.keys(): if len(Tree[hill]) >= cl_freqs: if supervised == False: print('hi') cl_seqs = Sequences[Tree[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) grid.fit(data[Tree[hill], :]) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ P_dist = kde.score_samples(data[Tree[hill], :]) Dist = kde.score_samples(data) P_dist = np.nan_to_num(P_dist) Dist = np.nan_to_num(Dist) if np.std(P_dist) == 0: Dist = np.array( [int(Dist[x] in P_dist) for x in range(len(Dist))]) else: Dist = scipy.stats.norm(np.mean(P_dist), np.std(P_dist)).cdf(Dist) Dist = np.nan_to_num(Dist) Construct['coords'].append([Chr, c, hill]) Construct['info'].append(Dist) ######################################### ############# AMOVA ################ ######################################### if supervised: labels = [x for x in kde_class_labels if x in ref_labels] Who = [ z for z in it.chain(*[kde_label_dict[x] for x in ref_labels]) ] Ngps = len(ref_labels) print(ref_labels) for hill in ref_labels: if len(kde_label_dict[hill]) >= cl_freqs: if include_who: Seq_specific = Sequences[include, :] cl_seqs = Seq_specific[kde_label_dict[hill], :] freq_vector = [ float(x) / (cl_seqs.shape[0] * 2) for x in np.sum(cl_seqs, axis=0) ] Frequencies['coords'].append([Chr, c, hill]) Frequencies['info'].append(freq_vector) these_freqs.append(freq_vector) else: Who = [ x for x in range(len(labels)) if labels[x] != -1 and labels[x] in Keep ] labels = [labels[x] for x in Who] Who = [Focus_labels[x] for x in Who] # Pairwise = return_fsts2(np.array(these_freqs)) sim_fst.extend(Pairwise.fst) if len(list(set(labels))) == 1: Results['coords'].append([Chr, c]) Results['info'].append([AMOVA, Ngps]) continue if amova: clear_output() AMOVA, Cig = AMOVA_FM42(data[Who, :], labels, n_boot=0, metric='euclidean') print('counting: {}, Ngps: {}'.format(AMOVA, Ngps)) Results['info'].append([Chr, c, AMOVA, Ngps]) Results['info'] = pd.DataFrame( np.array(Results['info']), columns=['chrom', 'window', 'AMOVA', 'Ngps']) X_plot = np.linspace(0, .3, 100) freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit( np.array(sim_fst).reshape(-1, 1)) log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1)) fig_roost_dens = [ go.Scatter(x=X_plot, y=np.exp(log_dens), mode='lines', fill='tozeroy', name='', line=dict(color='blue', width=2)) ] ## layout = go.Layout(title='allele frequency distribution across clusters', yaxis=dict(title='density'), xaxis=dict(title='fst')) fig = go.Figure(data=fig_roost_dens, layout=layout) return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
import codecs import json import numpy as np from sklearn.cluster import MeanShift fileObj = codecs.open('../data/connections.json', "r", "utf_8_sig") data = json.loads(fileObj.read()) fileObj.close() X = [] for coord in data: X.append([coord['latitude'], coord['longitude']]) ap = MeanShift() ap.fit(X) labels = ap.labels_ cluster_centers_ = ap.cluster_centers_ sample_count = len(X) coords_labeled = [] for i in range(0, sample_count): latitude = X[i][0] longitude = X[i][1] label = labels[i] coords_labeled.append({ 'point': [latitude, longitude], 'label': str(label), 'recall_count': data[i]['connection_count'] })
def tracker(path): #initialization for default value if path=='0': path=0; cap = cv2.VideoCapture(path) ip_method = ip.get_instace(ip.IPMethod.TOMASI); #FLANN Properties MIN_FRAMES_COUNT = 120 SKIP_FRAMES = 60 MIN_MERGE_FRAMES = 5; FLANN_INDEX_KDTREE = 0 index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 10) search_params = dict(checks = 50) flann = cv2.FlannBasedMatcher(index_params, search_params) DO_RESIZE=False new_sz = (180,120) #Initialization of inputs frames =[]; #Frames kp = []; #Key points all_matches = []; #All good matches match_count = []; #match_count labels = []; frame_cnt=0; print "Extracting frames...................." ret, prev_frame = cap.read() kp1,desc1 = ip_method.detectAndCompute(prev_frame); num_matches = np.zeros(kp1.__len__()) #storing frames frames.append(prev_frame); kp.append(kp1) match_count.append(num_matches); while(cap.isOpened()): SKIP_FRAMES=SKIP_FRAMES-1; ret, prev_frame = cap.read() if not ret or SKIP_FRAMES<0: break; while(cap.isOpened()): ret, cur_frame = cap.read() if not ret: break; kp2,desc2 = ip_method.detectAndCompute(cur_frame); matches = flann.knnMatch(desc1,desc2,k= 2) # Ratio test as per Lowe's paper good_matches = []; distances = [] for (m,n) in matches: if m.distance < 0.7*n.distance and m.distance > 4: good_matches.append(m); distances.append(m.distance); # Bashart's Displacement filtering mean = np.mean(distances); std = np.std(distances) good_matches[:] = [match for match in good_matches if abs(match.distance - mean) < 5 * std] kp1 = kp2; desc1 = desc2; num_matches = np.zeros(kp1.__len__()) for match in good_matches: num_matches[match.trainIdx]=match_count[-1][match.queryIdx]+1 all_matches.append(good_matches); #storing frames frames.append(cur_frame); kp.append(kp1) match_count.append(num_matches); if frame_cnt > MIN_FRAMES_COUNT: break; frame_cnt = frame_cnt +1; cap.release() print "Labeling the keypoints................." max_label=0; MIN_POINTS_TO_CLUSTER = 20 MAX_CLUSTERS = 100 #Forward Labeling Pass for rng in xrange(0,MIN_MERGE_FRAMES+1): labels.append([-1]*kp[rng].__len__()); for rng in xrange(MIN_MERGE_FRAMES+1,frame_cnt): motion_feats = []; feat_indices = []; labels.append([-1]*kp[rng].__len__()); for match in all_matches[rng-1]: if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES: if labels[rng-1][match.queryIdx]==-1: src_pt = np.int32(kp[rng-1][match.queryIdx].pt) dst_pt = np.int32(kp[rng][match.trainIdx].pt) motion_feats.append(motion.get_features(src_pt,dst_pt)); feat_indices.append(match.trainIdx) else : labels[rng][match.trainIdx]=labels[rng-1][match.queryIdx] if(motion_feats.__len__()>=MIN_POINTS_TO_CLUSTER): #Clustering mean-shift motion_feats = np.asarray(motion_feats) bandwidth = estimate_bandwidth(motion_feats, quantile=0.1,random_state=200) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(motion_feats); for idx,lbl in zip(feat_indices,ms.labels_): labels[rng][idx]=lbl+max_label; max_label = max(labels[rng])+1; random_colors = np.random.randint(256, size=(MAX_CLUSTERS, 3)) print "Writing the video................." fourcc = cv2.cv.CV_FOURCC(*'XVID') w = prev_frame.shape[0]; h = prev_frame.shape[1] if DO_RESIZE: vidout = cv2.VideoWriter('out.avi',fourcc,20,new_sz) else: vidout = cv2.VideoWriter('out.avi',fourcc,20,(h,w)) for frame_idx in xrange(MIN_MERGE_FRAMES*2,frame_cnt): cur_frame = frames[frame_idx]; for rng in xrange(frame_idx-MIN_MERGE_FRAMES,frame_idx): for match in all_matches[rng-1]: if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES \ and not (labels[rng-1][match.queryIdx]==-1 or labels[rng-1][match.queryIdx]>=MAX_CLUSTERS): #print "i m not here" src_pt = np.int32(kp[rng-1][match.queryIdx].pt) dst_pt = np.int32(kp[rng][match.trainIdx].pt) color = tuple(random_colors[labels[rng-1][match.queryIdx]]) cv2.line(cur_frame,tuple(src_pt),tuple(dst_pt),color,2); if DO_RESIZE: cur_frame=cv2.resize(cur_frame,new_sz); vidout.write(cur_frame); vidout.release() cv2.destroyAllWindows()
def create_dataset(self): self.data_list = [] abstracts = [] titles = [] #print("VOCAB: ", self.vocab) if self.vocab == []: count_vectorizer = CountVectorizer(stop_words='english') else: count_vectorizer = CountVectorizer(vocabulary=self.vocab) tfid_transformer = TfidfTransformer() abstract_data = None title_data = None graph_data = None for paper in self.papers: abstracts.append(paper.paper.abstract) titles.append(paper.paper.title) if self.using_abstracts == True: #print("using abstract data") abstract_count = count_vectorizer.fit_transform(abstracts) abstract_tfid = tfid_transformer.fit_transform(abstract_count) abstract_data = abstract_tfid #print(abstract_data) if self.using_titles == True: #print("using title data") title_count = count_vectorizer.fit_transform(titles) abstract_tfid = tfid_transformer.fit_transform(title_count) title_data = abstract_tfid #print(title_data.toarray(), len(self.papers)) if self.using_graph_data == True: #print("using graph data") graph_data = scipy.sparse.csr_matrix(np.matrix(self.graph_dataset)) #print(graph_data) self.data = [] for paper in self.papers: self.data.append([1]) self.data = scipy.sparse.csr_matrix(self.data) #print("MAT ", self.data) if abstract_data != None: self.data = scipy.sparse.hstack([self.data, abstract_data]) if title_data != None: self.data = scipy.sparse.hstack([self.data, title_data]) if graph_data != None: self.data = scipy.sparse.hstack([self.data, graph_data]) # Reduce data to two dimensions #print("nd data: ", self.data) self.nd_data = self.data.toarray() #print("SHAPE ", self.data.shape[1]) if self.data.shape[1] > 50: svd_data = TruncatedSVD(n_components=50).fit_transform(self.data) tsne_data = TSNE(n_components=2, metric='cosine').fit_transform(svd_data) elif self.data.shape[1] < 10: #print("PCA") svd_data = PCA(n_components=2).fit_transform(self.data.toarray()) tsne_data = svd_data if len(tsne_data) == 1: tsne_data = [[1, 0]] else: svd_data = self.data tsne_data = TSNE(n_components=2, metric='cosine').fit_transform(svd_data) self.data = scipy.sparse.csr_matrix(tsne_data) self.data = self.data.toarray() # Normalize data to max x and y == 1 #print("before normalization: ", self.data) max_x = 0 max_y = 0 min_x = 0 min_y = 0 for row in self.data: x = row[0] y = row[1] if x > max_x: max_x = x if y > max_y: max_y = y if x < min_x: min_x = x if y < min_y: min_y = y for row in self.data: if (max_x - min_x != 0): row[0] = (row[0] - min_x) / (max_x - min_x) else: row[0] = 0 if (max_y - min_y != 0): row[1] = (row[1] - min_y) / (max_y - min_y) else: row[1] = 0 if (self.func == "msh"): try: band = estimate_bandwidth(self.data) except ValueError: return False band = band * (self.bandwith_factor / 100) if band == 0: return False self.cluster_function = MeanShift(bandwidth=band) return True
x += 1 df[column] = list(map(convert_to_int, df[column])) return df df = handle_non_numerical(df) df.drop(['boat'], 1, inplace=True ) #you can tweak the dataset to see what variabes have an impact X = np.array(df.drop(['survived'], 1).astype(float)) X = preprocessing.scale(X) y = np.array(df['survived']) clf = MeanShift() clf.fit(X) labels = clf.labels_ cluster_centers = clf.cluster_centers_ original_df['cluster_group'] = np.nan for i in range(len(X)): original_df['cluster_group'].iloc[i] = labels[i] #refs rows in df n_clusters_ = len(np.unique(labels)) survival_rates = {} for i in range(n_clusters_): temp_df = original_df[(original_df['cluster_group'] == float(i))] survival_cluster = temp_df[(temp_df['survived'] == 1)]
# Fit and predict the data birch.fit(scaledData) predictions = birch.predict(scaledData) # Scatterplot between two features to check the clustering plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions) plt.xlabel("Height") plt.ylabel("Shell weight") plt.title("Clustering using Birch clustering algorithm") plt.show() ##################################### Mean Shift Clustering ################################# # Determine optimal bandwidth value bandwidth = estimate_bandwidth(scaledData, quantile=0.2, n_samples=500) # Instantiate the clustering model mnShift = MeanShift(bandwidth=bandwidth) # Fit and predict the data mnShift.fit(scaledData) predictions = mnShift.predict(scaledData) # Scatterplot between two features to check the clustering plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions) plt.xlabel("Height") plt.ylabel("Shell weight") plt.title("Clustering using Mean shift clustering algorithm") plt.show()
print(iris_data.head()) virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1] versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1] setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1] plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r') plt.scatter(x=versicolor['Sepal length'], y=versicolor['Sepal width'], color='g') plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b') #plt.show() print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2)) analyzer = MeanShift(bandwidth=1) print("Self MeanShift: ", analyzer.fit(iris_data)) print("Function mean_shift: ", mean_shift(iris_data)) labels, cluster_centers, n_clusters = mean_shift(iris_data) fig = plt.figure() ax = fig.add_subplot(111) colors = cycle('bgrcmy') for k, col in zip(range(n_clusters), colors): my_members = (labels == k) cluster_center = cluster_centers[k] x, y = iris_data[my_members, 0], iris_data[my_members, 1] ax.scatter(x=iris_data[my_members, 0], y=iris_data[my_members, 1],
import pandas as pd from sklearn.cluster import MeanShift if __name__ == '__main__': df = pd.read_csv('./Datasets/candy.csv') print(df.head()) X = df.drop('competitorname', axis=1) meanshift = MeanShift().fit(X) print('Numero de clusters: ', max(meanshift.labels_) + 1) print('==' * 64) print('Centros: ', meanshift.cluster_centers_) df['meanshift'] = meanshift.labels_ print('==' * 64) print(df.head())