def meanShift(flat_image):
    # Estimate Bandwidth
    bandwidth = estimate_bandwidth(flat_image, quantile = 0.2, n_samples=500)
    ms = MeanShift(bandwidth, bin_seeding=True)
    ms.fit(flat_image)
    labels = ms.labels_
    return ms.labels_, ms.cluster_centers_
 def meanshift_for_hough_line(self):
     # init mean shift
     pixels_of_label = {}
     points_of_label = {}
     for hough_line in self.points_of_hough_line:
         pixels = self.pixels_of_hough_line[hough_line]
         pixels = np.array(pixels)
         bandwidth = estimate_bandwidth(pixels, quantile=QUANTILE, n_samples=500)
         if bandwidth == 0:
             bandwidth = 2
         ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
         ms.fit(pixels)
         labels = ms.labels_
         labels_unique = np.unique(labels)
         n_clusters_ = len(labels_unique)
         for k in range(n_clusters_):
             label = list(hough_line)
             label.append(k)
             pixels_of_label[tuple(label)] = map(tuple, pixels[labels==k])
     for label in pixels_of_label:
         pixels = pixels_of_label[label]
         points = map(self.img.get_bgr_value, pixels)
         points_of_label[label] = points
     self.pixels_of_hough_line = pixels_of_label
     self.points_of_hough_line = points_of_label
Пример #3
0
    def _fit_mean_shift(self, x):
        for c in xrange(len(self.crange)):
            quant = 0.015 * (c + 1)
            for r in xrange(self.repeats):
                bandwidth = estimate_bandwidth(
                    x, quantile=quant, random_state=r)
                idx = c * self.repeats + r
                model = MeanShift(
                    bandwidth=bandwidth, bin_seeding=True)
                model.fit(x)
                self._labels[idx] = model.labels_
                self._parameters[idx] = model.cluster_centers_

                # build equivalent gmm
                k = model.cluster_centers_.shape[0]
                model_gmm = GMM(n_components=k, covariance_type=self.cvtype,
                                init_params='c', n_iter=0)
                model_gmm.means_ = model.cluster_centers_
                model_gmm.weights_ = sp.array(
                    [(model.labels_ == i).sum() for i in xrange(k)])
                model_gmm.fit(x)

                # evaluate goodness of fit
                self._ll[idx] = model_gmm.score(x).sum()
                if self.gof_type == 'aic':
                    self._gof[idx] = model_gmm.aic(x)
                if self.gof_type == 'bic':
                    self._gof[idx] = model_gmm.bic(x)

                print quant, k, self._gof[idx]
    def cluster_pixels_ms(self):
        # reshape
        """
        cluster points descriptors by meahs shift
        :type self: ColorRemover
        """
        fg_pixels = self.img.fg_pixels.keys()
        descriptors = []
        for r, c in fg_pixels:
            descriptors.append(self.descriptor_map[r][c])
        descriptors = np.array(descriptors)
        descriptors = PCA(n_components=int(VECTOR_DIMENSION)/2).fit_transform(descriptors)
        # descriptors = self.descriptor_map.reshape(descriptors_rows, 1, VECTOR_DIMENSION)
        bandwidth = estimate_bandwidth(descriptors, quantile=0.05)
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(descriptors)
        labels = ms.labels_

        for i in range(len(labels)):
            xy = fg_pixels[i]
            label = labels[i]
            self.labels_map.itemset(xy, label)
        # save the indices and BGR values of each cluster as a dictionary with keys of label
        for label in range(K):
            self.pixels_of_hough_line_in_sphere[label] = map(tuple, np.argwhere((self.labels_map == label)))
            self.cluster_bgr[label] = map(tuple, self.img.bgr[self.labels_map == label])
Пример #5
0
def cluster_data(data,clustering_method,num_clusters):
    cluster_centers = labels_unique = labels = extra = None
    if clustering_method == 'KMeans':
        # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
        k_means = KMeans(n_clusters=num_clusters,init='k-means++',n_init=10,max_iter=100,tol=0.0001,
                                precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1)
        k_means.fit(data)
        labels = k_means.labels_
        cluster_centers = k_means.cluster_centers_
    elif clustering_method == 'MeanShift':
        ms =  MeanShift( bin_seeding=True,cluster_all=False)
        ms.fit(data)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_
    elif clustering_method == 'AffinityPropagation':
        af = AffinityPropagation().fit(data)
        cluster_centers = [data[i] for i in  af.cluster_centers_indices_]
        labels = af.labels_
    elif clustering_method == "AgglomerativeClustering":
        n_neighbors=min(10,len(data)/2)
        connectivity = kneighbors_graph(data, n_neighbors=n_neighbors)
        ward = AgglomerativeClustering(n_clusters=num_clusters, connectivity=connectivity,
                               linkage='ward').fit(data)
        labels = ward.labels_
    elif clustering_method == "DBSCAN":
        db = DBSCAN().fit(data)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        extra = core_samples_mask
        labels = db.labels_

    if labels is not None:
        labels_unique = np.unique(labels)
    return labels,cluster_centers,labels_unique,extra
Пример #6
0
def mean_shift_cluster_analysis(x,y,quantile=0.2,n_samples=1000):
    # ADAPTED FROM:
    # http://scikit-learn.org/stable/auto_examples/cluster/plot_mean_shift.html#example-cluster-plot-mean-shift-py
    # The following bandwidth can be automatically detected using
    X = np.hstack((x.reshape((x.shape[0],1)),y.reshape((y.shape[0],1))))
    bandwidth = estimate_bandwidth(X, quantile=quantile, n_samples=n_samples)
    
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    #print("number of estimated clusters : %d" % n_clusters_)
    colors = 'bgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmykbgrcmyk' #cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for i in xrange(len(np.unique(labels))):
        my_members = labels == i
        cluster_center = cluster_centers[i]
        plt.scatter(X[my_members, 0], X[my_members, 1],s=90,c=colors[i],alpha=0.7)
        plt.scatter(cluster_center[0], cluster_center[1],marker='+',s=280,c=colors[i])
    tolx = (X[:,0].max()-X[:,0].min())*0.03
    toly = (X[:,1].max()-X[:,1].min())*0.03
    plt.xlim(X[:,0].min()-tolx,X[:,0].max()+tolx)
    plt.ylim(X[:,1].min()-toly,X[:,1].max()+toly)
    plt.show()
    return labels
Пример #7
0
def Mean_Shift(path):
    #importer les donnees
    data=pandas.read_csv(filepath_or_buffer=path,delimiter=',',encoding='utf-8')  
    data.drop_duplicates()
    print (data)
    #lire les donnees
    values=data[['latitude', 'longitude']].values
    print("printing values")
    print (values)
    #Mean shift
    print ("Clustering data Meanshift algorithm")
    bandwidth = estimate_bandwidth(values, quantile=0.003, n_samples=None)
    #ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, min_bin_freq=20, cluster_all=False)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,min_bin_freq=25,cluster_all=False)
    ms.fit(values)
    data['cluster'] = ms.labels_
    data = data.sort(columns='cluster')
    data = data[(data['cluster'] != -1)]
    print (data['cluster'])
    data['cluster'] = data['cluster'].apply(lambda x:"cluster" +str(x))
    labels_unique = np.unique(ms.labels_).tolist()
    del labels_unique[0]
    # Filtering clusters centers according to data filter
    cluster_centers = DataFrame(ms.cluster_centers_, columns=['latitude', 'longitude'])
    cluster_centers['cluster'] = labels_unique
    print (cluster_centers)
    n_centers_ = len(cluster_centers)
    print("number of clusters is :%d" % n_centers_)
    # print ("Exporting clusters to {}...'.format(clusters_file)")
    data.to_csv(path_or_buf="output/points.csv", cols=['user','latitude','longitude','cluster','picture','datetaken'], encoding='utf-8')
    #print ("Exporting clusters centers to {}...'.format(centers_file)")
    cluster_centers['cluster'] = cluster_centers['cluster'].apply(lambda x:"cluster" +str(x))
    cluster_centers.to_csv(path_or_buf="output/centers.csv", cols=['latitude', 'longitude','cluster'], encoding='utf-8')
    plot_meanshift(data, cluster_centers, n_centers_)
    return 0
def meanShift(points):
  # perform meanshift clustering of data
  meanshift = MeanShift()
  meanshift.fit(points.T)
  labels = meanshift.labels_
  centers = meanshift.cluster_centers_
  return np.array(labels)
Пример #9
0
def simplify_data1(x):
	X = np.array(zip(x,np.zeros(len(x))), dtype=np.float)
	bandwidth = estimate_bandwidth(X, quantile=0.2)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
	ms.fit(X)
	labels = ms.labels_
	cluster_centers = ms.cluster_centers_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	#print n_clusters_
	#exit()
	start=0
	value=0
	print x
	for k in range(n_clusters_):
	    my_members = labels == k
	    print "cluster {0}: {1}".format(k, X[my_members, 0]),np.average(X[my_members, 0])
	    value=np.average(X[my_members, 0])
	    val2=0
	    for i in xrange(start,start+len(X[my_members, 0])):
		val2+=X[i][0]
		print val2,X[i][0],i
		X[i][0]=value
	    print "FINAL",val2/len(X[my_members, 0])
	    start+=len(X[my_members, 0])
	return X[:,0]
Пример #10
0
 def centers_y_clusters(self,graph_db,nodes,consulta,cyprop):
     group = []
     todo = []
     rr = []
     for n in nodes:
         tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute()
         for r in tiene:
             todo.append([r.cuenta])
             rr.append(r.cuenta)
         
     ms = MeanShift(bin_seeding=True)
     ms.fit(np.asarray(todo))
     labels = ms.labels_
     cluster_centers = sorted(ms.cluster_centers_ , key=lambda x: x[0])
     for idx,cl in enumerate(cluster_centers):
         cluster_centers[idx] = float(cl[0])
     for u in cluster_centers:
         group.append([])
     for n in nodes:
         tiene = neo4j.CypherQuery(graph_db, consulta+" where id(n) ="+str(n.id)+" return count(distinct(e))"+cyprop+" as cuenta").execute()
         for r in tiene:
             valor = r.cuenta
         for idx,v in enumerate(cluster_centers):
             if idx == 0:
                 temp1 = -9999
             else:
                 temp1 = (cluster_centers[idx-1] + cluster_centers[idx])/2
             if idx == len(cluster_centers) - 1:
                 temp2 = 99999
             else:
                 temp2 = (cluster_centers[idx+1] + cluster_centers[idx])/2
             if temp1 <= valor < temp2:
                 group[idx].append(n)
     return cluster_centers, group
Пример #11
0
def make(filename, precision):
    with open('test.geojson') as f:
        data = json.load(f)

    features = data['features']
    points = [
        geo['geometry']["coordinates"]
        for geo in features if pred(geo)
    ]
    print points
    ar_points = array(points).reshape(len(points) * 2, 2)
    print ar_points
    bandwidth = estimate_bandwidth(ar_points) / precision
    cluster = MeanShift(bandwidth=bandwidth)
    cluster.fit(ar_points)
    labels = cluster.labels_
    cluster_centers = cluster.cluster_centers_
    print 'clusters:', len(unique(labels))

    for i, geo in enumerate(filter(pred, features)):
        geo['geometry']["coordinates"] = [
            list(cluster_centers[labels[i*2 + j]])
            for j in range(2)
        ]

    with open(filename, 'w') as f:
        json.dump(data, f)
Пример #12
0
def meanShift(mtx, **kw):
    """
    meanShift(mtx, **kw) uses scikit-learn's meanshift clustering implementation to
    cluster infoDistance matrices.

    Call with the distance matrix as the first parameter. 
        Available Keyword arguments:
        startingbandwidth:  the lowest bandwidth to begin the estimation with (defaults to 0.1)
        bandwithincrement:  the amount by which to increment bandwidth in between rounds of
                            meanshift (defaults to 0.01)
    """
    H = kw.get('startingbandwidth', 0.1)
    dH= kw.get('bandwidthincrement', 0.01)
    ms = MeanShift(bandwidth = H)
    clustercenters = None
    nnonunary = []
    minH = None
    while nclusters > 1:
        ms = MeanShift(bandwidth = H)
        ms.fit(mtx)
        centers   = ms.cluster_centers_
        clusters  = ms.labels_
        nonunary  = np.shape(np.where(np.bincount(clusters) > 1))[1]
        if nonunary:
            H = H + dH
Пример #13
0
def do_meanshift(s_path, band1, band2, band3, band4, colour1, colour2,
                 make_plot):
    '''Meanshift clustering to determine the number of clusters in the
        data, which is passed to KMEANS function'''
    # Truncate data
    X = np.vstack([colour1, colour2]).T
    '''Compute clustering with MeanShift'''
    # Scale data because meanshift generates circular clusters
    X_scaled = preprocessing.scale(X)
    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(X). Bandwidth can also be set manually.
    bandwidth = estimate_bandwidth(X)
    #bandwidth = 0.65
    # Meanshift clustering
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)
    labels_unique = np.unique(ms.labels_)

    objects = ms.labels_[ms.labels_ >= 0]
    n_clusters = len(labels_unique[labels_unique >= 0])
    # Make plot
    if "meanshift" in make_plot:
        make_ms_plots(s_path, colour1, colour2, n_clusters, X, ms,
                      band1, band2, band3, band4, objects)
    return(n_clusters, bandwidth)
Пример #14
0
def meanshift(raw_data, t):
   # Compute clustering with MeanShift
    # The following bandwidth can be automatically detected using
    #data = [ [(raw_data[i, 1]+raw_data[i, 5]), (raw_data[i, 2]+raw_data[i,6])] for i in range(raw_data.shape[0]) ]
    data = np.zeros((raw_data.shape[0],2))
    X = raw_data[:,1] + raw_data[:,5]
    Y = raw_data[:,2] + raw_data[:,6]
    #X = raw_data[:,1] ; Y = raw_data[:,2];
    data = np.transpose(np.concatenate((np.mat(X),np.mat(Y)), axis=0))
    bandwidth = estimate_bandwidth(data, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(data)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    print("number of estimated clusters : %d" % n_clusters_) 
    # Plot result
    plt.figure(t)
    plt.clf()
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(data[my_members, 0], data[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                 markeredgecolor='k', markersize=14)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.axis('equal')
    plt.show()    
Пример #15
0
def ms_algo(X, bandwidth=None):
    if(bandwidth==None):
        n_samples = X.shape[0]
        bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=n_samples)

    # Apply the meanshit algorithm from sklearn library
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

    # collect from the meanshift algorithm the labels and the centers of the clusters
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_


    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique) #Number of clusters

    # Print section
    print("The number of clusters is: %d" % n_clusters_)

    print("The centers are:")
    for i in range(n_clusters_):
        print i,
        print cluster_centers[i]

    return cluster_centers    
Пример #16
0
def train(trainingData, pklFile, clusteringAll, numberOfClusters=None):
	# ========================================================================= #
	# =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= #
	# ========================================================================= #
	if (pklFile == ''):
		os.system('rm -rf learntModel & mkdir learntModel')
		pklFile = 'learntModel/learntModel.pkl'
	
	# ========================================================================= #
	# =============== STEP 2. PERFORM CLUSTERING TO THE DATA ================== #
	# ========================================================================= #
	if (numberOfClusters == None):
		print "Running MeanShift Model..."
		bandwidth = estimate_bandwidth(trainingData)
		ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=clusteringAll)
		ms.fit(trainingData)
		joblib.dump(ms, pklFile)
		return {"numberOfClusters":len(ms.cluster_centers_), "labels": ms.labels_, "clusterCenters":ms.cluster_centers_}
	
	elif (numberOfClusters != None):
		print "Running K-Means Model..."
		kMeans = KMeans(init='k-means++', n_clusters=numberOfClusters)
		kMeans.fit(trainingData)
		joblib.dump(kMeans, pklFile)
		return {"numberOfClusters":len(kMeans.cluster_centers_), "labels": kMeans.labels_, "clusterCenters":kMeans.cluster_centers_}
Пример #17
0
def mean_shift(X):
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    return labels, cluster_centers
Пример #18
0
def BA_meanshift_cluster(mark, chrom):
    '''
    @param:
    @return:
    perform mean shift cluster on 2D data:
        ((chromStart+chromEnd)*0.5, chromEnd-chromStart)
    '''
    path = os.path.join(get_data_dir(), "tmp", mark,"{0}-{1}.csv".format(chrom, mark))
    DF = pd.read_csv(path, sep='\t')
    S_x = 0.5*(DF.loc[:, 'chromEnd'].values+DF.loc[:, 'chromStart'].values)
    S_y = DF.loc[:, 'chromEnd'].values-DF.loc[:, 'chromStart'].values
    X = np.hstack((np.atleast_2d(S_x[7000:8000]).T, np.atleast_2d(S_y[7000:8000]).T))
    print X
    bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=1000)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    print list(set(labels))
    import matplotlib.pyplot as plt
    from itertools import cycle
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(len(list(set(labels)))), colors):
        my_members = labels == k
        plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.title('Estimated number of clusters: %d' % len(list(set(labels))))
    plt.show()
Пример #19
0
    def get_clusters(self, in_file, cc_file, clf_file, arrivals_file, chunk_size=1710671):
        df = pd.read_csv(open(in_file), chunksize=chunk_size)
        dests = []
        part = 1
        lines = 1710671 / chunk_size
        try:
            dest = cPickle.load(open(arrivals_file))
        except IOError:
            for d in df:
                print "%d / %d" % (part, lines)
                part += 1
                for row in d.values:
                    # print eval(row[-1])
                    tmp = eval(row[-1])
                    if len(tmp) > 0:
                        dests.append(tmp[-1])
            dest = np.array(dests)
            cPickle.dump(dest, open(arrivals_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Destination points loaded"

        try:
            ms = cPickle.load(open(clf_file))
        except IOError:
            bw = 0.001
            ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5, n_jobs=-2)
            ms.fit(dest)
            cPickle.dump(ms, open(clf_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Mean shift loaded"
        cluster_centers = ms.cluster_centers_
        cPickle.dump(cluster_centers, open(cc_file, "w"), protocol=cPickle.HIGHEST_PROTOCOL)
        print "Clusters dumped"
Пример #20
0
def hart85_means_shift_cluster(pair_buffer_df, features):

    from sklearn.cluster import MeanShift, estimate_bandwidth

    # Creating feature vector
    cluster_df = pd.DataFrame()
    if 'active' in features:
        cluster_df['active'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                   ((np.fabs(row['T1 Active']) + np.fabs(row['T2 Active'])) / 2), axis=1), index=pair_buffer_df.index)
    if 'reactive' in features:
        cluster_df['reactive'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                     ((np.fabs(row['T1 Reactive']) + np.fabs(row['T2 Reactive'])) / 2), axis=1), index=pair_buffer_df.index)
    if 'delta' in features:
        cluster_df['delta'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                  (row['T2 Time'] - row['T1 Time']), axis=1), index=pair_buffer_df.index)
        cluster_df['delta'] = cluster_df[
            'delta'].apply(lambda x: int(x) / 6e10)

    if 'hour_of_use' in features:
        cluster_df['hour_of_use'] = pd.DatetimeIndex(
            pair_buffer_df['T1 Time']).hour

    if 'sd_event' in features:
        cluster_df['sd_event'] = pd.Series(pair_buffer_df.apply(lambda row:
                                                                     (df.power[row['T1 Time']:row['T2 Time']]).std(), axis=1), index=pair_buffer_df.index)

    X = cluster_df.values.reshape((len(cluster_df.index), len(features)))
    ms = MeanShift(bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    return pd.DataFrame(cluster_centers, columns=features)
Пример #21
0
def find_clusters(feature, items, bandwidth=None, min_bin_freq=None, cluster_all=True, n_jobs=1):
    """
    Cluster list of items based on feature using meanshift algorithm (Binning).

    :param feature: key used to retrieve item to cluster on
    :param items:
    :param bandwidth:
    :param min_bin_freq:
    :param cluster_all:
    :return:
    """
    x = [item[feature] for item in items]
    X = np.array(list(zip(x, np.zeros(len(x)))), dtype=np.float)
    ms = MeanShift(bandwidth=bandwidth, min_bin_freq=min_bin_freq, cluster_all=cluster_all, n_jobs=n_jobs)
    ms.fit(X)

    labels = ms.labels_
    labels_unique = np.unique(labels)

    n_clusters_ = len(labels_unique)

    clusters = []

    for k in range(n_clusters_):
        if k != -1:
            my_members = labels == k
            cluster_center = np.median(X[my_members, 0])
            cluster_sd = np.std(X[my_members, 0])
            clusters.append({
                'center': cluster_center,
                'sd': cluster_sd,
                'items': X[my_members, 0]
            })

    return clusters
Пример #22
0
def mean(X, save_fig=False, params_labels=None, prefix='clusters'):
    '''
    Compute clustering with MeanShift
    '''
    logger.debug('Calculating MeanShift clusters using %d parameters'%len(X[0]))
    
    X = np.array( X )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bandwidth = estimate_bandwidth(X, quantile=0.2)
    
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(X)
        
    labels = ms.labels_
    
    if save_fig:
        plotClusters(X, ms, method='mean', prefix=prefix,
                     params=params_labels)
    
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)
    
    logger.debug('Found %d clusters with MeanShift algorithm'%n_clusters_)
    
    return labels
Пример #23
0
def weekhour(lst,day,hour,num):

    l = [ ]
    for dicts in lst:
        latlong = dicts["latlong"]
        l.append(latlong)
    l = np.array(l)
    l = np.array([x for x in l if x[0] < 40])
    l = np.array([x for x in l if x[1] < -102.0])
    l = np.array([x for x in l if x[0] > 39])
    l = np.array([x for x in l if x[1] > -105.5])

    bandwidth = .001
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(l)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)


    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_), colors):
        my_members = labels == k
        cluster_center = cluster_centers[k]
        plt.plot(l[my_members,1], l[my_members,0], col + '.')
        plt.plot(cluster_center[1], cluster_center[0], 'x', markerfacecolor=col,\
    markeredgecolor='k', markersize=14)

    num_samples = len(labels)
    list_clust_cents = cluster_centers.tolist()
    num_labels = Counter(labels).most_common()
    top = tuple(num_labels)

    if num > n_clusters_:
        num = n_clusters_

    for i in range(num):
        densest = top[i][1]
        percent = round((float(densest)/float(num_samples))*100,3)
        if densest >= 60:
            import geocoder
            g = geocoder.google(list_clust_cents[i], method='reverse')
            address = g.address
        else:
            address = 0

        with open('weekdayclusterstest.csv', 'a') as csvfile:
            fieldnames = ['day', 'hour', 'densest cluster', 'address', 'percent', 
                          'number of samples', 'number of estimated clusters']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerow({'densest cluster': densest, \
                             'day': day, \
                             'hour': hour, \
                             'address': address, \
                             'percent': percent, \
                             'number of samples': num_samples, \
                             'number of estimated clusters': n_clusters_})
Пример #24
0
def meanshiftUsingPCA(path):
	# Load original image given the image path
	im = cv.LoadImageM(path)
	#convert image to YUV color space
	cv.CvtColor(im,im,cv.CV_BGR2YCrCb)
	# Load bank of filters
	filterBank = lmfilters.loadLMFilters()
	# Resize image to decrease dimensions during clustering
	resize_factor = 1
	thumbnail = cv.CreateMat(im.height / resize_factor, im.width / resize_factor, cv.CV_8UC3)
	cv.Resize(im, thumbnail)
	# now work with resized thumbnail image
	response = np.zeros(shape=((thumbnail.height)*(thumbnail.width),51), dtype=float)
	for f in xrange(0,48):
		filter = filterBank[f]
		# Resize the filter with the same factor for the resized image
		dst = cv.CreateImage(cv.GetSize(thumbnail), cv.IPL_DEPTH_32F, 3)
		resizedFilter = cv.CreateMat(filter.height / resize_factor, filter.width / resize_factor, filter.type)
		cv.Resize(filter, resizedFilter)
		# Apply the current filter
		cv.Filter2D(thumbnail,dst,resizedFilter)
		for j in xrange(0,thumbnail.height):
			for i in xrange(0,thumbnail.width):
				# Select the max. along the three channels
				maxRes = max(dst[j,i])
				if math.isnan(maxRes):
					maxRes = 0.0
				if maxRes > response[thumbnail.width*j+i,f]:
					# Store the max. response for the given feature index
					response[thumbnail.width*j+i,f] = maxRes

	#YUV features
	count = 0
	for j in xrange(0,thumbnail.height):
		for i in xrange(0,thumbnail.width):
			response[count,48] = thumbnail[j,i][0]
 			response[count,49] = thumbnail[j,i][1]
			response[count,50] = thumbnail[j,i][2]
            		count+=1

	#get the first 4 primary components using pca
	pca = PCA(response)
	pcaResponse = zeros([thumbnail.height*thumbnail.width,4])

	for i in xrange(0,thumbnail.height*thumbnail.width):
		pcaResponse[i] = pca.getPCA(response[i],4)

	# Create new mean shift instance
	ms = MeanShift(bandwidth=10,bin_seeding=True)
	# Apply the mean shift clustering algorithm
	ms.fit(pcaResponse)
	labels = ms.labels_
	n_clusters_ = np.unique(labels)
	print "Number of clusters: ", len(n_clusters_)
	repaintImage(thumbnail,labels)
	cv.Resize(thumbnail, im)
	return im
Пример #25
0
def do_meanshift (band1, band2, band3, band4, colour1, colour2, make_plots):
    '''Does meanshift clustering to determine a number of clusters in the 
        data, which is passed to KMEANS function'''

    data = np.loadtxt(inputdata)

    #Input Checking
    #if band1 == band2 or band3 == band4: 
        #print "Not a good idea to use the same band in one colour, try again"
        #return
    #for band in [band1, band2, band3, band4]:
        #if band not in band_names.keys():
            #print "Can't find %s in band_name list" %band
            #return
        
    #Import 4 different wavelengths
    #Colour 1: 05_mag
    wave1 = data[:, band_names[band1]]
    wave2 = data[:, band_names[band2]]
    
    #Colour 2: 05_mag
    wave3 = data[:, band_names[band3]]
    wave4 = data[:, band_names[band4]]
    
    gooddata1 = np.logical_and(np.logical_and(wave1!=badval, wave2!=badval), np.logical_and(wave3!=badval, wave4!=badval)) # Remove data pieces with no value 
    gooddata2 = np.logical_and(np.logical_and(wave1<maglim, wave2<maglim), np.logical_and(wave3<maglim, wave4<maglim))
    greatdata = np.logical_and(gooddata1, gooddata2)
    
    colour1 = wave1[greatdata] - wave2[greatdata]
    colour2 = wave3[greatdata] - wave4[greatdata]
    
      
    #Truncate data
    X = np.vstack([colour1, colour2]).T

    #Scale data because meanshift generates circular clusters 
    X_scaled = preprocessing.scale(X)

    # The following bandwidth can be automatically detected using
    # the routine estimate_bandwidth(). Bandwidth can also be set
    # as a value.

    bandwidth = estimate_bandwidth(X)

    # Meanshift clustering 
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X_scaled)

    labels_unique = np.unique(ms.labels_)
    n_clusters = len(labels_unique[labels_unique >= 0])
    
    #Make plot of clusters if needed
    
    if "MSplot" in make_plot: 
        make_ms_plots(colour1, colour2, n_clusters, X, ms, band1, band2, band3, band4)
    
    return(n_clusters)
Пример #26
0
def meanShiftClustering(centers_df,subject):
    #estimate the bandwidth to use with the mean shift algorithm. The quantile represents the distance used between the box centers to define the cluster. Smaller quantile, means smaller distance between points that would end up in the same cluster
    centers_df=centers_df.reset_index()
    bandwidth=estimate_bandwidth(centers_df[['center_x','center_y']].as_matrix(), quantile=0.0055)
    #instantiate the mean shift algorithm
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    #fit the algorithm on the box center coordinates
    ms.fit(centers_df[['center_x','center_y']])
    #get the resulting clustesr labels
    labels = ms.labels_
    #get the resulting centers of each *cluster*
    cluster_centers = ms.cluster_centers_

    labels_unique = np.unique(labels)
    #calculate the number of clusters by using the length of the list that contains all the unique labels
    n_clusters_ = len(labels_unique)

    #concatenate the centers data frame (which contains all the box coordinates, their dimensions, and their centers) with the clustering labels generated by the clustering
    boxes_df = pd.concat([centers_df,pd.DataFrame(labels,columns=['cluster_label'])],axis=1)

    #the aggregate function in the groupby, includes two functions: count and median
    f = {'Number of boxes in a cluster': ['count'],'Median': ['median']}
    #group by the label of each cluster and aggregate the boxes' top left coordinates and dimensions by applying the median
    aggregated_df = boxes_df.groupby('cluster_label')['cluster_label','tl_x','tl_y','width','height'].agg(f).reset_index()
    #change column names for a more descriptive name
    aggregated_df.columns = ['cluster_label','median_cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster','count_tl_x','count_tl_y','count_width','count_height']
    #leave out the unnecessary columns
    aggregated_df = aggregated_df[['cluster_label','agg_tl_x','agg_tl_y','agg_width','agg_height','boxes_in_cluster']]
    
    #Look at the output of the plotBoxes function (svg file) and determine at which THRESHOLD value there is a desired number of clusters (appears at the top of the plot) and that it visually matches the actual grid
    THRESHOLD = 5

    #filter out all the clusters that have less than a certain number of boxes in each cluster
    #use the old-weather-aggregator-with-plot.py script to check what the best threshold is
    aggregated_df = aggregated_df.loc[aggregated_df.boxes_in_cluster>THRESHOLD,:]
    good_clusters = np.unique(aggregated_df.cluster_label.values)

    print "for subject_id:"+str(subject)

    print "number of estimated clusters overall: %d" % n_clusters_

    print "number of estimated clusters, after small clusters were filtered out: %d" % len(good_clusters)

    print "clusters with more than %d boxes per cluster:" % THRESHOLD
    print aggregated_df.columns
    print aggregated_df.head()

    #save the aggregated boxes and their clusters into a csv file, separate file for each subject
    print "Saving the output/aggregated_df_%s.csv file..." % str(subject)
    aggregated_df.to_csv("output/aggregated_df_"+str(subject)+".csv",index=False)

    #make sure that only the boxes that belong to the good_clusters (have more boxes than the threshhold) remain in the boxes_df dataframe and then save the dataframe
    boxes_df = boxes_df.loc[boxes_df['cluster_label'].isin(good_clusters),:]
    print "Saving the output/clustered_df_%s.csv file..." % str(subject)
    boxes_df.to_csv("output/clustered_df_"+str(subject)+".csv",index=False)

    plotBoxes(aggregated_df,boxes_df,cluster_centers)
Пример #27
0
def test_parallel():
    ms1 = MeanShift(n_jobs=2)
    ms1.fit(X)

    ms2 = MeanShift()
    ms2.fit(X)

    assert_array_equal(ms1.cluster_centers_, ms2.cluster_centers_)
    assert_array_equal(ms1.labels_, ms2.labels_)
Пример #28
0
def checkForClustering(catalog):
    debug("Checking for data clustering")
    Xfull = catalog.view(np.float64).reshape(catalog.shape + (-1,))[:,1:]
    X = Xfull[:,2:]
    
    
    debug("Using DBSCAN")
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    n_clusters_DBSCAN = len(set(labels)) - (1 if -1 in labels else 0)
    debug('Estimated number of clusters with DBSCAN: %d' % n_clusters_DBSCAN)
        
    unique_labelsDBSCAN = set(labels)
    colorsDBSCAN = plt.cm.rainbow(np.linspace(0, 1, len(unique_labelsDBSCAN)))
    
    debug("Estimating clusters using MeanShift")
    bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labelsMS = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_uniqueMS = np.unique(labelsMS)
    n_clusters_MS = len(labels_uniqueMS)
    debug("Estimated number of clusters with MeanShift: %d" % n_clusters_MS)
    
    # Plot result
    fig = plt.figure(figsize=(12,12))
    ax0 = fig.add_subplot(2,2,1)
    ax1 = fig.add_subplot(2,2,2)
    ax2 = fig.add_subplot(2,2,3)
    ax3 = fig.add_subplot(2,2,4)
    for k, col in zip(unique_labelsDBSCAN, colorsDBSCAN):
        if k == -1:
            col = 'k'
        class_member_mask = (labels == k)
        mask = class_member_mask & core_samples_mask
        xy = Xfull[mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][mask], catalog['CLASS_STAR'][mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        xy = Xfull[class_member_mask & ~core_samples_mask]
        ax0.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)
        ax2.plot(catalog['MAG_APER(1)'][class_member_mask & ~core_samples_mask], catalog['CLASS_STAR'][class_member_mask & ~core_samples_mask], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5)

        ax0.set_title('DBCAN: # clusters: %d' % n_clusters_DBSCAN)
        
        
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
    for k, col in zip(range(n_clusters_MS), colors):
        my_members = labelsMS == k
        cluster_center = cluster_centers[k]
        ax1.plot(Xfull[my_members, 0], Xfull[my_members, 1], col + '.')
        ax3.plot(catalog['MAG_APER(1)'][my_members], catalog['CLASS_STAR'][my_members], col + '.')
        #ax1.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
    ax1.set_title('MeanShift: # clusters: %d' % n_clusters_MS)
    plt.show()
Пример #29
0
Файл: gc.py Проект: biocryst/gc
def evaluate_candidate(options, work, top_frag, candidate):
    combined = []
    top_score,top_ind,top_support = top_frag
    cand_score,cand_ind,cand_support = candidate

    min_support = options.support_rmsd

    comb_ind = sorted(list(set(top_ind) | set(cand_ind)))
    comb_support = sorted(list(set(top_support) & set(cand_support)))
    n_comb_support=len(comb_support)
    if n_comb_support < min_support:
        return []

    if work.use_scores:
        comb_scores = [work.scores[i] for i in comb_support]

    aln_models = work.CA.take(comb_support, 0).take(comb_ind, 1)
    calculator = RMSDCalculator.RMSDCalculator("QCP_OMP_CALCULATOR", aln_models)
    dist = squareform(calculator.pairwiseRMSDMatrix())

    mds = manifold.MDS(n_components=2, dissimilarity="precomputed", n_jobs=1, n_init = 5)
    pos = mds.fit(dist).embedding_
    try:
        ms = MeanShift(bandwidth=options.bnd_rmsd, cluster_all=False, bin_seeding=True, min_bin_freq = min_support)
        ms.fit(pos)
    except:
        try:
            ms = MeanShift(bandwidth=options.bnd_rmsd, cluster_all=False, bin_seeding=False)
            ms.fit(pos)
        except:
            return []

    labels = ms.labels_
    labels_unique = np.unique(labels)
    for label in labels_unique:
        if label == -1:
            continue
        class_members = [index[0] for index in np.argwhere(labels == label)]
        class_support = [comb_support[i] for i in class_members]
        n_class_support = len(class_support)
        if n_class_support < min_support:
            continue
        
        class_dist = dist.take(class_members,0).take(class_members,1)
        mean_dist = np.mean(squareform(class_dist))
        if work.use_scores:
            class_scores = [comb_scores[i] for i in class_members]
            class_score = sum(class_scores)/(1+mean_dist)
        else:
            class_score = (-n_class_support)/(1+mean_dist)
            
        heapq.heappush(combined, (class_score, comb_ind, class_support))

    if combined:
        return heapq.heappop(combined)
    return []
Пример #30
0
def MSclusterer(X):
	X = X.toarray()
	bandwidth = estimate_bandwidth(X, quantile=0.04, n_samples=500)
	ms = MeanShift(bandwidth=bandwidth, bin_seeding=False, cluster_all=False)
	ms.fit(X)
	labels = ms.labels_
	labels_unique = np.unique(labels)
	n_clusters_ = len(labels_unique)
	print(n_clusters_)
	return ms.labels_
Пример #31
0
from sklearn.cluster import MeanShift

from utils import load_bilateral_image, whiten
import matplotlib.pyplot as plt

# Get vectorized image
feat, im = load_bilateral_image()
H, W = im.shape[:2]
feat = whiten(feat)

ms = MeanShift(bandwidth=1, bin_seeding=True)
ms.fit(feat.reshape(-1, feat.shape[2]))
labels = ms.labels_

plt.subplot(1, 2, 1)
plt.imshow(im)
plt.axis('off')
plt.subplot(1, 2, 2)
plt.imshow(labels.reshape(H, W))
plt.axis('off')
plt.show()
Пример #32
0
from sklearn.cluster import MeanShift, estimate_bandwidth
from itertools import cycle
import sys

filename = sys.argv[1]
print filename
filebuf = open(filename)
points = []
for line in filebuf.readlines():
    line = line.rstrip().split()
    tmp = [float(line[0]), float(line[1]), float(line[2])]
    points.append(tmp)
points = np.array(points)
bandwidth = estimate_bandwidth(points, quantile=0.1)
ms = MeanShift(bandwidth, bin_seeding=True)
ms.fit(points)
labels = ms.labels_
centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

wfiles = []
for i in range(n_clusters_):
    filename = 'file'
    filename += str(i)
    filename += '.sp'
    wfiles.append(filename)

filebuf = []
for f in wfiles:
#Estimate bandwidth
# quantile : smoothening parameter. 
#cut points dividing the range of a probability distribution into continuous intervals with equal probabilities, or dividing the observations in a sample in the same way.
#should be between [0, 1] 0.5 means that the median of all pairwise distances is used.

# n_samples : The number of samples to use. If not given, all samples are used.
#bandwidth increases by very less amount by increasing no. of samples and not much visible difference will be there
bandwidth1 = estimate_bandwidth(flat_image, quantile=.1, n_samples=500)

#print(bandwidth1)


ms1 = MeanShift(bandwidth1, bin_seeding=True)

#Performing meanshift on flatImg
ms1.fit(flat_image)

#(r,g,b) vectors corresponding to the different clusters after meanshift
labels1=ms1.labels_


#Remaining colors after meanshift
cluster_centers1 = ms1.cluster_centers_


#Finding and diplaying the number of clusters
labels_unique1 = np.unique(labels1)
n_clusters_1 = len(labels_unique1)
#print("number of estimated clusters : %d" % n_clusters_1)

# Displaying segmented image
virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')

#plt.show()

print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2))
analyzer = MeanShift(bandwidth=1)
print("Self MeanShift: ", analyzer.fit(iris_data))
print("Function mean_shift: ", mean_shift(iris_data))

labels, cluster_centers, n_clusters = mean_shift(iris_data)
fig = plt.figure()
ax = fig.add_subplot(111)

colors = cycle('bgrcmy')
print(labels)
for k, col in zip(range(n_clusters), colors):
    cluster_center = cluster_centers[k]
    if (labels == k):
        x, y = iris_data[0], iris_data[1]

    ax.scatter(x, y, c=col, linewidth=0.2)
    ax.scatter(x=cluster_center[0],
from sklearn import datasets

iris = datasets.load_iris()
data = iris.data

from sklearn.cluster import MeanShift

clsfr = MeanShift(bandwidth=0.85)
clsfr.fit(data)
labels = clsfr.labels_

centroids = clsfr.cluster_centers_

print(len(centroids))
print(centroids)
Пример #36
0
#----------------------------------------------------------------------
# Compute clustering with MeanShift
#
# We'll work with the scaled data, because MeanShift finds circular clusters

X_scaled = preprocessing.scale(X)

# The following bandwidth can be automatically detected using
# the routine estimate_bandwidth().  Because bandwidth estimation
# is very expensive in memory and computation, we'll skip it here.

#bandwidth = estimate_bandwidth(X)
bandwidth = 0.4

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
ms.fit(X_scaled)

labels_unique = np.unique(ms.labels_)
n_clusters = len(labels_unique[labels_unique >= 0])
print labels_unique
print bandwidth
print "number of estimated clusters : %d" % n_clusters

#------------------------------------------------------------
# Plot the results
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(111)

# plot density
H, C1_bins, C2_bins = np.histogram2d(colour1, colour2, 51)
Пример #37
0
# Det kan være svært at se ud af data hvad der er den optimale vinduestørrelse,
# så derfor har vi metoden til (baseret på vores data)
# at give os den estimerede bedst egnede vinduesstørrelse
print()
print("Estimate Bandwidth")
bandwidth = estimate_bandwidth(titanic_data)
print(bandwidth)

# Task: Fit data to a meanshift model
from sklearn.cluster import MeanShift
import numpy as np

# Så "meanshift" er centrum af den cirkel med radius "bandwidth" der i et plot dækker over flest punkter
analyzer = MeanShift(bandwidth=30)
fit = analyzer.fit(titanic_data)
print()
print("fit\n", fit)
labels = analyzer.labels_
print()
print("labels\n", labels)
uniqueLabels = np.unique(labels)
print("\n\nnp.unique(labels)\n", uniqueLabels)

# Task: How many clusters do we get
print()
print("Number of clusters:")
numberOfClusters = len(uniqueLabels)
print(numberOfClusters)

# Task: Add a column to the titanic dataframe with the cluster label for each person
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.cluster import MeanShift

iris_data = pd.read_excel('iris_data.xlsx')
print(iris_data.head())

iris_data = pd.get_dummies(iris_data, columns=['Species'])
print(iris_data.head())

virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')
#plt.show()

from sklearn.cluster import estimate_bandwidth
print(estimate_bandwidth(virginica, quantile=0.2))
print(estimate_bandwidth(versicolor, quantile=0.2))
print(estimate_bandwidth(setosa, quantile=0.2))
print(estimate_bandwidth(iris_data, quantile=1))
analyzer = MeanShift(bandwidth=1)
print(analyzer.fit(iris_data))
Пример #39
0
            # replace the values using the dictionary
            df[col] = list(map(convert_to_int, df[col]))

    return df


df = handle_non_numerical_data(df)

# removing the survived column because that's what we are testing
X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = df['survived']

model = MeanShift()
model.fit(X)

# get the groups the model created
labels = model.labels_

# adding new col to original df with textual values for readability
original_df['cluster_group'] = np.nan

# adding the group value to the new col for all rows in df
for i in range(len(X)):
    # iloc references the row at index i
    original_df['cluster_group'].iloc[i] = labels[i]

# the number of groups we got from the model
n_clusters_ = len(np.unique(labels))
Пример #40
0
# Z = linkage(face_encodings, 'ward')
# fig = plt.figure(figsize=(25, 10))
# dn = dendrogram(Z)

#mean-shift
if True:
    nuke_people()
    faces = list(Face.objects.all())
    face_encodings = np.array(
        [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces])
    X = StandardScaler().fit_transform(face_encodings)

    bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=500)

    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)

#DBSCAN
if False:
    nuke_people()
    faces = list(Face.objects.all())
    face_encodings = np.array(
        [np.frombuffer(bytes.fromhex(f.encoding)) for f in faces])
    X = StandardScaler().fit_transform(face_encodings)

    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=5, min_samples=2).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
Пример #41
0
import numpy as np
from sklearn.cluster import MeanShift  # as ms
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as pt
from mpl_toolkits.mplot3d import Axes3D
#from matplotlib import style

#style.use("ggplot")

ponits = [[3, 3, 3], [9, 9, 9], [2, 9, 9]]

mo, _ = make_blobs(n_samples=1500, centers=ponits, cluster_std=0.5)

shift_calulating = MeanShift()
shift_calulating.fit(mo)
datalabeling = shift_calulating.labels_
centersofpoints = shift_calulating.cluster_centers_

print(centersofpoints)

clustertospot = len(np.unique(datalabeling))

print("esitmated clustering group", clustertospot)

penning = 20 * ['y', 'c', 'm', 'p', 's', 'd', 't']

print(penning)
print(datalabeling)

fig = pt.figure()
axised = fig.add_subplot(111, projection='3d')
Пример #42
0
                    1: np.float64
                })

print(X.head())

## This is the bit where it fits the data

ms = MeanShift(cluster_all=False)

# Convert the columns of interest to a NumPy array
# Multi-dimensional so could be anything really
msX = np.array(X.iloc[:, col_int])

# print (msX)

ms.fit(msX)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

n_clusters_ = len(np.unique(labels))

# print("Number of estimated clusters:", n_clusters_)
# print(labels)

## Add the labels to the original dataframe and output to csv for analysis

labels_df = pd.DataFrame(labels, columns=['LABELS'])
X = pd.concat([X, labels_df], axis=1)

X.to_csv(myPath + testout)
Пример #43
0
    def do_work(self, train, uid, url):
        self.cap = cv2.VideoCapture(url)
        print(uid)

        self.kernel = np.ones((3, 3), np.uint8)

        self.frameWidth = int(self.cap.get(3))
        self.frameHeight = int(self.cap.get(4))

        self.outOriginal = cv2.VideoWriter(
            'cache/original.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'),
            24, (self.frameWidth, self.frameHeight))
        self.outDetect = cv2.VideoWriter(
            'cache/detect.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24,
            (self.frameWidth, self.frameHeight))
        self.outSkel = cv2.VideoWriter(
            'cache/skel.avi', cv2.VideoWriter_fourcc('X', 'V', 'I', 'D'), 24,
            (self.frameWidth, self.frameHeight))

        self.fgbg = cv2.bgsegm.createBackgroundSubtractorMOG()

        self.frameCount = 0

        cacheDir = os.path.join(os.getcwd(), 'cache')
        sourceDir = os.path.join(os.getcwd(), 'sources')
        try:
            pass
            os.remove(os.path.abspath(os.path.join(cacheDir, 'test.csv')))
        except OSError as e:
            pass
        try:
            if train:
                os.remove(
                    os.path.abspath(os.path.join(sourceDir,
                                                 str(uid) + '.csv')))
        except OSError as e:
            pass

        while self.frameCount < 240:

            status, frame = self.cap.read()

            if not status:
                break

            blur = cv2.GaussianBlur(frame, (9, 9), 0)
            fgmask = self.fgbg.apply(blur)

            img = cv2.dilate(fgmask, self.kernel, iterations=1)

            x, y, height, length = self.contourDetect(img)
            boxImg = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
            boxImg = cv2.rectangle(boxImg, (x, y), (x + length, y + height),
                                   (0, 0, 255), 2)
            cv2.line(boxImg, (0, int(y + 0.75 * height)),
                     (640, int(y + 0.75 * height)), (0, 255, 0), 2)
            cv2.line(boxImg, (0, int(y + 0.15 * height)),
                     (640, int(y + 0.15 * height)), (255, 0, 0), 2)

            skel, hip, shoulder = self.skelRegion(img, x, y, height, length)

            if self.frameCount > 50 and self.frameCount < 151:
                if train:
                    with open('sources/' + str(uid) + '.csv', 'a',
                              newline='') as csvfile:
                        with open('cache/target.csv', 'a',
                                  newline='') as targetfile:
                            fieldnames = [
                                'height', 'stride', 'lowerbody', 'upperbody',
                                'hipangle', 'shoulderx', 'shouldery'
                            ]
                            writer = csv.DictWriter(csvfile,
                                                    fieldnames=fieldnames)

                            targetnames = ['class']
                            targetWriter = csv.DictWriter(
                                targetfile, fieldnames=targetnames)

                            writer.writerow({
                                'height':
                                height,
                                'stride':
                                length,
                                'lowerbody':
                                round(0.53 * height, 2),
                                'upperbody':
                                round(0.4 * height, 2),
                                'hipangle':
                                round(hip, 2),
                                'shoulderx':
                                shoulder[0],
                                'shouldery':
                                shoulder[1]
                            })

                            targetWriter.writerow({'class': uid})
                            targetWriter.writerow({'class': 0})
                else:
                    with open('cache/test.csv', 'a', newline='') as csvfile:
                        fieldnames = [
                            'height', 'stride', 'lowerbody', 'upperbody',
                            'hipangle', 'shoulderx', 'shouldery'
                        ]
                        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

                        writer.writerow({
                            'height': height,
                            'stride': length,
                            'lowerbody': round(0.53 * height, 2),
                            'upperbody': round(0.4 * height, 2),
                            'hipangle': round(hip, 2),
                            'shoulderx': shoulder[0],
                            'shouldery': shoulder[1]
                        })

            self.outOriginal.write(frame)

            self.outDetect.write(boxImg)
            self.outSkel.write(skel)

            self.frameCount += 1

            if self.frameCount % 10 == 0:
                if train:
                    self.trackProgress(self.frameCount / 240, True)
                else:
                    self.trackProgress(self.frameCount / 240, False)

        print("processing done!")
        self.cap.release()
        self.outDetect.release()
        self.outOriginal.release()
        self.outSkel.release()

        verify = False

        if train:
            pass
        else:
            csv_files = glob.glob('sources/*.csv')
            for cfile in csv_files:
                cf = pd.read_csv(cfile)
                master_array = cf.as_matrix()

                df = pd.read_csv('cache/test.csv')
                numpy_array = df.as_matrix()
                print(numpy_array)

                bandwidth = estimate_bandwidth(master_array, quantile=0.1)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(master_array)
                master_labels = ms.labels_
                master_centers = ms.cluster_centers_
                print("Master centroids:\n", master_centers)
                print("Number of Master clusters: ",
                      len(np.unique(master_labels)))

                bandwidth = estimate_bandwidth(numpy_array, quantile=0.1)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(numpy_array)
                labels = ms.labels_
                cluster_centers = ms.cluster_centers_
                print("Test centroids:\n", cluster_centers)
                print("Number of Test clusters: ", len(np.unique(labels)))

                bandwidth = estimate_bandwidth(master_centers, quantile=0.9)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(master_centers)
                master_centers = ms.cluster_centers_

                bandwidth = estimate_bandwidth(cluster_centers, quantile=0.9)
                ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
                ms.fit(cluster_centers)
                cluster_centers = ms.cluster_centers_

                # new_centers = np.concatenate((master_centers, cluster_centers))
                LIMIT = np.matrix([[5, 5, 5, 5, 5, 5, 5]])
                if abs(master_centers - cluster_centers).all() < LIMIT.all():
                    verify = True
                    uid = cfile.split('.')[0].split('/')[1]
                    data = self.fetchDatabase(uid)
                    img = open('cache/image.png', 'wb')
                    img.write(data[4])
                    img.close()
                    self.verifyDone.emit(str(data[0]), data[1], data[2],
                                         str(data[3]))
                    print(master_centers)
                    print(cluster_centers)

                    break

                print(master_centers)
                print(cluster_centers)

        if not verify and not train:
            self.unauthVerify.emit()
        if train:
            self.threadCompleted.emit(True)
        else:
            self.threadCompleted.emit(False)
Пример #44
0
 def __call__(self, data):
     data = self.normer(data.as_matrix(self.keys))
     bandwidth = estimate_bandwidth(data, self.quantile, self.n_samples)
     ms = MeanShift(bandwidth)
     ms.fit(data)
     return [int(ms.predict([x])) for x in data]
Пример #45
0
kp2, des2 = sift.detectAndCompute(img2, None)

import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

x = np.array([kp2[0].pt])

for i in range(len(kp2)):
    x = np.append(x, [kp2[i].pt], axis=0)

x = x[1:len(x)]

bandwidth = estimate_bandwidth(x, quantile=0.1, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True)
ms.fit(x)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)

s = [None] * n_clusters_
for i in range(n_clusters_):
    l = ms.labels_
    d, = np.where(l == i)
    print(d.__len__())
    s[i] = list(kp2[xx] for xx in d)

des2_ = des2
Пример #46
0
def main(file_name):
    t0 = time.time()

    mat = cv2.imread(file_name, 0)
    ret, binary = cv2.threshold(mat, 100, 255, cv2.THRESH_BINARY)
    _, cnts, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL,
                                  cv2.CHAIN_APPROX_SIMPLE)

    print(" ### Len contour : %d ###" % len(cnts))

    mat = cv2.cvtColor(mat, cv2.COLOR_GRAY2BGR)

    for i in range(len(cnts)):

        centroid = getCentroid(cnts[i])

        # approxy contours
        length = cv2.arcLength(cnts[i], True)

        epsilon = 0.01 * length
        if epsilon < 3:
            epsilon = 3

        approx = cv2.approxPolyDP(cnts[i], epsilon, True)
        approx = approx.reshape(len(approx), 2)

        # The following bandwidth can be automatically detected using
        bandwidth = estimate_bandwidth(approx,
                                       quantile=0.2,
                                       n_samples=len(approx))
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
        ms.fit(approx)

        labels = ms.labels_
        cluster_centers = ms.cluster_centers_

        for p in cluster_centers:
            p = (int(p[0]), int(p[1]))

            cv2.circle(mat, p, 5, (0, 255, 0), -1)

        # labels_unique = np.unique(labels)
        # n_clusters_ = len(labels_unique)

        # print("number of estimated clusters : %d" % n_clusters_)

        # # Plot result
        # import matplotlib.pyplot as plt
        # from itertools import cycle

        # plt.figure(1)
        # plt.clf()

        # colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
        # for k, col in zip(range(n_clusters_), colors):
        #     my_members = labels == k
        #     cluster_center = cluster_centers[k]
        #     plt.plot(approx[my_members, 0], approx[my_members, 1], col + '.')
        #     plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
        #              markeredgecolor='k', markersize=14)
        # plt.title('Estimated number of clusters: %d' % n_clusters_)
        # plt.show()

        print("-Contours[%d]" % i)
        print("\t*centroid : ", centroid, "in contour : ",
              list(centroid) in arrContour2ListPoints(approx), "\n")
        print("\t*length : %.2f , epsilon : %.2f\n" % (length, epsilon))
        print("\t*approx : ", approx.shape, "\n")
        print("\t*cluster : ", cluster_centers.shape, "\n")

        cv2.putText(mat, "%d" % i, centroid, cv2.FONT_HERSHEY_COMPLEX, 1,
                    (0, 255, 255), 2)
    #     cv2.circle(mat,centroid,5,(0,255,),-1)
    #     for p in approx:
    #         cv2.circle(mat,tuple(p),3,(0,0,255),-1)

    dt = time.time() - t0
    print("* total time : %.2f\n" % dt)

    cv2.imshow("", mat)
    k = cv2.waitKey(0)

    cv2.destroyAllWindows()
Пример #47
0
    data, y = ds.make_blobs(N,
                            n_features=2,
                            centers=centers,
                            cluster_std=[0.5, 0.25, 0.7, 0.5],
                            random_state=0)

    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 9), facecolor='w')
    m = euclidean_distances(data, squared=True)
    bw = np.median(m)
    print(bw)
    for i, mul in enumerate(np.linspace(0.1, 0.4, 4)):
        band_width = mul * bw
        model = MeanShift(bin_seeding=True, bandwidth=band_width)
        ms = model.fit(data)
        centers = ms.cluster_centers_
        y_hat = ms.labels_
        n_clusters = np.unique(y_hat).size
        print('带宽:', mul, band_width, '聚类簇的个数为:', n_clusters)

        plt.subplot(2, 2, i + 1)
        plt.title(u'带宽:%.2f,聚类簇的个数为:%d' % (band_width, n_clusters))
        clrs = []
        for c in np.linspace(16711680, 255, n_clusters):
            clrs.append('#%06x' % c)
        # clrs = plt.cm.Spectral(np.linspace(0, 1, n_clusters))
        for k, clr in enumerate(clrs):
            cur = (y_hat == k)
            plt.scatter(data[cur, 0], data[cur, 1], c=clr, edgecolors='none')
        plt.scatter(centers[:, 0],
            df[column] = list(map(convert_to_int, df[column]))

    return df


df = handle_non_numerical_data(df)
df.drop(['boat'], 1, inplace=True)
#print(df.head())

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
clusters_centers = clf.cluster_centers_
n_clusters = len(np.unique(labels))

original_df['cluster_group'] = np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]

survival_rates = {}
for i in range(n_clusters):
    temp_df = original_df[(original_df['cluster_group'] == float(i))]
    survival_cluster = temp_df[(temp_df['survived'] == 1)]
    survival_rate = float(len(survival_cluster)) / len(temp_df)
Пример #49
0
class aplicateClustering(object):

    def __init__(self, dataSet):
        self.dataSet = dataSet

    #metodo que permite aplicar k-means, genera diversos set de datos con respecto a las divisiones que se emplean...
    def aplicateKMeans(self, numberK):

        try:
            self.model = KMeans(n_clusters=numberK, random_state=1).fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            pass
            return 1

    #metodo que permite aplicar birch clustering
    def aplicateBirch(self, numberK):

        try:
            self.model = Birch(threshold=0.2, branching_factor=50, n_clusters=numberK, compute_labels=True, copy=True).fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            pass
            return 1

    #metodo que permite aplicar cluster jerarquico
    def aplicateAlgomerativeClustering(self, linkage, affinity, numberK):

        try:
            self.model = AgglomerativeClustering(n_clusters=numberK, affinity=affinity, memory=None, connectivity=None, compute_full_tree='auto', linkage=linkage).fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            pass
            return 1

    #metodo que permite aplicar AffinityPropagation, con diversos parametros...
    def aplicateAffinityPropagation(self):

        try:
            self.model = AffinityPropagation().fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            pass
            return 1

    #metodo que permite aplicar DBSCAN
    def aplicateDBSCAN(self):

        try:
            self.model = DBSCAN(eps=0.3, min_samples=10).fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            pass
            return 1

    #metodo que permite aplicar MeanShift clustering...
    def aplicateMeanShift(self):

        try:
            bandwidth = estimate_bandwidth(self.dataSet, quantile=0.2)
            self.model = MeanShift(bandwidth=bandwidth, bin_seeding=True)
            self.model = self.model.fit(self.dataSet)
            self.labels = self.model.labels_
            return 0
        except:
            return 1
Пример #50
0
def main():
    (options, args) = parseArguments()
    chunk_name = options.filename
    sample_transfer_params = np.array([
        options.sample_concurrency, options.sample_parallelism,
        options.sample_pipelining
    ])
    sample_transfer_throughput = options.sample_throughput
    if options.maxcc is not None:
        global maxcc
        maxcc = options.maxcc

    file_name = os.path.join(os.getcwd(), '../../target', chunk_name)

    resource_package = __name__  # Could be any module/package name
    resource_path = '/' + chunk_name  # Do not use os.path.join(), see below
    print resource_package, sys.path
    fin = pkg_resources.resource_stream(resource_package, resource_path)

    #print file_name
    #sys.exit()
    discarded_data_counter = 0
    all_experiments = []
    #fin = open(file_name, 'r')
    data, name, size, similarity = read_data_from_file(fin)
    while data is not None:
        data_copy = np.array(data)
        regression, degree, optimal_point = run_modelling(
            data_copy, name, data[0, :])
        if regression is None:
            #print "Skipped", name, size
            discarded_data_counter += 1
        elif name.startswith("SB") or name.startswith("sg"):
            discarded_data_counter += 1
        else:
            all_experiments.append(
                TransferExperiment(name, size, similarity, regression, degree,
                                   optimal_point, data[0, :]))
            #print "Read data point ", name, data[0,:], data
            #sys.exit(-1)
        data, name, size, similarity = read_data_from_file(fin)
    #print "Skipped:", discarded_data_counter,  "/", (len(all_experiments) + discarded_data_counter)
    fin.close()

    for experiment in all_experiments:
        poly = PolynomialFeatures(degree=experiment.poly_degree)
        experiment.estimated_troughput = experiment.regression.predict(
            poly.fit_transform(sample_transfer_params.reshape(1, -1)))
        experiment.set_closeness(
            abs(experiment.estimated_troughput - sample_transfer_throughput))

    all_experiments.sort(key=lambda x: x.closeness, reverse=True)
    for experiment in all_experiments:
        experiment.run_parameter_relaxation(options.cc_rate, options.p_rate,
                                            options.ppq_rate)
        #print experiment.name, experiment.estimated_troughput, " diff:", experiment.closeness, experiment.similarity, experiment.relaxed_params

    all_experiments.sort(key=lambda x: x.closeness, reverse=True)
    attrs = [experiment.closeness for experiment in all_experiments]

    #print attrs
    X = np.array(attrs, dtype=np.float)
    #print X
    bandwidth = estimate_bandwidth(X, quantile=0.2)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(X)
    labels = ms.labels_
    cluster_centers = ms.cluster_centers_
    labels_unique = np.unique(labels)
    n_clusters_ = len(labels_unique)

    label = labels[0]

    weight = 0
    #for experiment in all_experiments:

    sorted_centers = sorted(cluster_centers[:, 0], reverse=True)
    #print cluster_centers, sorted_centers

    #print "Sorted indexes"
    #for cluster_center in cluster_centers:
    #    print sorted_centers.index(cluster_center)

    #print labels
    for k in range(n_clusters_):
        my_members = labels == k
        #print my_members
        #print "cluster {0}: {1}".format(k, X[my_members, 0])

    my_members = labels == labels[-1]
    #similar_experiments = [experiment for experiment in all_experiments if experiment.closeness in  X[my_members]]
    #all_experiments = similar_experiments

    for experiment, label in zip(all_experiments, labels):
        rank = sorted_centers.index(cluster_centers[label])
        # print "Traffic similar:", experiment.name, experiment.closeness, rank
        experiment.closeness_weight = 2**rank

    all_experiments.sort(key=lambda x: x.similarity)
    attrs = [experiment.similarity for experiment in all_experiments]
    db1 = DBSCAN(eps=2, min_samples=1).fit(attrs)
    similarity_labels = db1.labels_
    # print attrs, similarity_labels
    for experiment, similarity_label in zip(all_experiments,
                                            similarity_labels):
        experiment.similarity_weight = 2**similarity_label
    # print attrs, similarity

    all_experiments.sort(key=lambda x: x.closeness)

    for experiment in all_experiments:
        experiment.run_parameter_relaxation(options.cc_rate, options.p_rate,
                                            options.ppq_rate)

    total_weight = 0
    total_thr = 0
    total_params = [0, 0, 0]

    for experiment in all_experiments:
        if experiment.similarity_weight < 1:
            # print experiment.name, experiment.closeness, experiment.closeness_weight, experiment.similarity, experiment.similarity_weight
            continue
        weight = experiment.similarity_weight * experiment.closeness_weight
        total_weight += weight
        weighted_params = [
            param * weight for param in experiment.relaxed_params
        ]
        print "HEYYY", experiment.name, experiment.closeness, experiment.closeness_weight, experiment.similarity_weight, weight, experiment.relaxed_params, experiment.first_row
        total_params = map(add, total_params, weighted_params)
        total_thr += weight * experiment.relaxed_throughput

    final_params = [x / (total_weight * 1.0) for x in total_params]
    final_throughput = total_thr / total_weight
    return final_params, final_throughput
Пример #51
0
def meanShiftClustering(array1, array2, array3, numberOfMinSamples):
    convertedArray = convertArrayFormat(array1, array2, array3)
    arrayToSave = convertedArray
    convertedArray = StandardScaler().fit_transform(convertedArray)
    if (len(convertedArray) > 3):
        # Compute MeanShift
        # bandwidth = estimate_bandwidth(convertedArray, quantile = 0.2, n_samples = 500)
        # ms = MeanShift(bandwidth = bandwidth, bin_seeding = True)
        ms = MeanShift()
        ms.fit(convertedArray)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_

        # couting the number of members for each cluster and assign clusters with less than 'numberOfMinSamples' members to noise
        counterVec = ([])
        maxLabel = max(labels)
        labels = list(labels)
        for i in range(0, maxLabel + 1):
            counterVec = np.append(counterVec, labels.count(i))

        # checking the labels whose number is less than 'numberOfMinsSamples'
        noiseLabelVec = ([])
        for i in range(0, len(counterVec)):
            if (counterVec[i] < numberOfMinSamples):
                noiseLabelVec = np.append(noiseLabelVec, i)

        # creating new labels, with excluding those whose number is less than 'numberOfMinSamples' and handle them as noise
        newLabels = []
        for i in range(0, len(labels)):
            if (np.any(noiseLabelVec == labels[i])):
                newLabels.append(-1)
            else:
                newLabels.append(labels[i])

        # changing labels to match those of other clustering algorithms
        for i in range(0, len(newLabels)):
            # decrease labelling uniformly by 1
            newLabels[i] -= 1

        # changing the -1 labels to max, and -2 labelling back to the original -1
        maxLabel = max(newLabels)
        for i in range(0, len(newLabels)):
            if (newLabels[i] == -1):
                newLabels[i] = (maxLabel + 1)
            if (newLabels[i] == -2):
                newLabels[i] = -1
            else:
                pass

        # Number of clusters in labels, ignoring noise if present
        labels_unique = np.unique(newLabels)
        n_clusters_ = len(set(labels_unique)) - (1 if -1 in newLabels else 0)
    else:
        n_clusters_ = 0

    print ('number of estimated clusters : %d' % n_clusters_)

    fig = plt.figure(figsize = (16,12)) # in inches
    colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')

    newLabels = np.asarray(newLabels) # for accurate plotting
    for k, col in zip(range(n_clusters_), colors):
        my_members = newLabels == k
        cluster_center = cluster_centers[k]
        plt.plot(convertedArray[my_members, 0], convertedArray[my_members, 1], col + '.')
        plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=14)
    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()

    return arrayToSave, newLabels
Пример #52
0
def cropEntries(image, file, padding):
    #t1 = time.time()
    croppedImages = []
    crop_points = []
    img = image.copy()
    height, width = img.shape[:2]
    sf = float(width) / float(2611)
    pad = int(padding / float(height) * float(11675))
    histogram = pd.Series(
        [width - cv2.countNonZero(img[i, :]) for i in list(range(height))])
    # do plots.
    #fig = plt.figure()
    #ax = histogram.plot()
    #ax.set_ylim([0,150])
    #ax.set_xlim([10500,11500])
    #plt.savefig('histogram' + file + '.pdf', bbox_inches='tight')
    #plt.close(fig)

    dip_df = histogram[histogram < sf * 25].to_frame().rename(
        columns={0: 'count'})
    indices = np.array(dip_df.index.tolist()).reshape(-1, 1)
    #pkl.dump(indices, open('indices.pkl', 'wb'))
    #t2 = time.time()
    #print('Prep time: ' + str(round(t2-t1, 2)) + ' s')

    # find indices to cut the entries
    #tf1 = time.time()
    ms = MeanShift(bandwidth=sf * 50, bin_seeding=True)
    ms.fit(indices)
    dip_group = ms.predict(indices)
    #tf2 = time.time()
    #print('Fit time: ' + str(round(tf2-tf1, 2)) + ' s')

    # add new column
    #t1 = time.time()
    dip_df = dip_df.assign(group=dip_group)
    #cut_points = [0] + sorted(dip_df.groupby('group').apply(lambda x: int(np.mean(x.index))).tolist())[1:-1] + [height]

    #calculate where to cut
    cut_points = [0] + sorted(
        dip_df.groupby('group').idxmin()['count'].tolist())[1:-1] + [height]
    median_height = np.median([
        cut_points[i + 1] - cut_points[i]
        for i in list(range(len(cut_points) - 1))
    ])
    #t2 = time.time()
    #print('Sort time: ' + str(round(t2-t1, 2)) + ' s')

    #for each pair of cut points found
    for i in list(range(len(cut_points) - 1)):
        start, end = cut_points[i], cut_points[i + 1]

        # if we suspect an entry is too large
        if end - start > 1.5 * median_height:
            # do the algorithm over again
            entry_hist = pd.DataFrame(data={
                'count': [
                    float(width - cv2.countNonZero(img[j, :]))
                    for j in list(range(start, end))
                ]
            },
                                      index=list(range(start, end)))
            entry_dip_df = entry_hist[entry_hist['count'] < sf * 100]
            entry_indices = np.array(entry_dip_df.index.tolist()).reshape(
                -1, 1)
            entry_ms = MeanShift(bandwidth=sf * 50, bin_seeding=True)
            entry_ms.fit(entry_indices)
            entry_dip_group = entry_ms.predict(entry_indices)
            entry_dip_df = entry_dip_df.assign(entry_group=entry_dip_group)
            entry_cut_points = [start] + sorted(
                entry_dip_df.groupby('entry_group').idxmin()
                ['count'].tolist())[1:-1] + [end]

            # if you have too many cut points for one entry
            if len(entry_cut_points) > 2:
                #print(entry_cut_points)
                #fig2 = plt.figure()
                #ax = entry_hist['count'].plot()
                #for xval in entry_cut_points:
                #ax2 = plt.axvline(x = xval, linestyle = ':', color = 'r')
                #ax.set_ylim([0,300])
                #plt.savefig('entry_hist' + file + str(i+1) + '.pdf', bbox_inches='tight')
                #plt.close(fig2)

                for entry_i in list(range(len(entry_cut_points) - 1)):
                    # adjust the cut points
                    if histogram.iloc[entry_cut_points[entry_i]:
                                      entry_cut_points[entry_i +
                                                       1]].sum() > sf * 20:
                        adjusted_start = entry_cut_points[entry_i]
                        adjusted_end = entry_cut_points[entry_i + 1]
                        while (histogram.iloc[adjusted_start]
                               == 0) and (adjusted_start < (adjusted_end - 1)):
                            adjusted_start += 1
                        while (histogram.iloc[adjusted_end - 1]
                               == 0) and ((adjusted_end - 1) > adjusted_start):
                            adjusted_end -= 1
                        adjusted_start = max(adjusted_start - pad, 0)
                        adjusted_end = min(adjusted_end + pad, height)
                        croppedImages.append(img[adjusted_start:adjusted_end,
                                                 0:width])
                        crop_points.append([adjusted_start, adjusted_end])
            else:
                if entry_hist['count'].sum() > sf * 20:
                    # adjust cut points
                    adjusted_start = start + 0
                    adjusted_end = end - 0
                    while (histogram.iloc[adjusted_start]
                           == 0) and (adjusted_start < (adjusted_end - 1)):
                        adjusted_start += 1
                    while (histogram.iloc[adjusted_end - 1]
                           == 0) and ((adjusted_end - 1) > adjusted_start):
                        adjusted_end -= 1
                    adjusted_start = max(adjusted_start - pad, 0)
                    adjusted_end = min(adjusted_end + pad, height)
                    croppedImages.append(img[adjusted_start:adjusted_end,
                                             0:width])
                    crop_points.append([adjusted_start, adjusted_end])
        else:
            # if the cut points end up possibly cutting words
            if histogram.iloc[start:end].sum() > sf * 20:
                # adjust cut points
                adjusted_start = start + 0
                adjusted_end = end - 0
                while (histogram.iloc[adjusted_start]
                       == 0) and (adjusted_start < (adjusted_end - 1)):
                    adjusted_start += 1
                while (histogram.iloc[adjusted_end - 1]
                       == 0) and ((adjusted_end - 1) > adjusted_start):
                    adjusted_end -= 1
                adjusted_start = max(adjusted_start - pad, 0)
                adjusted_end = min(adjusted_end + pad, height)
                croppedImages.append(img[adjusted_start:adjusted_end, 0:width])
                crop_points.append([adjusted_start, adjusted_end])
    #pkl.dump(crop_points, open('crop_points.' + file + '.pkl', 'wb'))
    return croppedImages, crop_points
Пример #53
0
    def postionInSmallMapExtract(self,mapPic):
        #input pic is 0 and 1 matric
        #输入是小地图那个白框的矩阵图像,所有值只有0和1
        if np.max(mapPic) > 1:
            print('图片数据大于1')
        if (mapPic == 0).all():
            print('图片全为0')
            return

        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))  # 定义结构元素
        mapPic = cv2.morphologyEx(mapPic, cv2.MORPH_OPEN, kernel)  # 开运算
        # self.picDisplay(mapPic,'map2')
        # exit()

        lieHe = np.sum(mapPic,axis = 0)#每列求和,最后是一行
        hangHe = np.sum(mapPic,axis = 1)#每行求和,最后是一列

        lieKuangXianPos = []
        hangKuangXianPos = []
        lieAreaList = []
        hangAreaList = []
        for i in range(len(lieHe)):
            if lieHe[i]> 6:
                lieKuangXianPos.append(i)
            elif lieHe[i]>0:
                lieAreaList.append(i)

        for i in range(len(hangHe)):
            if hangHe[i] > 6:
                hangKuangXianPos.append(i)
            elif hangHe[i]>0:
                hangAreaList.append(i)


        # print('行坐标:%s 列坐标:%s'%(hangKuangXianPos,lieKuangXianPos))
        # print('中心点坐标:%s'%([np.mean(hangKuangXianPos),np.mean(lieKuangXianPos)]))
        # print(lieAreaList)
        # print(hangAreaList)

        # 行坐标:[143, 144, 170, 171] 实际为竖直坐标
        # 列坐标:[89, 90, 137, 138] 实际为水平坐标
        # 中心点坐标:[157.0, 113.5]
        # 竖直坐标间距13.5 水平坐标间距24 半个矩形长度,到中心点距离


        #为了处理部分矩形边框超出小地图范围的情况
        centerPos = []
        centerPos.append(np.mean(lieAreaList))#中心点横坐标
        centerPos.append(np.mean(hangAreaList))#中心点纵坐标

        hangChangdu = len(hangKuangXianPos)
        lieChangdu = len(lieKuangXianPos)
        hangKuangXianPos = np.array(hangKuangXianPos).reshape(1,hangChangdu)
        lieKuangXianPos = np.array(lieKuangXianPos).reshape(1,lieChangdu)

        #通过聚类算法获取边框中线位置 此处为纵坐标获取
        zeros = np.zeros([1, hangChangdu])
        points = np.array([hangKuangXianPos, zeros]).T.reshape(hangChangdu,2)
        # print(points)
        # print(points.shape)
        ms = MeanShift(bandwidth=2)
        ms.fit(points)

        cluster_centers = ms.cluster_centers_
        print(cluster_centers)
        ys = []
        for i in cluster_centers:
            for j in i:
                if j!=0:
                    ys.append(j)

        #通过聚类算法获取边框中线位置 此处为横坐标获取
        zeros = np.zeros([1, lieChangdu])
        points = np.array([lieKuangXianPos, zeros]).T.reshape(lieChangdu,2)
        # print(points)
        # print(points.shape)
        ms = MeanShift(bandwidth=4)
        ms.fit(points)

        cluster_centers = ms.cluster_centers_
        # print(cluster_centers)
        xs = []
        for i in cluster_centers:
            for j in i:
                if j!=0:
                    xs.append(j)

        centerPoint = [0,0]

        if len(xs) == 2:
            centerPoint[0] = np.mean(xs)
        elif len(xs) == 1:
            if xs[0] > centerPos[0]:
                centerPoint[0] = xs[0] - 24
            else:
                centerPoint[0] = xs[0] + 24
        else:
            print('err 聚类获取点非1,2个 xs:%s'%(xs))

        if len(ys) == 2:
            centerPoint[1] = np.mean(ys)
        elif len(ys) == 1:
            if ys[0] > centerPos[1]:
                centerPoint[1] = ys[0] - 13.5
            else:
                centerPoint[1] = ys[0] + 13.5
        else:
            print('err 聚类获取点非1,2个 ys:%s'%(ys))

        # print('估计中心点:%s 精确中心点:%s'%(centerPos,centerPoint))
        return centerPoint
Пример #54
0
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("data.csv")
s = data[["G3", "Dalc", "failures"]]
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(s["Dalc"], s["failures"], s["G3"])
ax.set_xlabel("Daily Alchol consumption")
a = [1, 2, 3, 4]
ax.set_ylabel("Failures")  #Check the pics
ax.set_zlabel("Grade point")  #Check the pic
plt.show()

colors = np.array(["Red", "Blue", "Green"])
kmeans = MeanShift()
kmeans.fit(s)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(s["Dalc"], s["failures"], s["G3"], color=colors[kmeans.labels_])
ax.set_ylabel("Free time")  #Check the pics
ax.set_zlabel("Grade point")  #Check the pic
ax.set_xlabel("Daily alcholic consumption")
#s["Fjobn"]
plt.show()

s = s.groupby(["Dalc", "failures"])["G3"].mean()
s = s.reset_index()
print(s)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(s["Dalc"], s["failures"], s["G3"])
Пример #55
0
	def combined(self,cluster_num=None,isPlot=False):
		'''Parameter
		      - cluster_num: input "int", manually determine number of clusters
		      - isplot: plot or not
		   Explanation
		      - utilize Mean-Shift method to make initial centroids of K-Means
		'''

		print(" [*] starting meanshift-kmeans combind method ")

		self.data_preprocessing()
		print(" [*] data pre-processing done ")

		ind = 0
		while self.data.shape[1] > self.data.shape[0]:

			self.dimension_reduction(cont_rate=self.cont_rate_list[ind])

			if self.data.shape[1] > self.data.shape[0]:
				print("  -  reduced data dimension larger then data amount, alter another contribution rate ")
				ind += 1

			else:
				print("  -  reduced data dimension smaller then data amount ")
				break

		print(" [*] dimensionality reduction done ")

		X = np.array(self.data)
		bandwidth = estimate_bandwidth(X, quantile=0.2, n_samples=10000, random_state=42, n_jobs=2) 
		ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
		ms.fit(X)
		labels = ms.labels_
		cluster_centers = ms.cluster_centers_

		labels_unique = np.unique(labels)
		n_clusters_ = len(labels_unique)

		dict_1 = []
		for i in labels:
			dict_1.append((i,str(i)))

		np.random.seed(42)
		data = np.array(cluster_centers) 

		if n_clusters_ > 2:
			if not cluster_num:

				range_n_clusters = list(range(2,n_clusters_))
				silhouette_avg = []

				for n in range_n_clusters:
					estimator = KMeans(init='random', n_clusters=n, max_iter=1000, n_init=10)
					cluster_labels = estimator.fit_predict(data)
					silhouette_avg.append(silhouette_score(data, cluster_labels))
					print("  -  For n_clusters = " + str(n) + ", the average silhouette_score is : " + str(silhouette_avg[-1]) + ".")

				# use K-Means to cluster Mean-Shift centroids
				n_digits = range_n_clusters[silhouette_avg.index(max(silhouette_avg))] #
				print("  -  Choose n_clusters = " + str(n_digits) + " with max average silhouette score as final clusters number.")
				kmeans = KMeans(init='random', n_clusters=n_digits, max_iter=1000, n_init=10)
				output_label = kmeans.fit_predict(data) 
				cluster_centers = kmeans.cluster_centers_

				# use clustered Mean-Shift centroids as initial centroids of K-Means
				kmeans = KMeans(init=cluster_centers, n_clusters=len(cluster_centers), max_iter=1000)
				output_label = kmeans.fit_predict(self.data) 

			else:
				# use K-Means to cluster Mean-Shift centroids
				kmeans = KMeans(init='random', n_clusters=min(cluster_num,n_clusters_), max_iter=1000, n_init=10)
				output_label = kmeans.fit_predict(data) 
				cluster_centers = kmeans.cluster_centers_

				# use clustered Mean-Shift centroids as initial centroids of K-Means
				kmeans = KMeans(init=cluster_centers, n_clusters=len(cluster_centers), max_iter=1000)
				output_label = kmeans.fit_predict(self.data) 

		else:
			output_label = labels

		print(" [*] end ")

		if isPlot and X.shape[1] >= 2:
			plt.figure(2)
			plt.clf()
			colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
			plot_shape = list('.^*o+dp.^*o+dp.^*o+dp^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.')

			for k, col in zip(list(set(output_label)), colors):
				my_members = output_label == k
				cluster_center = cluster_centers[k]
				plt.plot(X[my_members, 0], X[my_members, 1], col + plot_shape[k])
				plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
						markeredgecolor='k', markersize=14)

			plt.title('MeanShift-KMeans')
			plt.show()

		dict_2 = {}
		for i in range(len(output_label)):
			dict_2.update({str(i):output_label[i]})

		new_labels = []
		for i in range(len(dict_1)):
			new_labels.append(dict_2[dict_1[i][1]])
		new_labels = np.array(new_labels).astype(int)

		labels_unique = np.unique(new_labels)
		n_clusters_ = len(labels_unique)

		new_labels_pd = pd.DataFrame(new_labels,index=self.data.index.tolist(),columns=['labels'])
		data_cluster_dict = {}
		output = pd.DataFrame(index=list(set(new_labels_pd['labels'])),\
							columns=self.data.columns)

		for label in set(new_labels_pd['labels']):
			a = list(new_labels_pd[(new_labels_pd['labels'] == label)].index)
			tmp = self.data.loc[a]
			for col in list(self.data.columns):
				output.loc[label][col] = tmp[col].mean()
				data_cluster_dict.update({label:a})

		if isPlot and X.shape[1] == 1:
			x = []
			for i in range(len(data_cluster_dict)):
				x.extend(data_cluster_dict[i])

			self.data = self.data.reindex(x)
			self.data = self.data.reset_index(drop=True)

			j = 0
			plot_shape = list('.^*o+dp.^*o+dp.^*o+dp^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.^*o+dp.')
			plot_color = list('bgrcmykbgrcmykbgrcmykbgrcmyk')
			for i in range(len(data_cluster_dict)):
				plt.plot(self.data.loc[j:j+len(data_cluster_dict[i])-1], color=plot_color[i], marker=plot_shape[i], \
					linestyle='', linewidth=2.0)
				j += len(data_cluster_dict[i])
			plt.show()

		return output, data_cluster_dict
Пример #56
0
def cluster(x, y, n_class):
    plt.figure()

    plt.subplot(241)
    plt.title('RAW')
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=y)

    kmeans_pre = KMeans(n_clusters=n_class, random_state=9).fit_predict(x)
    plt.subplot(242)
    plt.title('K-Means')
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=kmeans_pre)

    aff_pre = AffinityPropagation(preference=-50).fit(x)
    cluster_centers_indices = aff_pre.cluster_centers_indices_
    labels = aff_pre.labels_
    n_clusters_ = len(cluster_centers_indices)
    plt.subplot(243)
    plt.title('AffinityPropagation@{}'.format(unique(labels)))
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=labels)

    bandwidth = estimate_bandwidth(x, quantile=0.2, n_samples=500)
    ms = MeanShift(bandwidth=bandwidth / 2)
    ms.fit(x)
    labels1 = ms.labels_
    cluster_centers = ms.cluster_centers_
    plt.subplot(244)
    plt.title('MeanShift@{}'.format(unique(labels1)))
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=labels1)

    sc_pre = SpectralClustering(n_clusters=n_class).fit_predict(x)
    plt.subplot(245)
    plt.title('SpectralClustering')
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=sc_pre)

    clustering = AgglomerativeClustering(n_clusters=n_class).fit(x)
    labels2 = clustering.labels_
    plt.subplot(246)
    plt.title('AgglomerativeClustering')
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=labels2)

    labels3 = DBSCAN(eps=abs(np.max(x) - np.min(x)) / n_class / 2, min_samples=1).fit_predict(x)
    plt.subplot(247)
    plt.title('DBSCAN@{}'.format(unique(labels3)))
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=labels3)

    gmm = GaussianMixture(n_components=n_class)
    gmm.fit(x)
    labels4 = gmm.predict(x)
    plt.subplot(248)
    plt.title('GaussianMixture')
    plt.axis('off')
    plt.scatter(x[:, 0], x[:, 1], c=labels4)

    plt.show()

    preds = [kmeans_pre, labels, labels1, sc_pre, labels2, labels3, labels4]
    names = ['K-Means',
             'AffinityPropagation@{}'.format(unique(labels)),
             'MeanShift@{}'.format(unique(labels1)),
             'SpectralClustering',
             'AgglomerativeClustering',
             'DBSCAN@{}'.format(unique(labels3)),
             'GaussianMixture']
    print('Method:', 'NMI', 'Homogeneity', 'Completeness')
    for name, pred in zip(names, preds):
        m1, m2, m3 = metric(y, pred)
        print(name + ':', m1, m2, m3)
Пример #57
0
#!/usr/bin/python
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift, estimate_bandwidth
from itertools import cycle

# Load data from input file
X = np.loadtxt('data_clustering.txt', delimiter=',')

# Estimate the bandwidth of X
bandwidth_X = estimate_bandwidth(X, quantile=0.1, n_samples=len(X))

# Cluster data with MeanShift
meanshift_model = MeanShift(bandwidth=bandwidth_X, bin_seeding=True)
meanshift_model.fit(X)

# Extract the centers of clusters
cluster_centers = meanshift_model.cluster_centers_
print('\nCenters of clusters:\n', cluster_centers)

# Estimate the number of clusters
labels = meanshift_model.labels_
num_clusters = len(np.unique(labels))
print("\nNumber of clusters in input data =", num_clusters)

# Plot the points and cluster centers
plt.figure()
markers = 'o*xvs'
for i, marker in zip(range(num_clusters), markers):
    # Plot points that belong to the current cluster
    plt.scatter(X[labels == i, 0],
from scipy.fftpack import dct
import matplotlib.pyplot as plt
from PIL import Image

image = Image.open('C:\Users\Cris\Desktop\mountain_color.jpg')
image = np.array(image)

#Need to convert image into feature array based
#on rgb intensities
flat_image=np.reshape(image, [-1, 3])
 
#Estimate bandwidth
bandwidth2 = estimate_bandwidth(flat_image,
                                quantile=.2, n_samples=5000)
ms = MeanShift(bandwidth2, bin_seeding=True)
ms.fit(flat_image)
labels=ms.labels_

# Example of how to use discrete cosine transform. 
# We will apply it to luminance, rather than labels.
discrete_cosine_transform = dct(np.array(labels, dtype = 'float'))

np.savetxt('C:\Users\Cris\Desktop\labels.csv', labels, delimiter=',')
 
# Plot image vs segmented image

plt.figure(2)
plt.subplot(2, 1, 1)
plt.imshow(image)
plt.axis('off')
plt.subplot(2, 1, 2)
Пример #59
0
bandwidth3 = estimate_bandwidth(flat_image, quantile=.3, n_samples=500)
bandwidth4 = estimate_bandwidth(flat_image, quantile=.4, n_samples=500)

print(bandwidth1)
print(bandwidth2)
print(bandwidth3)
print(bandwidth4)

ms1 = MeanShift(bandwidth1, bin_seeding=True)
ms2 = MeanShift(bandwidth2, bin_seeding=True)
ms3 = MeanShift(bandwidth3, bin_seeding=True)
ms4 = MeanShift(bandwidth4, bin_seeding=True)
#print(ms1)

#Performing meanshift on flatImg
ms1.fit(flat_image)
ms2.fit(flat_image)
ms3.fit(flat_image)
ms4.fit(flat_image)

#(r,g,b) vectors corresponding to the different clusters after meanshift
labels1 = ms1.labels_
labels2 = ms2.labels_
labels3 = ms3.labels_
labels4 = ms4.labels_
#print(labels)

#Remaining colors after meanshift
cluster_centers1 = ms1.cluster_centers_
cluster_centers2 = ms2.cluster_centers_
cluster_centers3 = ms3.cluster_centers_
Пример #60
0
plt.scatter(X["Longitude"],
            X["Latitude"],
            c=kmeans.labels_,
            cmap='rainbow',
            zorder=0)
plt.ylim(20, 55)
plt.xlim(-130, -60)
plt.title("KMeans clustering k=50")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
"""""" """""" """""
Mean_shift Clustering

""" """""" """""" ""
#bandwidth = estimate_bandwidth([X["Longitude"], X["Latitude"]])
meanshift = MeanShift()
meanshift.fit(distance_matrix)

print("number of estimated clusters in Mean Shift Clustering",
      len(np.unique(meanshift.labels_)))

plt.scatter(X["Longitude"], X["Latitude"], c=meanshift.labels_, cmap='rainbow')
#plt.ylim(20,55)
#plt.xlim(-130,-60)
plt.title("MeanShift Clustering")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()