Exemplo n.º 1
0
print('quantile',
      'n_clusters_',
      'n_noise',
      'silhouette_score',
      'davies_bouldin_score',
      'calinski_harabasz_score',
      sep=',')
r = range(0, 11, 1)
for quantile in r:
    quantile = quantile / 10
    # logger.info('Estimating Bandwidth')
    bandwidth = estimate_bandwidth(df, quantile=quantile, n_jobs=-1)
    logger.info('Bandwidth estimate: %f, quantile: %f' % (bandwidth, quantile))
    if bandwidth > 0.0:
        # logger.info('Clustering')
        ms = MeanShift(bandwidth=bandwidth, bin_seeding=True,
                       n_jobs=-1).fit(df)
        labels = ms.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        print(n_clusters_)
        if n_clusters_ > 1:
            silhouette_score = metrics.silhouette_score(df, ms.labels_)
            davies_bouldin_score = metrics.davies_bouldin_score(df, ms.labels_)
            calinski_harabasz_score = metrics.calinski_harabasz_score(
                df, ms.labels_)
            print(quantile,
                  n_clusters_,
                  n_noise_,
                  silhouette_score,
                  davies_bouldin_score,
                  calinski_harabasz_score,
Exemplo n.º 2
0
    return df


print ". Loading CSV file ({})...".format(config.db_path)

df = read_data(config.db_path)
#################################################################################
# Clustering
X = df[['latitude', 'longitude']].values
print ". Estimating MeanShift's bandwidth..."
bandwidth = estimate_bandwidth(X, quantile=0.0005, n_samples=10000)

print ". Clustering with MeanShift..."
clustering = MeanShift(bandwidth=bandwidth,
                       bin_seeding=True,
                       cluster_all=False,
                       min_bin_freq=10)
clustering.fit(X)
labels = clustering.labels_
df['cluster'] = clustering.labels_
cluster_centers = clustering.cluster_centers_

labels_unique = np.unique(l for l in labels if l != -1)
n_clusters_ = len(labels_unique)

cluster_count = df[df['cluster'] != -1].groupby('cluster').size()

# Building a DataFrame describing each cluster
print ". Fetching cluster data with Google Places..."
c_data = list()
for i, cluster in enumerate(labels_unique):
avg_emb_zip = zip(sense_keys_temp, hom_types, avg_embs)
avg_emb_df = pd.DataFrame(avg_emb_zip, columns=['sense_key', 'hom_type', 'avg_emb'])

len(h_embs)

from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

bert_vecs = np.array(avg_embs)

len(bert_vecs)

cluster = MeanShift().fit(bert_vecs)
cluster.labels_

clustering = DBSCAN(eps=0.25, min_samples=2, metric='cosine').fit(bert_vecs)
clustering.labels_

ag_cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0.55).fit(bert_vecs)
ag_cluster.labels_

# relevant

from sklearn.manifold import TSNE
from sklearn.manifold import LocallyLinearEmbedding # Will be used for LLE, LTSA, Hessian LLE, and Modified LLE.
from sklearn.manifold import MDS
from sklearn.manifold import SpectralEmbedding
from sklearn.manifold import Isomap
Exemplo n.º 4
0
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

import utilities

# Load data from input file
X = utilities.load_data('data_multivar.txt')

# Estimating the bandwidth
bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=len(X))

# Compute clustering with MeanShift
meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True)
meanshift_estimator.fit(X)
labels = meanshift_estimator.labels_
centroids = meanshift_estimator.cluster_centers_
num_clusters = len(np.unique(labels))

print "Number of clusters in input data =", num_clusters

###########################################################
# Plot the points and centroids

import matplotlib.pyplot as plt
from itertools import cycle

plt.figure()

# specify marker shapes for different clusters
markers = '.*xv'
Exemplo n.º 5
0
def node():
	global frontiers,mapData,global1,global2,global3,globalmaps,litraIndx,n_robots,namespace_init_count
	rospy.init_node('filter', anonymous=False)
	
	# fetching all parameters
	map_topic= rospy.get_param('~map_topic','/map')
	threshold= rospy.get_param('~costmap_clearing_threshold',70)
	info_radius= rospy.get_param('~info_radius',1.0)					#this can be smaller than the laser scanner range, >> smaller >>less computation time>> too small is not good, info gain won't be accurate
	goals_topic= rospy.get_param('~goals_topic','/detected_points')	
	n_robots = rospy.get_param('~n_robots',1)
	namespace = rospy.get_param('~namespace','')
	namespace_init_count = rospy.get_param('namespace_init_count',1)
	rateHz = rospy.get_param('~rate',100)
	litraIndx=len(namespace)
	rate = rospy.Rate(rateHz)
#-------------------------------------------
	rospy.Subscriber(map_topic, OccupancyGrid, mapCallBack)
	

#---------------------------------------------------------------------------------------------------------------
	

	for i in range(0,n_robots):
 		 globalmaps.append(OccupancyGrid()) 
 	
 	if len(namespace) > 0:	 
	 	for i in range(0,n_robots):
			rospy.Subscriber(map_topic, OccupancyGrid, globalMap) 
	elif len(namespace)==0:
			rospy.Subscriber(map_topic, OccupancyGrid, globalMap) 	
#wait if map is not received yet
	while (len(mapData.data)<1):
		pass
#wait if any of robots' global costmap map is not received yet
	for i in range(0,n_robots):
 		 while (len(globalmaps[i].data)<1):
 		 	pass
	
	global_frame="/"+mapData.header.frame_id


	tfLisn=tf.TransformListener()
	if len(namespace) > 0:
		for i in range(0,n_robots):
			tfLisn.waitForTransform(global_frame[1:], '/odom', rospy.Time(0),rospy.Duration(10.0))
	elif len(namespace)==0:
			tfLisn.waitForTransform(global_frame[1:], '/odom', rospy.Time(0),rospy.Duration(10.0))
	
	rospy.Subscriber(goals_topic, PointStamped, callback=callBack,callback_args=[tfLisn,global_frame[1:]])
	pub = rospy.Publisher('frontiers', Marker, queue_size=10)
	pub2 = rospy.Publisher('centroids', Marker, queue_size=10)
	filterpub = rospy.Publisher('filtered_points', PointArray, queue_size=10)

	rospy.loginfo("the map and global costmaps are received")
	
	
	# wait if no frontier is received yet 
	while len(frontiers)<1:
		pass
	
	
	points=Marker()
	points_clust=Marker()
#Set the frame ID and timestamp.  See the TF tutorials for information on these.
	points.header.frame_id= mapData.header.frame_id
	points.header.stamp= rospy.Time.now()

	points.ns= "markers2"
	points.id = 0
	
	points.type = Marker.POINTS
	
#Set the marker action for latched frontiers.  Options are ADD, DELETE, and new in ROS Indigo: 3 (DELETEALL)
	points.action = Marker.ADD;

	points.pose.orientation.w = 1.0

	points.scale.x=0.2
	points.scale.y=0.2 

	points.color.r = 255.0/255.0
	points.color.g = 255.0/255.0
	points.color.b = 0.0/255.0

	points.color.a=1;
	points.lifetime = rospy.Duration();

	p=Point()

	p.z = 0;

	pp=[]
	pl=[]
	
	points_clust.header.frame_id= mapData.header.frame_id
	points_clust.header.stamp= rospy.Time.now()

	points_clust.ns= "markers3"
	points_clust.id = 4

	points_clust.type = Marker.POINTS

#Set the marker action for centroids.  Options are ADD, DELETE, and new in ROS Indigo: 3 (DELETEALL)
	points_clust.action = Marker.ADD;

	points_clust.pose.orientation.w = 1.0;

	points_clust.scale.x=0.2;
	points_clust.scale.y=0.2; 
	points_clust.color.r = 0.0/255.0
	points_clust.color.g = 255.0/255.0
	points_clust.color.b = 0.0/255.0

	points_clust.color.a=1;
	points_clust.lifetime = rospy.Duration();

		
	temppoint=PointStamped()
	temppoint.header.frame_id= mapData.header.frame_id
	temppoint.header.stamp=rospy.Time(0)
	temppoint.point.z=0.0
	
	arraypoints=PointArray()
	tempPoint=Point()
	tempPoint.z=0.0
#-------------------------------------------------------------------------
#---------------------     Main   Loop     -------------------------------
#-------------------------------------------------------------------------
	while not rospy.is_shutdown():
#-------------------------------------------------------------------------	
#Clustering frontier points
		centroids=[]
		front=copy(frontiers)
		if len(front)>1:
			ms = MeanShift(bandwidth=0.3)   
			ms.fit(front)
			centroids= ms.cluster_centers_	 #centroids array is the centers of each cluster		

		#if there is only one frontier no need for clustering, i.e. centroids=frontiers
		if len(front)==1:
			centroids=front
		frontiers=copy(centroids)
#-------------------------------------------------------------------------	
#clearing old frontiers  
      
		z=0
		while z<len(centroids):
			cond=False
			temppoint.point.x=centroids[z][0]
			temppoint.point.y=centroids[z][1]
						
			for i in range(0,n_robots):
				
				
				transformedPoint=tfLisn.transformPoint(globalmaps[i].header.frame_id,temppoint)
				x=array([transformedPoint.point.x,transformedPoint.point.y])
				cond=(gridValue(globalmaps[i],x)>threshold) or cond
			if (cond or (informationGain(mapData,[centroids[z][0],centroids[z][1]],info_radius*0.5))<0.2):
				centroids=delete(centroids, (z), axis=0)
				z=z-1
			z+=1
#-------------------------------------------------------------------------
#publishing
		arraypoints.points=[]
		for i in centroids:
			tempPoint.x=i[0]
			tempPoint.y=i[1]
			arraypoints.points.append(copy(tempPoint))
		filterpub.publish(arraypoints)
		pp=[]	
		for q in range(0,len(frontiers)):
			p.x=frontiers[q][0]
			p.y=frontiers[q][1]
			pp.append(copy(p))
		points.points=pp
		pp=[]	
		for q in range(0,len(centroids)):
			p.x=centroids[q][0]
			p.y=centroids[q][1]
			pp.append(copy(p))
		points_clust.points=pp
		pub.publish(points)
		pub2.publish(points_clust) 
		rate.sleep()
Exemplo n.º 6
0
Std_RGB = np.array([38.55379149, 35.64913446, 39.07419321])
Data_Norm = (Img_Data - Mean_RGB)/Std_RGB
Data_NFlat = np.reshape(Data_Norm, (Size_Data[0], 32*32*3))
# Shullf data
per = np.random.permutation(Data_Norm.shape[0])
Shuf_Data_Norm = Data_NFlat[per, :]
Shuf_Label_Data = Label_Data[per]

for i in range(F_n):
    DataN_te = Shuf_Data_Norm[Fold_size*i:Fold_size*(i+1), :]
    DataN_tr_1 = Shuf_Data_Norm[0:(Fold_size*i), :]
    DataN_tr_2 = Shuf_Data_Norm[Fold_size*(i+1):, :]
    DataN_tr = np.concatenate((DataN_tr_1,DataN_tr_2))
    DataN_te_y = Shuf_Label_Data[Fold_size*i:Fold_size*(i+1)]
    DataN_tr_y_1 = Shuf_Label_Data[0:(Fold_size*i)]
    DataN_tr_y_2 = Shuf_Label_Data[Fold_size*(i+1):]
    DataN_tr_y = np.concatenate((DataN_tr_y_1,DataN_tr_y_2))
    model_name_rbf = 'Model_' + str(i+1) + '_rbf.model'
    model_name_linear = 'Model_' + str(i+1) + '_linear.model'
    # Mean shift cluster
    clustering_Mf = MeanShift(n_jobs=-1)
    clustering_Mf.fit(DataN_tr)
    Cluster_predict = clustering_Mf.predict(DataN_te)
    print(Cluster_predict[0:40])
    '''
    score_rbf = clf_rbf.score(DataN_te,DataN_te_y)
    print("The score of rbf is : %f"%score_rbf)
    joblib.dump(clf_rbf, model_name_rbf)
    '''

Exemplo n.º 7
0
# for production
taxi_df = pd.read_sql_query('SELECT * FROM tripdata', engine)

taxi_data=taxi_df[['pickup_datetime', 'pickup_longitude', 'pickup_latitude']].copy()
taxi_data.columns=['datetime', 'lng', 'lat']
taxi_df = None


data_set = taxi_data
data_set['dayofweek']=data_set.datetime.dt.weekday
data_set['hourofday']=data_set.datetime.dt.hour
data_set['weekdays']=(data_set.dayofweek < 5)*1


from sklearn.cluster import MeanShift
ms=MeanShift(bandwidth=0.003, cluster_all=False, min_bin_freq=5)

all_clusters=None


# loop through weekdays and weekends, and each 2-hours time duration
# train the data with MeanShift
# get the cluster_centers
# and packed with 'items' equals to the number of items belongs to the center
# which will be used as a weighting in display
for d in [0, 1]:
    for h in [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15],[16,17],[18,19],[20,21],[22,23]]:
        print 'Clusters for weekdays=', d, '; hour=', h
        # train only on lng and lat
        X = data_set[(data_set.weekdays==d) & (data_set.hourofday.isin(h))][['lng', 'lat']]
        ms.fit(X)
Exemplo n.º 8
0
def color_analysis(preffixName, suffixName, textNum, textLines):

    textLinesLeft = []
    backColors = []
    maxSize = 0
    index_maxSize = 0
    for i in range(textNum + 1):
        textImg = Image.open(preffixName + 'text' + str(i) + suffixName)
        textImg = textImg.convert('RGB')
        textImg = np.asarray(textImg)
        if (textImg.shape[0] * textImg.shape[1]) > maxSize:
            maxSize = textImg.shape[0] * textImg.shape[1]
            index_maxSize = i

        saliencyImg = Image.open(preffixName + 'saliency' + str(i) +
                                 suffixName)
        saliencyImg = np.asarray(saliencyImg)

        textBack_color = textImg[saliencyImg < 10]
        backColors.append(np.mean(textBack_color, axis=0))

        #尝试聚类处理
        #backColor = getMainColor(textBack_color,5)
        #backColors.append(backColor)

    if len(backColors) > 0:
        backColors = np.array(backColors)

        #区分的阈值,该值可调整
        ms = MeanShift(bandwidth=50, bin_seeding=True)
        ms.fit(backColors)
        labels = ms.labels_
        cluster_centers = ms.cluster_centers_

        binCount = np.bincount(labels).tolist()

        Label_index = []
        for i in range(len(binCount)):
            eachLabel_index = []
            for j in range(len(labels)):
                if labels[j] == i:
                    eachLabel_index.append(j)
            Label_index.append(eachLabel_index)

        maxLabel = labels[index_maxSize]

        results = []

        for i in range(len(Label_index)):
            if i != maxLabel:
                texts_eachlabel = Label_index[i]
                if len(texts_eachlabel) > 1:
                    yAxis = []
                    for index in texts_eachlabel:
                        yAxis.append(textLines[index][1])

                    yAxis_copy = list(yAxis)
                    yAxis_copy.sort(reverse=False)

                    results.append(textLines[texts_eachlabel[yAxis.index(
                        yAxis_copy[0])]])

                    for i in range(1, len(yAxis_copy)):

                        index = texts_eachlabel[yAxis.index(yAxis_copy[i])]
                        eachline = results[-1]
                        eachline_next = textLines[index]

                        if overlap_degree(eachline, eachline_next) > -1:
                            #合并
                            results[-1][0] = min(eachline[0], eachline_next[0])
                            results[-1][1] = min(eachline[1], eachline_next[1])
                            results[-1][2] = max(eachline[2], eachline_next[2])
                            results[-1][3] = max(eachline[3], eachline_next[3])
                        else:
                            results.append(eachline_next)
                else:
                    results.append(textLines[texts_eachlabel[0]])
            else:
                texts_eachlabel = Label_index[i]
                for index in texts_eachlabel:
                    textLinesLeft.append(textLines[index])

        return results, textLinesLeft
    else:
        return [], textLines
Exemplo n.º 9
0
A = pd.read_excel(
    r'D:\Program Files\JetBrains\PyCharmFile\Cluster\Data\e2.xlsx',
    sheet_name="Sheet1",
    header=0)
tabtop = A.columns.values.tolist()  # 读出表头
B = np.array(A)
temp = ['2016年', '2017年', '2018年']

x = 1
y = 13
for j in range(3):  # 对某一行业的进行聚类(三次)
    C = B[0:109, x:y]
    D = DataFrame(C)
    E = D.loc[~(D == 0).all(axis=1), :]  # 删除全为0的行
    bw = estimate_bandwidth(E, quantile=0.3)  # 设置带宽
    model = MeanShift(bandwidth=bw, bin_seeding=True)  # 聚类函数参数设置
    model.fit(E)  # 开始聚类
    r1 = (Series(model.labels_)).value_counts()  # 统计各个类别的数目
    r2 = DataFrame(model.cluster_centers_)  # 找出聚类中心
    r = pd.concat([r2, r1], axis=1)  # 横向连接数据
    r.columns = tabtop[x:y] + [u'类别数目']
    name1 = temp[j] + '.xlsx'
    print(name1)
    file1 = r.to_excel(name1)
    g = pd.concat([E, Series(model.labels_, index=E.index)], axis=1)
    g.columns = tabtop[x:y] + [u'聚类类别']
    name2 = 'A' + temp[j] + '.xlsx'
    print(name2)
    file2 = g.to_excel(name2)
    x = y
    y = y + 12
Exemplo n.º 10
0
# import libraries
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs

# creadte data
centers = [[3,3,3],[4,5,5],[3,10,10]]
X, _ = make_blobs(n_samples=700, centers = centers, cluster_std=0.5)

# create model
MSh = MeanShift()

# train model
MSh.fit(X)
labels = MSh.labels_
cluster_centers = MSh.cluster_centers_
print(cluster_centers)
#load libraries
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift

#load data
iris = datasets.load_iris()
features = iris.data

#standardize features
scaler = StandardScaler()
features_std = scaler.fit_transform(features)

#create meanshift object
cluster = MeanShift(n_jobs=-1)

#train model
model = cluster.fit(features_std)
Exemplo n.º 12
0
    "Confidence in national government"
]]
subdf.fillna(0, inplace=True)

scaler = preprocessing.StandardScaler()

scaled_df = scaler.fit_transform(subdf)

reducer_p = PCA(n_components=2)
pca_df = reducer_p.fit_transform(scaled_df)

# =====================================================================
# 5. Clustering
# =====================================================================

learner = MeanShift(bandwidth=None)
ms = learner.fit_predict(pca_df)

learner = MiniBatchKMeans(n_clusters=3)
mbkm = learner.fit_predict(pca_df)

learner = SpectralClustering(n_clusters=3)
sc = learner.fit_predict(pca_df)

learner = AgglomerativeClustering(n_clusters=3)
ac = learner.fit_predict(pca_df)

# =====================================================================
# 5. Cluster graphs
# =====================================================================

print("n_samples: %d, n_features: %d" % matrix.shape)
print()

#降维
print("Performing dimensionality reduction using LSA")
t0 = time()
svd = TruncatedSVD(2)  #维度
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

matrix_l = lsa.fit_transform(matrix)

# #############################################################################
# Do the actual clustering
bandwidth = estimate_bandwidth(matrix_l, quantile=0.2, n_samples=500)
t0 = time()
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(matrix_l)
print("done in %0.3fs" % (time() - t0))
print()

labels_Pred = ms.labels_

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_ture, labels_Pred))
print("Completeness: %0.3f" % metrics.completeness_score(labels_ture, labels_Pred))
print("NMI: %0.3f" % metrics.normalized_mutual_info_score(labels_ture, labels_Pred, average_method='arithmetic'))



    elif i == 2:
        y_corrected.append(1)
    elif i == 3:
        y_corrected.append(2)
    elif i == 4:
        y_corrected.append(5)
    else:
        y_corrected.append(3)

#establish a KNN model
KNN = KNeighborsClassifier(n_neighbors=5)
KNN.fit(x, y)
KNN.predict([[5, 0]])
y_predict2 = KNN.predict(x)

print("KNN accuracy", accuracy_score(y, y_predict2))
print(pd.value_counts(y_predict2))

#establish a meanshift model

bw = estimate_bandwidth(x, n_samples=120)
print(bw)
ms = MeanShift(bandwidth=bw)
ms.fit(x)
y_predict_ms = ms.predict(x)
print("meanshift 归类结果", pd.value_counts(y_predict_ms))
fig5 = plt.figure()
label1 = plt.scatter(x['x'][y_predict_ms == 1], x['y'][y_predict_ms == 1])
label2 = plt.scatter(x['x'][y_predict_ms == 0], x['y'][y_predict_ms == 0])
plt.legend((label1, label2), ('label1', 'label2'))
plt.show()
Exemplo n.º 15
0
import numpy
import cPickle
import scipy.misc
import numpy as np
import pandas as pd
import json
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs
from itertools import cycle

import data

print "Generating Clusters"
df = pd.read_csv('taxi_data/train.csv',
                 converters={'POLYLINE': lambda x: json.loads(x)[-1:]})
dests = []
for p in df['POLYLINE']:

    if len(p) > 0:
        dests.append([p[0][1], p[0][0]])
pts = numpy.array(dests)

bw = 0.001

means = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
means.fit(pts)
cluster_centers = means.cluster_centers_

print "Clusters shape: ", cluster_centers.shape
print cluster_centers
Exemplo n.º 16
0
centerNorms[nanIdx] = -1
centers = centers[(centerNorms < 100) & (centerNorms > 0)]

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red')
ax.axis('equal')
plt.show()

res = 200
bandwidthList = list(np.linspace(0.01, 0.045, res))
nCenters = np.zeros((res))
i = 0
for bandwidth in bandwidthList:
    meanShiftClusterer = MeanShift(bandwidth=bandwidth,
                                   bin_seeding=True,
                                   min_bin_freq=5,
                                   cluster_all=False)
    wasError = False
    try:
        meanShiftClusterer.fit(centers)
    except ValueError:
        print('No point for bandwidth {}'.format(bandwidth))
        wasError = True
    if not wasError:
        clusterCenters = meanShiftClusterer.cluster_centers_
        nCenters[i] = clusterCenters.shape[0]
    else:
        nCenters[i] = 0
    i += 1

plt.plot(np.array(bandwidthList), nCenters)
Exemplo n.º 17
0
final = []
for i in UserId:
    if i in list(Q1['userId']) and i in list(Q2['userId']) and i in list(
            FinalExam['userId']):

        a = float(Q1[Q1['userId'] == i].get('score'))
        q1.append(a)
        q2.append(float(Q2[Q2['userId'] == i].get('score')))
        final.append(float(FinalExam[FinalExam['userId'] == i].get('score')))

d = {'userId': "", 'score_Q1': "", 'scoreQ2': "", 'score_final': ""}

x = np.array([q1, q2, final])
x = x.reshape(len(x[0]), 3)
kmeans = km(n_clusters=3, random_state=0).fit(x)
clusteringMS = MeanShift(bandwidth=2).fit(x)

clusteringDB = DBSCAN(eps=3, min_samples=2).fit(x)
clusteringDB.labels_
# kmeans.labels_ 这个是分类好了的表格
#quizAndExam KMEANS 效果贼差,决定不用了。
quizAndExam = pd.DataFrame(data=x,
                           columns={"q1Score", "q2Score", "finalScore"})
quizAndExam['class'] = clusteringDB.labels_
quizAndExam['label'] = kmeans.labels_

#d
quizAndExam.to_excel('output.xlsx', engine='xlsxwriter')
"""
三个学生的成绩等级。TB20个,B36个,F47个。咱大学的学生水平不行啊。
"""
Exemplo n.º 18
0
    X = X.to_numpy()
    return X


def Dataset():
    np.random.seed(10)
    X, _ = datasets.make_blobs(n_samples=1500, n_features=2, centers=3, cluster_std=2.1)
    # dumpfile(X,'fileSet')
    # data = loadfile('fileSet')
    # print(data)
    return X


np.set_printoptions(threshold=sys.maxsize)
## clustering_algorithms
MeanShift = MeanShift(bandwidth=2)
KMeans = KMeans(9, verbose=1000)
SpectralClustering = SpectralClustering(n_clusters=9, assign_labels="discretize", random_state=0)
DBSCAN = DBSCAN(eps=3, min_samples=2)
OPTICS = OPTICS(min_samples=2)
AffinityPropagation = AffinityPropagation()
# clustering_algorithms  = [['MeanShift', MeanShift], ['KMeans', KMeans],
#               ['SpectralClustering', SpectralClustering], ['DBSCAN', DBSCAN], ['OPTICS', OPTICS],['AffinityPropagation',AffinityPropagation]]
file = [['1k','0-0-1000'],['10k','0-0-10000'],['100k','0-0-100k'],['1m','0-0-1m']]
clustering_algorithms = [['KMeans', KMeans]]
# file = [['1k','0-0-1000']]
for n, f in file:
    X = readdata(f)
    for c, model in clustering_algorithms:
        print(c, "Start.......!")
        z = model.fit_predict(X)
Exemplo n.º 19
0
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets.samples_generator import make_blobs

centers = [[1, 1], [-1, -1], [1, -1]]
X,_ = make_blobs(n_samples=10000, centers=centers, cluster_std=0.6)


bandwidth = estimate_bandwidth(X, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)


# Plot result
import matplotlib.pyplot as plt

for k in range(n_clusters_):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.scatter(X[my_members, 0], X[my_members, 1])
    plt.plot(cluster_center[0], cluster_center[1], 'o',
             markeredgecolor='b', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
Exemplo n.º 20
0
def func_meanshift():
    print('\nCLUSTERING: MEAN SHIFT\n')
    #Creating the model using photometric data
    print('Fitting the model...')
    print(
        'If you want to choose the bandwidth, please write it. Else, write 0')
    bandwidth = input()
    if bandwidth == '0':
        bandwidth = estimate_bandwidth(dataused)
    else:
        bandwidth = float(bandwidth)
    print('Using a bandwidth of', bandwidth)
    ms = MeanShift(bandwidth=bandwidth)
    ms.fit(dataused)
    print('Mean Shift problem solved')

    # Model parameters
    centers = ms.cluster_centers_
    print('POSITIONS OF THE CENTERS:\n', centers)

    # Adjusting the data to the model
    print('Obtaining the labels...')
    labels = ms.labels_
    labeledlabels = ms.predict(labeleddataused)

    # Saving data in ASCII format
    plotfile = (root + '/MeanShift/' + n + root_folder + root_file +
                '_MeanShift')
    print('Saving data with labels in ASCII format...')
    np.savetxt(plotfile + '.txt',
               np.c_[data['id_2MASS'], data['id_AllWISE'], dataset, labels],
               header=dataheading,
               delimiter='\t',
               fmt='%s')
    np.savetxt(plotfile + '_labeled.txt',
               np.c_[labeleddataset, labeledlabels, labeleddata['z'],
                     labeleddata['class'], labeleddata['subClass']],
               header=dataheading[20:] + '\tz\tclass\tsubClass',
               delimiter='\t',
               fmt='%s')
    print('Data file saved successfully, check your MeanShift folder')

    # Saving data in FITS format
    print('Saving data with labels in FITS format...')
    bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile +
                 '.txt out=' + plotfile + '.fits')
    subprocess.run(bashorder, shell=True)
    bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile +
                 '_labeled.txt out=' + plotfile + '_labeled.fits')
    subprocess.run(bashorder, shell=True)
    print('Data file saved successfully, check your KMeans folder')

    # Saving centers information
    plotfile = root + '/MeanShift/' + n + root_folder + 'Centers'
    print('Saving centers position...')
    np.savetxt(plotfile + '.txt',
               centers,
               header=dataheading[20:-6],
               delimiter='\t',
               fmt='%s')
    print('Centers positions saved successfully, check your MeanShift folder')

    print('\nMEAN SHIFT TECHNIQUE APPLIED\n')
Exemplo n.º 21
0
import numpy as np
from sklearn.cluster import MeanShift
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import style
style.use('ggplot')

centers = [[1, 1, 1], [5, 5, 5], [3, 10, 10]]
X, _ = make_blobs(n_samples=100, centers=centers, cluster_std=1)

ms = MeanShift()
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
print(cluster_centers)
n_clusters_ = len(np.unique(labels))
print('Number of estimated clusters: ', n_clusters_)

colors = 10 * ['g', 'r', 'c', 'b', 'k', 'y', 'm']
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i in range(len(X)):
    ax.scatter(X[i][0], X[i][1], X[i][2], c=colors[labels[i]], marker='o')

ax.scatter(cluster_centers[:, 0],
           cluster_centers[:, 1],
           cluster_centers[:, 2],
           marker='x',
           color='k',
# mean shift clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import MeanShift
from matplotlib import pyplot
# define dataset
X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
# define the model
model = MeanShift()
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

# optics clustering
from numpy import unique
Exemplo n.º 23
0
def window_analysis(Windows,
                    ref_labels,
                    labels1,
                    Chr=1,
                    ncomp=4,
                    amova=True,
                    supervised=True,
                    include_who=[],
                    range_sample=[130, 600],
                    rand_sample=0,
                    clsize=15,
                    cl_freqs=5,
                    Bandwidth_split=20):

    kde_class_labels = labels1
    kde_label_dict = {
        z:
        [x for x in range(len(kde_class_labels)) if kde_class_labels[x] == z]
        for z in list(set(kde_class_labels))
    }

    if include_who:
        include = [
            x for x in range(len(kde_class_labels))
            if kde_class_labels[x] in include_who
        ]
        ref_labels = include_who
        kde_class_labels = [kde_class_labels[x] for x in include]

        kde_label_dict = {
            z: [
                x for x in range(len(kde_class_labels))
                if kde_class_labels[x] == z
            ]
            for z in include_who
        }

    if rand_sample:
        sample = rand_sample
        sample_range = [0, sample]
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in np.random.choice(
                    list(Windows[Chr].keys()), sample, replace=True)
            }
        }

    if range_sample:
        sample_range = range_sample
        Freq_extract = {
            Chr: {
                bl: Windows[Chr][bl]
                for bl in list(sorted(Windows[Chr].keys()))
                [sample_range[0]:sample_range[1]]
            }
        }

    Results = {'header': ['Chr', 'window'], 'info': []}

    Frequencies = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    Construct = {'header': ['Chr', 'window', 'cl'], 'coords': [], 'info': []}

    PC_var = {'header': ['Chr', 'window'], 'coords': [], 'info': []}

    pc_density = []
    pc_coords = []

    sim_fst = []

    for c in Freq_extract[Chr].keys():
        Sequences = Windows[Chr][c]

        if Sequences.shape[1] <= 3:
            Results[Chr][c] = [0, 0]
            print('hi')
            continue

        Sequences = np.nan_to_num(Sequences)

        pca = PCA(n_components=ncomp, whiten=False,
                  svd_solver='randomized').fit(Sequences)
        data = pca.transform(Sequences)

        if include_who:
            data = data[include, :]

        ##### PC density
        PC = 0

        pc_places = data[:, PC]

        X_plot = np.linspace(-8, 8, 100)

        kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
            np.array(pc_places).reshape(-1, 1))

        log_dens = kde.score_samples(X_plot.reshape(-1, 1))

        pc_density.append(np.exp(log_dens))
        pc_coords.append(pc_places)

        PC_var['coords'].append([Chr, c])
        PC_var['info'].append([x for x in pca.explained_variance_])
        ###
        params = {
            'bandwidth': np.linspace(np.min(data), np.max(data),
                                     Bandwidth_split)
        }
        grid = GridSearchCV(KernelDensity(algorithm="ball_tree",
                                          breadth_first=False),
                            params,
                            verbose=0)

        ######################################
        ####### TEST global Likelihood #######
        ######################################
        Focus_labels = list(range(data.shape[0]))

        #### Mean Shift approach
        ## from sklearn.cluster import MeanShift, estimate_bandwidth

        bandwidth = estimate_bandwidth(data,
                                       quantile=0.2,
                                       n_samples=len(Focus_labels))
        if bandwidth <= 1e-3:
            bandwidth = 0.1

        ms = MeanShift(bandwidth=bandwidth,
                       cluster_all=False,
                       min_bin_freq=clsize)
        ms.fit(data[Focus_labels, :])
        labels = ms.labels_

        Tree = {
            x: [Focus_labels[y] for y in range(len(labels)) if labels[y] == x]
            for x in [g for g in list(set(labels)) if g != -1]
        }
        Keep = [x for x in Tree.keys() if len(Tree[x]) > clsize]

        Tree = {x: Tree[x] for x in Keep}
        Ngps = len(Tree)
        SpaceX = {x: data[Tree[x], :] for x in Tree.keys()}

        these_freqs = []
        ### Extract MScluster likelihood by sample

        for hill in SpaceX.keys():

            if len(Tree[hill]) >= cl_freqs:
                if supervised == False:
                    print('hi')
                    cl_seqs = Sequences[Tree[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

            grid.fit(data[Tree[hill], :])

            # use the best estimator to compute the kernel density estimate
            kde = grid.best_estimator_

            P_dist = kde.score_samples(data[Tree[hill], :])
            Dist = kde.score_samples(data)
            P_dist = np.nan_to_num(P_dist)
            Dist = np.nan_to_num(Dist)
            if np.std(P_dist) == 0:
                Dist = np.array(
                    [int(Dist[x] in P_dist) for x in range(len(Dist))])
            else:
                Dist = scipy.stats.norm(np.mean(P_dist),
                                        np.std(P_dist)).cdf(Dist)
            Dist = np.nan_to_num(Dist)
            Construct['coords'].append([Chr, c, hill])
            Construct['info'].append(Dist)

            #########################################
        ############# AMOVA ################
        #########################################

        if supervised:
            labels = [x for x in kde_class_labels if x in ref_labels]
            Who = [
                z for z in it.chain(*[kde_label_dict[x] for x in ref_labels])
            ]
            Ngps = len(ref_labels)

            print(ref_labels)
            for hill in ref_labels:

                if len(kde_label_dict[hill]) >= cl_freqs:
                    if include_who:
                        Seq_specific = Sequences[include, :]

                    cl_seqs = Seq_specific[kde_label_dict[hill], :]

                    freq_vector = [
                        float(x) / (cl_seqs.shape[0] * 2)
                        for x in np.sum(cl_seqs, axis=0)
                    ]

                    Frequencies['coords'].append([Chr, c, hill])
                    Frequencies['info'].append(freq_vector)
                    these_freqs.append(freq_vector)

        else:
            Who = [
                x for x in range(len(labels))
                if labels[x] != -1 and labels[x] in Keep
            ]
            labels = [labels[x] for x in Who]
            Who = [Focus_labels[x] for x in Who]

        #
        Pairwise = return_fsts2(np.array(these_freqs))
        sim_fst.extend(Pairwise.fst)

        if len(list(set(labels))) == 1:
            Results['coords'].append([Chr, c])
            Results['info'].append([AMOVA, Ngps])
            continue

        if amova:
            clear_output()
            AMOVA, Cig = AMOVA_FM42(data[Who, :],
                                    labels,
                                    n_boot=0,
                                    metric='euclidean')
            print('counting: {}, Ngps: {}'.format(AMOVA, Ngps))
            Results['info'].append([Chr, c, AMOVA, Ngps])

    Results['info'] = pd.DataFrame(
        np.array(Results['info']),
        columns=['chrom', 'window', 'AMOVA', 'Ngps'])

    X_plot = np.linspace(0, .3, 100)

    freq_kde = KernelDensity(kernel='gaussian', bandwidth=0.01).fit(
        np.array(sim_fst).reshape(-1, 1))

    log_dens = freq_kde.score_samples(X_plot.reshape(-1, 1))

    fig_roost_dens = [
        go.Scatter(x=X_plot,
                   y=np.exp(log_dens),
                   mode='lines',
                   fill='tozeroy',
                   name='',
                   line=dict(color='blue', width=2))
    ]
    ##

    layout = go.Layout(title='allele frequency distribution across clusters',
                       yaxis=dict(title='density'),
                       xaxis=dict(title='fst'))

    fig = go.Figure(data=fig_roost_dens, layout=layout)

    return Frequencies, sim_fst, Results, Construct, pc_density, pc_coords, fig
Exemplo n.º 24
0
import codecs
import json

import numpy as np
from sklearn.cluster import MeanShift

fileObj = codecs.open('../data/connections.json', "r", "utf_8_sig")
data = json.loads(fileObj.read())
fileObj.close()

X = []
for coord in data:
    X.append([coord['latitude'], coord['longitude']])

ap = MeanShift()
ap.fit(X)
labels = ap.labels_
cluster_centers_ = ap.cluster_centers_

sample_count = len(X)
coords_labeled = []

for i in range(0, sample_count):
    latitude = X[i][0]
    longitude = X[i][1]
    label = labels[i]
    coords_labeled.append({
        'point': [latitude, longitude],
        'label': str(label),
        'recall_count': data[i]['connection_count']
    })
Exemplo n.º 25
0
def tracker(path):
	#initialization for default value
	if path=='0':
		path=0;
	
	cap = cv2.VideoCapture(path)
	ip_method = ip.get_instace(ip.IPMethod.TOMASI);
	
	#FLANN Properties
	MIN_FRAMES_COUNT = 120
	SKIP_FRAMES = 60
	MIN_MERGE_FRAMES = 5;
	FLANN_INDEX_KDTREE = 0
	index_params = dict(algorithm = FLANN_INDEX_KDTREE, trees = 10)
	search_params = dict(checks = 50)
	flann = cv2.FlannBasedMatcher(index_params, search_params)
	DO_RESIZE=False
	new_sz = (180,120)
	#Initialization of inputs
	frames =[];					#Frames
	kp = [];					#Key points
	all_matches = []; 			#All good matches
	match_count = [];			#match_count
	labels = [];
	frame_cnt=0;	
	
	print "Extracting frames...................."
	ret, prev_frame = cap.read()
	kp1,desc1 = ip_method.detectAndCompute(prev_frame);
	num_matches = np.zeros(kp1.__len__())
	
	#storing frames
	frames.append(prev_frame);
	kp.append(kp1)
	match_count.append(num_matches);
	
	
	while(cap.isOpened()):
		SKIP_FRAMES=SKIP_FRAMES-1;
		ret, prev_frame = cap.read()
		if not ret or SKIP_FRAMES<0:
			break;
	
	while(cap.isOpened()):
			
		ret, cur_frame = cap.read()
		if not ret:
			break;
		kp2,desc2 = ip_method.detectAndCompute(cur_frame);
		matches = flann.knnMatch(desc1,desc2,k=	2)		
		# Ratio test as per Lowe's paper 
		good_matches = []; distances = []
		for (m,n) in matches:
			
			if m.distance < 0.7*n.distance and m.distance > 4:
				good_matches.append(m);
				distances.append(m.distance);
				
		# Bashart's Displacement filtering
		mean = np.mean(distances); std = np.std(distances)
		good_matches[:] = [match for match in good_matches if abs(match.distance - mean) <  5 * std]
		kp1 = kp2; desc1 = desc2;
		
		num_matches = np.zeros(kp1.__len__())
		for match in good_matches:
			num_matches[match.trainIdx]=match_count[-1][match.queryIdx]+1
	
		all_matches.append(good_matches);		
		
		#storing frames
		frames.append(cur_frame);
		kp.append(kp1)
		match_count.append(num_matches);
		
		if frame_cnt > MIN_FRAMES_COUNT:
			break;
		frame_cnt = frame_cnt +1;
	cap.release()
	
	
	print "Labeling the keypoints................."
	max_label=0;
	MIN_POINTS_TO_CLUSTER = 20
	MAX_CLUSTERS = 100
	#Forward Labeling Pass
	for rng in xrange(0,MIN_MERGE_FRAMES+1):
		labels.append([-1]*kp[rng].__len__());
	for rng in xrange(MIN_MERGE_FRAMES+1,frame_cnt):
		motion_feats = []; feat_indices = [];
		labels.append([-1]*kp[rng].__len__());
		for match in all_matches[rng-1]:
			if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES: 
				if labels[rng-1][match.queryIdx]==-1:
					src_pt = np.int32(kp[rng-1][match.queryIdx].pt)
					dst_pt = np.int32(kp[rng][match.trainIdx].pt)
					motion_feats.append(motion.get_features(src_pt,dst_pt));
					feat_indices.append(match.trainIdx)
				else :
					labels[rng][match.trainIdx]=labels[rng-1][match.queryIdx]
		
		if(motion_feats.__len__()>=MIN_POINTS_TO_CLUSTER):
			#Clustering mean-shift
			motion_feats = np.asarray(motion_feats)
			bandwidth = estimate_bandwidth(motion_feats, quantile=0.1,random_state=200)
			ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
			ms.fit(motion_feats);
			for idx,lbl in zip(feat_indices,ms.labels_):
				labels[rng][idx]=lbl+max_label;
			max_label = max(labels[rng])+1;
	
	
	random_colors = np.random.randint(256, size=(MAX_CLUSTERS, 3))
	print "Writing the video................."
	fourcc = cv2.cv.CV_FOURCC(*'XVID')
	w = prev_frame.shape[0]; h = prev_frame.shape[1]
	if DO_RESIZE:
		vidout = cv2.VideoWriter('out.avi',fourcc,20,new_sz)
	else:
		vidout = cv2.VideoWriter('out.avi',fourcc,20,(h,w))
	for frame_idx in xrange(MIN_MERGE_FRAMES*2,frame_cnt):
		cur_frame = frames[frame_idx];
		for rng in xrange(frame_idx-MIN_MERGE_FRAMES,frame_idx):
			for match in all_matches[rng-1]:
				if match_count[rng-1][match.queryIdx]>=MIN_MERGE_FRAMES \
						and not (labels[rng-1][match.queryIdx]==-1 or labels[rng-1][match.queryIdx]>=MAX_CLUSTERS):
					#print "i m not here"
					src_pt = np.int32(kp[rng-1][match.queryIdx].pt)
					dst_pt = np.int32(kp[rng][match.trainIdx].pt)
					color = tuple(random_colors[labels[rng-1][match.queryIdx]])
					cv2.line(cur_frame,tuple(src_pt),tuple(dst_pt),color,2);	
		if DO_RESIZE:
			cur_frame=cv2.resize(cur_frame,new_sz);
		vidout.write(cur_frame);
	vidout.release()
	cv2.destroyAllWindows()
 def create_dataset(self):
     self.data_list = []
     abstracts = []
     titles = []
     #print("VOCAB: ", self.vocab)
     if self.vocab == []:
         count_vectorizer = CountVectorizer(stop_words='english')
     else:
         count_vectorizer = CountVectorizer(vocabulary=self.vocab)
     tfid_transformer = TfidfTransformer()
     abstract_data = None
     title_data = None
     graph_data = None
     for paper in self.papers:
         abstracts.append(paper.paper.abstract)
         titles.append(paper.paper.title)
     if self.using_abstracts == True:
         #print("using abstract data")
         abstract_count = count_vectorizer.fit_transform(abstracts)
         abstract_tfid = tfid_transformer.fit_transform(abstract_count)
         abstract_data = abstract_tfid
         #print(abstract_data)
     if self.using_titles == True:
         #print("using title data")
         title_count = count_vectorizer.fit_transform(titles)
         abstract_tfid = tfid_transformer.fit_transform(title_count)
         title_data = abstract_tfid
         #print(title_data.toarray(), len(self.papers))
     if self.using_graph_data == True:
         #print("using graph data")
         graph_data = scipy.sparse.csr_matrix(np.matrix(self.graph_dataset))
         #print(graph_data)
     self.data = []
     for paper in self.papers:
         self.data.append([1])
     self.data = scipy.sparse.csr_matrix(self.data)
     #print("MAT ", self.data)
     if abstract_data != None:
         self.data = scipy.sparse.hstack([self.data, abstract_data])
     if title_data != None:
         self.data = scipy.sparse.hstack([self.data, title_data])
     if graph_data != None:
         self.data = scipy.sparse.hstack([self.data, graph_data])
     # Reduce data to two dimensions
     #print("nd data: ", self.data)
     self.nd_data = self.data.toarray()
     #print("SHAPE ", self.data.shape[1])
     if self.data.shape[1] > 50:
         svd_data = TruncatedSVD(n_components=50).fit_transform(self.data)
         tsne_data = TSNE(n_components=2,
                          metric='cosine').fit_transform(svd_data)
     elif self.data.shape[1] < 10:
         #print("PCA")
         svd_data = PCA(n_components=2).fit_transform(self.data.toarray())
         tsne_data = svd_data
         if len(tsne_data) == 1:
             tsne_data = [[1, 0]]
     else:
         svd_data = self.data
         tsne_data = TSNE(n_components=2,
                          metric='cosine').fit_transform(svd_data)
     self.data = scipy.sparse.csr_matrix(tsne_data)
     self.data = self.data.toarray()
     # Normalize data to max x and y == 1
     #print("before normalization: ", self.data)
     max_x = 0
     max_y = 0
     min_x = 0
     min_y = 0
     for row in self.data:
         x = row[0]
         y = row[1]
         if x > max_x:
             max_x = x
         if y > max_y:
             max_y = y
         if x < min_x:
             min_x = x
         if y < min_y:
             min_y = y
     for row in self.data:
         if (max_x - min_x != 0):
             row[0] = (row[0] - min_x) / (max_x - min_x)
         else:
             row[0] = 0
         if (max_y - min_y != 0):
             row[1] = (row[1] - min_y) / (max_y - min_y)
         else:
             row[1] = 0
     if (self.func == "msh"):
         try:
             band = estimate_bandwidth(self.data)
         except ValueError:
             return False
         band = band * (self.bandwith_factor / 100)
         if band == 0:
             return False
         self.cluster_function = MeanShift(bandwidth=band)
     return True
                    x += 1
            df[column] = list(map(convert_to_int, df[column]))

    return df


df = handle_non_numerical(df)

df.drop(['boat'], 1, inplace=True
        )  #you can tweak the dataset to see what variabes have an impact

X = np.array(df.drop(['survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['survived'])

clf = MeanShift()
clf.fit(X)

labels = clf.labels_
cluster_centers = clf.cluster_centers_

original_df['cluster_group'] = np.nan

for i in range(len(X)):
    original_df['cluster_group'].iloc[i] = labels[i]  #refs rows in df

n_clusters_ = len(np.unique(labels))
survival_rates = {}
for i in range(n_clusters_):
    temp_df = original_df[(original_df['cluster_group'] == float(i))]
    survival_cluster = temp_df[(temp_df['survived'] == 1)]
# Fit and predict the data
birch.fit(scaledData)
predictions = birch.predict(scaledData)

# Scatterplot between two features to check the clustering
plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions)
plt.xlabel("Height")
plt.ylabel("Shell weight")
plt.title("Clustering using Birch clustering algorithm")
plt.show()

##################################### Mean Shift Clustering #################################

# Determine optimal bandwidth value
bandwidth = estimate_bandwidth(scaledData, quantile=0.2, n_samples=500)

# Instantiate the clustering model
mnShift = MeanShift(bandwidth=bandwidth)

# Fit and predict the data
mnShift.fit(scaledData)
predictions = mnShift.predict(scaledData)

# Scatterplot between two features to check the clustering
plt.scatter(scaledData[:, 2], scaledData[:, 6], c=predictions)
plt.xlabel("Height")
plt.ylabel("Shell weight")
plt.title("Clustering using Mean shift clustering algorithm")
plt.show()
Exemplo n.º 29
0
print(iris_data.head())

virginica = iris_data.loc[iris_data['Species_I. virginica'] == 1]
versicolor = iris_data.loc[iris_data['Species_I. versicolor'] == 1]
setosa = iris_data.loc[iris_data['Species_I. setosa'] == 1]

plt.scatter(x=virginica['Sepal length'], y=virginica['Sepal width'], color='r')
plt.scatter(x=versicolor['Sepal length'],
            y=versicolor['Sepal width'],
            color='g')
plt.scatter(x=setosa['Sepal length'], y=setosa['Sepal width'], color='b')

#plt.show()

print("Self band: ", estimate_bandwidth(iris_data, quantile=0.2))
analyzer = MeanShift(bandwidth=1)
print("Self MeanShift: ", analyzer.fit(iris_data))
print("Function mean_shift: ", mean_shift(iris_data))

labels, cluster_centers, n_clusters = mean_shift(iris_data)
fig = plt.figure()
ax = fig.add_subplot(111)

colors = cycle('bgrcmy')
for k, col in zip(range(n_clusters), colors):
    my_members = (labels == k)
    cluster_center = cluster_centers[k]

    x, y = iris_data[my_members, 0], iris_data[my_members, 1]
    ax.scatter(x=iris_data[my_members, 0],
               y=iris_data[my_members, 1],
Exemplo n.º 30
0
import pandas as pd

from sklearn.cluster import MeanShift

if __name__ == '__main__':

    df = pd.read_csv('./Datasets/candy.csv')

    print(df.head())

    X = df.drop('competitorname', axis=1)

    meanshift = MeanShift().fit(X)
    print('Numero de clusters: ', max(meanshift.labels_) + 1)
    print('==' * 64)

    print('Centros: ', meanshift.cluster_centers_)

    df['meanshift'] = meanshift.labels_

    print('==' * 64)
    print(df.head())