示例#1
0
 def __k_mean(cls, df, k_clusters, xnorm, label_name):
     from sklearn.cluster import KMeans
     kmeans_cat = label_name
     if xnorm.shape[1] > 0:
         #n_clusters = elbow point
         start_time = time.time()
         estimator = KMeans(n_clusters=k_clusters)
         estimator.fit(xnorm)
         y_pred = estimator.predict(xnorm)
         res=estimator.__dict__
         #print(res['cluster_centers_'])
         cluster_center = res['cluster_centers_']
         print("training time: ", time.time()-start_time, "(sec)")
     else:
         #no split
         y_pred = 0
         cluster_center = []
         print("no kmeans split")
     df[kmeans_cat] = y_pred + 1
     return df, kmeans_cat, cluster_center
# ss.to_csv("../feature/cf_clustering_20.txt",index=False)
print("saddddddd")
userFactors=model.userFactors.withColumn("features",array2Vec("features"))#
userFactors.show()

############################################
# data = pd.read_csv("../feature/cf_clustering_20.txt")
data = ss
y = data['id']
x = data[[x for x in data.columns if x != 'id']]
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=30,max_iter=200)
kmeans = kmeans.fit(x)

lables = kmeans.predict(x)
data['culster30'] = lables
print("##################")
print(data.head())
print(kmeans.cluster_centers_)

# data = data[[x for x in data.columns if x in ('id','lable')]]
data.to_csv("../feature/cf_clustering_app30.txt",index=False)


# kmeans = KMeans(n_clusters=100,max_iter=300)
# kmeans = kmeans.fit(x)
#
# lables = kmeans.predict(x)
# data['culster100'] = lables
# print("##################")