def __k_mean(cls, df, k_clusters, xnorm, label_name): from sklearn.cluster import KMeans kmeans_cat = label_name if xnorm.shape[1] > 0: #n_clusters = elbow point start_time = time.time() estimator = KMeans(n_clusters=k_clusters) estimator.fit(xnorm) y_pred = estimator.predict(xnorm) res=estimator.__dict__ #print(res['cluster_centers_']) cluster_center = res['cluster_centers_'] print("training time: ", time.time()-start_time, "(sec)") else: #no split y_pred = 0 cluster_center = [] print("no kmeans split") df[kmeans_cat] = y_pred + 1 return df, kmeans_cat, cluster_center
# ss.to_csv("../feature/cf_clustering_20.txt",index=False) print("saddddddd") userFactors=model.userFactors.withColumn("features",array2Vec("features"))# userFactors.show() ############################################ # data = pd.read_csv("../feature/cf_clustering_20.txt") data = ss y = data['id'] x = data[[x for x in data.columns if x != 'id']] from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=30,max_iter=200) kmeans = kmeans.fit(x) lables = kmeans.predict(x) data['culster30'] = lables print("##################") print(data.head()) print(kmeans.cluster_centers_) # data = data[[x for x in data.columns if x in ('id','lable')]] data.to_csv("../feature/cf_clustering_app30.txt",index=False) # kmeans = KMeans(n_clusters=100,max_iter=300) # kmeans = kmeans.fit(x) # # lables = kmeans.predict(x) # data['culster100'] = lables # print("##################")