def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() km_path = path + "/km" kmeans.save(km_path) kmeans2 = KMeans.load(km_path) self.assertEqual(kmeans.uid, kmeans2.uid) self.assertEqual(type(kmeans.uid), type(kmeans2.uid)) self.assertEqual(kmeans2.uid, kmeans2.k.parent, "Loaded KMeans instance uid (%s) did not match Param's uid (%s)" % (kmeans2.uid, kmeans2.k.parent)) self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k], "Loaded KMeans instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def test_kmeans(self): kmeans = KMeans(k=2, seed=1) path = tempfile.mkdtemp() km_path = path + "/km" kmeans.save(km_path) kmeans2 = KMeans.load(km_path) self.assertEqual(kmeans.uid, kmeans2.uid) self.assertEqual(type(kmeans.uid), type(kmeans2.uid)) self.assertEqual(kmeans2.uid, kmeans2.k.parent, "Loaded KMeans instance uid (%s) did not match Param's uid (%s)" % (kmeans2.uid, kmeans2.k.parent)) self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k], "Loaded KMeans instance default params did not match " + "original defaults") try: rmtree(path) except OSError: pass
def k_means(): spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() from pyspark.ml.linalg import Vectors data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ), (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )] df = spark.createDataFrame(data, ["features"]) kmeans = KMeans(k=2, seed=1) model = kmeans.fit(df) centers = model.clusterCenters() len(centers) # 2 model.computeCost(df) # 2.000... transformed = model.transform(df).select("features", "prediction") rows = transformed.collect() rows[0].prediction == rows[1].prediction # True rows[2].prediction == rows[3].prediction # True model.hasSummary # True summary = model.summary summary.k # 2 summary.clusterSizes # [2, 2] temp_path = "./" kmeans_path = temp_path + "/kmeans" kmeans.save(kmeans_path) kmeans2 = KMeans.load(kmeans_path) kmeans2.getK() # 2 model_path = temp_path + "/kmeans_model" model.save(model_path) model2 = KMeansModel.load(model_path) model2.hasSummary # False model.clusterCenters()[0] == model2.clusterCenters()[0] # array([ True, True], dtype=bool) model.clusterCenters()[1] == model2.clusterCenters()[1]
from pyspark.ml.clustering import KMeans # 查找最佳k值 for i in range(2,11): km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction') res_kmval = km.fit(clsdata_model).summary.trainingCost print(i,': ',res_kmval) # k = 4 model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model) res_km = model.transform(clsdata_model) summary = model.summary summary.clusterSizes [739011, 463649, 807578] summary.trainingCost >>> 7632810.723481619 model.clusterCenters() model.save('kmeans3_model') clsdata_vecform.createOrReplaceTempView('clsdata') res_km.createOrReplaceTempView('reskm') res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
def train_model(self, messages, k, path_to_model=None, ft_col='features', distance="cosine", initSteps=10, tol=0.0001, maxIter=30, mode="new", log_path=None): """Train K-Means model. -- params: messages (pyspark.sql.dataframe.DataFrame): data frame with a vector column with features for the kmeans algorithm k (int): number of clusters ft_col (string): name of the features column distance ("euclidean" or "cosine"): distance measure for the kmeans algorithm tol (int): tolerance for kmeans algorithm convergence maxIter (int): maximum number of iterations for the kmeans algorithm path_to_model (string): where to save trained kmeans model mode ("new" or "overwrite"): whether to save new file or overwrite pre-existing one. log_path (string): where to save optimization stats. Default None (no saving) Returns: model_fit (pyspark.ml.clustering.KMeansModel): trained K-Means model """ from pyspark.ml.evaluation import ClusteringEvaluator from pyspark.ml.clustering import KMeans from pathlib import Path import time import datetime evaluator = ClusteringEvaluator(distanceMeasure=distance) start_time = time.time() start_time_string = datetime.datetime.fromtimestamp( start_time).strftime('%Y-%m-%d %H:%M:%S') model = KMeans(featuresCol=ft_col, k=k, initMode='k-means||', initSteps=initSteps, tol=tol, maxIter=maxIter, distanceMeasure=distance) model_fit = model.fit(messages) wsse = model_fit.summary.trainingCost silhouette = evaluator.evaluate(model_fit.summary.predictions) if log_path: log_path = Path(log_path) log_path.parent.mkdir(parents=True, exist_ok=True) print("Saving training metrics to: {}".format(log_path)) with open(log_path, "a") as log: log.write("With K={}\n\n".format(k)) log.write("Started at: {}\n".format(start_time_string)) log.write("Within Cluster Sum of Squared Errors = " + str(round(wsse, 4))) log.write("\nSilhouette with cosine distance = " + str(round(silhouette, 4))) log.write( "\nTime elapsed: {} minutes and {} seconds.\n".format( int((time.time() - start_time) / 60), int((time.time() - start_time) % 60))) log.write('--' * 30 + "\n\n") else: print("With K={}\n".format(k)) print("Started at: {}\n".format(start_time_string)) print("Within Cluster Sum of Squared Errors = " + str(round(wsse, 4))) print("Silhouette with cosine distance = " + str(round(silhouette, 4))) print("\nTime elapsed: {} minutes and {} seconds.".format( int((time.time() - start_time) / 60), int((time.time() - start_time) % 60))) print('--' * 30) if path_to_model: outname = "{}/kmeans_K={}".format(path_to_model, k) print("Saving K-Means model to: {}".format(outname)) if mode == "overwrite": model.write().overwrite().save(outname) else: model.save(outname) return {"model": model_fit, "wsse": wsse, "asw": silhouette}