示例#1
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
示例#2
0
 def test_kmeans(self):
     kmeans = KMeans(k=2, seed=1)
     path = tempfile.mkdtemp()
     km_path = path + "/km"
     kmeans.save(km_path)
     kmeans2 = KMeans.load(km_path)
     self.assertEqual(kmeans.uid, kmeans2.uid)
     self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
     self.assertEqual(kmeans2.uid, kmeans2.k.parent,
                      "Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
                      % (kmeans2.uid, kmeans2.k.parent))
     self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
                      "Loaded KMeans instance default params did not match " +
                      "original defaults")
     try:
         rmtree(path)
     except OSError:
         pass
示例#3
0
def k_means():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    from pyspark.ml.linalg import Vectors
    data = [(Vectors.dense([0.0, 0.0]), ), (Vectors.dense([1.0, 1.0]), ),
            (Vectors.dense([9.0, 8.0]), ), (Vectors.dense([8.0, 9.0]), )]
    df = spark.createDataFrame(data, ["features"])
    kmeans = KMeans(k=2, seed=1)
    model = kmeans.fit(df)
    centers = model.clusterCenters()
    len(centers)
    # 2
    model.computeCost(df)
    # 2.000...
    transformed = model.transform(df).select("features", "prediction")
    rows = transformed.collect()
    rows[0].prediction == rows[1].prediction
    # True
    rows[2].prediction == rows[3].prediction
    # True
    model.hasSummary
    # True
    summary = model.summary
    summary.k
    # 2
    summary.clusterSizes
    # [2, 2]
    temp_path = "./"
    kmeans_path = temp_path + "/kmeans"
    kmeans.save(kmeans_path)
    kmeans2 = KMeans.load(kmeans_path)
    kmeans2.getK()
    # 2
    model_path = temp_path + "/kmeans_model"
    model.save(model_path)
    model2 = KMeansModel.load(model_path)
    model2.hasSummary
    # False
    model.clusterCenters()[0] == model2.clusterCenters()[0]
    # array([ True,  True], dtype=bool)
    model.clusterCenters()[1] == model2.clusterCenters()[1]
示例#4
0
from pyspark.ml.clustering import KMeans
# 查找最佳k值
for i in range(2,11):
    km = KMeans().setK(i).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction')
    res_kmval = km.fit(clsdata_model).summary.trainingCost
    print(i,': ',res_kmval)
    

# k = 4
model = KMeans().setK(3).setSeed(4603).setFeaturesCol('feature').setPredictionCol('prediction').fit(clsdata_model)
res_km = model.transform(clsdata_model)
summary = model.summary
summary.clusterSizes
[739011, 463649, 807578]
summary.trainingCost
>>> 7632810.723481619
model.clusterCenters()

model.save('kmeans3_model')
clsdata_vecform.createOrReplaceTempView('clsdata')
res_km.createOrReplaceTempView('reskm')
res4 = spark.sql('select c.*, r.prediction as prediction from clsdata c, reskm r where c.id = r.id').drop('feature')
    def train_model(self,
                    messages,
                    k,
                    path_to_model=None,
                    ft_col='features',
                    distance="cosine",
                    initSteps=10,
                    tol=0.0001,
                    maxIter=30,
                    mode="new",
                    log_path=None):
        """Train K-Means model.

        -- params:
        messages (pyspark.sql.dataframe.DataFrame): data frame with a vector column with features for the kmeans algorithm
        k (int): number of clusters
        ft_col (string): name of the features column
        distance ("euclidean" or "cosine"): distance measure for the kmeans algorithm
        tol (int): tolerance for kmeans algorithm convergence
        maxIter (int): maximum number of iterations for the kmeans algorithm
        path_to_model (string): where to save trained kmeans model
        mode ("new" or "overwrite"): whether to save new file or overwrite pre-existing one.
        log_path (string): where to save optimization stats. Default None (no saving)

        Returns:
        model_fit (pyspark.ml.clustering.KMeansModel): trained K-Means model
        """
        from pyspark.ml.evaluation import ClusteringEvaluator
        from pyspark.ml.clustering import KMeans
        from pathlib import Path
        import time
        import datetime

        evaluator = ClusteringEvaluator(distanceMeasure=distance)

        start_time = time.time()
        start_time_string = datetime.datetime.fromtimestamp(
            start_time).strftime('%Y-%m-%d %H:%M:%S')

        model = KMeans(featuresCol=ft_col,
                       k=k,
                       initMode='k-means||',
                       initSteps=initSteps,
                       tol=tol,
                       maxIter=maxIter,
                       distanceMeasure=distance)

        model_fit = model.fit(messages)

        wsse = model_fit.summary.trainingCost
        silhouette = evaluator.evaluate(model_fit.summary.predictions)

        if log_path:
            log_path = Path(log_path)
            log_path.parent.mkdir(parents=True, exist_ok=True)
            print("Saving training metrics to: {}".format(log_path))
            with open(log_path, "a") as log:
                log.write("With K={}\n\n".format(k))
                log.write("Started at: {}\n".format(start_time_string))
                log.write("Within Cluster Sum of Squared Errors = " +
                          str(round(wsse, 4)))
                log.write("\nSilhouette with cosine distance = " +
                          str(round(silhouette, 4)))

                log.write(
                    "\nTime elapsed: {} minutes and {} seconds.\n".format(
                        int((time.time() - start_time) / 60),
                        int((time.time() - start_time) % 60)))
                log.write('--' * 30 + "\n\n")
        else:
            print("With K={}\n".format(k))
            print("Started at: {}\n".format(start_time_string))
            print("Within Cluster Sum of Squared Errors = " +
                  str(round(wsse, 4)))
            print("Silhouette with cosine distance = " +
                  str(round(silhouette, 4)))

            print("\nTime elapsed: {} minutes and {} seconds.".format(
                int((time.time() - start_time) / 60),
                int((time.time() - start_time) % 60)))
            print('--' * 30)

        if path_to_model:
            outname = "{}/kmeans_K={}".format(path_to_model, k)
            print("Saving K-Means model to: {}".format(outname))
            if mode == "overwrite":
                model.write().overwrite().save(outname)
            else:
                model.save(outname)

        return {"model": model_fit, "wsse": wsse, "asw": silhouette}