示例#1
0
def chartShow():
    data = train()
    # ----Kmeans聚类----
    print("------------------Kmeans聚类--------------------")
    print("------------设定不同的K值,进行分类,计算平方误差之和------------")

    errors = []
    results = []
    centers = []

    for k in range(2, 10):
        # 获得模型
        kmeansmodel = KMeans().setK(k).setFeaturesCol('feature').setPredictionCol('prediction').fit(data)
        print("With K={}".format(k))

        # 带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(data).collect()
        results.append(kmeans_results)
        #     for item in kmeans_results:
        #         print(item)
        #         print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        # 获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        print(len(kmeans_centers))
        for item in kmeans_centers:
            print(item)
            #         print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        # 计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(data)
        errors.append(WSSSE)
        print("Within Set Sum of Squared Error = " + str(WSSSE))

        print('--' * 30 + '\n')

    # ----WSSSE可视化----
    plt.figure()
    k_number = range(2, 10)
    plt.plot(k_number, errors)
    plt.xlabel('Number of K')
    plt.ylabel('WSSSE')
    plt.title('K-WSSSE')

    # ----聚类结果可视化----
    print("---------将数据转换为panda结构,并查看空间3d图心-----------")
    # 通过K-WSSSE图,k=6时聚类效果较好
    k = 4

    cluster_vis = plt.figure(figsize=(10, 10)).gca(projection='3d')

    for item in results[k - 2]:
        if item[1] == 0:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='b')  # blue
        if item[1] == 1:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='y')  # yellow
        if item[1] == 2:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='m')  # magenta
        if item[1] == 3:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='k')  # black
        if item[1] == 4:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='g')  # green
        if item[1] == 5:
            cluster_vis.scatter(item[0][0], item[0][1], item[0][2], c='c')  # cyan

    for item in centers[k - 2]:
        cluster_vis.scatter(item[0], item[1], item[2], c='r', marker='p')  # red,五角

    plt.show()
        #带有预测簇标签的数据集
        kmeans_results = kmeansmodel.transform(iris_DF).collect()
        results.append(kmeans_results)
        for item in kmeans_results:
            print(str(item[0]) + ' is predcted as cluster' + str(item[1]))

        #获取到模型的所有聚类中心情况
        kmeans_centers = kmeansmodel.clusterCenters()
        centers.append(kmeans_centers)
        center_seq = 0
        for item in kmeans_centers:
            print("Cluster" + str(center_seq) + "  Center" + str(item))
            center_seq = center_seq + 1

        #计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
        WSSSE = kmeansmodel.computeCost(iris_DF)
        errors.append(WSSSE)
        print("Within Set Sum of Squared Error = " + str(WSSSE))

        print('--' * 30 + '\n')

    #----WSSSE可视化----
    plt.figure()
    k_number = range(2, 10)
    plt.plot(k_number, errors)
    plt.xlabel('Number of K')
    plt.ylabel('WSSSE')
    plt.title('K-WSSSE')

    #----聚类结果可视化----
    print("---------将数据转换为panda结构,并查看空间3d图心-----------")
示例#3
0
est2 = KMeans(featuresCol='features', predictionCol='pred2', k=2, seed=1)
est3 = KMeans(featuresCol='features', predictionCol='pred3', k=3, seed=1)

#%%
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[assembler, scaler, selector,
                            est2, est3])

#%%
result = pipeline.fit(data_raw).transform(data_raw)

result.show()

#%%
wssse2 = est2.computeCost(result)
print(wssse2)

wssse3 = est3.computeCost(result)
print(wssse3)

#%%



#%%


#%%

print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

# COMMAND ----------

evaluator = MulticlassClassificationEvaluator(metricName="f1")
print("F1 Score = " + str(evaluator.evaluate(predictionAndLabels)))

# COMMAND ----------

from pyspark.ml.clustering import KMeans

# Trains a k-means model.
model = KMeans().setK(20).setSeed(1).fit(df_)

# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(df_)
print("Within Set Sum of Squared Errors = " + str(wssse))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

# COMMAND ----------

from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg

pca = PCAml(k=2, inputCol="features", outputCol="pca")
model = pca.fit(df_)
from datetime import datetime
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import abs

data_url = "gs://bigdatasystems_alex_bucket/project/user_info16/part*"
raw_data = (spark.read.option("header",
                              "true").option("inferschema", "true").option(
                                  "mode", "DROPMALFORMED").csv(data_url))

assembler = VectorAssembler(inputCols=[
    "followers", "friends", "favorited", "status_count", "region_id",
    "user_desc_rating", "count"
],
                            outputCol="feat_vector")
featured_data = assembler.transform(raw_data.na.fill(0))
featuresScaler = StandardScaler(inputCol="feat_vector", outputCol="features")
featuresModel = featuresScaler.fit(featured_data)
scFeatData = featuresModel.transform(featured_data)

for k in range(2, 25, 2):
    model = KMeans().setK(k).setSeed(0).fit(scFeatData)
    wssse = model.computeCost(scFeatData)
    print(k, "\t", wssse)
示例#6
0
from sklearn.grid_search import GridSearchCV


def f(x):
    rel = {}
    rel['features'] = Vectors.dense(float(x[1]), float(x[2]), float(x[3]),
                                    float(x[4]))
    rel['label'] = str(x[5]).strip("\"")
    return rel


spark = SparkSession.builder.appName("logistic_regression").getOrCreate()

df = spark.sparkContext.textFile("iris.txt").map(
    lambda line: line.split(",")).map(lambda p: Row(**f(p))).toDF()
"""创建Estimator并调用其fit()方法来生成相应的Transformer对象,
很显然,在这里KMeans类是Estimator,而用于保存训练后模型的KMeansModel类则属于Transformer"""
kmeans_model = KMeans().setK(3).setFeaturesCol("features").setPredictionCol(
    "prediction").fit(df)

results = kmeans_model.transform(df).collect()
for item in results:
    print(str(item[0]) + " is predicted as cluster " + str(item[1]))
"""有可以通过KMeansModel类自带的clusterCenter属性获取到模型的所有聚类中心情况"""
results2 = kmeans_model.clusterCenters()
for item in results2:
    print(item)
"""与MLLib下的实现相同,KMeansModel类也提供了计算集合内误差平方和(Within Set Sum of Squared Error, WSSSE)
的方法来度量聚类的有效性,在真实K值未知的情况下,该值的变化可以作为选取合适K值的一个重要参考"""
print(kmeans_model.computeCost(df))