예제 #1
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

cluster_df = spark.read.csv('clustering_dataset.csv',
                            header=True,
                            inferSchema=True)
cluster_df.show()

vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'],
                                  outputCol='features')
vcluster_df = vectorAssembler.transform(cluster_df)

vcluster_df.show()

kmeans = KMeans().setK(3)
kmeans = kmeans.setSeed(1)
kmodel = kmeans.fit(vcluster_df)

centers = kmodel.clusterCenters()

# hierarchical clustering
vcluster_df.show()

from pyspark.ml.clustering import BisectingKMeans
bkmeans = BisectingKMeans().setK(3)
bkmeans = bkmeans.setSeed(1)
bkmodel = bkmeans.fit(vcluster_df)

bkcenters = bkmodel.clusterCenters()