def clustering_tuning(self): df_raw = pd.read_csv(f"{self.DEFAULT_PREPROCESSING_OUTPUT}", header=None) spark = SparkSession \ .builder \ .appName("PySparkKMeans") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() df = spark.createDataFrame(df_raw) assembler = VectorAssembler(inputCols=df.columns, outputCol="features") # df_sample = df.sample(withReplacement=False, fraction=0.1) df_vec = assembler.transform(df).select("features") K_lst = list(range(100, 10001, 50)) for k in K_lst: kmeans = KMeans(k=k) kmeans.setSeed(1) kmeans.setMaxIter(5000) model = kmeans.fit(df_vec) model.setPredictionCol("newPrediction") model.predict(df_vec.head().features) centers = model.clusterCenters() transformed = model.transform(df_vec).select( "features", "newPrediction") rows = transformed.collect() # Evaluate clustering by computing Silhouette score evaluator = ClusteringEvaluator() transformed = transformed.withColumn("prediction", func.col("newPrediction")) transformed = transformed.reset_index() silhouette = evaluator.evaluate(transformed) return transformed
from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans cluster_df = spark.read.csv('clustering_dataset.csv', header=True, inferSchema=True) cluster_df.show() vectorAssembler = VectorAssembler(inputCols=['col1', 'col2', 'col3'], outputCol='features') vcluster_df = vectorAssembler.transform(cluster_df) vcluster_df.show() kmeans = KMeans().setK(3) kmeans = kmeans.setSeed(1) kmodel = kmeans.fit(vcluster_df) centers = kmodel.clusterCenters() # hierarchical clustering vcluster_df.show() from pyspark.ml.clustering import BisectingKMeans bkmeans = BisectingKMeans().setK(3) bkmeans = bkmeans.setSeed(1) bkmodel = bkmeans.fit(vcluster_df) bkcenters = bkmodel.clusterCenters()
withMean=True) scaled_model = stand_scaled.fit(train_df) train_df = scaled_model.transform(train_df) scaled_model = stand_scaled.fit(test1_df) test1_df = scaled_model.transform(test1_df) scaled_model = stand_scaled.fit(test2_df) test2_df = scaled_model.transform(test2_df) kmeans = KMeans().setK(2) # set number of clusters kmeans = kmeans.setSeed(1) # set start point kmodel = kmeans.fit(train_df) centers = kmodel.clusterCenters() print(centers) test1_df = kmodel.transform(test1_df) test1_df.select("features", "Occupancy", "prediction").show(5) test2_df = kmodel.transform(test2_df) test2_df.select("features", "Occupancy", "prediction").show(5) count1 = test1_df.filter(" prediction!=Occupancy").count() total1 = test1_df.count() count2 = test2_df.filter(" prediction!=Occupancy").count()
score_table = 'from lookalike_application_score_vector_08192021_1m' num_clusters = 100 spark = SparkSession.builder.enableHiveSupport().getOrCreate() # num_features = 10 # num_rows = 10000 # data = np.random.rand(num_rows, num_features) # spark.createDataFrame(data) did_bucket = 0 command = "SELECT did, score_vector, did_bucket FROM {} WHERE did_bucket = {}".format( score_table, did_bucket) df = spark.sql(command) list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT()) df = df.withColumn('score_vec', list_to_vector_udf(df.score_vector)) first_time = True if first_time: kmeans = KMeans(k=num_clusters, featuresCol='score_vec') kmeans.setSeed(1) kmeans.setPredictionCol('cluster_id') model = kmeans.fit(df) first_time = False df2 = model.transform(df) df2.show()