def initializeModels(self): try: if self.kmeansDF: logger.info("Already loaded this DataFrame") pass except AttributeError: self.kmeansDF = None commandsDF = self.bashDF.map(lambda row: Row(date=row.date, source=row.source, username=row.username, exec_as=row.exec_as, srcip=row.srcip, command=row.command.split(" "))).toDF() commandsDF.cache() word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="command", outputCol="features") w2model = word2Vec.fit(commandsDF) resultDF = w2model.transform(commandsDF) resultDF.cache() kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3) kmodel = kmeans.fit(resultDF) kmeansDF = kmodel.transform(resultDF) kmeansDF.cache() kmeansDF.coalesce(1).write.parquet('/user/jleaniz/ml/kmeans', mode='append') outliers = kmeansDF.groupBy("prediction").count().filter('count < 10').withColumnRenamed("prediction", "cluster") self.outlierCmds = outliers.join(kmeansDF, kmeansDF.prediction == outliers.cluster)
#fitting the vector data and transforming with scaler transformation scaler_model = scaler.fit(final_df) final_df = scaler_model.transform(final_df) final_df.show(6) # In[28]: import numpy as np import matplotlib.pyplot as plt from time import time cost = np.zeros(20) for k in range(2, 20): start = time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(final_df.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(final_df) end = time() print("K means from spark took {:.4f} seconds(k = {:.4f})".format( end - start, k)) # In[8]: fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') # In[39]: best_k = 15 # choose best k from elbow curve
df_clus = vec.transform(data).select('Player', 'features') df_clus.show() # In[20]: features_ball = ['Runs', 'Wkts', 'Ave', 'Econ', 'SR'] vec_ball = VectorAssembler(inputCols=features_ball, outputCol="features_ball") df_clus_ball = vec_ball.transform(dataBowl).select('Player', 'features_ball') df_clus_ball.show() # In[21]: error = np.zeros(15) for k in range(2, 15): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_clus.sample(False, 0.25, seed=1)) error[k] = model.computeCost(df_clus) # In[22]: errorBowl = np.zeros(15) for k in range(2, 15): kmeans_ball = KMeans().setK(k).setSeed(1).setFeaturesCol("features_ball") model_ball = kmeans_ball.fit(df_clus_ball.sample(False, 0.25, seed=1)) errorBowl[k] = model_ball.computeCost(df_clus_ball) # In[23]: get_ipython().magic('matplotlib inline') import matplotlib.pyplot as plt plt.style.use('seaborn-whitegrid')
bowling.show() vecAssembler = VectorAssembler(inputCols=FEATURES_COL, outputCol="features") df_kmeans = vecAssembler.transform(batting).select('player_name', 'features') df_kmeans.show() vecAssembler_bowl = VectorAssembler(inputCols=FEATURES_COL_BOWL, outputCol="features_bowl") df_kmeans_bowl = vecAssembler_bowl.transform(bowling).select( 'player_name', 'features_bowl') df_kmeans_bowl.show() cost = np.zeros(20) for k in range(2, 20): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(df_kmeans.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(df_kmeans) # requires Spark 2.0 or later cost_bowl = np.zeros(20) for k in range(2, 20): kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features_bowl") model = kmeans.fit(df_kmeans_bowl.sample(False, 0.1, seed=42)) cost_bowl[k] = model.computeCost( df_kmeans_bowl) # requires Spark 2.0 or later fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') fig, ax = plt.subplots(1, 1, figsize=(8, 6))
from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt from pyspark.ml.clustering import KMeans kmeans = KMeans(k=2, seed=1) def mapper(line): return line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[ 7], line[8], line[9], line[10], line[11], line[12], weather_features = latlongagain.map(mapper) weather_features_df = weather_features.toDF() weather_df = weather_features_df.selectExpr( "_1 as datetime1", "_2 as day", "_3 as month", "_4 as lat", "_5 as lng", "_6 as base", "_7 as humidity", "_8 as wind", "_9 as temp", "_10 as desc", "_11 as rain", "_12 as latlng", "_13 as borough", "_14 as features") test1 = weather_df.withColumn("features", udf_foo("features")) test1.printSchema() model = kmeans.fit(test1.select('features'))
# In[2]: #I.Generate the two dimensional dataset #with three cluster centroid np.random.seed(0) centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) X, labels_true = make_blobs(n_samples=3000, centers=centers, cluster_std=0.7) # In[3]: #Clustering with KMeans K_means = KMeans(init='k-means++', n_clusters=3, n_init=10) K_means.fit(X) #Return labels and cluster centers K_means_labels = K_means.labels_ K_means_cluster_centers = K_means.cluster_centers_ K_means_inertia = K_means.inertia_ #find the unique elements K_means_labels_unique = np.unique(K_means_labels) # In[4]: # Plot result fig = plt.figure(figsize=(7, 7)) colors = ['#4EACC5', '#FF9C34', '#4E9A06'] ax = fig.add_subplot(111)