def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10): cluster_data = self.sc.parallelize(Matrix) trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs) results = trains.predict(cluster_data).collect() return results
spark = SparkSession \ .builder \ .appName("KMeans") \ .config("spark.some.config.option", "Angadpreet-KMeans") \ .getOrCreate() today = dt.datetime.today() spark_df = sc.parallelize(spark.read.json("Data/yelp_academic_dataset_business.json").select("stars","review_count","is_open").take(1700)) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(lambda x:(x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(lambda x:Vectors.dense(x)) num_clusters = 3 #Input into the Algorithm km = KMeans() kme = km.train(vector_df, k = num_clusters, maxIterations = 10, seed=2018) centers = kme.clusterCenters err = vector_df.map(lambda x:(x[0], findCenter(x[0], centers))).collect() #Silhoutte Value comparison ag = 0 agi = 0 for er in err: avg = [0] * num_clusters avgi = [0] * num_clusters for e in err: avg[e[1]] += Vectors.squared_distance(er[0], e[0]) avgi[e[1]] += 1 a = avg[er[1]] / avgi[er[1]]
# print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() done in 5.232165s n_samples: 7540, n_features: 6638 # # KMeans Clustering # Initial clusters = 7 # Maximum iteration = 100 # In [47]: km = KMeans(n_clusters=7, init='k-means++', max_iter=100, n_init=1, verbose=1) print("Clustering sparse data with %s" % km) t0 = time() km.fit(X) print("done in %0.3fs" % (time() - t0)) Clustering sparse data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=7, n_init=1, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=1) Initialization complete Iteration 0, inertia 13635.141 Iteration 1, inertia 6943.485 Iteration 2, inertia 6924.093 Iteration 3, inertia 6915.004 Iteration 4, inertia 6909.212
def fit(self, data, n_components, n_iter, ct): """ Estimate model parameters with the expectation-maximization algorithm. Parameters ---------- data - RDD of data points n_components - Number of components n_iter - Number of iterations. Default to 100 Attributes ---------- covariance_type : Type of covariance matrix. Supports only diagonal covariance matrix. ct : Threshold value to check the convergence criteria. Defaults to 1e-3 min_covar : Floor on the diagonal of the covariance matrix to prevent overfitting. Defaults to 1e-3. converged : True once converged False otherwise. Weights : array of shape (1, n_components) weights for each mixture component. Means : array of shape (n_components, n_dim) Mean parameters for each mixture component. Covars : array of shape (n_components, n_dim) Covariance parameters for each mixture component """ sc = data.context covariance_type = 'diag' converged = False self.min_covar = 1e-3 # observation statistics self.s0 = 0 self.s1 = 0 # To get the no of data points n_points = data.count() # To get the no of dimensions n_dim = data.first().size if (n_points == 0): raise ValueError('Dataset cannot be empty') if (n_points < n_components): raise ValueError( 'Not possible to make (%s) components from (%s) datapoints' % (n_components, n_points)) # Initialize Covars(diagonal covariance matrix) if hasattr(data.first(), 'indices'): self.isSparse = 1 def convert_to_kvPair(eachV): g = [] for i in range(eachV.indices.size): g.append( (eachV.indices[i], (eachV.values[i], eachV.values[i] * eachV.values[i]))) return g def computeVariance(x): mean = x[1][0] / n_points sumSq = x[1][1] / n_points return x[0], sumSq - mean * mean cov = [] kvPair = data.flatMap(convert_to_kvPair) res = kvPair.reduceByKey(np.add).map(computeVariance) cov = Vectors.sparse(n_dim, res.collectAsMap()).toArray() + 1e-3 self.Covars = np.tile(cov, (n_components, 1)) else: self.isSparse = 0 cov = [] for i in range(n_dim): cov.append( data.map(lambda m: m[i]).variance() + self.min_covar) self.Covars = np.tile(cov, (n_components, 1)) # Initialize Means using MLlib KMeans self.Means = np.array(KMeans().train(data, n_components).clusterCenters) # Initialize Weights with the value 1/n_components for each component self.Weights = np.tile(1.0 / n_components, n_components) # EM algorithm # loop until number of iterations or convergence criteria is satisfied for i in range(n_iter): logging.info("GMM running iteration %s " % i) # broadcasting means,covars and weights self.meansBc = sc.broadcast(self.Means) self.covarBc = sc.broadcast(self.Covars) self.weightBc = sc.broadcast(self.Weights) # Expectation Step EstepOut = data.map(self.scoreOnePoint) # Maximization step MstepIn = EstepOut.reduce(lambda (w1, x1, y1, z1), ( w2, x2, y2, z2): (w1 + w2, x1 + x2, y1 + y2, z1 + z2)) self.s0 = self.s1 self.mStep(MstepIn[0], MstepIn[1], MstepIn[2], MstepIn[3]) # Check for convergence. if i > 0 and abs(self.s1 - self.s0) < ct: converged = True logging.info("Converged at iteration %s" % i) break return self
def costs_movies(cluster, train, test): for c in cluster: m = KMeans().train(train, k=c, maxIterations=10, runs=3) wscc = m.computeCost(test) print("WSCC for k=" + str(c) + ":" + str(wscc))
#fitting the vector data and transforming with scaler transformation scaler_model = scaler.fit(final_df) final_df = scaler_model.transform(final_df) final_df.show(6) # In[28]: import numpy as np import matplotlib.pyplot as plt from time import time cost = np.zeros(20) for k in range(2, 20): start = time() kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol("features") model = kmeans.fit(final_df.sample(False, 0.1, seed=42)) cost[k] = model.computeCost(final_df) end = time() print("K means from spark took {:.4f} seconds(k = {:.4f})".format( end - start, k)) # In[8]: fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.plot(range(2, 20), cost[2:20]) ax.set_xlabel('k') ax.set_ylabel('cost') # In[39]:
row_ratings.cache() # als_model = ALS.train(row_ratings, 50, 10, 0.1) movie_factors = als_model.productFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) movie_vectors = movie_factors.map(lambda (id, vector): vector) #print(movie_vectors.first()) user_factors = als_model.userFeatures().map(lambda (id, factor): (id, Vectors.dense(factor))) user_vectors = user_factors.map((lambda (id, vector): vector)) #print(user_vectors.first()) # train movie_cluster_model = KMeans().train(movie_vectors, k=5, maxIterations=10, runs=3) print("movie cluster model kmeans :") print(movie_cluster_model) user_cluster_model = KMeans().train(user_vectors, k=5, maxIterations=10, runs=3) print("user cluster model kmeans :") print(user_cluster_model) # predict movie_1 = movie_vectors.first() movie_cluster = movie_cluster_model.predict(movie_1) print(movie_cluster)
def K_means(self, data): cluster_data = self.sc.parallelize(data) trains = KMeans().train(cluster_data, self.k, self.iteration, self.runs) results = trains.predict(cluster_data).collect() return results
from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt from pyspark.ml.clustering import KMeans kmeans = KMeans(k=2, seed=1) def mapper(line): return line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[ 7], line[8], line[9], line[10], line[11], line[12], weather_features = latlongagain.map(mapper) weather_features_df = weather_features.toDF() weather_df = weather_features_df.selectExpr( "_1 as datetime1", "_2 as day", "_3 as month", "_4 as lat", "_5 as lng", "_6 as base", "_7 as humidity", "_8 as wind", "_9 as temp", "_10 as desc", "_11 as rain", "_12 as latlng", "_13 as borough", "_14 as features") test1 = weather_df.withColumn("features", udf_foo("features")) test1.printSchema() model = kmeans.fit(test1.select('features'))
def kmeans_label(mat, scoring=False): kmeans_model = KMeans(n_clusters=NUMBER_CLUSTERS, random_state=1).fit(mat) labels = kmeans_model.labels_ if scoring: print "kmeans score: ", silhouette_score(tif_mat, labels, metric='euclidean') return labels
# print(first_part[:10]) first_part = sc.parallelize(first_part).repartition(5).persist( pyspark.StorageLevel.DISK_ONLY) #adding first sample data in data_used and reoving those from originalRDD data_used = first_part.map(lambda line: int(line[0])).collect() data_used = set(data_used) originalRDD = originalRDD.filter(lambda line: True if int(line[0]) not in data_used else False) # Trains a k-means model. # make clusters model with 5*number of clusters # predicting results for every point in the first part train_data = first_part.map(lambda line: array(line[2:])) train_data = np.array(train_data.collect()) kmeans = KMeans(n_clusters=input_clusters * 5, random_state=0).fit(train_data) print(kmeans.labels_) results = first_part.map(lambda line: (kmeans.predict([line[ 2:]]), [int(line[0])])).map(lambda line: (line[0].tolist()[0], line[1]) ).reduceByKey(lambda a, b: a + b).persist( pyspark.StorageLevel.DISK_ONLY) # seperating the clusters with only one point and adding them to the retained set RetainedSetRDD = results.filter(lambda line: True if len(line[1]) == 1 else False).map(lambda line: line[1][0]) retained_set.update(set(RetainedSetRDD.collect())) # print(retained_set) # Running K means on the candidates for Discard Set remaining = results.filter(lambda line: True if len(line[1]) > 1 else False ).flatMap(lambda line: line[1])