def test_bisecting_kmeans(self): from pyspark.mllib.clustering import BisectingKMeans data = array([0.0, 0.0, 1.0, 1.0, 9.0, 8.0, 8.0, 9.0]).reshape(4, 2) bskm = BisectingKMeans() model = bskm.train(self.sc.parallelize(data, 2), k=4) p = array([0.0, 0.0]) rdd_p = self.sc.parallelize([p]) self.assertEqual(model.predict(p), model.predict(rdd_p).first()) self.assertEqual(model.computeCost(p), model.computeCost(rdd_p)) self.assertEqual(model.k, len(model.clusterCenters))
def doublekmeans(data,year): data = data.loc[year,'pm2.5'] #kmeans data = sc.parallelize(data) cluster_no = 2 maxIter = 30 clusters = KMeans.train(data,cluster_no,maxIter) #find 1.0 labels tdata = data.collect() cluster_info = np.zeros(len(tdata)) label = [] for i in range(0,len(tdata)): cluster_info[i]=clusters.predict(np.array(tdata[i])) if cluster_info[i]==1.0: label.append(i) #selecting 1.0 data and preparing the data data1 = clean_data.drop(['Is','Ir','cbwd','No','month','year','hour','day'],axis = 1) data2 = data1.iloc[label] data2 = sc.parallelize(data2.as_matrix()) #bisecting kmeans data2.collect() cluster_no = 2 maxiter = 30 model = BisectingKMeans.train(data2,cluster_no,maxiter) return clusters.centers,model.centers, model.computeCost(data2)
def BisectingKMeans_clustering(parsedData): parsedData = data.map( lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) model = BisectingKMeans.train(parsedData, 2, maxIterations=5) print "\n-----------------------------------------------------------------------------" print "\n Cluster Centers (BisectingKMeans)" print "\n-----------------------------------------------------------------------------" print model.clusterCenters
def bisecting_k_means(unclustered_data, number_of_clusters, max_iterations=5, seed=None, min_divisible_cluster_size=1.0): if number_of_clusters < 1: raise ValueError("While clustering with BisectingKMeans, \ the given number of clusters is not positive") model = BisectingKMeans.train( rdd=unclustered_data, k=number_of_clusters, maxIterations=max_iterations, seed=seed, minDivisibleClusterSize=min_divisible_cluster_size) cost = model.computeCost(unclustered_data) return [model, cost]
def main(): compounds = load_data(sc, dataFile) compounds.partitionBy(executor_num) fingerprints = select_fingerprints(compounds).cache() fp_only = fingerprints.map(lambda (id, smi, fp, name): fp) for x in [1500, 2000]: start_time = time.time() model = BisectingKMeans.train(fp_only, k=x) #print(model.clusterCenters) #print("Clusters " ,len(model.clusterCenters)) cost = model.computeCost(fp_only) #model.save(sc, baseFile + '/btreemodel') print("Bisecting " + str(cost)) #model.clusterCenters.foreach(lambda ctr : print("Cluster Center")) all_fps = fingerprints.collect() cluster_assignment = [] end_time1 = time.time() print("Clustering Time taken ", x, end_time1 - start_time) for fp in all_fps: cluster_assignment.append('{} {} {}'.format( fp[1], fp[3], model.predict(fp[2]))) #print ( "FP ", fp[0], " SMI: ", fp[1], " ", model.predict(fp[2])) end_time = time.time() print("Total Time taken ", x, end_time - start_time) if EVALUATION: header = sc.parallelize(["smiles Name Cluster"]) clusters = sc.parallelize(cluster_assignment) output_file = header.union(clusters) #output_file.foreach(output) # output_file.saveAsTextFile("../mols/resultsSpark/result") current_time_milli = int(round(time.time() * 1000)) outputextension = str(current_time_milli) output_file.coalesce(1).saveAsTextFile(baseFile + "/output" + str(x) + "/result" + outputextension) sc.stop()
today = dt.datetime.today() spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_user.json").select( "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[ 0], x[1], (today - par.parse(x[2])).days)).collect()[:1200]) scaler = MinMaxScaler(inputCol="_1",\ outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: Vectors.dense(x)) num_clusters = 4 #Input into the Algorithm km = BisectingKMeans() start = timer() kme = km.train(vector_df, k=num_clusters, maxIterations=20, seed=2018) end = timer() print(end - start) centers = kme.clusterCenters err = vector_df.map(lambda x: (x[0], findCenter(x[0], centers))).collect() per_clus = [0] * num_clusters per_clus_num = [0] * num_clusters #Silhoutte Value comparison ag = 0 agi = 1200
from numpy import array from pyspark import SparkContext from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel # spark-submit mllib_k_means.py # 二分KMeans # K-Means是聚类算法中的最常用的一种,算法最大的特点是简单,好理解,运算速度快 # K-Means算法是一种无监督分类算法 if __name__ == "__main__": sc = SparkContext(appName="KMeansExample") data = sc.textFile("kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) model = BisectingKMeans.train(parsedData, 2, maxIterations=5) cost = model.computeCost(parsedData) print("Final centers: " + str(model.clusterCenters)) print("Bisecting K-means Cost = " + str(cost)) sc.stop()
# from __future__ import print_function # $example on$ from numpy import array # $example off$ from pyspark import SparkContext # $example on$ from pyspark.mllib.clustering import BisectingKMeans, BisectingKMeansModel # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonBisectingKMeansExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile("data/mllib/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build the model (cluster the data) model = BisectingKMeans.train(parsedData, 2, maxIterations=5) # Evaluate clustering cost = model.computeCost(parsedData) print("Bisecting K-means Cost = " + str(cost)) # $example off$ sc.stop()
tfidf = idf.transform(tf) # In[84]: # tfidf.collect() # In[85]: if algorithm == "K": clusters = KMeans.train(tfidf, 8, maxIterations=20, initializationMode="random", seed=42) else: clusters = BisectingKMeans.train(tfidf, 8, maxIterations=20, seed=42) clusterCenters = clusters.clusterCenters # In[ ]: # In[86]: documentModel = documents1.zip(tfidf) # cluster_broadcast = sc.broadcast(clusters) # In[87]: def findErrorWC(document, clusters): documentWords = document[0] documentTfidf = document[1]