def power_iteration_clustering(unclustered_data, number_of_clusters, max_iterations=10, init_mode='random'): if number_of_clusters < 1: raise ValueError("While clustering with PowerIterationClustering, \ the given number of clusters is not positive") model = PowerIterationClustering.train(rdd=unclustered_data, k=number_of_clusters, maxIterations=max_iterations, initMode=init_mode) assignments = model.assignments().collect() return [model, assignments]
def Power_Iteration_Clustering(X, K, Adj=False, Lp_norm=2, sigma=1, max_Iter=20): ''' Input: X : [n_samples, n_samples] numpy array if adj=True, or, a [n_samples_a, n_features] array otherwise; K: int, The number of clusters; adj: boolean, Indicating whether the adjacency matrix is pre-computed. Default: False; Lp_norm: int, Indicating which L^p norm is using. Default: 2; sigma: float, The variance for the Gaussian (aka RBF) kernel. Default: 1; max_Iter: int, Maximum number of iterations of the PIC algorithm. Default: 20. Output: cluster labels: A [n_samples, ] numpy array, node ids: A list with length "n_samples". ''' # Setting up PySpark Context conf = SparkConf() sc = SparkContext(conf=conf) if Adj: # Concatenate the point ID to the last column of the array X1 = np.concatenate((X, np.array(range(X.shape[0]), ndmin=2).T), axis=1) data = sc.parallelize(X1.tolist()) # Manipulate the RDD such that each entry is a tuple of the form (ID, distance_list) Adj_matRDD = data.map(lambda x: (int(x[len(x)-1]), x[:(len(x)-1)])) else: X1 = np.concatenate((X, np.array(range(X.shape[0]), ndmin=2).T), axis=1) data = sc.parallelize(X1.tolist()) data = data.map(lambda x: (int(x[len(x)-1]), x[:(len(x)-1)])) # Compute the pairwise distances between points Adj_matRDD = data.map(lambda item: Distance_Computing(item, DF=X, p=Lp_norm)) # Transform the affinity matrix such that each element in the list has the form (i, j, s_{ij}) A_RDD = Adj_matRDD.flatMap(lambda item: Affinity(item, sigma=sigma)) # Cluster the data into two classes using PowerIterationClustering model = PowerIterationClustering.train(A_RDD, K, 100) cluster_id = model.assignments().collect() sc.stop() IDs = [k.id for k in cluster_id] clusters = [k.cluster for k in cluster_id] # Sort the cluster label list based on the ascending order of their IDs IDs_sorted = sorted(IDs) clusters_sorted = np.array(clusters)[np.argsort(IDs)] return clusters_sorted, IDs_sorted
lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vec_df = spark.createDataFrame( scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2])))) # Create RowMatrix from the transpose of spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect()) mat = RowMatrix(vector_df) bun = mat.rows.collect() num_clusters = 3 pre = sc.parallelize(mat.columnSimilarities().entries.map( lambda e: (e.i, e.j, e.value)).collect()) model = PowerIterationClustering.train(pre, 3, 20, "random") err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][ x.id], bun[2][x.id]), x.cluster)).collect() # Silhoutte value ag = 0 agi = 1700 for er in err: avg = [0] * num_clusters avgi = [0] * num_clusters for e in err: avg[e[1]] += Vectors.squared_distance(er[0], e[0]) avgi[e[1]] += 1 a = avg[er[1]] / avgi[er[1]] b = sys.maxint for i in range(len(avg)):
# limitations under the License. # from __future__ import print_function from pyspark import SparkContext # $example on$ from pyspark.mllib.clustering import PowerIterationClustering, PowerIterationClusteringModel # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PowerIterationClusteringExample") # SparkContext # $example on$ # Load and parse the data data = sc.textFile("data/mllib/pic_data.txt") similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')])) # Cluster the data into two classes using PowerIterationClustering model = PowerIterationClustering.train(similarities, 2, 10) model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster))) # Save and load model model.save(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel") sameModel = PowerIterationClusteringModel\ .load(sc, "target/org/apache/spark/PythonPowerIterationClusteringExample/PICModel") # $example off$ sc.stop()
meta = raw_meta.map(parse_meta) auths = raw_auth.map(parse_auth) uniq_auths = auths.flatMap(lambda d: d["authors"]).distinct() uniq_auths_int = uniq_auths.zipWithIndex().collectAsMap() int_auth = sc.parallelize(int_auth_map(uniq_auths_int), 100) authID_pairs = auths.flatMap(lambda x: auth_pairs(x, uniq_auths_int)) auth_ct = authID_pairs.map(auth_pairs_ct).reduceByKey(lambda a, b: a + b, numPartitions=100) auth_ct = auth_ct.map(lambda (a, b): (b, a)).sortByKey(ascending=False, numPartitions=100).map(lambda (a, b): (b, a)) auth_net_edges = auth_ct.map(auth_ct_to_three_tpl) auth_cluster_mdl = PowerIterationClustering.train(auth_net_edges, 200, 25) result = sc.parallelize(sorted(auth_cluster_mdl.assignments().collect(), key=lambda x: x.id), 100) result = result.map(result_to_tpl) auth_net_clust = int_auth.join(result).map(lambda d: (d[1][0], d[1][1])) auth_net_edges_names = auth_net_edges.map(lambda d: (d[0], (d[1], d[2]))) auth_net_edges_names = int_auth.join(auth_net_edges_names).map(lambda d: (d[1][1][0], (d[1][0], d[1][1][1]))) auth_net_edges_names = int_auth.join(auth_net_edges_names).map(lambda d: (d[1][1][0], d[1][0], d[1][1][1])) auth_nodes_out = "s3://XX/auth_nodes" auth_nodes_lines = auth_net_clust.map(auths_nodes_clusters_to_csv) auth_nodes_lines.coalesce(1).saveAsTextFile(auth_nodes_out) auth_edges_out = "s3://XX/auth_edges"
if __name__ == "__main__": # made the spark contest sc = SparkContext(appName="Spectral Clustering in Spark") # input file input_file_RDD = sc.textFile(input_file) withIndex = input_file_RDD.map(split_function).zipWithIndex() indexKey = withIndex.map(lambda (k,v): (v,k)) C = indexKey.cartesian(indexKey) input_affinities = C.map(affinities) model = PowerIterationClustering.train(input_affinities, num_clusters, upper_bound) joined = sc.parallelize(sorted(indexKey.join(model.assignments()).collect())) if (num_clusters == 2): two_clusters(joined) plt.scatter(x1, y1, c='r') plt.scatter(x2, y2, c='g') plt.show() elif (num_clusters == 3): three_clusters(joined) plt.scatter(x1, y1, c='r') plt.scatter(x2, y2, c='g') plt.scatter(x3, y3, c='b') plt.show()
meta_math = meta.map(lambda d: (d["id"], d["subj"])).filter(lambda x: x[1] == u"math") uniq_papers = meta_math.map(lambda d: d[0]).distinct() uniq_papers_int = uniq_papers.zipWithIndex().collectAsMap() int_paper = sc.parallelize(int_paper_map(uniq_papers_int), numSlices=numParts) paper_int = int_paper.map(lambda x: (x[1], x[0])) intID_words = paper_int.join(id_words, numPartitions=numParts).map(lambda x: (x[1][0], x[1][1])) pairs = intID_words.cartesian(intID_words) sims = pairs.map(lambda x: (x[0][0], x[1][0], compute_jaccard(x[0][1], x[1][1]))) sims = sims.filter(lambda x: x[2] < 1.0) subj_cluster_mdl = PowerIterationClustering.train(sims, 5, 25) result = sc.parallelize(sorted(subj_cluster_mdl.assignments().collect(), key=lambda x: x.id), numSlices=numParts) result = result.map(result_to_tpl) words_net_clust = int_paper.join(result, numPartitions=numParts).map(lambda d: (d[1][0], d[1][1])) words_net_clust_subj = words_net_clust.join(subj, numPartitions=numParts).map(lambda d: (d[0], d[1][1], d[1][0])) words_edges = sims.filter(lambda x: x[2] > 0.0) words_edges_names = words_edges.map(lambda d: (d[0], (d[1], d[2]))) words_edges_names = int_paper.join(words_edges_names, numPartitions=numParts).map(lambda d: (d[1][1][0], (d[1][0], d[1][1][1]))) words_edges_names = int_paper.join(words_edges_names, numPartitions=numParts).map(lambda d: (d[1][1][0], d[1][0], d[1][1][1])) words_nodes_out = "s3://XX/words_nodes"
#start working with spark app_name = "PIC_Amazon_20030601" source_path = "/home/ophidian/dataset/Amazon20030601_transform.txt" my_model_path = "/home/ophidian/pyspark_models/PIC_amazon0601" out_path = "home/ophidian/pyspark_results/PIC_amazon0601.result" conf = SparkConf().setAppName(app_name) sc = SparkContext(conf=conf) """ # example in document data = sc.textFile("data/mllib/pic_data.txt") similarities = data.map(lambda line: tuple([float(x) for x in line.split(' ')])) model = PowerIterationClustering.train(similarities, 2, 10) """ #use pyspark PIC clustering on vertex # Load data data = sc.textFile(source_path) weighted_edges = data.map(lambda line: tuple([float(x) for x in line.split(' ')])) # Cluster the data into 10 classes using PowerIterationClustering model = PowerIterationClustering.train(weighted_edges, 10, 100) #model.assignments().foreach(lambda x: print(str(x.id) + " -> " + str(x.cluster))) with open(out_path,"w") as out_file: model.assignments().foreach(lambda x: out_file.write(str(x.id) + " -> " + str(x.cluster)) + "\n" ) # Save and load model model.save(sc, my_model_path) #sameModel = PowerIterationClusteringModel.load(sc, my_model_path)
if __name__ == "__main__": # made the spark contest sc = SparkContext(appName="Spectral Clustering in Spark") # input file input_file_RDD = sc.textFile(input_file) withIndex = input_file_RDD.map(split_function).zipWithIndex() indexKey = withIndex.map(lambda (k, v): (v, k)) C = indexKey.cartesian(indexKey) input_affinities = C.map(affinities) model = PowerIterationClustering.train(input_affinities, num_clusters, upper_bound) joined = sc.parallelize( sorted(indexKey.join(model.assignments()).collect())) if (num_clusters == 2): two_clusters(joined) plt.scatter(x1, y1, c='r') plt.scatter(x2, y2, c='g') plt.show() elif (num_clusters == 3): three_clusters(joined) plt.scatter(x1, y1, c='r') plt.scatter(x2, y2, c='g') plt.scatter(x3, y3, c='b') plt.show()