示例#1
0
 def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10):
     cluster_data = self.sc.parallelize(Matrix)
     trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs)
     results = trains.predict(cluster_data).collect()
     return results
示例#2
0
movie_cluster_model = KMeans().train(movie_vectors,
                                     k=5,
                                     maxIterations=10,
                                     runs=3)
print("movie cluster model kmeans :")
print(movie_cluster_model)
user_cluster_model = KMeans().train(user_vectors,
                                    k=5,
                                    maxIterations=10,
                                    runs=3)
print("user cluster model kmeans :")
print(user_cluster_model)

# predict
movie_1 = movie_vectors.first()
movie_cluster = movie_cluster_model.predict(movie_1)
print(movie_cluster)

# evaluation
movie_cost = movie_cluster_model.computeCost(movie_vectors)
print("WCSS for movies :" + str(movie_cost))
train_test_split_movies = movie_vectors.randomSplit((0.6, 0.4), 123)
train_movies = train_test_split_movies[0]
test_movies = train_test_split_movies[1]


def costs_movies(cluster, train, test):
    for c in cluster:
        m = KMeans().train(train, k=c, maxIterations=10, runs=3)
        wscc = m.computeCost(test)
        print("WSCC for k=" + str(c) + ":" + str(wscc))
示例#3
0
	def K_means(self,data):
		cluster_data = self.sc.parallelize(data)
		trains = KMeans().train(cluster_data,self.k,self.iteration,self.runs)
		results = trains.predict(cluster_data).collect()
		return results
示例#4
0
 def K_means(self, data):
     cluster_data = self.sc.parallelize(data)
     trains = KMeans().train(cluster_data, self.k, self.iteration,
                             self.runs)
     results = trains.predict(cluster_data).collect()
     return results
示例#5
0
    pyspark.StorageLevel.DISK_ONLY)

#adding first sample data in data_used and reoving those from originalRDD
data_used = first_part.map(lambda line: int(line[0])).collect()
data_used = set(data_used)
originalRDD = originalRDD.filter(lambda line: True
                                 if int(line[0]) not in data_used else False)

# Trains a k-means model.
# make clusters model with 5*number of clusters
# predicting results for every point in the first part
train_data = first_part.map(lambda line: array(line[2:]))
train_data = np.array(train_data.collect())
kmeans = KMeans(n_clusters=input_clusters * 5, random_state=0).fit(train_data)
print(kmeans.labels_)
results = first_part.map(lambda line: (kmeans.predict([line[
    2:]]), [int(line[0])])).map(lambda line: (line[0].tolist()[0], line[1])
                                ).reduceByKey(lambda a, b: a + b).persist(
                                    pyspark.StorageLevel.DISK_ONLY)

# seperating the clusters with only one point and adding them to the retained set
RetainedSetRDD = results.filter(lambda line: True if len(line[1]) == 1 else
                                False).map(lambda line: line[1][0])
retained_set.update(set(RetainedSetRDD.collect()))
# print(retained_set)

# Running K means on the candidates for Discard Set
remaining = results.filter(lambda line: True if len(line[1]) > 1 else False
                           ).flatMap(lambda line: line[1])
remaining = set(remaining.collect())
# print(remaining)