def clusteringTake0(rawData): countsByLabel = rawData.map(lambda x: x.split(',').pop()).countByValue().items() countSorted = sorted(countsByLabel, key=itemgetter(1), reverse=True) for val in countSorted: print(val) def preprocessing(line): values = line.split(",") del values[1:4] label = values.pop() vector = Vectors.dense(map(lambda x: float(x),values)) return (label, vector) labelsAndData = rawData.map(preprocessing) data = labelsAndData.values().cache() model = KMeans.train(data, 2) for centerpoint in model.clusterCenters: print(centerpoint) clusterLabelCount = labelsAndData.map(lambda x: (model.predict(x[1]), x[0] )).countByValue() for labelCount in clusterLabelCount.items(): print(str(labelCount[0][0]) + " " + str(labelCount[0][1]) + " " + str(labelCount[1])) data.unpersist()
def clusteringScore3(normalizedLabelsAndData, k): model = KMeans.train(normalizedLabelsAndData.values(), k, maxIterations=10, epsilon=1.0e-6) labelsAndClusters = normalizedLabelsAndData.mapValues(model.predict) clustersAndLabels = labelsAndClusters.map(lambda t: (t[1], t[0])) labelsInCluster = clustersAndLabels.groupByKey().values() labelCounts = labelsInCluster.map(lambda a: map(lambda (k, g): len(list(g)), groupby(a))) n = normalizedLabelsAndData.count() return labelCounts.map(lambda m: sum(m) * entropy(m)).sum()/n
def buildAnomalyDetector(data, normalizeFunction): normalizedData = data.map(normalizeFunction) normalizedData.cache() model = KMeans.train(normalizedData, 150, maxIterations=10, epsilon=1.0e-6) normalizedData.unpersist() distances = normalizedData.map(lambda datum: distToCentroid(datum, model)) threshold = distances.top(100).pop() def f(datum): return distToCentroid(normalizeFunction(datum), model) > threshold return f
def visualizationInR(rawData): def preprocessing(line): values = line.split(",") del values[1:4] values.pop() return Vectors.dense(map(lambda x: float(x), values)) data = rawData.map(preprocessing).cache() model = KMeans.train(data, 100, maxIterations=10, epsilon=1.0e-6) sample = data.map(lambda datum: model.predict(datum) + "," + ",".join(datum)).sample(False, fraction=0.05, seed=None) sample.saveAsTextFile("file:///user/ds/sample")
# Check results: display(results) # Without using Pipelines: # Clustering from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array from math import sqrt # Load and parse input data data = sc.textFile("data/kmeans_data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')])) # Build and train the model: K=2, 10 iterations. clusters = KMeans.train(parsedData, 2, 10) # Evaluate the clustering def error(point): center = clusters.centers[clusters.predict(point)] return sqrt(sum([x**2 for x in (point - center)])) WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Saving and loading the model clusters.save(sc, "MyModels") sameModel = KMeansModel.load(sc, "MyModels")
def clusteringScore2(data, k): model = KMeans.train(data, k, maxIterations=10, epsilon=1.0e-6) return data.map(lambda datum: distToCentroid(datum, model)).mean()
def clusteringScore(data, k): model = KMeans.train(data, k) return data.map(lambda datum: distToCentroid(datum, model)).mean()