def clusteringTake0(rawData):
    countsByLabel = rawData.map(lambda x: x.split(',').pop()).countByValue().items()
    countSorted = sorted(countsByLabel, key=itemgetter(1), reverse=True)
    for val in countSorted:
        print(val)

    def preprocessing(line):
        values = line.split(",")
        del values[1:4]
        label = values.pop()
        vector = Vectors.dense(map(lambda x: float(x),values))
        return (label, vector)

    labelsAndData = rawData.map(preprocessing)

    data = labelsAndData.values().cache()

    model = KMeans.train(data, 2)

    for centerpoint in model.clusterCenters:
        print(centerpoint)

    clusterLabelCount = labelsAndData.map(lambda x: (model.predict(x[1]), x[0] )).countByValue()

    for labelCount in clusterLabelCount.items():
        print(str(labelCount[0][0]) + " " + str(labelCount[0][1]) + " " + str(labelCount[1]))

    data.unpersist()
def clusteringScore3(normalizedLabelsAndData, k):
    model = KMeans.train(normalizedLabelsAndData.values(), k, maxIterations=10, epsilon=1.0e-6)
    labelsAndClusters = normalizedLabelsAndData.mapValues(model.predict)
    clustersAndLabels = labelsAndClusters.map(lambda t: (t[1], t[0]))
    labelsInCluster = clustersAndLabels.groupByKey().values()
    labelCounts = labelsInCluster.map(lambda a: map(lambda (k, g): len(list(g)), groupby(a)))
    n = normalizedLabelsAndData.count()
    return labelCounts.map(lambda m: sum(m) * entropy(m)).sum()/n
def buildAnomalyDetector(data, normalizeFunction):
    normalizedData = data.map(normalizeFunction)
    normalizedData.cache()
    model = KMeans.train(normalizedData, 150, maxIterations=10, epsilon=1.0e-6)
    normalizedData.unpersist()
    distances = normalizedData.map(lambda datum: distToCentroid(datum, model))
    threshold = distances.top(100).pop()

    def f(datum):
        return distToCentroid(normalizeFunction(datum), model) > threshold
    return f
def visualizationInR(rawData):
    def preprocessing(line):
        values = line.split(",")
        del values[1:4]
        values.pop()
        return Vectors.dense(map(lambda x: float(x), values))

    data = rawData.map(preprocessing).cache()
    model = KMeans.train(data, 100, maxIterations=10, epsilon=1.0e-6)

    sample = data.map(lambda datum: model.predict(datum) + "," + ",".join(datum)).sample(False, fraction=0.05, seed=None)
    sample.saveAsTextFile("file:///user/ds/sample")
# Check results:
display(results)

# Without using Pipelines:

# Clustering
from pyspark.mllib.clustering import KMeans, KMeansModel
from numpy import array
from math import sqrt

# Load and parse input data
data = sc.textFile("data/kmeans_data.txt")
parsedData = data.map(lambda line: array([float(x) for x in line.split(' ')]))

# Build and train the model: K=2, 10 iterations.
clusters = KMeans.train(parsedData, 2, 10)


# Evaluate the clustering
def error(point):
    center = clusters.centers[clusters.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))


WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)

print("Within Set Sum of Squared Error = " + str(WSSSE))

# Saving and loading the model
clusters.save(sc, "MyModels")
sameModel = KMeansModel.load(sc, "MyModels")
def clusteringScore2(data, k):
    model = KMeans.train(data, k, maxIterations=10, epsilon=1.0e-6)
    return data.map(lambda datum: distToCentroid(datum, model)).mean()
def clusteringScore(data, k):
    model = KMeans.train(data, k)
    return data.map(lambda datum: distToCentroid(datum, model)).mean()