def test_streaming_kmeans(): records = get_data_from_db() conf = SparkConf().setAppName("testingClusters").setMaster("local[2]") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") initCenters = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]] initWeights = [1.0, 1.0, 1.0] stkm = StreamingKMeansModel(initCenters, initWeights) recordRDD = sc.parallelize(records) stkm = stkm.update(data=recordRDD, decayFactor=1.0, timeUnit=u"batches") for center in stkm.centers: print(center) sample_data = np.array([46.5, 23.0, 1034.0]).reshape(1, -1) record_scaled = scaler.transform(sample_data) print("Scaled Record: ", record_scaled) ret = stkm.predict(sc.parallelize(record_scaled)) print("******** Predicted cluster index: ") ret.foreach(print) for center in stkm.centers: distance = np.linalg.norm(center - record_scaled) similarity = 1 / (1 + distance) print(center, distance, similarity) stkm = stkm.update(sc.parallelize(record_scaled), 1.0, u"points") sample_data = np.array([46.2, 23.5, 1034.32]).reshape(1, -1) record_scaled = scaler.transform(sample_data) print("Scaled Record: ", record_scaled) ret = stkm.predict(sc.parallelize(record_scaled)) print("******** Predicted cluster index: ") ret.foreach(print) for center in stkm.centers: distance = np.linalg.norm(center - record_scaled) similarity = 1 / (1 + distance) print(center, distance, similarity)
truePredictions.append(int(c[0])) with open("datasets/"+sys.argv[1]+".txt","r") as fichero: for linea in fichero: points.append(linea.strip("\n").split()) for document in cursor: centers = document["clusterCenters"] weights = document["clusterWeights"] stkm = StreamingKMeansModel(centers, weights) predictions = [] for point in points: predictions.append(stkm.predict(point)) recall = recall_score(truePredictions,predictions, average='weighted') precision = precision_score(truePredictions, predictions, average='weighted') f1Score = 2 * (precision * recall) / (precision + recall) logging.info("Recall = " + str(recall) ) logging.info("Precision = " + str(precision) ) logging.info("F1-Score = " + str(f1Score)) f=open(sys.argv[1]+"Resultados.txt","w") f.write("Recall = " + str(recall)) f.write("\n") f.write("Precision = " + str(precision))
stkm = StreamingKMeansModel(offlineModel.clusterCenters, initWeights) #kafkaStream = KafkaUtils.createDirectStream(ssc, ['test'], {'metadata.broker.list': 'localhost:9092'}) #lines = kafkaStream.map(lambda line: array([float(x) for x in line.split('\t')])) consumer = KafkaConsumer('test', bootstrap_servers=['localhost:9092'], value_deserializer=lambda x: loads(x.decode('utf-8')), consumer_timeout_ms=10000) colors = [ 'r', 'k', 'b', 'grey', 'darkorange', 'm', 'y', 'c', 'gold', 'slateblue', 'beige', 'coral', 'g', 'peru', 'pink' ] i = 0 print("Collecting data") for message in consumer: testData = Vectors.dense([float(x) for x in message.value.strip().split()]) plt.plot(testData[0], testData[1], 'o', color=colors[stkm.predict(testData)]) print("Vamos a dibujar") plt.show() ssc.start() ssc.awaitTermination()