def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEquals(result, [[0], [1], [2], [3]])
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [ self.sc.parallelize(batch, 1) for batch in predict_data ] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True eventually(condition, catch_assertions=True)
def test_streaming_kmeans(): records = get_data_from_db() conf = SparkConf().setAppName("testingClusters").setMaster("local[2]") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") initCenters = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]] initWeights = [1.0, 1.0, 1.0] stkm = StreamingKMeansModel(initCenters, initWeights) recordRDD = sc.parallelize(records) stkm = stkm.update(data=recordRDD, decayFactor=1.0, timeUnit=u"batches") for center in stkm.centers: print(center) sample_data = np.array([46.5, 23.0, 1034.0]).reshape(1, -1) record_scaled = scaler.transform(sample_data) print("Scaled Record: ", record_scaled) ret = stkm.predict(sc.parallelize(record_scaled)) print("******** Predicted cluster index: ") ret.foreach(print) for center in stkm.centers: distance = np.linalg.norm(center - record_scaled) similarity = 1 / (1 + distance) print(center, distance, similarity) stkm = stkm.update(sc.parallelize(record_scaled), 1.0, u"points") sample_data = np.array([46.2, 23.5, 1034.32]).reshape(1, -1) record_scaled = scaler.transform(sample_data) print("Scaled Record: ", record_scaled) ret = stkm.predict(sc.parallelize(record_scaled)) print("******** Predicted cluster index: ") ret.foreach(print) for center in stkm.centers: distance = np.linalg.norm(center - record_scaled) similarity = 1 / (1 + distance) print(center, distance, similarity)
import dautil as dl from pyspark.mllib.clustering import StreamingKMeansModel from pyspark import SparkContext csv_file = dl.data.get_direct_marketing_csv() csv_rows = dl.data.read_csv(csv_file) stkm = StreamingKMeansModel(28 * [[0., 0., 0.]], 28 * [1.]) sc = SparkContext() for row in csv_rows: spend = dl.data.centify(row['spend']) if spend > 0: history = dl.data.centify(row['history']) data = sc.parallelize([[int(row['recency']), history, spend]]) stkm = stkm.update(data, 0., 'points') print(stkm.centers)
if sys.argv[1]==("sset1"): truePredictions.append(int(c[0])-1) if sys.argv[1]==("asset1"): truePredictions.append(int(c[0])-1) else: truePredictions.append(int(c[0])) with open("datasets/"+sys.argv[1]+".txt","r") as fichero: for linea in fichero: points.append(linea.strip("\n").split()) for document in cursor: centers = document["clusterCenters"] weights = document["clusterWeights"] stkm = StreamingKMeansModel(centers, weights) predictions = [] for point in points: predictions.append(stkm.predict(point)) recall = recall_score(truePredictions,predictions, average='weighted') precision = precision_score(truePredictions, predictions, average='weighted') f1Score = 2 * (precision * recall) / (precision + recall) logging.info("Recall = " + str(recall) ) logging.info("Precision = " + str(precision) ) logging.info("F1-Score = " + str(f1Score))
os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell' sc = SparkContext(appName="PythonSparkStreamingKafka") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 120) initWeights = [ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ] offlineModel = KMeansModel.load(sc, "KMeansModel") stkm = StreamingKMeansModel(offlineModel.clusterCenters, initWeights) #kafkaStream = KafkaUtils.createDirectStream(ssc, ['test'], {'metadata.broker.list': 'localhost:9092'}) #lines = kafkaStream.map(lambda line: array([float(x) for x in line.split('\t')])) consumer = KafkaConsumer('test', bootstrap_servers=['localhost:9092'], value_deserializer=lambda x: loads(x.decode('utf-8')), consumer_timeout_ms=10000) colors = [ 'r', 'k', 'b', 'grey', 'darkorange', 'm', 'y', 'c', 'gold', 'slateblue', 'beige', 'coral', 'g', 'peru', 'pink' ]