def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel( clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [self.sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True self._eventually(condition, catch_assertions=True)
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [self.sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) self.ssc.start() def condition(): self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]]) return True eventually(condition, catch_assertions=True)
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [ self.sc.parallelize(batch, 1) for batch in predict_data ] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) self.ssc.start() def condition(): self.assertEqual(result, [[0], [1], [2], [3]]) return True eventually(condition, catch_assertions=True)
def test_predictOn_model(self): """Test that the model predicts correctly on toy data.""" stkm = StreamingKMeans() stkm._model = StreamingKMeansModel( clusterCenters=[[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]], clusterWeights=[1.0, 1.0, 1.0, 1.0]) predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]], [[1.5, -1.5]]] predict_data = [sc.parallelize(batch, 1) for batch in predict_data] predict_stream = self.ssc.queueStream(predict_data) predict_val = stkm.predictOn(predict_stream) result = [] def update(rdd): rdd_collect = rdd.collect() if rdd_collect: result.append(rdd_collect) predict_val.foreachRDD(update) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEquals(result, [[0], [1], [2], [3]])
return LabeledPoint(label, vec) sc = SparkContext(appName="StreamingKmeansKafka") ssc = StreamingContext(sc, 20) zkQuorum, trainTopic, testTopic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {trainTopic: 1}) lines = kvs.map(lambda x: x[1]) trainingData = lines.map(lambda line: Vectors.dense( [float(x) for x in line.strip().split(',')])) kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {testTopic: 1}) lines = kvs.map(lambda x: x[1]) testingData = lines.map(parse) model = StreamingKMeans(k=3, decayFactor=1.0).setRandomCenters(4, 1.0, 0) model.trainOn(trainingData) result = model.predictOn(trainingData) result.pprint() result = model.predictOnValues( testingData.map(lambda lp: (lp.label, lp.features))) result.pprint() ssc.start() ssc.awaitTermination()
stkm.trainOn(parsed) def sendPartition(rdd): connection = MongoClient("mongodb://localhost:27017/") test_db = connection.get_database('sparkDB') collection = test_db.get_collection('sset1') model = stkm.latestModel() centers = model.centers weights = model.clusterWeights myquery = {"name": "sset1"} newvalues = { "$set": { "clusterCenters": centers.tolist(), "clusterWeights": weights } } collection.update_one(myquery, newvalues) connection.close() parsed.count().foreachRDD(sendPartition) stkm.predictOn(parsed).pprint() parsed.count().pprint() ssc.start() ssc.awaitTermination()