def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [
            self.sc.parallelize(batch, 1) for batch in predict_data
        ]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        self.ssc.start()

        def condition():
            self.assertEqual(result, [[0], [1], [2], [3]])
            return True

        eventually(condition, catch_assertions=True)
示例#2
0
文件: tests.py 项目: rajsingh7/spark
    def test_predictOn_model(self):
        """Test that the model predicts correctly on toy data."""
        stkm = StreamingKMeans()
        stkm._model = StreamingKMeansModel(clusterCenters=[[1.0, 1.0],
                                                           [-1.0, 1.0],
                                                           [-1.0, -1.0],
                                                           [1.0, -1.0]],
                                           clusterWeights=[1.0, 1.0, 1.0, 1.0])

        predict_data = [[[1.5, 1.5]], [[-1.5, 1.5]], [[-1.5, -1.5]],
                        [[1.5, -1.5]]]
        predict_data = [sc.parallelize(batch, 1) for batch in predict_data]
        predict_stream = self.ssc.queueStream(predict_data)
        predict_val = stkm.predictOn(predict_stream)

        result = []

        def update(rdd):
            rdd_collect = rdd.collect()
            if rdd_collect:
                result.append(rdd_collect)

        predict_val.foreachRDD(update)
        t = time()
        self.ssc.start()
        self._ssc_wait(t, 6.0, 0.01)
        self.assertEquals(result, [[0], [1], [2], [3]])
def test_streaming_kmeans():
    records = get_data_from_db()

    conf = SparkConf().setAppName("testingClusters").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sc.setLogLevel("ERROR")

    initCenters = [[0.0, 0.0, 0.0], [1.0, 1.0, 1.0], [0.5, 0.5, 0.5]]
    initWeights = [1.0, 1.0, 1.0]
    stkm = StreamingKMeansModel(initCenters, initWeights)
    recordRDD = sc.parallelize(records)
    stkm = stkm.update(data=recordRDD, decayFactor=1.0, timeUnit=u"batches")

    for center in stkm.centers:
        print(center)

    sample_data = np.array([46.5, 23.0, 1034.0]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)

    stkm = stkm.update(sc.parallelize(record_scaled), 1.0, u"points")

    sample_data = np.array([46.2, 23.5, 1034.32]).reshape(1, -1)
    record_scaled = scaler.transform(sample_data)
    print("Scaled Record: ", record_scaled)
    ret = stkm.predict(sc.parallelize(record_scaled))
    print("******** Predicted cluster index: ")
    ret.foreach(print)
    for center in stkm.centers:
        distance = np.linalg.norm(center - record_scaled)
        similarity = 1 / (1 + distance)
        print(center, distance, similarity)
示例#4
0
import dautil as dl
from pyspark.mllib.clustering import StreamingKMeansModel
from pyspark import SparkContext

csv_file = dl.data.get_direct_marketing_csv()
csv_rows = dl.data.read_csv(csv_file)

stkm = StreamingKMeansModel(28 * [[0., 0., 0.]], 28 * [1.])
sc = SparkContext()

for row in csv_rows:
    spend = dl.data.centify(row['spend'])

    if spend > 0:
        history = dl.data.centify(row['history'])
        data = sc.parallelize([[int(row['recency']), history, spend]])
        stkm = stkm.update(data, 0., 'points')

print(stkm.centers)
import dautil as dl
from pyspark.mllib.clustering import StreamingKMeansModel
from pyspark import SparkContext

csv_file = dl.data.get_direct_marketing_csv()
csv_rows = dl.data.read_csv(csv_file)

stkm = StreamingKMeansModel(28 * [[0., 0., 0.]], 28 * [1.])
sc = SparkContext()

for row in csv_rows:
    spend = dl.data.centify(row['spend'])

    if spend > 0:
        history = dl.data.centify(row['history'])
        data = sc.parallelize([[int(row['recency']),
                               history, spend]])
        stkm = stkm.update(data, 0., 'points')

print(stkm.centers)
示例#6
0
              if sys.argv[1]==("sset1"):
                 truePredictions.append(int(c[0])-1)
              if sys.argv[1]==("asset1"):
                 truePredictions.append(int(c[0])-1)
              else:
                 truePredictions.append(int(c[0]))

     with open("datasets/"+sys.argv[1]+".txt","r") as fichero:
    	  for linea in fichero:
              points.append(linea.strip("\n").split())

     for document in cursor:
         centers = document["clusterCenters"]
         weights = document["clusterWeights"]

     stkm = StreamingKMeansModel(centers, weights)

     predictions = []

     for point in points:
         predictions.append(stkm.predict(point))

     recall = recall_score(truePredictions,predictions, average='weighted')

     precision = precision_score(truePredictions, predictions, average='weighted')

     f1Score = 2 * (precision * recall) / (precision + recall)

     logging.info("Recall = " + str(recall) )
     logging.info("Precision = " + str(precision) )
     logging.info("F1-Score = " + str(f1Score))
示例#7
0
os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'

sc = SparkContext(appName="PythonSparkStreamingKafka")

sc.setLogLevel("WARN")

ssc = StreamingContext(sc, 120)

initWeights = [
    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0
]

offlineModel = KMeansModel.load(sc, "KMeansModel")

stkm = StreamingKMeansModel(offlineModel.clusterCenters, initWeights)

#kafkaStream = KafkaUtils.createDirectStream(ssc, ['test'], {'metadata.broker.list': 'localhost:9092'})

#lines = kafkaStream.map(lambda line: array([float(x) for x in line.split('\t')]))

consumer = KafkaConsumer('test',
                         bootstrap_servers=['localhost:9092'],
                         value_deserializer=lambda x: loads(x.decode('utf-8')),
                         consumer_timeout_ms=10000)

colors = [
    'r', 'k', 'b', 'grey', 'darkorange', 'm', 'y', 'c', 'gold', 'slateblue',
    'beige', 'coral', 'g', 'peru', 'pink'
]