def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters( centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def test_trainOn_predictOn(self): """Test that prediction happens on the updated model.""" stkm = StreamingKMeans(decayFactor=0.0, k=2) stkm.setInitialCenters([[0.0], [1.0]], [1.0, 1.0]) # Since decay factor is set to zero, once the first batch # is passed the clusterCenters are updated to [-0.5, 0.7] # which causes 0.2 & 0.3 to be classified as 1, even though the # classification based in the initial model would have been 0 # proving that the model is updated. batches = [[[-0.5], [0.6], [0.8]], [[0.2], [-0.1], [0.3]]] batches = [sc.parallelize(batch) for batch in batches] input_stream = self.ssc.queueStream(batches) predict_results = [] def collect(rdd): rdd_collect = rdd.collect() if rdd_collect: predict_results.append(rdd_collect) stkm.trainOn(input_stream) predict_stream = stkm.predictOn(input_stream) predict_stream.foreachRDD(collect) t = time() self.ssc.start() self._ssc_wait(t, 6.0, 0.01) self.assertEqual(predict_results, [[0, 1, 1], [1, 0, 1]])
def test_trainOn_model(self): """Test the model on toy data with four clusters.""" stkm = StreamingKMeans() initCenters = [[1.0, 1.0], [-1.0, 1.0], [-1.0, -1.0], [1.0, -1.0]] stkm.setInitialCenters(centers=initCenters, weights=[1.0, 1.0, 1.0, 1.0]) # Create a toy dataset by setting a tiny offest for each point. offsets = [[0, 0.1], [0, -0.1], [0.1, 0], [-0.1, 0]] batches = [] for offset in offsets: batches.append([[offset[0] + center[0], offset[1] + center[1]] for center in initCenters]) batches = [self.sc.parallelize(batch, 1) for batch in batches] input_stream = self.ssc.queueStream(batches) stkm.trainOn(input_stream) t = time() self.ssc.start() # Give enough time to train the model. self._ssc_wait(t, 6.0, 0.01) finalModel = stkm.latestModel() self.assertTrue(all(finalModel.centers == array(initCenters))) self.assertEquals(finalModel.clusterWeights, [5.0, 5.0, 5.0, 5.0])
def main(): conf = SparkConf().setAppName("twitterclassifier") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) tweets = ssc.socketTextStream("localhost", PORT) \ .map(lambda x: json.loads(x)) \ .filter(lambda x: 'text' in x) \ .map(lambda x: x['text'].encode('utf-8')) hasher = HashingTF(DIM) features = tweets.map(lambda x: (x, hasher.transform(featurize(x)))).cache() # We create a model with random clusters and specify the number of clusters to find # decay = 1: total memory; decay = 0: no memory model = StreamingKMeans(k=N, decayFactor=0.1).setRandomCenters(DIM, 1.0, 0) model.trainOn(features.map(lambda x: x[1])) results = model.predictOnValues(features).cache() # Need a closure over i here. def print_group(i): results.filter(lambda x: x[1] == i).map(lambda x: '%i: %s' % (x[1], x[0])).pprint(3) for i in xrange(N): print_group(i) ssc.start() ssc.awaitTermination()
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator(batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) self.ssc.start() def condition(): self.assertEqual(stkm.latestModel().clusterWeights, [25.0]) return True eventually(condition, catch_assertions=True) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
def perform_training(sc: SparkContext, params_dict: dict): batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[ 'batch_duration'] training_duration = 20 if 'training_duration' not in params_dict else params_dict[ 'training_duration'] ssc = StreamingContext(sc, batch_duration) topics = ['normal-ekg-stream'] kafka_params = {'metadata.broker.list': 'localhost:9092'} kvs = KafkaUtils.createDirectStream( ssc, topics, kafkaParams=kafka_params, valueDecoder=lambda val: json.loads(val.decode('utf-8'))) windowed_signal = kvs.map(lambda msg: Vectors.dense( [float(value) for value in msg[1]['signal_values']])) # windowed_signal.foreachRDD(Plotter.plot_signal_window) model = StreamingKMeans(k=20, decayFactor=1.0).setRandomCenters(188, 1.0, 0) model.trainOn(windowed_signal) ssc.start() ssc.awaitTerminationOrTimeout(training_duration) ssc.stop(stopSparkContext=False, stopGraceFully=True) return model.latestModel()
class StreamingUpdate(object): """ Streaming Update: DStream """ def __init__(self, init_clusters, decay_factor, time_unit, sc, ssc): self.init_clusters=init_clusters self.decay_factor=decay_factor self.time_unit=time_unit self.sc=sc self.ssc=ssc # implement def streaming(self, mnk, clusters, init_clusters): self.mnk=mnk self.clusters=clusters self.init_clusters=init_clusters self.streaming_kmeans=StreamingKMeans(self.init_clusters, self.decay_factor, self.time_unit) self.streaming_kmeans.setInitialCenters(self.mnk.cluster_centers_, np.ones([self.init_clusters])) # update shape for centers in StreamingContext """ Từ điển được cập nhật khi có tin tức mới đến thì em cập nhật lại kích thước của các centroid VD: Từ điển ban đầu có kích thước 10 từ Em biểu diễn một câu có 5 từ bằng sparse vector kích thước 5x10 Từ điển sau khi cập nhật có 15 từ thì câu trên phải biểu diễn lại bằng sparse vector có kích thước 5x15 Có cách biểu diễn khác mà không phải cập nhật lại biểu diễn của câu không ạ """ def update_shape(self, docs, dictionary): self.streaming_kmeans.setRandomCenters(matrix_tfidf(docs, dictionary).shape[1], 1.0, 0) # save matrix update def save_matrix_update(self, docs, dictionary): np.savetxt('/home/ducvu/input_streaming.txt', matrix_tfidf(docs, dictionary)) # load dstream def load_dstream(self): self.dstream = self.sc.textFile("/home/ducvu/input_streaming.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) # make predict def make_predict(self, docs, dictionary): self.streaming_kmeans.trainOn(self.load_dstream()) self.pred_stream=[] matrix=matrix_tfidf(docs) for x in matrix: self.pred_stream.append(self.streaming_kmeans.latest_model.predict(x)) self.pred_stream=np.array(self.pred_stream) df = pd.DataFrame(matrix).groupby(self.pred_stream).mean() for i, r in df.iterrows(): print('\nCluster {0}:'.format(i)) print(','.join([get_tfidf(dictionary).get_feature_names()[t] for t in np.argsort(r)[-15:]]))
def test_accuracy_for_single_center(self): """Test that parameters obtained are correct for a single center.""" centers, batches = self.streamingKMeansDataGenerator( batches=5, numPoints=5, k=1, d=5, r=0.1, seed=0) stkm = StreamingKMeans(1) stkm.setInitialCenters([[0., 0., 0., 0., 0.]], [0.]) input_stream = self.ssc.queueStream( [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) t = time() self.ssc.start() self._ssc_wait(t, 10.0, 0.01) self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] self.assertAlmostEqual(centers[0][i], modelCenters, 1) self.assertAlmostEqual(realCenters[i], modelCenters, 1)
.filter(lambda tpl: tpl[0] != 0)\ .filter(lambda tpl: tpl[2] != '')\ .map(lambda tpl: (tpl[0],tpl[1],tokenize(tpl[2])))\ .map(lambda tpl:(tpl[0],tpl[1],tpl[2],doc2vec(tpl[2]))) #dstream_tweets.pprint() trainingData = dstream_tweets.map( lambda tpl: [tpl[0], tpl[1]] + tpl[3].tolist()) #trainingData.pprint() testdata = dstream_tweets.map(lambda tpl: ( ([tpl[0], tpl[1]], tpl[2]), [tpl[0], tpl[1]] + tpl[3].tolist())) #testdata.pprint() # model = StreamingKMeans(k=clusterNum, decayFactor=0.6).setRandomCenters(102, 1.0, 3) model.trainOn(trainingData) clust = model.predictOnValues(testdata) #clust.pprint() #words = lines.flatMap(lambda line: line.split(" ")) topic = clust.map(lambda x: (x[1], x[0][1])) #topic.pprint() topicAgg = topic.reduceByKey(lambda x, y: x + y) #wordCollect.pprint() topicAgg.map(lambda x: (x[0], freqcount(x[1]))).pprint() clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) # Run! ssc.start() ssc.awaitTermination()
from pyspark.mllib.clustering import StreamingKMeans if __name__ == "__main__": sc = SparkContext(appName="sai twitter feed") ssc = StreamingContext(sc, 10) ssc.checkpoint("chkpfile") def parserData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("/files").map(parserData) model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial Centres" + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centres(sc, model): print(str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centres, (s, model)) s.enter(10, 1, print_cluster_centres, (s, model)) s.run() ssc.awaitTermination() # to make this work #spark-submit "C:\SaiStudy - LEarn It All - Version9\Saistudy - split-csv.py"
ssc = StreamingContext(sc, 5) # Kafka Stream ks = KafkaUtils.createDirectStream( ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) # Supplied to Streaming KMeans as the centers by StreamingKmeans are not giving good predictions init_centers = KMeans.train(trainingData, 2).centers model = StreamingKMeans(k=2, decayFactor=0.1)\ .setInitialCenters(init_centers, [1.0, 1.0, 1.0, 1.0, 1.0]) model.trainOn(ssc.queueStream([trainingData])) def parse(lp): arr = lp.split(',')[2:-1] label = lp.split(',')[0] vec = Vectors.dense([float(x) for x in arr]) return LabeledPoint(label, vec) test_stream = ks.map(lambda x: x[1]).map(parse) result = model.predictOnValues( test_stream.map(lambda lp: (lp.label, lp.features))) # Prints Prediction Prediction and Cluster Centers def current_centers(time, rdd): print("\n--------------------- %s --------------------------" %
ssc.checkpoint("file:///tmp/spark") def parseTrainingData(line): cells = line.split(",") return Vectors.dense([float(cells[0]), float(cells[1])]) trainingStream = ssc.textFileStream("file:///Users/jananiravi/spark/spark-2.1.0-bin-without-hadoop/tweets/training")\ .map(parseTrainingData) trainingStream.pprint(); model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) print("Initial centers: " + str(model.latestModel().centers)) model.trainOn(trainingStream) ssc.start() s = sched.scheduler(time.time, time.sleep) def print_cluster_centers(sc, model): print("Cluster centers: " + str(model.latestModel().centers)) s.enter(10, 1, print_cluster_centers, (sc, model)) s.enter(10, 1, print_cluster_centers, (s, model)) s.run() ssc.awaitTermination()
# Inicializa o algoritmo k-means com streaming para rodar sobre os dados # adicionados ao diretório de streaming. # k=2: Número de clusters em que o dataset será dividido # decayFactor=1.0: Todos os dados, desde o início, são relevantes. # 0.0: Utilização somente dos dados mais recentes. # O k-means requer o centro dos clusters randômicos para iniciar o # processo: # 2: Quantidade de centros a serem setados # 1.0 e 0: weight e seed model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(2, 1.0, 0) # Imprime os centros. print('Initial centers: ' + str(model.latestModel().centers)) # Treinamento do modelo model.trainOn(training_stream) # Inicia a stream ssc.start() # Agenda a impressão dos valores do centros em tempos periódicos s = sched.scheduler(time.time, time.sleep) # Função que imprime os centros recursivamente, a cada 10s. def print_cluster_centers(sc, model): print('Cluster centers: ' + str(str(model.latestModel().centers))) s.enter(10, 1, print_cluster_centers, (sc, model)) # A função para imprimir os clusters (print_cluster_centers) será # executada a cada 10s com prioridade 1. Essa função aceita dois # argumentos, o schedule s e o modelo representado pela variável
# clus=row[0] # #ptext[clus].set_text(str(clus)+ ':'+str([x[0] for x in row[1][1]])) # ptext[clus].set_text(str(clus)+ ':'+str(row[1][1])) # ptext[clus].set_color(colors[clus]) # plt.pause(0.0001) # q = multiprocessing.Queue() f = multiprocessing.Queue() job_for_another_core2 = multiprocessing.Process(target=data_plotting,args=(q,)) job_for_another_core2.start() sc = SparkContext('local[4]', 'Social Panic Analysis') # Create a local StreamingContext with two working thread and batch interval of 1 second ssc = StreamingContext(sc, 10) dstream = ssc.socketTextStream("localhost", 9998) trainingData = dstream.map(Vectors.parse) trainingData.pprint() testData=trainingData.map(lambda x: (x,x)) testData.pprint() model = StreamingKMeans(k=clusterNum, decayFactor=0.1).setRandomCenters(2, 1.0, 0) model.trainOn(trainingData) print(model.latestModel().clusterCenters) clust=model.predictOnValues(testData) clust.pprint() #print(model.predictOnValues(testData.map(lambda lp: (lp.label, lp.features)))) clust.foreachRDD(lambda time, rdd: q.put(rdd.collect())) ssc.start() ssc.awaitTermination()
initialCenters, initialWeights) #stkm = StreamingKMeans(k=int(numberClusters),decayFactor=1.0).setRandomCenters(2,1.0,100) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['StreamingKMeansTFG'], { "metadata.broker.list": "localhost:9092", "auto_offset_reset": 'earliest' }) parsed = directKafkaStream.map(lambda v: loads(v[1])) parsed = parsed.map( lambda line: Vectors.dense([float(x) for x in line.strip().split()])) stkm.trainOn(parsed) def sendPartition(rdd): connection = MongoClient(mongoIP) test_db = connection.get_database(mongoDataBase) collection = test_db.get_collection(mongoCollection) model = stkm.latestModel() centers = model.centers weights = model.clusterWeights myquery = {"name": mongoCollection} newvalues = { "$set": { "clusterCenters": centers.tolist(), "clusterWeights": weights }