def perform_training(sc: SparkContext, params_dict: dict): batch_duration = 1 if 'batch_duration' not in params_dict else params_dict[ 'batch_duration'] training_duration = 20 if 'training_duration' not in params_dict else params_dict[ 'training_duration'] ssc = StreamingContext(sc, batch_duration) topics = ['normal-ekg-stream'] kafka_params = {'metadata.broker.list': 'localhost:9092'} kvs = KafkaUtils.createDirectStream( ssc, topics, kafkaParams=kafka_params, valueDecoder=lambda val: json.loads(val.decode('utf-8'))) windowed_signal = kvs.map(lambda msg: Vectors.dense( [float(value) for value in msg[1]['signal_values']])) # windowed_signal.foreachRDD(Plotter.plot_signal_window) model = StreamingKMeans(k=20, decayFactor=1.0).setRandomCenters(188, 1.0, 0) model.trainOn(windowed_signal) ssc.start() ssc.awaitTerminationOrTimeout(training_duration) ssc.stop(stopSparkContext=False, stopGraceFully=True) return model.latestModel()
def main(): pwords = load_wordlist("./Dataset/positive.txt") nwords = load_wordlist("./Dataset/negative.txt") conf = SparkConf().setMaster("local[2]").setAppName("TweeStreamer") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8'))) tweets = kstream.map(lambda x: json.loads(x[1])) tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords)) tweetsUsentiment.pprint() #tweetsUsentiment.saveToCassandra("killranalytics", "real_time_data") ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
def main(): conf = SparkConf().setMaster("local[2]").setAppName("twitterStream") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 15) # batch interval 15 seconds ssc.checkpoint("checkpoint") # load nFeelingWords = load("./Dataset/nFeeling.txt") pFeelingWords = load("./Dataset/pFeeling.txt") # accept kafka data kstream = KafkaUtils.createDirectStream( ssc, topics=['twitterStream'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore")) words = tweets.flatMap(lambda line: line.split(" ")) nfeelings = words.map(lambda word: ('nfeelings', 1) if word in nFeelingWords else ('nfeelings', 0)) pfeelings = words.map(lambda word: ('pfeelings', 1) if word in pFeelingWords else ('pfeelings', 0)) bothFeelings = pfeelings.union(nfeelings) feelingCounts = bothFeelings.reduceByKey(lambda x, y: x + y) currentFeelingCounts = feelingCounts.updateStateByKey(sumCount) currentFeelingCounts.pprint() counts = [] feelingCounts.foreachRDD(lambda t, rdd: counts.append(rdd.collect())) ssc.start() ssc.awaitTerminationOrTimeout(45) ssc.stop(stopGraceFully=True) constructPlot(counts)
def start(port, duration=40, jobID='', batch_interval=20): ''' All Spark Streaming options go here ''' # Create a local StreamingContext with two working thread and # batch interval of 1 second sc = SparkContext('local[2]', 'NetworkWordCount') ssc = StreamingContext(sc, batch_interval) # Create a DStream that will connect to hostname:port, like localhost:9999 lines = ssc.socketTextStream(HOST, port) text = lines.map(lambda post: get_json(post))\ .filter(lambda post: post is not None)\ .filter(lambda post: 'created_at' in post)\ .filter(lambda post: is_valid_string_format(post['text']))\ .map(lambda post: post['created_at'] + ' | ' + post['text']) ''' No write to disk option! This will compute sentiment on the fly without first writing all text to temporary disk storage. To utilize this map(), users will need to only save the sentiment counts to a file and read from it in the runner.py code ''' # sentiment_counts = cleaned_text.map( # lambda text: (discretized_vader(text), 1) # ).reduceByKey(lambda x, y: x + y) # sentiment_counts.pprint() text.saveAsTextFiles('./text', suffix=jobID) ssc.start() ssc.awaitTerminationOrTimeout(duration) ssc.stop()
def main(): pwords = load_wordlist("./Dataset/positive.txt") nwords = load_wordlist("./Dataset/negative.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") sql = SQLContext(sc) # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8'))) tweets = kstream.map(lambda x: json.loads(x[1])) tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords)) #searchTermSentiment = tweetsUsentiment.pprint() tweetsUsentiment.saveToCassandra("tweetdb", "tweettable") ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
def net_streaming(): ''' 功能 :连接到TCP服务器,接收处理Socket数据流 参数host : TCP服务器IP 参数port : TCP服务器端口 ''' if len(sys.argv) != 3: print "usage: chapter9_streaming.py <tcp host> <tcp port>" return -1 host, port = sys.argv[1], sys.argv[2] sc = SparkContext(appName="pyspark_net_streaming") stream_sc = StreamingContext(sc, 1) socketTexts = stream_sc.socketTextStream(host, int(port)) counts = socketTexts.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a+b) counts.pprint(24) stream_sc.start() stream_sc.awaitTerminationOrTimeout(timeout=30) stream_sc.stop(stopSparkContext=False, stopGraceFully=True)
def kafka_streaming(): ''' 功能:接收处理kafka消息 ''' if len(sys.argv) != 3: print "Usage: chapter9_streaming.py <zookeepr host> <topic name>" return -1 sc = SparkContext(appName="kafka_streaming") ssc = StreamingContext(sc, 1) zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint(24) ssc.start() ssc.awaitTerminationOrTimeout(timeout=30) ssc.stop(stopSparkContext=True, stopGraceFully=True)
def main(): # load nfeeling_words = load_word_list( "/home/peterli2he1/spark/Dataset/nFeeling.txt") pfeeling_words = load_word_list( "/home/peterli2he1/spark/Dataset/pFeeling.txt") # Initialize spark streaming context conf = SparkConf().setAppName("TwitterStreamApplication") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("checkpoint_TwitterStreamApplication") # Processing data from Kafka kstream = KafkaUtils.createDirectStream( ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"}) tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore")) words = tweets.flatMap(lambda line: line.split(" ")) nfeelings = words.map(lambda word: ("nfeelings", 1) if word in nfeeling_words else ("nfeelings", 0)) pfeelings = words.map(lambda word: ("pfeelings", 1) if word in pfeeling_words else ("pfeelings", 0)) both_feelings = pfeelings.union(nfeelings) feeling_counts = both_feelings.reduceByKey(lambda x, y: x + y) counts = [] feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect())) ssc.start() ssc.awaitTerminationOrTimeout(10) ssc.stop(stopGraceFully=True) construct_plot(counts)
def start(self, port, keyword, timeout=60): print("port", port) print("keyword", keyword) sc = SparkContext('local[3]',"TwitterStreamApp" + str(port)) sc.setLogLevel("ERROR") # create the Streaming Context from the above spark context with interval size 3 seconds ssc = StreamingContext(sc, 3) # # setting a checkpoint to allow RDD recovery # ssc.checkpoint("checkpoint_TwitterApp" + str(port)) # read data from port self.db = DBFireBase(keyword) dataStream = ssc.socketTextStream("127.0.0.1", port) # processing # split each tweet into words tokens = dataStream.flatMap(lambda line: wc_tokenize(_preprocess(line))) # count each word word_count = tokens.map(lambda x: (x, 1)) word_counts = word_count.reduceByKey(lambda a, b: a+b) # hate speech prediction result = dataStream.map(lambda line: (line, predict(get_feats(np.asarray([line], dtype='U'))))) # upload to Firebase word_counts.foreachRDD(self.process_rdd) result.foreachRDD(self.process_rdd2) ssc.start() # wait for timeout default 60s ssc.awaitTerminationOrTimeout(timeout) ssc.stop()
def Stream(): sc = SparkContext(appName = 'NewsTwitter') ssc = StreamingContext(sc, 10) ssc.checkpoint('checkpoint') kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'twitter':1}) parsed = kafkaStream.map(lambda v: json.loads(v[1])).map(parseText) parsed.cache() tweets_saver = parsed.map(lambda tweet: tweet + '\n').reduce(lambda x, y: x + y) tweets_saver.saveAsTextFiles('file:///home/lmh/Downloads/temp/lmh/text/tweets/t') sentiment = parsed.map(lambda tweet: [analize_sentiment(tweet), 1]) sentiment_count = sentiment.reduceByKey(add) sentiment_count.cache() sentiment_count.saveAsTextFiles('file:///home/lmh/Downloads/temp/lmh/text/sentiment/s') sentiment_count.pprint() sentiment_fig_data = sentiment_count.updateStateByKey(cumu) counts = [] sentiment_fig_data.foreachRDD(lambda s, rdd: counts.append(rdd.collect())) ssc.start() ssc.awaitTerminationOrTimeout(60) ssc.stop(stopGraceFully = True) return counts
def file_streaming_dynamic(): ''' 功能:处理在linux文件系统上动态生成的文件csv文件,并将处理的文件转存到其他的目录下 ''' sc = SparkContext.getOrCreate() stream_sc = StreamingContext(sc, 1) file_stream = stream_sc.textFileStream("file:///home/hadoop/stream/").map( lambda x: x.split(",")) file_stream.pprint(24) file_stream.saveAsTextFiles("/home/hadoop/output/") stream_sc.start() stream_sc.awaitTerminationOrTimeout(timeout=100) stream_sc.stop(stopSparkContext=True, stopGraceFully=True)
def spark_analysis(topic_name): sc = SparkContext(appName="PythhonSpark") ssc = StreamingContext(sc, 60) kvs = KafkaUtils.createDirectStream( ssc, [topic_name], {"metadata.broker.list": "localhost:9092"}) lines = kvs.map(lambda x: x[1].encode("ascii", "ignore")) words = lines.flatMap(lambda line: line.split("\n")) review = words.map(lambda word: (topic_name, [analyze_sentiment(topic_name, word), 1])) answer = review.reduceByKey(lambda x, y: [x[0] + y[0], x[1] + y[1]]) answer.pprint() ssc.start() #ssc.awaitTermination() ssc.awaitTerminationOrTimeout(600)
def file_streaming_static(): """ 功能 : 监控在HDFS上的json文件,读取数据将处理的结果保存会HDFS """ sc = SparkContext.getOrCreate() spark = SparkSession(sc) stream_sc = StreamingContext(sc, 1) file_data = stream_sc.textFileStream( "hdfs://127.0.0.1:9000/test_data").map(lambda x: x.split(",")) file_data.pprint(24) file_data.foreachRDD(file_save) stream_sc.start() stream_sc.awaitTerminationOrTimeout(timeout=30) stream_sc.stop(stopSparkContext=True, stopGraceFully=True)
def startStreaming(socketio): # Functions used to process incoming data def process_batch(batch): global clf # convert byte lines into tweets jsons #tweet = json.loads(json.loads(batch[1])) # TO GET DICT tweet = json.loads(batch[1]) tweet = pd.read_json(tweet, typ='series', orient='records') #tweet = tweet.values.reshape(1, -1) return tweet def process_RDD(rdd): global clf tweets = rdd.collect() if len(tweets) > 0: preds = clf.predict(tweets) bots = preds.tolist().count(0) humans = preds.tolist().count(1) print("Sending data to front-end") socketio.emit('update_values', {'bots': bots, 'humans': humans}) # Setup Spark sc = SparkContext.getOrCreate()#(appName="PythonStreamingDirectKafkaWordCount") ssc = StreamingContext(sc, 0.1) kvs = KafkaUtils.createDirectStream(ssc, ["tweet_stream"], {'metadata.broker.list': "localhost:9092"}) tweets = kvs.map(lambda x: process_batch(x)) tweets.foreachRDD(process_RDD) #counts.pprint() ssc.start() ssc.awaitTerminationOrTimeout(600) ssc.stop()
def main(): # load nfeeling_words = load_word_list("/home/xxx/spark/Dataset/nFeeling.txt") pfeeling_words = load_word_list("/home/xxx/spark/Dataset/pFeeling.txt") # Initialize spark streaming context conf = SparkConf().setAppName("TwitterStreamApplication") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # 1 sec mini batch ssc.checkpoint("checkpoint_TwitterStreamApplication") # Processing data from Kafka # twitter-stream is our Kafka topic; metadata.broker.list means address of the Kafka server. # since KafkaUtils is from pyspark package, kstream is acutally rdds. This makes data following all rdds. kstream = KafkaUtils.createDirectStream( ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"}) # default tweet format from Twitter API is JSON array with each tweet as a JSON string. # kstream (Kafka data format) is a list of (key, value) tuples. key: message metadata i.e. partition etc. value: message contents i.e. tweets. # we just want the value of kstream to analysis pos/neg feelings of the tweet, thus, x[1]. Only take English contents, ignore others. # map here means traverse all the kstream tuples and save what we want into "tweets" variable. tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore")) # for each line of tweet split them into list of strings. # then flatMap them into one large list of strings for all the tweets. # flatMap means 2d to 1d list. words = tweets.flatMap(lambda line: line.split(" ")) nfeelings = words.map(lambda word: ("nfeelings", 1) if word in nfeeling_words else ("nfeelings", 0)) pfeelings = words.map(lambda word: ("pfeelings", 1) if word in pfeeling_words else ("pfeelings", 0)) # combine pos and neg feelings tuples into one large list both_feelings = pfeelings.union(nfeelings) # add up all the pos/neg feeling counts respectively. feeling_counts = both_feelings.reduceByKey(lambda x, y: x + y) counts = [] # since feeling_counts is acutally rdd, use foreachRDD to add pos/neg tuples to counts list for plotting. # rdd.collect() action actually performs the reduceByKey transformation here. feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect())) ssc.start() ssc.awaitTerminationOrTimeout(10) ssc.stop(stopGraceFully=True) construct_plot(counts)
def main(): pwords = load_wordlist("../Dataset/positive.txt") nwords = load_wordlist("../Dataset/negative.txt") sterms = load_wordlist("../Dataset/keyWords.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("TweeStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = CassandraSparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 10 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitter-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: json.loads(x[1])) tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() tweetsUsentiment = tweets.map( lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms)) searchTermUsentiment = tweetsUsentiment.flatMap( lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey( lambda a, b: a + b) searchTermUsentiment = searchTermUsentiment.map( lambda (key, value): { "searchterm": "_" + key, "insertion_time": datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S'), "sentiment": value }) searchTermUsentiment.pprint() searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable") # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms)) ssc.start() ssc.awaitTerminationOrTimeout(1000) ssc.stop(stopGraceFully=True)
def main(): # load positive/negative word list # Use absolute file paths if necessary nfeeling_words = load_word_list("../word_monitor/dataset/nFeeling.txt") pfeeling_words = load_word_list("../word_monitor/dataset/pFeeling.txt") # Initialize spark streaming context conf = SparkConf().setAppName("TwitterStreamApplication") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("checkpoint_TwitterStreamApplication") # Processing data from Kafka kstream = KafkaUtils.createDirectStream( ssc, ["twitter-stream"], {"metadata.broker.list": "localhost:9092"}) tweets = kstream.map(lambda x: x[1].encode("ascii", "ignore") ) # Extract and only keep ascii-supported messages words = tweets.flatMap(lambda line: line.split(" ") ) # Extract all words in tweets and put in a list nfeelings = words.map(lambda word: ("nfeelings", 1) if word in nfeeling_words else ("nfeelings", 0)) pfeelings = words.map(lambda word: ("pfeelings", 1) if word in pfeeling_words else ("pfeelings", 0)) both_feelings = pfeelings.union(nfeelings) feeling_counts = both_feelings.reduceByKey( lambda x, y: x + y ) # Reduce by key (nfeelings or pfeelings) -> [(nfeelings, count), (pfeelings, count)] counts = [] feeling_counts.foreachRDD(lambda t, rdd: counts.append(rdd.collect( ))) # foreachRDD() takes a function that has two parameter: (time, rdd) ssc.start() ssc.awaitTerminationOrTimeout(10) # Set running time ssc.stop(stopGraceFully=True) construct_plot(counts)
def main(): tickerSymbols = load_wordlist("../Dataset/tickerSymbols.txt") conf = SparkConf().\ setMaster("local[2]").\ setAppName("StockStreamer").\ set("spark.cassandra.connection.host",\ "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream(\ ssc, topics = ['stock-topic1'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) stock = kstream.map(lambda (key, value): json.loads(value)) stock.pprint() stock.saveToCassandra("tweetdb", "stocktable") # Start the computation ssc.start() ssc.awaitTerminationOrTimeout(10000) ssc.stop(stopGraceFully=True)
def main(): conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # Creating a streaming context with batch interval of 1 sec ssc = StreamingContext(sc, 10) #ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['stock-topic1'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) stock = kstream.map(lambda (key, value): json.loads(value)) stock.pprint() #value = stock.map(lambda stock1: stock1[u'bidaskvalvol']) #value.pprint() #text_counts = stock.map(lambda stockQ: (stockQ['bidaskvalvol'],1)).reduceByKey(lambda x,y: x + y) #text_counts.pprint() # Start the computation ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop(stopGraceFully=True)
#Step 7-4-3. Creating sum of each row of numbers. def stringToNumberSum(data): removedSpaceData = data.strip() if removedSpaceData == '': return (None) splittedData = removedSpaceData.split(' ') numData = [float(x) for x in splittedData] sumOfData = sum(numData) return (sumOfData) dataInString = '10 10 20 ' stringToNumberSum(dataInString) #Step 7-4-4. Reading data from Kafka and getting sum of each row. from pyspark.streaming.kafka import KafkaUtils from pyspark.streaming import StreamingContext bookStreamContext = StreamingContext(sc, 10) bookKafkaStream = KafkaUtils.createStream(ssc=bookStreamContext, zkQuorum='localhost:2185', groupId='pysparkBookGroup', topics={'pysparkBookTopic': 1}) sumedData = bookKafkaStream.map(lambda data: stringToNumberSum(data[1])) sumedData.pprint() bookStreamContext.start() bookStreamContext.awaitTerminationOrTimeout(30)
class StreamingContextTests(PySparkStreamingTestCase): duration = 0.1 setupCalled = False def _add_input_stream(self): inputs = [range(1, x) for x in range(101)] stream = self.ssc.queueStream(inputs) self._collect(stream, 1, block=False) def test_stop_only_streaming_context(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self._add_input_stream() self.ssc.start() self.ssc.stop(False) self.ssc.stop(False) def test_queue_stream(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) result = self._collect(dstream, 3) self.assertEqual(input, result) def test_text_file_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream2 = self.ssc.textFileStream(d).map(int) result = self._collect(dstream2, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "w") as f: f.writelines(["%d\n" % i for i in range(10)]) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], result) def test_binary_records_stream(self): d = tempfile.mkdtemp() self.ssc = StreamingContext(self.sc, self.duration) dstream = self.ssc.binaryRecordsStream(d, 10).map( lambda v: struct.unpack("10b", bytes(v))) result = self._collect(dstream, 2, block=False) self.ssc.start() for name in ('a', 'b'): time.sleep(1) with open(os.path.join(d, name), "wb") as f: f.write(bytearray(range(10))) self.wait_for(result, 2) self.assertEqual([list(range(10)), list(range(10))], [list(v[0]) for v in result]) def test_union(self): input = [list(range(i + 1)) for i in range(3)] dstream = self.ssc.queueStream(input) dstream2 = self.ssc.queueStream(input) dstream3 = self.ssc.union(dstream, dstream2) result = self._collect(dstream3, 3) expected = [i * 2 for i in input] self.assertEqual(expected, result) def test_transform(self): dstream1 = self.ssc.queueStream([[1]]) dstream2 = self.ssc.queueStream([[2]]) dstream3 = self.ssc.queueStream([[3]]) def func(rdds): rdd1, rdd2, rdd3 = rdds return rdd2.union(rdd3).union(rdd1) dstream = self.ssc.transform([dstream1, dstream2, dstream3], func) self.assertEqual([2, 3, 1], self._take(dstream, 3)) def test_transform_pairrdd(self): # This regression test case is for SPARK-17756. dstream = self.ssc.queueStream( [[1], [2], [3]]).transform(lambda rdd: rdd.cartesian(rdd)) self.assertEqual([(1, 1), (2, 2), (3, 3)], self._take(dstream, 3)) def test_get_active(self): self.assertEqual(StreamingContext.getActive(), None) # Verify that getActive() returns the active context self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) # Verify that getActive() returns None self.ssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.assertEqual(StreamingContext.getActive(), None) def test_get_active_or_create(self): # Test StreamingContext.getActiveOrCreate() without checkpoint data # See CheckpointTests for tests with checkpoint data self.ssc = None self.assertEqual(StreamingContext.getActive(), None) def setupFunc(): ssc = StreamingContext(self.sc, self.duration) ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.setupCalled = True return ssc # Verify that getActiveOrCreate() (w/o checkpoint) calls setupFunc when no context is active self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that getActiveOrCreate() returns active context and does not call the setupFunc self.ssc.start() self.setupCalled = False self.assertEqual(StreamingContext.getActiveOrCreate(None, setupFunc), self.ssc) self.assertFalse(self.setupCalled) # Verify that getActiveOrCreate() calls setupFunc after active context is stopped self.ssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) # Verify that if the Java context is stopped, then getActive() returns None self.ssc = StreamingContext(self.sc, self.duration) self.ssc.queueStream([[1]]).foreachRDD(lambda rdd: rdd.count()) self.ssc.start() self.assertEqual(StreamingContext.getActive(), self.ssc) self.ssc._jssc.stop(False) self.setupCalled = False self.ssc = StreamingContext.getActiveOrCreate(None, setupFunc) self.assertTrue(self.setupCalled) def test_await_termination_or_timeout(self): self._add_input_stream() self.ssc.start() self.assertFalse(self.ssc.awaitTerminationOrTimeout(0.001)) self.ssc.stop(False) self.assertTrue(self.ssc.awaitTerminationOrTimeout(0.001))
reverse=True) for i in range(3): print(" " + str(word_count_dict_tuple[i])) # -------------------------------------------------------------------------- # DStream processing # -------------------------------------------------------------------------- textDataRDD = ssc.textFileStream(Directory) textDataRDD = textDataRDD.map(lambda row: row.split(SPACE_CHARACTER)[3]) count_barrage = textDataRDD.flatMap(splitWord).map(lambda word: (word, 1)) count_word = count_barrage.reduceByKey(lambda a, b: a + b) count_word.foreachRDD(topBarrage) # print(count) # -------------------------------------------------------------------------- # DStream main() # -------------------------------------------------------------------------- ssc.start() ssc.awaitTerminationOrTimeout(64) ssc.stop() print("DStream finished.") f = open('result.txt', 'a') f.write("result:") f.write( str( sorted(accumulate_barrage.items(), key=lambda kv: kv[1], reverse=True)[:10])) f.close()
def run_spark_job(queue: Queue, _agg_function: AggregationFunction, _agg_window_millis: int, _spark_opts: dict = {}, _environment: dict = {}): os.environ.update(_environment) try: try: import findspark findspark.init() except Exception as ex: self.logger.warn("Cannot import Spark pyspark with" " findspark. Message: {}".format(str(ex))) pass from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.sql.functions import expr, window from pyspark.serializers import NoOpSerializer from pyspark.streaming import DStream from pyspark.streaming.kafka import utf8_decoder spark_builder = SparkSession \ .builder \ for k in _spark_opts: spark_builder = spark_builder.config(k, _spark_opts[k]) spark_builder \ .appName(str(self)) \ .config("spark.jars.packages", "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1," "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \ .config("spark.jars", BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar") spark = spark_builder.getOrCreate() spark.sparkContext.setLogLevel("WARN") ssc = StreamingContext(spark.sparkContext, (agg_window_millis / 1000)) agg = expr("value") if _agg_function == AggregationFunction.AVG: agg = expr("avg(value)") elif _agg_function == AggregationFunction.SUM: agg = expr("sum(value)") elif _agg_function == AggregationFunction.COUNT: agg = expr("count(value)") elif _agg_function == AggregationFunction.P50: agg = expr("percentile(value, 0.5)") elif _agg_function == AggregationFunction.P75: agg = expr("percentile(value, 0.75)") elif _agg_function == AggregationFunction.P95: agg = expr("percentile(value, 0.95)") elif _agg_function == AggregationFunction.P99: agg = expr("percentile(value, 0.99)") deserializer = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkPubsubMessageSerializer() # noqa: E501 pubsub_utils = \ ssc._jvm.org.apache.spark.streaming.pubsub.PubsubUtils credentials = \ ssc._jvm.org.apache.spark.streaming.pubsub.SparkGCPCredentials storage_level = \ ssc._jvm.org.apache.spark.storage.StorageLevel _pubsub_stream = pubsub_utils \ .createStream(ssc._jssc, project_id, subscription, credentials.Builder().build(), storage_level.DISK_ONLY()) _pubsub_stream_des = _pubsub_stream.transform(deserializer) ser = NoOpSerializer() pubsub_stream = DStream(_pubsub_stream_des, ssc, ser).map(utf8_decoder) def aggregate_rdd(_queue, _agg, df, ts): secs = int(self.agg_window_millis / 1000) win = window("ts", "{} seconds".format(secs)) if df.first(): aggs = df \ .groupBy("application", win) \ .agg(_agg.alias("value")) \ .collect() for row in aggs: message = InputMessage(row["application"], value=row["value"], ts=ts) self.logger.debug("Enqueue: {}".format( message.to_json())) try: _queue.put(message.to_json()) except AssertionError as ex: self.logger.warn(str(ex)) else: self.logger.warn("Empty RDD") # Create kafka stream pubsub_stream \ .foreachRDD(lambda ts, rdd: aggregate_rdd(queue, agg, spark.read.json(rdd), ts)) # Run ssc.start() if "timeout" in _spark_opts: ssc.awaitTerminationOrTimeout(_spark_opts["timeout"]) ssc.stop() spark.stop() else: ssc.awaitTermination() ssc.stop() spark.stop() except Exception as e: raise e
dstream = dstream.flatMap(extract_carr_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average) elif args.task == 'q13': dstream = dstream.flatMap(extract_weekday_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average) elif args.task == 'q21': dstream = dstream.flatMap(extract_origin_carrier_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average) elif args.task == 'q22': dstream = dstream.flatMap(extract_origin_destination_dep_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average) elif args.task == 'q23': dstream = dstream.flatMap(extract_route_carrier_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average) elif args.task == 'q24': dstream = dstream.flatMap(extract_route_arr_delay).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average) elif args.task == 'q32': get_cass().execute('truncate %s' % schema['table']) dstream = dstream.flatMap(extract_trip_info).foreachRDD(save_trip) else: print("Unknown task") # runner ts_last_data = time.time() ssc.start() while True: res = ssc.awaitTerminationOrTimeout(args.run_interval) if res: # stopped elsewhere break else: # still running if time.time() - ts_last_data > args.idle_time: dump("No data received for %d seconds, stopping..." % args.idle_time) ssc.stop(stopSparkContext=True, stopGraceFully=False)
def run_spark_job(queue: Queue, _agg_function: AggregationFunction, _agg_window_millis: int, _spark_opts: dict = {}, _environment: dict = {}): os.environ.update(_environment) try: try: import findspark findspark.init() except Exception as ex: self.logger.warn("Cannot import Spark pyspark with" " findspark. Message: {}".format(str(ex))) pass from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql.functions import expr, window spark_builder = SparkSession \ .builder \ for k in _spark_opts: spark_builder = spark_builder.config(k, _spark_opts[k]) spark_builder = spark_builder \ .appName(str(self)) \ .config("spark.jars.packages", "org.apache.spark:spark-streaming-kafka-0-8_2.11:2.2.1," "org.apache.bahir:spark-streaming-pubsub_2.11:2.2.1") \ .config("spark.jars", BASE_PATH + "/lib/streaming-pubsub-serializer_2.11-0.1.jar") spark = spark_builder.getOrCreate() spark.sparkContext.setLogLevel("WARN") ssc = StreamingContext(spark.sparkContext, (agg_window_millis / 1000)) agg = expr("value") if _agg_function == AggregationFunction.AVG: agg = expr("avg(value)") elif _agg_function == AggregationFunction.SUM: agg = expr("sum(value)") elif _agg_function == AggregationFunction.COUNT: agg = expr("count(value)") elif _agg_function == AggregationFunction.P50: agg = expr("percentile(value, 0.5)") elif _agg_function == AggregationFunction.P75: agg = expr("percentile(value, 0.75)") elif _agg_function == AggregationFunction.P95: agg = expr("percentile(value, 0.95)") elif _agg_function == AggregationFunction.P99: agg = expr("percentile(value, 0.99)") kafka_stream = KafkaUtils.createDirectStream( ssc, [self.input_topic], {"metadata.broker.list": ",".join(self.broker_servers)}) def aggregate_rdd(_queue, _agg, df, ts): secs = int(self.agg_window_millis / 1000) win = window("ts", "{} seconds".format(secs)) if df.first(): aggs = df \ .groupBy("application", win) \ .agg(_agg.alias("value")) \ .collect() for row in aggs: message = InputMessage(row["application"], value=row["value"], ts=ts) self.logger.debug("Enqueue: {}".format( message.to_json())) try: _queue.put(message.to_json()) except AssertionError as ex: self.logger.warn(str(ex)) else: warnings.warn("Empty RDD") # Create kafka stream kafka_stream \ .map(lambda x: x[1]) \ .foreachRDD(lambda ts, rdd: aggregate_rdd(queue, agg, spark.read.json(rdd), ts)) # Run ssc.start() if "timeout" in _spark_opts: ssc.awaitTerminationOrTimeout(_spark_opts["timeout"]) ssc.stop() spark.stop() else: ssc.awaitTermination() ssc.stop() spark.stop() except Exception as e: raise e
class BiliSparkStreaming(): def __init__(self, master): self.master = master scf = SparkConf().setAppName("BiliSpark").setMaster(self.master).set("spark.cores.max", "3") self.sc = SparkContext(conf=scf) # sc.setLogLevel(logging.WARNING) '''监控文件目录''' self.monitor_directory = "/Users/chenhao/Documents/BiliSpark/data" '''写入文件目录''' self.writeDirectory = '/Users/chenhao/Documents/BiliData/data/' self.streamingContext = StreamingContext(self.sc, 10) sparkSession = SparkSession.builder.config(conf=scf).getOrCreate() self.mongo = MongoDB("mongodb://localhost:27017/", "biliSpark") self.months = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'] '''存储目前为止所有的数据''' self.total_data = None '''获取SparkSession实例''' def getSparkSessionInstance(self, sparkConf): if ('sparkSessionSingletonInstance' not in globals()): globals()['sparkSessionSingletonInstance'] = SparkSession \ .builder \ .config(conf=sparkConf) \ .getOrCreate() return globals()['sparkSessionSingletonInstance'] '''监控并处理数据''' def monitor_process_data(self): bili_data = self.streamingContext.textFileStream(self.monitor_directory) bili_data = bili_data.map(lambda line: (line.split(",")[1], line.split(",")[2], int(line.split(",")[3]), int(line.split(",")[4]), line.split(",")[5] )) bili_data.foreachRDD(self.process_data) self.streamingContext.start() self.streamingContext.awaitTerminationOrTimeout(2000) '''每一个rdd进行处理数据''' def process_data(self, time, rdd): print("时间:" + str(time)) if not rdd.isEmpty(): try: spark = self.getSparkSessionInstance(rdd.context.getConf()) rowRDD = rdd.map(lambda x: Row(tv=x[0], label=x[1], play=x[2], dm=x[3], month=x[4])) '''转换为DataFrame''' temp_bili_data = spark.createDataFrame(rowRDD) temp_bili_data.show() '''目前为止所有的数据''' if self.total_data is None: self.total_data = temp_bili_data.toPandas() else: self.total_data = pd.concat([self.total_data, temp_bili_data.toPandas()], axis=0, sort=True, ignore_index=True) except BaseException as e: print(e) '''记录目前为止所有的数据''' if self.total_data is not None: '''根据番剧计算播放量和弹幕量 所有月份的''' tv_bili = self.total_data.groupby(['tv']).sum().reset_index() '''将所有月的播放量和弹幕存储到MongoDB中''' tv_bili_json = self.db2Json(tv_bili) for tv_bili_temp_json in tv_bili_json: # print(tv_bili) query = { 'tv': tv_bili_temp_json['tv'] } # print(query) self.insert_mongo('allmonth_play_dm', query, tv_bili_temp_json) '''每个月各个番剧的播放量或者弹幕并排序''' for tempmonth in self.months: '''筛选出各个月份的播放量和弹幕''' temp_month_tv = self.total_data[self.total_data['month'] == tempmonth] # '''根据番剧名字进行叠加''' temp_month_tv_bymonth = temp_month_tv.groupby(["tv"]).sum().reset_index() if not temp_month_tv.empty: '''由于groupby时将月份去除了,因此要重新加上这一列''' temp_month_tv_bymonth['month'] = tempmonth temp_month_tv_bymonth_json = self.db2Json(temp_month_tv_bymonth) print(temp_month_tv_bymonth_json) for x in temp_month_tv_bymonth_json: query = { 'tv': x['tv'], 'month': tempmonth #月份查询条件不要忘记 } # print(x) self.insert_mongo('bymonth_play_dm', query, x) if not temp_month_tv.empty: '''各个月份的番剧的播放量''' temp_month_tv_sort = temp_month_tv_bymonth.sort_values(by='play', ascending=False) self.write_playOrdm_bymonth(temp_month_tv_sort, tempmonth, 'play') if not temp_month_tv.empty: '''各个月份的番剧的弹幕''' temp_month_dm_sort = temp_month_tv_bymonth.sort_values(by='dm', ascending=False) self.write_playOrdm_bymonth(temp_month_dm_sort, tempmonth, 'dm') self.write_playOrdm_Allmonth(tv_bili, 'play') self.write_playOrdm_Allmonth(tv_bili, 'dm') '''DataFrame转换为json数据''' def db2Json(self, db): json_record = db.to_json(orient='records') return json.loads(json_record) '''根据月份统计各个番剧的播放量或者弹幕数''' def write_playOrdm_bymonth(self, data, month, dmorplay): filepath = self.writeDirectory + month + '月-' + dmorplay + ".txt" row = data.shape[0] with open(filepath, 'w') as w: for i in range(row): tempdata = data.iloc[i] # print(data[key] + str(data[value])) w.write(tempdata['tv'] + ":" + str(tempdata[dmorplay]) + "\n") '''将各个番剧的播放量或者弹幕数写入文件''' def write_playOrdm_Allmonth(self, data, dmorPlay): filepath = self.writeDirectory + 'tv' + "- " + dmorPlay + ".txt" row = data.shape[0] with open(filepath, 'w') as w: for i in range(row): tempdata = data.iloc[i] # print(data[key] + str(data[value])) w.write(tempdata['tv'] + ":" + str(tempdata[dmorPlay]) + "\n") '''将数据插入Mongo数据库,存在就更新数据''' def insert_mongo(self, collection, query, data): self.mongo.insertOrUpdate(collection, query, data)
temp[i].append(p.encode('utf-8', 'ignore')) temp[i].append(q) temp[i].append(r) i += 1 i = 0 for p in prediction.collect(): temp[i].append(p) i += 1 print(temp) for i in temp: insert_tweet(str(i[0]), str(i[1]), "0", int(i[3]), int(i[2])) else: print("Empty RDD !!!") pass twitter = tweets.map(lambda tweet: tweet['user']['screen_name']) tweet_text = tweets.map(lambda tweet: tweet['text']) txt = tweets.map(lambda x: (x['text'], x['user']['screen_name'], x['id'])) txt.foreachRDD(process_data) #text = tweet_text.map(lambda x: x.encode('utf-8','ignore')) #text.foreachRDD(process_data) ssc.start() ssc.awaitTerminationOrTimeout(1000) ssc.stop(stopGraceFully=True)
sc.setCheckpointDir("/tmp") # for stable state ssc = StreamingContext(sc, 0.01) rddQ = [] for filename in os.listdir("data/split"): rddQ.append(sc.textFile("data/split/" + filename)) # rddQ.append(sc.textFile("data/split/aa")) result = [] def update_result(rdd): global result result = rdd.top(10) # processing dstream = ssc.queueStream(rddQ) dstream = sclean(dstream) dstream = scount(dstream) dstream\ .map(lambda x: (x[1],x[0]))\ .foreachRDD(lambda rdd: update_result(rdd)) ssc.start() ssc.awaitTerminationOrTimeout(30) ssc.stop() for (k, v) in result: print(str(k) + " " + str(v))
dataFilePathOnHdfs = "hdfs://{}/btsdata/aviation/ontime/".format(master) conf = SparkConf().setAppName(APP_NAME).setMaster('spark://{}:7077'.format(master)) sc = SparkContext(conf) ssc = StreamingContext(sc, STREAMING_INTERVAL) ssc.checkpoint('/tmp/ccc') lines = ssc.textFileStream(dataFilePathOnHdfs) res2_2 = lines.map(lambda line : line.split(",")) \ .filter(lambda line : line[6] == originAirport) \ # 2nd argument: 'SRQ', 'CMH', 'JFK', 'SEA', or 'BOS' .map(lambda line : (line[7], float(line[12]))) \ # (Carrier, Departure Delay) .combineByKey(lambda x : (x, 1), \ lambda x, y : (x[0] + y, x[1] + 1), \ # (sum, count) lambda x, y : (x[0] + y[0], x[1] + y[1]) ) \ .map(lambda (key, (valueSum, count) : (key, valueSum / count))) \ .sortByKey('ascending') ssc.start() while true: if ssc.awaitTerminationOrTimeout(10): break else: pass print res2_2.take(10) print "Gracefully stopping Spark Streaming Application" ssc.stop(stopSparkContext = True, stopGracefully = True) print "Application stoppped"
class BasicOperationTests(PySparkStreamingTestCase): def test_map(self): """Basic operation test for DStream.map.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.map(str) expected = [list(map(str, x)) for x in input] self._test_func(input, func, expected) def test_flatMap(self): """Basic operation test for DStream.flatMap.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.flatMap(lambda x: (x, x * 2)) expected = [list(chain.from_iterable((map(lambda y: [y, y * 2], x)))) for x in input] self._test_func(input, func, expected) def test_filter(self): """Basic operation test for DStream.filter.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.filter(lambda x: x % 2 == 0) expected = [[y for y in x if y % 2 == 0] for x in input] self._test_func(input, func, expected) def test_count(self): """Basic operation test for DStream.count.""" input = [range(5), range(10), range(20)] def func(dstream): return dstream.count() expected = [[len(x)] for x in input] self._test_func(input, func, expected) def test_slice(self): """Basic operation test for DStream.slice.""" import datetime as dt self.ssc = StreamingContext(self.sc, 1.0) self.ssc.remember(4.0) input = [[1], [2], [3], [4]] stream = self.ssc.queueStream([self.sc.parallelize(d, 1) for d in input]) time_vals = [] def get_times(t, rdd): if rdd and len(time_vals) < len(input): time_vals.append(t) stream.foreachRDD(get_times) self.ssc.start() self.wait_for(time_vals, 4) begin_time = time_vals[0] def get_sliced(begin_delta, end_delta): begin = begin_time + dt.timedelta(seconds=begin_delta) end = begin_time + dt.timedelta(seconds=end_delta) rdds = stream.slice(begin, end) result_list = [rdd.collect() for rdd in rdds] return [r for result in result_list for r in result] self.assertEqual(set([1]), set(get_sliced(0, 0))) self.assertEqual(set([2, 3]), set(get_sliced(1, 2))) self.assertEqual(set([2, 3, 4]), set(get_sliced(1, 4))) self.assertEqual(set([1, 2, 3, 4]), set(get_sliced(0, 4))) def test_reduce(self): """Basic operation test for DStream.reduce.""" input = [range(1, 5), range(5, 9), range(9, 13)] def func(dstream): return dstream.reduce(operator.add) expected = [[reduce(operator.add, x)] for x in input] self._test_func(input, func, expected) def test_reduceByKey(self): """Basic operation test for DStream.reduceByKey.""" input = [[("a", 1), ("a", 1), ("b", 1), ("b", 1)], [("", 1), ("", 1), ("", 1), ("", 1)], [(1, 1), (1, 1), (2, 1), (2, 1), (3, 1)]] def func(dstream): return dstream.reduceByKey(operator.add) expected = [[("a", 2), ("b", 2)], [("", 4)], [(1, 2), (2, 2), (3, 1)]] self._test_func(input, func, expected, sort=True) def test_mapValues(self): """Basic operation test for DStream.mapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 2), (3, 3)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.mapValues(lambda x: x + 10) expected = [[("a", 12), ("b", 12), ("c", 11), ("d", 11)], [(0, 14), (1, 11), (2, 12), (3, 13)], [(1, 11), (2, 11), (3, 11), (4, 11)]] self._test_func(input, func, expected, sort=True) def test_flatMapValues(self): """Basic operation test for DStream.flatMapValues.""" input = [[("a", 2), ("b", 2), ("c", 1), ("d", 1)], [(0, 4), (1, 1), (2, 1), (3, 1)], [(1, 1), (2, 1), (3, 1), (4, 1)]] def func(dstream): return dstream.flatMapValues(lambda x: (x, x + 10)) expected = [[("a", 2), ("a", 12), ("b", 2), ("b", 12), ("c", 1), ("c", 11), ("d", 1), ("d", 11)], [(0, 4), (0, 14), (1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11)], [(1, 1), (1, 11), (2, 1), (2, 11), (3, 1), (3, 11), (4, 1), (4, 11)]] self._test_func(input, func, expected) def test_glom(self): """Basic operation test for DStream.glom.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.glom() expected = [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]] self._test_func(rdds, func, expected) def test_mapPartitions(self): """Basic operation test for DStream.mapPartitions.""" input = [range(1, 5), range(5, 9), range(9, 13)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): def f(iterator): yield sum(iterator) return dstream.mapPartitions(f) expected = [[3, 7], [11, 15], [19, 23]] self._test_func(rdds, func, expected) def test_countByValue(self): """Basic operation test for DStream.countByValue.""" input = [list(range(1, 5)) * 2, list(range(5, 7)) + list(range(5, 9)), ["a", "a", "b", ""]] def func(dstream): return dstream.countByValue() expected = [[(1, 2), (2, 2), (3, 2), (4, 2)], [(5, 2), (6, 2), (7, 1), (8, 1)], [("a", 2), ("b", 1), ("", 1)]] self._test_func(input, func, expected, sort=True) def test_groupByKey(self): """Basic operation test for DStream.groupByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): return dstream.groupByKey().mapValues(list) expected = [[(1, [1]), (2, [1]), (3, [1]), (4, [1])], [(1, [1, 1, 1]), (2, [1, 1]), (3, [1])], [("a", [1, 1]), ("b", [1]), ("", [1, 1, 1])]] self._test_func(input, func, expected, sort=True) def test_combineByKey(self): """Basic operation test for DStream.combineByKey.""" input = [[(1, 1), (2, 1), (3, 1), (4, 1)], [(1, 1), (1, 1), (1, 1), (2, 1), (2, 1), (3, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1), ("", 1)]] def func(dstream): def add(a, b): return a + str(b) return dstream.combineByKey(str, add, add) expected = [[(1, "1"), (2, "1"), (3, "1"), (4, "1")], [(1, "111"), (2, "11"), (3, "1")], [("a", "11"), ("b", "1"), ("", "111")]] self._test_func(input, func, expected, sort=True) def test_repartition(self): input = [range(1, 5), range(5, 9)] rdds = [self.sc.parallelize(r, 2) for r in input] def func(dstream): return dstream.repartition(1).glom() expected = [[[1, 2, 3, 4]], [[5, 6, 7, 8]]] self._test_func(rdds, func, expected) def test_union(self): input1 = [range(3), range(5), range(6)] input2 = [range(3, 6), range(5, 6)] def func(d1, d2): return d1.union(d2) expected = [list(range(6)), list(range(6)), list(range(6))] self._test_func(input1, func, expected, input2=input2) def test_cogroup(self): input = [[(1, 1), (2, 1), (3, 1)], [(1, 1), (1, 1), (1, 1), (2, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 1)]] input2 = [[(1, 2)], [(4, 1)], [("a", 1), ("a", 1), ("b", 1), ("", 1), ("", 2)]] def func(d1, d2): return d1.cogroup(d2).mapValues(lambda vs: tuple(map(list, vs))) expected = [[(1, ([1], [2])), (2, ([1], [])), (3, ([1], []))], [(1, ([1, 1, 1], [])), (2, ([1], [])), (4, ([], [1]))], [("a", ([1, 1], [1, 1])), ("b", ([1], [1])), ("", ([1, 1], [1, 2]))]] self._test_func(input, func, expected, sort=True, input2=input2) def test_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.join(b) expected = [[('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_left_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.leftOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3))]] self._test_func(input, func, expected, True, input2) def test_right_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.rightOuterJoin(b) expected = [[('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_full_outer_join(self): input = [[('a', 1), ('b', 2)]] input2 = [[('b', 3), ('c', 4)]] def func(a, b): return a.fullOuterJoin(b) expected = [[('a', (1, None)), ('b', (2, 3)), ('c', (None, 4))]] self._test_func(input, func, expected, True, input2) def test_update_state_by_key(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s input = [[('k', i)] for i in range(5)] def func(dstream): return dstream.updateStateByKey(updater) expected = [[0], [0, 1], [0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_update_state_by_key_initial_rdd(self): def updater(vs, s): if not s: s = [] s.extend(vs) return s initial = [('k', [0, 1])] initial = self.sc.parallelize(initial, 1) input = [[('k', i)] for i in range(2, 5)] def func(dstream): return dstream.updateStateByKey(updater, initialRDD=initial) expected = [[0, 1, 2], [0, 1, 2, 3], [0, 1, 2, 3, 4]] expected = [[('k', v)] for v in expected] self._test_func(input, func, expected) def test_failed_func(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): raise ValueError("This is a special error") input_stream.map(failed_func).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func2(self): # Test failure in # TransformFunction.apply(rdd: Option[RDD[_]], rdd2: Option[RDD[_]], time: Time) input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream1 = self.ssc.queueStream(input) input_stream2 = self.ssc.queueStream(input) def failed_func(rdd1, rdd2): raise ValueError("This is a special error") input_stream1.transformWith(failed_func, input_stream2, True).pprint() self.ssc.start() try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error") def test_failed_func_with_reseting_failure(self): input = [self.sc.parallelize([d], 1) for d in range(4)] input_stream = self.ssc.queueStream(input) def failed_func(i): if i == 1: # Make it fail in the second batch raise ValueError("This is a special error") else: return i # We should be able to see the results of the 3rd and 4th batches even if the second batch # fails expected = [[0], [2], [3]] self.assertEqual(expected, self._collect(input_stream.map(failed_func), 3)) try: self.ssc.awaitTerminationOrTimeout(10) except: import traceback failure = traceback.format_exc() self.assertTrue("This is a special error" in failure) return self.fail("a failed func should throw an error")
airportAirports.checkpoint(60) airportAirports.foreachRDD(outputQ2N2) carriersA2A.checkpoint(60) carriersA2A.foreachRDD(outputQ2N3) topHopFlights.checkpoint(60) topHopFlights.foreachRDD(outputQ3N2) print("STARTED!") ssc.start() runStatus = 1 while True: res = ssc.awaitTerminationOrTimeout(10) # 10 seconds timeout if dataSaved1 and dataSaved2 and dataSaved3 and dataSaved4 and dataSaved5 and dataSaved6: runStatus = 0 if res: # stopped elsewhere break else: # still running timerCount+=1 print("still running...%d" % timerCount) if runStatus == 0: print("Finish saving data. Stopping streaming...") ssc.stop(stopSparkContext=True, stopGraceFully=True) break
from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import Row, SparkSession import functions as app # Spark Context conf = SparkConf().setMaster('local[2]').setAppName('ApacheWebLogsStream') sc = SparkContext(conf=conf) sc.setLogLevel('OFF') # Spark Streaming Context ssc = StreamingContext(sparkContext=sc, batchDuration=10) input_stream = ssc.socketTextStream(hostname='localhost', port=9999) # Process stream input_stream.foreachRDD(app.process) ssc.start() ssc.awaitTerminationOrTimeout(timeout=900) ssc.stop()
#datamap = tx_fee_rdd.map(lambda x: ("tx_fee",x) ) #( rowkey , [ row key , column family , column name , value ] ) datamap = tx_fee_rdd.map(lambda x: (str(x[0]), [str(x[0]),"tx_fee_col","tx_fee",str(x[1])]) ) datamap.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv) lines = ssc.socketTextStream("localhost", 8888) dump_rdd = lines.map(lambda x: json.dumps(x)) load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore')) #load_rdd.pprint(2) split_blk_rdd = load_rdd.map(lambda x: x.split(":")) #split_blk_rdd.pprint() tx_fee_rdd = split_blk_rdd.map(lambda x : (x[14][1:7],x[15][1:-15])) #this gets transaction fee #tx_fee_rdd.pprint(200) #works tx_fee_rdd.foreachRDD(SaveRecord) #function call ssc.start() # Start the computation #ssc.awaitTermination() # Wait for the computation to terminate ssc.awaitTerminationOrTimeout(15000) #13000#time out in 3 hours #ssc.stop() # Wait for the computation to terminate
dstream = dstream.flatMap( extract_origin_destination_dep_delay).reduceByKey(lambda a, b: (a[ 0] + b[0], a[1] + b[1])).foreachRDD(top_complex_average) elif args.task == 'q23': dstream = dstream.flatMap(extract_route_carrier_arr_delay).reduceByKey( lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD( top_complex_average) elif args.task == 'q24': dstream = dstream.flatMap(extract_route_arr_delay).reduceByKey( lambda a, b: (a[0] + b[0], a[1] + b[1])).foreachRDD(top_average) elif args.task == 'q32': get_cass().execute('truncate %s' % schema['table']) dstream = dstream.flatMap(extract_trip_info).foreachRDD(save_trip) else: print("Unknown task") # runner ts_last_data = time.time() ssc.start() while True: res = ssc.awaitTerminationOrTimeout(args.run_interval) if res: # stopped elsewhere break else: # still running if time.time() - ts_last_data > args.idle_time: dump("No data received for %d seconds, stopping..." % args.idle_time) ssc.stop(stopSparkContext=True, stopGraceFully=False)
#print gen_tx_json return gen_tx_json #get lines RDD lines = ssc.socketTextStream("localhost", 9999) dump_rdd = lines.map(lambda x: json.dumps(x)) #print dump_rdd.take(2) load_rdd = dump_rdd.map(lambda x: json.loads(x)).map(lambda x : x.decode('unicode_escape').encode('ascii','ignore')) #print load_rdd.take(2) #load_rdd.pprint(100) #tx = load_rdd.flatMap(lambda x: x.split(":")) #this works split_blk_rdd = load_rdd.map(lambda x: x.split(":")) #split_blk_rdd.pprint() gen_tx_rdd = split_blk_rdd.map(lambda x : (x[8][1:7],x[6][4:68]) ) #this gets generation transactions #gen_tx_rdd.pprint() #works tx_json_rdd = gen_tx_rdd.map(lambda x: (x[0],get_tx_fee(x[1])) ) #function call tx_fee_rdd = tx_json_rdd.map(lambda x : (x[0],x[1].items() [3][1][0]["value"]-25) )#.filter(lambda x : "value" in x) tx_fee_rdd.foreachRDD(SaveRecord) #function call ssc.start() # Start the computation #ssc.awaitTermination() # Wait for the computation to terminate ssc.awaitTerminationOrTimeout(12000) #time out 3.33 hours #ssc.stop() # Wait for the computation to terminate