class MyStreamingListener(StreamingListener): """ Uses py4j framework to send Java objects to the pyspark process. The parameters to the callbacks are Java objects with members variables as objects. They are not sent as primitive data types. """ def onBatchStarted(self, batchStarted): # 'batchStarted' instance of org.apache.spark.streaming.api.java.JavaStreamingListenerBatchStarted print('>>> Batch completed...number of records: ', batchStarted.batchInfo().numRecords()) def onBatchCompleted(self, batchCompleted): # 'batchStarted' instance of org.apache.spark.streaming.api.java.JavaStreamingListenerBatchCompleted print('>>> Batch completed...time taken (ms) = ', batchCompleted.batchInfo().totalDelay()) if __name__ == '__main__': ssc = StreamingContext(\ SparkContext(conf = SparkConf().setAppName('TestStreamingListenerJob')), \ 5) ssc.addStreamingListener(MyStreamingListener()) ssc\ .socketTextStream('localhost', 9999)\ .flatMap(lambda line: line.split(' '))\ .count()\ .pprint() ssc.start() ssc.awaitTermination()
def start(self): # optimize spark-streaming performance conf = SparkConf() conf.set("spark.locality.wait", 10) conf.set("spark.streaming.backpressure.enabled", True) conf.set("spark.streaming.kafka.consumer.poll.ms", 512) conf.set("spark.streaming.receiver.maxRate", 1000) sc = SparkContext(conf=conf, appName = "spark_streaming_kafka") sc.setLogLevel("WARN") ssc = StreamingContext(sc, \ self.config["TWITTER_STREAMING"]["MINI_BATCH_TIME_INTERVAL_SEC"] ) listener = Listener(sc) ssc.addStreamingListener(listener) self.num_subreddit = len(get_tweet_count_dict(sc)) print("classify to %d" % \ self.num_subreddit) # union of streams numStreams = 8 kafkaStreams = [KafkaUtils.createStream(ssc, self.config["DEFAULT"]\ ["KAFKA_PUBLIC_IP"]+':2181', 'spark-streaming', {'twitter':1}) for _ in range (numStreams)] unifiedStream = ssc.union(*kafkaStreams) # if used direct stream #kafkaStream = KafkaUtils.createDirectStream(ssc, ['twitter'], {"metadata.broker.list": self.config["DEFAULT"]["KAFKA_BROKER_LIST"]}) # load streaming message from kafka parsed = unifiedStream.map(lambda v: json.loads(v[1])) # debug usage #parsed.count().map(lambda x:'Tweets in this batch: %s' % x).pprint() # process and classify tweets subreddit_topic = parsed\ #.map(lambda tweet: self.get_word_set(tweet['text'], tweet['user']['name'])) .map(lambda tweets: (tweet['user']['name'], self.get_word_set(tweet['text'])) subreddit_topic = subreddit_topic.map(self.get_top_topic) subreddit_topic.pprint() ssc.start() ssc.awaitTermination() return def main(): process = twitterStreamingProcess() process.start() if __name__ == '__main__': main()
from pyspark.streaming import StreamingContext from pyspark.streaming.listener import StreamingListener #sc = StreamingContext() sc = SparkContext(appName="PythonTwitterStreaming") ssc = StreamingContext(sc, 1) def batchStarted(input): print('batch started') print(input) listener = StreamingListener() listener.onBatchStarted = batchStarted ssc.addStreamingListener(listener) # ssc.addStreamingListener() #lines = ssc.socketTextStream("54.213.33.240", 9002) stream = ssc.socketTextStream("54.213.33.240", 9002) #stream = ssc.addStreamingListener() stream.pprint() #print(lines) ssc.start() ssc.awaitTermination()
pprint(outputOperationCompleted) # conf = (SparkConf() # .setMaster("spark://192.168.1.33:7077") # .setAppName("PythonStreamingExample")) # sc = SparkContext(conf = conf) sc = SparkContext(appName="PythonStreamingNetworkWordCount2") ssc = StreamingContext(sc, 5) # second argument is the batch interval in seconds. # 9999 1MB text file # 9998 tiny text file # 9997 nc with stdin # IP address that worker node will connect to (don't use localhost or 127.0.0.1) lines = ssc.socketTextStream('192.168.1.33', 9999) # lines = ssc.socketTextStream('localhost', 9999) lines.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a+b).pprint() streamingListener = DebugStreamingListener() ssc.addStreamingListener(streamingListener=streamingListener) ssc.start() ssc.awaitTermination()
'place': tweet['place'], 'hash_tags': hashtag['text'] } res_list.append(temp) print(res_list) if res_list: insert_into_hashtags(res_list) if __name__ == "__main__": # creating spark configuration conf = SparkConf() conf.setAppName(SparkStream.APP_NAME.value) sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, int(SparkStream.STREAM_INTERVAL.value)) ssc.checkpoint(SparkStream.CHECKPOINT.value) dataStream = ssc.socketTextStream(SparkStream.TCP_IP.value, SparkStream.TCP_PORT.value) cv = dataStream.map(lambda x: json.loads(x)) cv.foreachRDD(process_rdd) # add listener to check if stream closed ssc.addStreamingListener(CustomListener()) ssc.start() ssc.awaitTermination()
#output_file.write("KMeans Model Update, %d, %s, %.5f\n"%(count,NUMBER_PARTITIONS, end_train-start)) output_file.flush() appName = "PythonSparkStreamingKafkaKMeans" conf = SparkConf().setAppName(appName).set( 'spark.metrics.conf.*.sink.csv.class', 'org.apache.spark.metrics.sink.CsvSink').set( 'spark.metrics.conf.*.sink.csv.directory', './') sc = SparkContext(conf=conf) ssc_start = time.time() ssc = StreamingContext(sc, STREAMING_WINDOW) batch_collector = BatchInfoCollector() ssc.addStreamingListener(batch_collector) kafka_dstream = KafkaUtils.createDirectStream( ssc, [TOPIC], {"metadata.broker.list": METABROKER_LIST}) ssc_end = time.time() #output_file.write("Spark SSC Startup, %d, %s\n"%( NUMBER_PARTITIONS, str(ssc_end-ssc_start))) kafka_dstream.count().pprint() points = kafka_dstream.transform(pre_process) points.pprint() points.foreachRDD(model_update) try: ssc.start() ssc.awaitTermination()
from pyspark.sql.functions import explode from pyspark.sql.functions import split from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.listener import StreamingListener #sc = StreamingContext() sc = SparkContext(appName="PythonTwitterStreaming") ssc = StreamingContext(sc, 1) sl = StreamingListener() sl.onBatchStarted() ssc.addStreamingListener() #lines = ssc.socketTextStream("54.213.33.240", 9002) #stream = ssc.socketTextStream("54.213.33.240", 9002) #stream = ssc.addStreamingListener() stream.pprint() #print(lines) ssc.start() ssc.awaitTermination() #ssc = new StreamingContext("local[2]", "NodejsTcpClient", Seconds(1)) #val lines = ssc.socketTextStream("127.0.0.1", 1337, StorageLevel.MEMORY_AND_DISK_SER)
#output_file.write("KMeans Prediction, %.3f\n"%(end_pred-end_train)) #return predictions def model_prediction(rdd): pass ########################################################################################################################## # Start Streaming App ssc_start = time.time() ssc = StreamingContext(sc, STREAMING_WINDOW) batch_collector = BatchInfoCollector() ssc.addStreamingListener(batch_collector) #kafka_dstream = KafkaUtils.createStream(ssc, KAFKA_ZK, "spark-streaming-consumer", {TOPIC: 1}) #kafka_param: "metadata.broker.list": brokers # "auto.offset.reset" : "smallest" # start from beginning kafka_dstream = KafkaUtils.createDirectStream(ssc, [TOPIC], {"metadata.broker.list": METABROKER_LIST, "auto.offset.reset" : "smallest"}) #, fromOffsets=fromOffset) ssc_end = time.time() output_file.write("Spark SSC Startup, %d, %d, %s, %.5f\n"%(spark_cores, -1, NUMBER_PARTITIONS, ssc_end-ssc_start)) ##################################################################### # Scenario Count #global counts