spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \ localhost:2181 test` """ from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) key_words = ['python', 'bad'] zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).filter( lambda word: word in key_words) counts.map(lambda word: println("key words is detected")) counts.pprint() ssc.start() ssc.awaitTermination()
def tmp(x): y = x.split(';')[7] #z= y.split(',') #for el in z: return (y) def split(x): return (x, 1) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, int( sys.argv[2])) #take from the command line for batch interval ssc.checkpoint("/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) #tweet=dataStream.filter(lambda w:"Android" in w.split(';')[3]) tweet = dataStream.map(tmp) tweet = tweet.flatMap(lambda line: line.split(",")) tweet = tweet.map(split) #tweet = tweet.map(lambda w: [w.split(',')[i], 1] #tweet.pprint() window = tweet.reduceByKeyAndWindow(lambda x, y: x + y, int(sys.argv[1]), 1) #totalcount=window.updateStateByKey(aggregate_tweets_count) #totalcount.pprint() '''def parse(line): try:
################### from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) print(11111) zkQuorum, topic = "192.168.52.79:9092", "test" kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) print(22222) print('kvs', kvs) lines = kvs.map(lambda x: x[1]) print('lines', type(lines), lines) counts = lines.flatMap(lambda line: line.split(" ")).map( lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) #counts = lines.flatMap(lambda line: line.split(" ")) print('counts', counts) #print('cv',counts.value) counts.pprint() print(33333) ssc.start() ssc.awaitTermination() ################### from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
sorted_rdd = sorted_rdd1.filter(lambda y: y[0] != '') s_list = sorted_rdd.collect() if (s_list != []): print(s_list[0][0], s_list[1][0], s_list[2][0], s_list[3][0], s_list[4][0], sep=",") conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, int(sys.argv[2])) ssc.checkpoint("/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) hashtag1 = dataStream.window(int(sys.argv[1]), 1) if (',' in hashtag1.select(lambda w: w.split(";")[7])): hashtag2 = hashtag1.select(lambda w: w.split(";")[7]) hashtag3 = hashtag2.flatmap(lambda p: p.split(",")) else: hashtag3 = hashtag1.flatmap(lambda w: w.split(";")[7]) hashtag4 = hashtag3.map(lambda x: (x, 1)) #hashtags=hashtag4.reduceByKey(add) hashtags = hashtag4.updateStateByKey(lambda x, y: int(x) + int(y)) hashtags.foreachRDD(func)
import fixpath from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext from pyspark.sql.functions import desc from src.models.Tweet import Tweet conf = SparkConf().setAppName("Twitter Application").setMaster("local") spark_context = SparkContext(conf=conf) spark_streaming_context = StreamingContext(spark_context, 20) spark_streaming_context.checkpoint("checkpoint_TwitterApp") spark_sql_context = SQLContext(spark_context) socket_stream = spark_streaming_context.socketTextStream("127.0.0.1", 5555) lines = socket_stream.window(20) (lines.flatMap(lambda text: text.split(" ")).filter( lambda word: word.lower().startswith("#")).map(lambda word: ( word.lower(), 1)).reduceByKey(lambda prev, curr: prev + curr).map( lambda rec: Tweet(rec[0], rec[1])).foreachRDD(lambda rdd: rdd.toDF( ).sort(desc("count")).limit(10).registerTempTable("tweets"))) spark_streaming_context.start()
fpr = 0 try: fpr = fp / (fp + tn) except ZeroDivisionError: pass f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "," + str(fpr) + "\n") f.close() if __name__ == "__main__": sc = SparkContext() sc.setLogLevel(logLevel="ERROR") scc = StreamingContext(sc, 10) streaming_c = scc.socketTextStream("localhost", port) f = open(filename, "w+") f.write("Time,FPR\n") f.close() streaming_c.foreachRDD(test) scc.start() scc.awaitTermination()
def init(self): sc = SparkContext() ssc = StreamingContext(sc, 1) sc.setLogLevel("ERROR") return sc, ssc
if i == (len(maximum) - 1): hashh = hashh + str(maximum[i][0]) else: hashh = hashh + str(maximum[i][0]) + "," i = i + 1 if hashh != "": print("%s" % (hashh)) wind_size = int(sys.argv[1]) batch_duration = int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, batch_duration) ssc.checkpoint("home/hduser/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweet = dataStream.map(lambda w: (w.split(';')[7])) hashtag = tweet.flatMap(lambda w: (w.split(','))) hasht = hashtag.map(lambda w: (w, 1)) counts = hasht.filter(lambda x: x[0] != '') totalcount = counts.reduceByKeyAndWindow( lambda a, b: a + b, wind_size, batch_duration).transform(lambda rdd: rdd.sortBy(lambda y: (-y[1], y[0]))) #print(totalcount) totalcount.foreachRDD(process_rdd)
return globals()['sqlContextSingletonInstance'] def splitJson(time,rdd): sqc = getSqlContextInstance(rdd.context) kudu_df = sqc.createDataFrame(rdd,schema) kudu_df.write.format('org.apache.kudu.spark.kudu') \ .option('kudu.master',kudu_master) \ .option('kudu.table',kudu_table) \ .mode("append") \ .save() if __name__ == '__main__': sc = SparkContext(appName="SparkStreaming_IoT") ssc = StreamingContext(sc, 5) # 5 second window kvs = KafkaUtils.createStream(ssc, zk_broker, "meetup_comment_ws", {kafka_topic:1}) kafka_stream = kvs.map(lambda x: x[1]) \ .map(lambda l: json.loads(l)) \ .map(lambda p: (p['dateandtime'], p['country'], p['event'], p['member'], p['sentiment'], p['comment'])) kafka_stream.foreachRDD(splitJson) ssc.start() ssc.awaitTermination()