spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \
      localhost:2181 test`
"""
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="PythonStreamingKafkaWordCount")
    ssc = StreamingContext(sc, 1)
    key_words = ['python', 'bad']
    zkQuorum, topic = sys.argv[1:]
    kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                                  {topic: 1})
    lines = kvs.map(lambda x: x[1])
    counts = lines.flatMap(lambda line: line.split(" ")).filter(
        lambda word: word in key_words)
    counts.map(lambda word: println("key words is detected"))
    counts.pprint()

    ssc.start()
    ssc.awaitTermination()
def tmp(x):
    y = x.split(';')[7]
    #z= y.split(',')
    #for el in z:
    return (y)


def split(x):
    return (x, 1)


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, int(
    sys.argv[2]))  #take from the command line for batch interval
ssc.checkpoint("/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)
#tweet=dataStream.filter(lambda w:"Android" in w.split(';')[3])
tweet = dataStream.map(tmp)
tweet = tweet.flatMap(lambda line: line.split(","))
tweet = tweet.map(split)
#tweet = tweet.map(lambda w: [w.split(',')[i], 1]
#tweet.pprint()

window = tweet.reduceByKeyAndWindow(lambda x, y: x + y, int(sys.argv[1]), 1)
#totalcount=window.updateStateByKey(aggregate_tweets_count)
#totalcount.pprint()
'''def parse(line):
    try:
示例#3
0
###################
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

sc = SparkContext(appName="PythonStreamingKafkaWordCount")
ssc = StreamingContext(sc, 1)
print(11111)
zkQuorum, topic = "192.168.52.79:9092", "test"
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer",
                              {topic: 1})
print(22222)
print('kvs', kvs)
lines = kvs.map(lambda x: x[1])
print('lines', type(lines), lines)
counts = lines.flatMap(lambda line: line.split(" ")).map(
    lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
#counts = lines.flatMap(lambda line: line.split(" "))
print('counts', counts)
#print('cv',counts.value)
counts.pprint()
print(33333)
ssc.start()
ssc.awaitTermination()

###################
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils

sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
示例#4
0
    sorted_rdd = sorted_rdd1.filter(lambda y: y[0] != '')
    s_list = sorted_rdd.collect()
    if (s_list != []):
        print(s_list[0][0],
              s_list[1][0],
              s_list[2][0],
              s_list[3][0],
              s_list[4][0],
              sep=",")


conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, int(sys.argv[2]))
ssc.checkpoint("/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)
hashtag1 = dataStream.window(int(sys.argv[1]), 1)

if (',' in hashtag1.select(lambda w: w.split(";")[7])):
    hashtag2 = hashtag1.select(lambda w: w.split(";")[7])
    hashtag3 = hashtag2.flatmap(lambda p: p.split(","))
else:
    hashtag3 = hashtag1.flatmap(lambda w: w.split(";")[7])
hashtag4 = hashtag3.map(lambda x: (x, 1))
#hashtags=hashtag4.reduceByKey(add)
hashtags = hashtag4.updateStateByKey(lambda x, y: int(x) + int(y))
hashtags.foreachRDD(func)
示例#5
0
import fixpath

from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc

from src.models.Tweet import Tweet

conf = SparkConf().setAppName("Twitter Application").setMaster("local")
spark_context = SparkContext(conf=conf)
spark_streaming_context = StreamingContext(spark_context, 20)
spark_streaming_context.checkpoint("checkpoint_TwitterApp")

spark_sql_context = SQLContext(spark_context)

socket_stream = spark_streaming_context.socketTextStream("127.0.0.1", 5555)

lines = socket_stream.window(20)

(lines.flatMap(lambda text: text.split(" ")).filter(
    lambda word: word.lower().startswith("#")).map(lambda word: (
        word.lower(), 1)).reduceByKey(lambda prev, curr: prev + curr).map(
            lambda rec: Tweet(rec[0], rec[1])).foreachRDD(lambda rdd: rdd.toDF(
            ).sort(desc("count")).limit(10).registerTempTable("tweets")))

spark_streaming_context.start()
    fpr = 0

    try:
        fpr = fp / (fp + tn)
    except ZeroDivisionError:
        pass

    f.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "," +
            str(fpr) + "\n")

    f.close()


if __name__ == "__main__":

    sc = SparkContext()
    sc.setLogLevel(logLevel="ERROR")

    scc = StreamingContext(sc, 10)
    streaming_c = scc.socketTextStream("localhost", port)

    f = open(filename, "w+")
    f.write("Time,FPR\n")
    f.close()

    streaming_c.foreachRDD(test)

    scc.start()
    scc.awaitTermination()
 def init(self):
     sc = SparkContext()
     ssc = StreamingContext(sc, 1)
     sc.setLogLevel("ERROR")
     return sc, ssc
示例#8
0
        if i == (len(maximum) - 1):
            hashh = hashh + str(maximum[i][0])
        else:
            hashh = hashh + str(maximum[i][0]) + ","
        i = i + 1
    if hashh != "":
        print("%s" % (hashh))


wind_size = int(sys.argv[1])
batch_duration = int(sys.argv[2])
conf = SparkConf()
conf.setAppName("BigData")
sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, batch_duration)
ssc.checkpoint("home/hduser/checkpoint_BIGDATA")

dataStream = ssc.socketTextStream("localhost", 9009)

tweet = dataStream.map(lambda w: (w.split(';')[7]))

hashtag = tweet.flatMap(lambda w: (w.split(',')))
hasht = hashtag.map(lambda w: (w, 1))
counts = hasht.filter(lambda x: x[0] != '')

totalcount = counts.reduceByKeyAndWindow(
    lambda a, b: a + b, wind_size,
    batch_duration).transform(lambda rdd: rdd.sortBy(lambda y: (-y[1], y[0])))
#print(totalcount)
totalcount.foreachRDD(process_rdd)
示例#9
0
        return globals()['sqlContextSingletonInstance']


def splitJson(time,rdd):
    sqc = getSqlContextInstance(rdd.context)
    kudu_df = sqc.createDataFrame(rdd,schema)

    kudu_df.write.format('org.apache.kudu.spark.kudu') \
                 .option('kudu.master',kudu_master) \
                 .option('kudu.table',kudu_table) \
                 .mode("append") \
                 .save()

if __name__ == '__main__':
    sc = SparkContext(appName="SparkStreaming_IoT")
    ssc = StreamingContext(sc, 5) # 5 second window
    kvs = KafkaUtils.createStream(ssc, zk_broker, "meetup_comment_ws", {kafka_topic:1})

    kafka_stream = kvs.map(lambda x: x[1]) \
                           .map(lambda l: json.loads(l)) \
                           .map(lambda p: (p['dateandtime'],
                                           p['country'],
                                           p['event'],
                                           p['member'],
                                           p['sentiment'],
                                           p['comment']))


    kafka_stream.foreachRDD(splitJson)
    ssc.start()
    ssc.awaitTermination()