def main(): conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec ssc.checkpoint("checkpoint") geolocator = Nominatim() stream(ssc,geolocator,100)
def start_spark(timeout=None, max_items_per_rdd_sent=None): sc = SparkContext("local[4]", "twitter.trending") ssc = StreamingContext(sc, 5) ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/') kafka_params = { 'zookeeper.connect': config.get('zookeeper', 'host'), 'group.id': config.get('kafka', 'group_id'), 'metadata.broker.list': config.get('kafka', 'hosts') } ksc = KafkaUtils.createDirectStream(ssc, [config.get('kafka', 'topic')], kafka_params) hashtag_counts = get_word_counts(ksc) filtered_tweet_count = filter_tweets(hashtag_counts) send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent) ssc.start() if timeout: ssc.awaitTermination(timeout) ssc.stop(stopSparkContext=True, stopGraceFully=True) else: ssc.awaitTermination()
def createStreamingContext(): # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("spark://%s:7077" % MASTER_NAME, appName="GlutenTweet", pyFiles=PYFILES) ssc = StreamingContext(sc, 2) # Create a DStream of raw data raw = ssc.socketTextStream(MASTER_IP, 9999) # Convert into models tweets = raw.map(lambda r: Tweet(raw_json=r)) # Store models tweets.foreachRDD(storeTweetsRDD) # Sliding window analysis window = tweets.window(20*60, 30) hashtagCounts = analysisHahtagCount(window) streamTop(hashtagCounts).pprint() # Keyword extraction - note tweets is immutable tweetsKeyword = tweets.map(lambda t: keywordExtraction(t)) # Update models tweetsKeyword.foreachRDD(updateTweetsRDD) # Sliding window analysis window2 = tweetsKeyword.window(20*60, 30) keywordCounts = analysisKeywordCount(window2) streamTop(keywordCounts).pprint() ssc.checkpoint(CHECKPOINT_DIR) return ssc
def main(): conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 10) # Create a streaming context with batch interval of 10 sec ssc.checkpoint("checkpoint") pwords = load_wordlist("positive.txt") nwords = load_wordlist("negative.txt") counts = stream(ssc, pwords, nwords, 100) make_plot(counts)
def functionToCreateContext(): sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction) counts.pprint() return ssc
def main(): conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set("spark.dynamicAllocation.enabled", "true") sc = SparkContext(conf = conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("checkpoint") # Clear the cassandra table init_cassandra().execute('TRUNCATE {}'.format(top_airports_table)) stream_kafka(ssc)
def createContext(): uBATCH_INTERVAL = 10 sc = SparkContext(SPARK_MASTER, appName="StreamingKafka") sc.broadcast(batchUserPostDict) sc.broadcast(batchPostUserDict) #sc = SparkContext("local[*]", appName="StreamingKafka") # streaming batch interval of 5 sec first, and reduce later to 1 sec or lower ssc = StreamingContext(sc, uBATCH_INTERVAL) ssc.checkpoint(CHECKPOINT_DIR) # set checkpoint directory in HDFS #ssc.checkpoint(10 * uBATCH_INTERVAL) return ssc ssc = StreamingContext.getOrCreate(CHECKPOINT_DIR, createContext)
def createContext(): conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g') sc = SparkContext(conf=conf) ssc = StreamingContext(sc, STREAMING_INTERVAL) lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL)) ssc.checkpoint(CHECKPOINT_DIR) # main split-combine-apply logic put here pairs = lines.map(lambda x: x.split(",")).map(lambda x: (x[8], 1)) runningCounts = pairs.updateStateByKey(updateFunction) sortedCounts = runningCounts.transform(lambda rdd: rdd.sortBy(lambda (airport, freq): freq, ascending=False))
def createStreamingContext(): conf = SparkConf().setMaster("local[2]").setAppName("amqp_temperature") conf.set("spark.streaming.receiver.writeAheadLog.enable", "true") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("/tmp/spark-streaming-amqp") receiveStream = AMQPUtils.createStream(ssc, "localhost", 5672, "temperature") temperature = receiveStream.map(getTemperature) max = temperature.reduceByWindow(getMax, None, 5, 5) max.pprint() return ssc
def main(): global ssc conf = SparkConf() conf.setAppName("TopAirports") conf.set("spark.streaming.kafka.maxRatePerPartition", "0") conf.set('spark.streaming.stopGracefullyOnShutdown', True) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) # Stream every 1 second ssc.checkpoint("/tmp/checkpoint") signal.signal(signal.SIGINT, stop_streaming) stream_kafka()
def functionToCreateContext(): sc = SparkContext("local[*]", "streaming_part") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 5) data_from_ticket_mechine = ssc.socketTextStream("localhost", 9999) data_from_camera_mechine = ssc.socketTextStream("localhost", 9998) #meat data_from_ticket_mechine.map(ticket_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(ticket_mechine_RDD_handler) data_from_camera_mechine.map(camera_mechine_pre_process).updateStateByKey(updateFunction).foreachRDD(camera_mechine_RDD_handler) ssc.checkpoint(checkpointDirectory) # set checkpoint directory return ssc
def functionToCreateContext(): # spark context config sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") # kafka opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) # processing lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .updateStateByKey(updateFunction) \ .map(toStringList) \ .foreachRDD(lambda rdd: rdd.saveAsNewAPIHadoopDataset(conf=conf, keyConverter=keyConv, valueConverter=valueConv)) return ssc
def functionToCreateContext(): # new context conf = SparkConf() conf = conf.setAppName(APP_NAME) sc = SparkContext(conf=conf) # http://stackoverflow.com/questions/24686474/shipping-python-modules-in-pyspark-to-other-nodes sc.addPyFile("common.py") # As argument Spark Context and batch retention ssc = StreamingContext(sc, 10) # set checkpoint directory ssc.checkpoint(CHECKPOINT_DIR) # return streaming spark context return ssc
def createContext(brokers, topic, checkpointDir): # If you do not see this printed, that means the StreamingContext has been loaded # from the new checkpoint sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount") ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) wordCounts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) #wordCounts.foreachRDD(echo) wordCounts.pprint() ssc.checkpoint(checkpointDir) return ssc
def createContext(): conf = SparkConf().setMaster('spark://{}:7077'.format(MASTER_URL)).set('spark.executor.memory', '2g') sc = SparkContext(conf=conf) ssc = StreamingContext(sc, STREAMING_INTERVAL) lines = ssc.textFileStream('hdfs://{}/data/on_time/streaming/'.format(MASTER_URL)) ssc.checkpoint(CHECKPOINT_DIR) # main split-combine-apply logic put here # filter out header and other invalid rows rdd = lines.map(lambda line: line.split(',')).filter(lambda words: len(words) > 56) # extract first field (for filtering header), Carrier, Orig, Dest, and delay fields rdd2 = rdd.map(lambda x: (x[0], x[8], x[11], x[18], x[52], x[53], x[54], x[55], x[56])).map(lambda line: [str(w.replace('"','')) for w in line]).filter(lambda row: row[0] != 'Year' and any(row[4:])) rdd2.pprint() # sum up delay fields for each row sum_delay_rdd = rdd2.map(sum_delay) sum_delay_rdd.pprint() # sum up delay for each (orig, dest, carrier) pair combined_rdd = sum_delay_rdd.updateStateByKey(updateFunction) combined_rdd.pprint() # calculate avg delay avg_rdd = combined_rdd.transform(lambda rdd: rdd.map(lambda (x, y): ((x[0], x[1]), (y[0]/float(y[1]), x[2])))) avg_rdd.pprint() # group by (orig, dest) avg_rdd_by_route = avg_rdd.groupByKey() # sort by on time performance for each (orig, dest) route and take top 10 route_sorted_carrier = avg_rdd_by_route.mapValues(lambda x: sorted(list(x))[:10]) aa = route_sorted_carrier.flatMapValues(lambda x: x) aa.pprint() aa.foreachRDD(process) return ssc
def creatingfunc(): # create streaming context ssc = StreamingContext(sc, batchIntervalSeconds) LogToKinesis("creatingfunc", "StreamingContext", str(dir(ssc))) ssc.remember(10*batchIntervalSeconds) # setup streams try: #paxRecords = ssc.textFileStream(SOURCE).map(ParsePassengerRecord) # parse and enrich pax data kinesisStream = KinesisUtils.createStream(ssc, KINESIS_APPNAME, KINESIS_STREAM, KINESIS_ENDPOINT_URL, KINESIS_REGION, InitialPositionInStream.TRIM_HORIZON, 10, StorageLevel.MEMORY_AND_DISK_2, ACCESS_KEY, SECRET_KEY) LogToKinesis("kinesisStream", "KinesisUtils.createStream", str(dir(kinesisStream))) # track total boarding and alighting per train/ownmoduleno # Note: rdd returned by updateStateByKey is (ownmoduleno, (alight, board)) # for easy conversion to dataframe we map this rdd to (ownmoduleno, alight, board). (Not shure why the following did not work: map(lambda k,v: (k,v[0],v[1])) ) """ noOfPassengersOwnModuleToday = paxRecords.map(lambda record: (record[OWN_MODULE_NO],(record[TOTAL_ALIGHTING], record[TOTAL_BOARDING]))) \ .updateStateByKey(updatePassengerCount) \ .map(lambda v: (v[0],v[1][0],v[1][1])) paxRecordsWindowStationLine = paxRecords.window(1800,20) # compute aggregates on a 30 min window updated every 20 sec paxRecordsTable = paxRecords.window(900,900) # save to permanent storage every 15 min (how large/small amounts of data is optimal to save at a time?) LogToKinesis("creatingfunc", "Streams set up OK") """ except Exception as e: LogToKinesis("creatingfunc", "EXCEPTION", str(e)) # output streams try: #paxRecords.foreachRDD(processPax) #noOfPassengersOwnModuleToday.foreachRDD(processOwnModuleState) # send sum of alightings and boardings and pax present onboard for each train to Kinesis #paxRecordsWindowStationLine.foreachRDD(processStationLineWindow) #send aggregates to Kinesis periodically, i.e. last 30 mins updated every 20 secs #paxRecordsTable.foreachRDD(processTable) #save to permanent table periodically kinesisStream.foreachRDD(processKinesisPax) except Exception as e: LogToKinesis("mainLoop", "EXCEPTION", str(e)) ssc.checkpoint(CHECKPOINTDIR) return ssc
response = requests.post(url, data=post_data) if __name__ == "__main__": # os.environ["PYSPARK_PYTHON"] = "python3" # os.environ["PYSPARK_DRIVER_PYTHON"] = "python3" os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.2 pyspark-shell' # create spark configuration conf = SparkConf() conf.setAppName("TwitterSentiment") sc = SparkContext(conf=conf) spark = SparkSession(sc) # sc.setLogLevel("ERROR") # streaming data will be divided into batches every 10s ssc = StreamingContext(sc, 10) # dataStream = ssc.socketTextStream(HOST, PORT).window(windowDuration=10, slideDuration=10) dataStream = KafkaUtils.createStream(ssc, ZOOKEEPER, 'spark-streaming', {'china': 1}) \ .window(windowDuration=10, slideDuration=10) dataStream.pprint() (dataStream.map(lambda line: line[1].lower()).filter( lambda word: len(word) > 0).map(lambda word: (word, )) # map to a tuple (word,) .foreachRDD(predict)) ssc.checkpoint("checkpoints_sentiment") ssc.start() ssc.awaitTermination()
rediscon=redis.StrictRedis(host='ec2-52-40-47-83.us-west-2.compute.amazonaws.com', port=6379, db=0,password='') pipe = rediscon.pipeline() for i in s: key='stream' value=i[1] val=str(i[0])+','+str(value[0])+','+str(value[1]) pipe.lpush(key,val) pipe.execute() appName='Stream_APP' master='spark://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:7077' conf = SparkConf().setAppName(appName).setMaster(master) sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("hdfs://ec2-50-112-193-115.us-west-2.compute.amazonaws.com:9000/user/spark_checkpoint") brokers="52.41.140.111:9092,52.41.90.5:9092,52.41.120.152:9092" kvs = KafkaUtils.createDirectStream(ssc, ["stream_test"], {"metadata.broker.list": brokers}) user_metric = kvs.map(split_features) user_count=kvs.map(split_count) count_stream = user_count.reduceByKeyAndWindow(lambda x, y: x + y, 3, 1) sum_stream = user_metric.reduceByKeyAndWindow(window_sum_hr_speed, 3, 1) joined_stream=sum_stream.join(count_stream) smoothened_stream= joined_stream.map(computations)
# Import libs import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext # Begin if __name__ == "__main__": sc = SparkContext(appName="StreamingErrorCount"); # 2 is the batch interval : 2 seconds ssc = StreamingContext(sc, 2) # Checkpoint for backups ssc.checkpoint("file:///tmp/spark") # Define the socket where the system will listen # Lines is not a rdd but a sequence of rdd, not static, constantly changing lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) # Update function def countWords(newValues, lastSum): if lastSum is None : lastSum = 0 return sum(newValues, lastSum) word_counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda word : (word, 1))\ .updateStateByKey(countWords) ## Display the counts
from pyspark.streaming import StreamingContext from pyspark.sql import Row, SQLContext from pyspark.sql.functions import lit from pyspark.sql import SparkSession import sys import json configuration = SparkConf() configuration.setAppName('Project') spark_context = SparkContext(conf=configuration) spark = SparkSession(spark_context) streaming_context = StreamingContext(spark_context, 4) streaming_context.checkpoint('Project Checkpoint') input_stream = streaming_context.socketTextStream('localhost', 6100) ########################################################## chemistry = dict() sql = SQLContext(spark_context) ########################################################## ## Player Profile player_profile = {} # Open a csv reader called DictReader
# 3. spark-submit --jars spark-streaming-kafka-assembly_2.10-1.4.1.jar ./alerts/pairs_corr.py vsu-01:2181 # http://stackoverflow.com/questions/3425439/why-does-corrcoef-return-a-matrix # if __name__ == "__main__": if len(sys.argv) != 3: print( "Usage: pairs_corr_redis.py <broker_list ex: vsu-01:2181> <queue_name - for saving the correlations series>" ) exit(-1) app_name = "IbMarketDataStream" sc = SparkContext(appName=app_name) #, pyFiles = ['./cep/redisQueue.py']) ssc = StreamingContext(sc, 2) ssc.checkpoint('./checkpoint') brokers, qname = sys.argv[1:] # # demonstrate how to use broadcast variable # Q = sc.broadcast({ 'qname': qname, 'namespace': 'mdq', 'host': 'localhost', 'port': 6379, 'db': 3 })
from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": hostname = "localhost" port_number = 8888 sc = SparkContext(appName="ErrorCounter") ssc = StreamingContext(sparkContext=sc, batchDuration=2) ''' batchDuration=2 --> all messages received within a 2s window from one RDD within the DStream. ''' ''' Create a new directory, this one will be used as a checkpoint storagepoint.''' ssc.checkpoint("/home/wesley/Documents/spark/checkpoints") ''' Lines is a DStream of RDDs, it's NOT a static collection of RDDs as it's constantly being updated''' lines = ssc.socketTextStream(hostname=hostname, port=port_number) error_count = lines.flatMap(lambda line: line.split(" ")) \ .filter(lambda word: "ERROR" in word) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda sum_occurrences, next_occurrence: sum_occurrences + next_occurrence) ''' Print out the result.''' error_count.pprint() '''# Start listening for streaming data.''' ssc.start() ''' Wait infinitely for streaming data unless you explicitly terminate the application.''' ssc.awaitTermination()
# get hostname and port of data source from application arguments hostname = sys.argv[1] port = int(sys.argv[2]) # Create a new SparkContext sc = SparkContext() # Set log level to ERROR to avoid distracting extra output sc.setLogLevel("ERROR") # Create and configure a new Streaming Context # with a 1 second batch duration ssc = StreamingContext(sc,1) # Enable checkpointing (required for window operations) ssc.checkpoint("logcheckpt") # Create a DStream of log data from the server and port specified logStream = ssc.socketTextStream(hostname,port) # Every two seconds, display the total number of requests over the # last 5 seconds # countStream = logStream.countByWindow(5,2) # countStream.pprint() # --------------------- # Bonus: Display the top 5 users every second # Count requests by user ID for every batch userreqStream = logStream \ .map(lambda line: (line.split(' ')[2],1)) \
r2.sort(key = lambda x: x[1],reverse=True) if(len(r2)>2): print("%s,%s,%s,%s,%s"%(r2[0][0],r2[1][0],r2[2][0],r2[3][0],r2[4][0])) except: pass if __name__ == "__main__": conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,int(sys.argv[2])) #2 is batch durtion ssc.checkpoint("/checkpoint_BIGDATA") #checkpoint is for recovering the lost data dataStream=ssc.socketTextStream("localhost",9009) tweet=dataStream.map(lambda w:(w.split(';')[7])) #tweet.pprint() hashtag = tweet.flatMap(lambda w :compute(w)) h = hashtag.window(int(sys.argv[1]),1) count=hashtag.reduceByKeyAndWindow(lambda x, y: x + y,lambda x,y:x-y,int(sys.argv[1]),1) #count.pprint() #To Perform operation on each RDD count.foreachRDD(process_rdd)
return (msg.topic, data.PulseId()) #return data.PulseId() tlist = [] numTopic = 2 topicHead = 'SparkTest-' for i in range(numTopic): tlist.append(topicHead + str(i)) sc = SparkContext(appName="mytstApp") sc.setLogLevel("ERROR") # 减少shell打印日志 ssc = StreamingContext(sc, 1) #tlist = ['Spark_1','Spark_2'] checkpoint_dir = './Checkpoint/spark' ssc.checkpoint(checkpoint_dir) kafka_params = { "bootstrap.servers": "localhost:9092", "group.id": "myUserGroup", "enable.auto.commit": "false", "auto.offset.reset": "smallest" } dstream = [KafkaUtils.createDirectStream(ssc, [tlist[0]], kafka_params,\ keyDecoder=spot_decoder,\ valueDecoder=spot_decoder,\ messageHandler=setHandler )\ for i in range(len(tlist)) ] countList = []
fin = interval * duracion return (fin - duracion, fin) def fUpdate(newValues, history): return set((history or [])).union(newValues) conf = SparkConf().setMaster("local[2]").setAppName("ContarCoordenadas") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") ssc = StreamingContext(sc, 5) initial = sc.parallelize([]) stream = ssc.socketTextStream("localhost", 7777) ssc.checkpoint(persistent + "counts") counts = stream.map(lambda line: line.split(";")).map( lambda x: (get_interval(int(x[3])), x[0])) history = counts.updateStateByKey(fUpdate, initialRDD=initial) # https://spark.apache.org/docs/latest/streaming-programming-guide.html#dataframe-and-sql-operations history.foreachRDD( lambda time, rdd: print(" duracion {} -- (intervalo - autos) {}".format( duracion, rdd.mapValues(lambda a: len(a)).sortByKey(False).map(lambda a: ( as_tuple_range(a[0]), a[1])).collect(), ))) ssc.start()
x += ',' print(x[:-1]) x = "" except: e = sys.exc_info()[0] window_size = int(sys.argv[1]) batch_size = int(sys.argv[2]) conf = SparkConf() conf.setAppName("FifaApp") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, batch_size) ssc.checkpoint("/usr/local/hadoop/checkpoint_FifaApp") dataStream = ssc.socketTextStream("localhost", 9009) #tweet = dataStream.window(window_size, 1) #tweet = dataStream.map(lambda w : w.split(';')[7]) tweet = dataStream.map(lambda w: w.split(';')[7]) tweet = tweet.flatMap(lambda x: x.split(',')) tweet = tweet.map(lambda y: (y, 1)) #tagsTotal = tweet.reduceByKey(aggregate_tags_count) #tagsTotal = tweet.reduceByKeyAndWindow(aggregate_tags_count,None,window_size,1) tagsTotal = tweet.updateStateByKey(aggregate_tags_count) tagsTotal.foreachRDD(process_rdd) #tagsTotal.pprint() ssc.start()
else: return 0 if __name__ == "__main__": if len(sys.argv) != 4: print( "Usage: kafka-stream-find-word-example.py <broker_list> <topic> <word>", file=sys.stderr) exit(-1) file_path = "/path/to/output_file/file_name" checkpointDirectoryLocation = "/path/to/checkpoint/" sc = SparkContext(appName="StreamingKafkaExample") ssc = StreamingContext(sc, 5) # read every 5 sec ssc.checkpoint(checkpointDirectoryLocation) # required for Window function brokers, topic, word = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) ct = lines.map(lambda line: check_for_word(line, word, file_path))\ .reduceByWindow(lambda a, b: a+b, lambda a, b: a-b, 30, 30) ct.pprint() ssc.start() ssc.awaitTermination()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": # Create the Spark context sc = SparkContext(appName="PythonStreamingApp") # Necessary log4j logging level settings are done log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) # Create the Spark Streaming Context with 10 seconds batch interval ssc = StreamingContext(sc, 10) # Set the check point directory for saving the data to recover when there is a crash ssc.checkpoint("\tmp") # The quorum of Zookeeper hosts zooKeeperQuorum = "localhost" # Message group name messageGroup = "sfb-consumer-group" # Kafka topics list separated by coma if there are multiple topics to be listened on topics = "sfb" # Number of threads per topic numThreads = 1 # Create a Kafka DStream kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup, {topics: numThreads}) # Create the Kafka stream appLogLines = kafkaStream.map(lambda x: x[1]) # Count each log messge line containing the word ERROR errorLines = appLogLines.filter(lambda appLogLine: "ERROR" in appLogLine)
return flipped_sorted.map(lambda (x, y): (y, x)) stream = KafkaUtils.createDirectStream( ssc, [config.TOPIC], {"metadata.broker.list": config.KAFKA_SERVER}) lines = stream.map(lambda x: x[1]) # lines.pprint() # Reduce by author's timezone and effective user mentions over a sliding window. tweet_locations = lines.map(extract_location).filter( lambda loc: loc != 'None').map(lambda x: (x, 1)) tweet_mentions = lines.flatMap(extract_mentions).filter(lambda user: user[0]) locations_agg = tweet_locations \ .reduceByKeyAndWindow(lambda a, b: (a + b), lambda x, y: x - y, config.WINDOW_SIZE, config.SLIDE_INTERVAL) \ .transform(lambda rdd: sort_by_value(rdd)) mentions_agg = tweet_mentions \ .reduceByKeyAndWindow(lambda a, b: (a + b), lambda x, y: x - y, config.WINDOW_SIZE, config.SLIDE_INTERVAL) \ .transform(lambda rdd: sort_by_value(rdd)) locations_agg.pprint() mentions_agg.pprint() # Metadata checkpointing is primarily needed for recovery from driver failures, # whereas data or RDD checkpointing is necessary even for basic functioning if stateful transformations are used. # http://spark.apache.org/docs/latest/streaming-programming-guide.html#checkpointing ssc.checkpoint(config.SPARK_CHECKPOINT_DIR) ssc.start() ssc.awaitTermination()
for record in taken[:num]: print(record) result.append(record) ws = create_connection(url) ws.send(json.dumps(result)) ws.close() if len(taken) > num: print("...") print("") def updateFunc(new_values, last_sum): return sum(new_values) + (last_sum or 0) sc = SparkContext(appName="PythonTwitterStreaming") ssc = StreamingContext(sc, 1) tweets = ssc.socketTextStream('localhost', 9999) ssc.checkpoint("./checkpoint-tweet") running_counts = tweets.flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .updateStateByKey(updateFunc).transform(lambda rdd: rdd.sortBy(lambda x: x[1],False)) running_counts.foreachRDD(takeAndPrint) ssc.start() ssc.awaitTermination()
print(hashtag_counts_df) except: e = sys.exc_info()[0] print("Error: %s" % e) def tmp(x): return (x.split(';')[0], 1) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 2) ssc.checkpoint("/home/anjali/Desktop/sem5/") dataStream = ssc.socketTextStream("localhost", 9009) # dataStream.pprint() tweet = dataStream.map(tmp) # OR tweet = dataStream.map(lambda w: (w.split(';')[0], 1)) count = tweet.reduceByKey(lambda x, y: x + y) #count.pprint() #TO maintain state totalcount = tweet.updateStateByKey(aggregate_tweets_count) #totalcount.pprint() #To Perform operation on each RDD totalcount.foreachRDD(process_rdd) totalcount.pprint()
Description: ''' import sys from pyspark import SparkConf, SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: WindowedSocketStreaming.py <hostname> <port>", file=sys.stderr) exit(-1) conf = SparkConf().setAppName('Windowed Streaming').setMaster("local[3]") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) ssc.checkpoint("file:///mnt/data1/workspace/data_analysis_mining/Python_Spark/spark_tutorial/data/socket/stateful") def update_func(new_value, last_num): return sum(new_value) + (last_num or 0) initialState = sc.parallelize([('hello', 1), ('world', 1)]) lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) running_counts = lines.flatMap(lambda line: line.split(' '))\ .map(lambda word: (word, 1))\ .updateStateByKey(update_func, initialRDD=initialState) running_counts.saveAsTextFiles("file:///mnt/data1/workspace/data_analysis_mining/Python_Spark/spark_tutorial/data/socket/output") running_counts.pprint() ssc.start()
###### ###### Main script ####### ###### signal.signal(signal.SIGINT, signal_handler) dynamo = dynamodb2.connect_to_region(AWS_REGION) out_table = Table(DB_TABLE, connection=dynamo) config = SparkConf() config.set('spark.streaming.stopGracefullyOnShutdown', True) #config.set('spark.yarn.executor.memoryOverhead', '2g') sc = SparkContext(appName='g2ex2', conf=config, pyFiles=['flight.py']) ssc = StreamingContext(sc, 1) ssc.checkpoint('/tmp/g2ex2') lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) filtered = lines.map(lambda line: line.split(","))\ .map(lambda fields: Flight(fields))\ .filter(lambda fl: fl.Cancelled == 0)\ .map(lambda fl: ((fl.Origin, fl.Dest), (fl.DepDelay, 1)))\ .updateStateByKey(updateFunction) filtered.foreachRDD(lambda rdd: rdd.foreachPartition(save_partition)) # start streaming process ssc.start() try:
i = i + 1 if hashh != "": print("%s" % (hashh)) if len(sys.argv) != 3: print("Should enter file, Window Size, Batch Duration", file=sys.stderr) sys.exit(-1) wind_size = int(sys.argv[1]) batch_duration = int(sys.argv[2]) conf = SparkConf() conf.setAppName("BigData") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, batch_duration) ssc.checkpoint("home/hduser/checkpoint_BIGDATA") dataStream = ssc.socketTextStream("localhost", 9009) tweet = dataStream.map(lambda w: (w.split(';')[7])) hashtag = tweet.flatMap(lambda w: (w.split(','))) hasht = hashtag.map(lambda w: (w, 1)) counts = hasht.filter(lambda x: x[0] != '') totalcount = counts.reduceByKeyAndWindow( lambda a, b: a + b, wind_size, batch_duration).transform(lambda rdd: rdd.sortBy(lambda y: (-y[1], y[0]))) #print(totalcount) totalcount.foreachRDD(process_rdd)
import sys import json import redis from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils def publishToRedis(tup): tweet = tup pool = redis.ConnectionPool(host='127.0.0.1', port=6379, db=0) r = redis.StrictRedis(connection_pool=pool) r.publish("twitterchannel", tweet) if __name__ == '__main__': sc = SparkContext(appName="PythonTwitterStreaming") ssc = StreamingContext(sc, 1) tweetStream = KafkaUtils.createStream(ssc, 'localhost:2181', "kafka-stream-redis", {'tweets': 1}) tweets = tweetStream.map(lambda x: x[1]) ssc.checkpoint("./checkpoint-tweet") tweets.foreachRDD(lambda rdd: rdd.foreach(publishToRedis)) ssc.start() ssc.awaitTermination()
wordPairs = words.map(lambda word: (word, 1)) hashtagPairs = hashtags.map(lambda hashtag: (hashtag, 1)) # Complete a wordcount using a key and 20 minute window wordCounts = wordPairs \ .reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), 1200, 10) # Last 20 minutes, updates every 10 seconds hashtagCount = hashtagPairs \ .reduceByKeyAndWindow(lambda x, y: int(x) + int(y), lambda x, y: int(x) - int(y), 1200, 10) # Last 20 minutes, updates every 10 seconds # Sort the words and hashtags in decending order sortedWordCount = wordCounts.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False)) sortedHashtagCount = hashtagCount.transform(lambda rdd: rdd.sortBy(lambda x: x[1], False)) # Send word and hashtag counts to the api sortedWordCount.foreachRDD(lambda rdd: send_data_to_api(rdd.collect(), 'http://localhost:5000/incomingWordCount', 'count')) sortedHashtagCount.foreachRDD(lambda rdd: send_data_to_api(rdd.collect(), 'http://localhost:5000/incomingHashtagCount', 'count')) # Store filtered tweets to the database and send them to the api tweet_objects.foreachRDD(lambda rdd: store_and_send_tweet(rdd)) # Save counts to file sortedWordCount.saveAsTextFiles("./spark_data/word_counts/".format(str(datetime.now()) + ".json")) sortedHashtagCount.saveAsTextFiles("./spark_data/hashtag_counts/".format(str(datetime.now()) + ".json")) # Starts the streaming context ssc.checkpoint("./spark_data/checkpoints/") ssc.start() ssc.awaitTermination()
if __name__ == "__main__": stopwords = read_stopwords("./stopwords.txt") ss = SparkSession.builder \ .appName("Spark Structured Streaming from Twitter") \ .getOrCreate() sc = ss.sparkContext ssc = StreamingContext(sc, 2) # setting a checkpoint to allow RDD recovery ssc.checkpoint( f"hdfs://{os.environ['HDFS_HOST_NAME']}:{os.environ['HDFS_HOST_PORT']}/checkpoint" ) # read data from port 9009 dataStream = ssc.socketTextStream(os.environ["TWITTER_CLIENT"], os.environ["TWITTER_PORT"]) # split each tweet into words words = dataStream.flatMap(lambda line: line.lower().split(" ")) # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1) hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1)) # adding the count of each hashtag to its last count tags_totals = hashtags.updateStateByKey(aggregate_tags_count) # do processing for each RDD generated in each interval tags_totals.foreachRDD(process_hashtags_rdd)
def input_preporcess(line): fields = line.split(",") return ((str(fields[ORIGIN_COL]), 1), (str(fields[DEST_COL]), 1)) if fields[DEST_COL] != "" else None def updateFunction(newValues, runningCount): return sum(newValues) + (runningCount or 0) if __name__ == '__main__': # set up sc = SparkContext(appName="q11") ssc = StreamingContext(sc, TimeOut) brokers = BootStarpServers topic = TopicName sc.setLogLevel("WARN") ssc.checkpoint("/tmp/q11") kvs = KafkaUtils.createDirectStream(ssc, [topic], KafkaParams) # key logic lines = kvs.map(lambda x: x[1]) rst = lines.flatMap(input_preporcess).filter( lambda x: x != None).updateStateByKey(updateFunction) rst.foreachRDD(output) # start program ssc.start() ssc.awaitTermination()
##Developer should start REPL using: #pyspark --master local[2] #3a from pyspark.streaming import StreamingContext #3b ssc = StreamingContext(sc, 2) #3c inputDS = ssc.socketTextStream("sandbox", 9999) #3d ssc.checkpoint("hdfs:///user/root/checkpointDir") #3e windowDS = inputDS.window(10,2).flatMap(lambda line:line.split(" ")) \ .map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b) #3f windowDS.pprint() #3g sc.setLogLevel("ERROR") #3h ssc.start()
TCP_IP = 'localhost' TCP_PORT = 9006 # Pyspark # create spark configuration config = SparkConf() config.setAppName('TwitterApp') config.setMaster('local[2]') config.set("spark.network.timeout","4200s") config.set("spark.executor.heartbeatInterval","4000s") # create spark context with the above configuration spark_con = SparkContext(conf=config) # create the Streaming Context from spark context with interval size 2 seconds sparkstreamconf = StreamingContext(spark_con, 4) sparkstreamconf.checkpoint("checkpoint_TwitterApp") elastic_search = Elasticsearch([{'host': 'localhost', 'port': 9200}]) def filter_emoji(text_json): text = text_json['text'] emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" u"\U0001F300-\U0001F5FF" u"\U0001F680-\U0001F6FF" u"\U0001F1E0-\U0001F1FF"
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils sc = SparkContext(appName="AirportRank") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") # lines = ssc.textFileStream("/user/otp") kvs = KafkaUtils.createDirectStream(ssc, ["flights"], {"metadata.broker.list": "hdp-master:9092"}) def print_top_list(rdd): for (count, word) in rdd.take(10): print("%s: %i" % (word, count)) # print '[%s]' % ', '.join(map(str, rdd.take(10))) def updateFunc(new_values, last_sum): return sum(new_values) + (last_sum or 0) lines = kvs.map(lambda x: x[1]) running_counts = lines.flatMap(lambda line: line.split(",")[4:6]).map(lambda apt: (apt, 1)).updateStateByKey(updateFunc) # reduceByKey(lambda x, y: x + y) top = running_counts.map(lambda x: (x[1], x[0])).transform(lambda rdd: rdd.sortByKey(False)) top.foreachRDD(print_top_list) # top.pprint()
temp_ave = sum(i) / len(i) ave_list.append(temp_ave) # print(len(ave_list)) # print(sorted(ave_list)) return sorted(ave_list)[-2] # port_num = 9999 port_num = int(sys.argv[1]) # output_name = "task2.csv" output_name = sys.argv[2] output = open(output_name, 'w') output.write("Time, Ground Truth, Estimation\n") output.flush() os.fsync(output.fileno()) sc = SparkContext("local[2]", "hw6") ssc = StreamingContext(sc, 5) ssc.checkpoint('checkpoint') initialStateRDD = sc.parallelize([('temp_key', {'city': []})]) lines = ssc.socketTextStream("localhost", port_num) key_city = lines.map(lambda x: ( 'temp_key', int(binascii.hexlify(json.loads(x)['city'].encode('utf8')), 16 ))).updateStateByKey(updateFunc, initialRDD=initialStateRDD) key_city.pprint() ssc.start() ssc.awaitTermination() # int(binascii.hexlify(new_value[1].encode('utf8')), 16)
import sys from pyspark import SparkContext from pyspark import SparkConf from pyspark.streaming import StreamingContext APP_NAME = "Group2_2_Top10DestinationsOntimeDepartureFromX" STREAMING_INTERVAL = 1 master = str(sys.argv[1]) # 'ec2-xx-xx-xx-xx.compute-1.amazonaws.com' originAirport = str(sys.argv[2]) dataFilePathOnHdfs = "hdfs://{}/btsdata/aviation/ontime/".format(master) conf = SparkConf().setAppName(APP_NAME).setMaster('spark://{}:7077'.format(master)) sc = SparkContext(conf) ssc = StreamingContext(sc, STREAMING_INTERVAL) ssc.checkpoint('/tmp/ccc') lines = ssc.textFileStream(dataFilePathOnHdfs) res2_2 = lines.map(lambda line : line.split(",")) \ .filter(lambda line : line[6] == originAirport) \ # 2nd argument: 'SRQ', 'CMH', 'JFK', 'SEA', or 'BOS' .map(lambda line : (line[7], float(line[12]))) \ # (Carrier, Departure Delay) .combineByKey(lambda x : (x, 1), \ lambda x, y : (x[0] + y, x[1] + 1), \ # (sum, count) lambda x, y : (x[0] + y[0], x[1] + y[1]) ) \ .map(lambda (key, (valueSum, count) : (key, valueSum / count))) \ .sortByKey('ascending')
if process_state == 'DEAD_PROCESS': return PROC_STOPPED return PROC_UNKNOWN def event_count_to_text((event,count)): if event == PROC_STARTED: return "Started: " + str(count) if event == PROC_STOPPED: return "Stopped: " + str(count) return "" BATCH_DURATION = 10 WINDOW_DURATION = 60 SLIDE_DURATION = 20 sc = SparkContext(appName='PythonStreamingQueueStream') ssc = StreamingContext(sc, BATCH_DURATION) ssc.checkpoint('ckpt') ssc.socketTextStream("localhost", 9999)\ .map(line_to_event)\ .filter(lambda event: event == PROC_STARTED or event == PROC_STOPPED)\ .map(lambda event: (event,1))\ .reduceByKey(lambda count1,count2: count1+count2)\ .pprint() # .reduceByKeyAndWindow( # func=lambda count1, count2: count1 + count2, # invFunc=lambda count1, count2: count1 - count2, # windowDuration=WINDOW_DURATION, # slideDuration=SLIDE_DURATION)\ # .map(event_count_to_text)\ # .transform(lambda rdd: rdd.sortBy(lambda text: text))\ # .saveAsTextFiles('process-stats')
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": sc = SparkContext(appName="StreamingErrorCount") ssc = StreamingContext(sc, 1) ssc.checkpoint("hdfs:///user/hdp/streaming") lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) counts = lines.flatMap(lambda line: line.split(" "))\ .filter(lambda word:"ERROR" in word)\ .map(lambda word: (word, 1))\ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import SQLContext, Row title = "Iteration #3" if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: " + sys.argv[0] + " <zk> <topic> </checkpoint/dir>", file=sys.stderr) exit(-1) zkQuorum, topic, checkpointDirectory = sys.argv[1:] sc = SparkContext(appName=title) ssc = StreamingContext(sc, 1) ssc.checkpoint(checkpointDirectory) kafkaStream = KafkaUtils.createStream(ssc, zkQuorum, "iter-consumer", {topic: 1}) event = kafkaStream.map(lambda x: x[1]) lines = event.flatMap(lambda line: line.split("\n")) fields = lines.map(lambda f: f.split(",")) hexandcall = fields.map(lambda f: (f[4], f[10]), 1) #callsign = fields.map(lambda c: (c[10])) #notnulls = hexandcall.filter(lambda n: (n != '')) #joined = notnulls.join(hexandcall) #callsigns = notnulls.map(lambda c: (c, 1)) #hexidents = hexident.map(lambda h: (h, 1)) #counts = callsigns.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10) counts = hexandcall.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10) counts.pprint()
#!/usr/bin/python # -*- coding: utf-8 -*- __author__ = "wxmimperio" from pyspark import SparkContext, SparkConf from pyspark.streaming import StreamingContext if __name__ == "__main__": conf = SparkConf().setMaster("local[2]").setAppName("window_streaming_demo") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 3) # 设置checkpoint到hdfs上 ssc.checkpoint("hdfs://localhost:9000/checkpiont/streaming_cp_log") lines = ssc.socketTextStream("spark-master", 9999) wordCounts = ( lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda x, y: (x + y)) ) # 每隔3s,统计一次前6s的数据 windows = wordCounts.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 6, 3) windows.pprint() ssc.start() ssc.awaitTermination()
if len(group) > 10: group = group[:10] return group if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: script.py <zk> <topic>", file=sys.stderr) exit(-1) zkQuorum, topic = sys.argv[1:] sc = SparkContext(appName="KafkaSparkStreaming") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 1) ssc.checkpoint("/tmp/q22") # mandatory for updateStateByKey ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 21}) st1 = ks.map(lambda x: x[1].split(',')) st2 = st1.flatMap(lambda x: [x[3], x[4]]) st3 = st2.map(lambda x: (x, 1)) st4 = st3.reduceByKey(lambda x, y: x + y) st5 = st4.map(lambda x: (x[0], x[1])).updateStateByKey(updateFunction) st6 = st5.map(lambda (key, value): (True, [(key, value)])) st7 = st6.reduceByKey(gettop10) st7.pprint() ssc.start() ssc.awaitTermination()
import sys from pyspark import SparkContext # allow us to work with spark from pyspark.streaming import StreamingContext # allow to work with streams in spark if __name__ == "__main__": sc = SparkContext("local[2]", "StreamingCount") sc.setLogLevel("WARN") ssc = StreamingContext( sc, 2 ) # 2 is a batchInterval prop of the DStream created by this StreamingContext ssc.checkpoint('file:///tmp/spark') lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) # hostname and port counts = lines.flatMap(lambda line: line.split(" "))\ .filter(lambda word: "ERROR" in word)\ .map(lambda word: (word, 1))\ .reduceByKeyAndWindow(lambda a, b: a + b, lambda a, b: a - b, 20, 2) counts.pprint() ssc.start() ssc.awaitTermination() # ncat -lk 9999 # spark-submit .\reduce_by_key_and_window.py localhost 9999
from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import SQLContext, Row title = "Iteration #3" if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: " + sys.argv[0] + " <zk> <topic> </checkpoint/dir>", file=sys.stderr) exit(-1) zkQuorum, topic, checkpointDirectory = sys.argv[1:] sc = SparkContext(appName=title) ssc = StreamingContext(sc, 1) ssc.checkpoint(checkpointDirectory) kafkaStream = KafkaUtils.createStream(ssc, zkQuorum, "iter-consumer", {topic: 1}) event = kafkaStream.map(lambda x: x[1]) lines = event.flatMap(lambda line: line.split("\n")) fields = lines.map(lambda f: f.split(",")) hexandcall = fields.map(lambda f: (f[4], f[10]), 1) #callsign = fields.map(lambda c: (c[10])) #notnulls = hexandcall.filter(lambda n: (n != '')) #joined = notnulls.join(hexandcall) #callsigns = notnulls.map(lambda c: (c, 1)) #hexidents = hexident.map(lambda h: (h, 1)) #counts = callsigns.reduceByKeyAndWindow(lambda a, b: a+b, lambda a, b: a-b, 600, 10) counts = hexandcall.reduceByKeyAndWindow(lambda a, b: a + b,
new_vals1 + last_vals1) ###### ###### Main script ####### ###### signal.signal(signal.SIGINT, signal_handler) config = SparkConf() config.set('spark.streaming.stopGracefullyOnShutdown', True) #config.set('spark.yarn.executor.memoryOverhead', '2g') sc = SparkContext(appName='g1ex3', conf=config, pyFiles=['flight.py']) ssc = StreamingContext(sc, 1) ssc.checkpoint('/tmp/g1ex3') lines = ssc.socketTextStream(sys.argv[1], int(sys.argv[2])) filtered = lines.map(lambda line: line.split(","))\ .map(lambda fields: Flight(fields))\ .filter(lambda fl: fl.Cancelled == 0)\ .map(lambda fl: (fl.DayOfWeek, (fl.ArrDelay, 1)))\ .updateStateByKey(updateFunction) filtered.foreachRDD(lambda rdd: print_rdd(rdd)) # start streaming process ssc.start() try:
print word if __name__ == '__main__': l = StdOutListener() auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) stream = tweepy.Stream(auth, l) stream.filter(track=['cricket'], async=True) timeInterval = int(sys.argv[1]) windowInterval = int(sys.argv[2]) if windowInterval > timeInterval: print "Error: Window interval should be greater than time interval" sys.exit(1) time.sleep(timeInterval) ssc = StreamingContext(sc, 1) stream = ssc.queueStream(l.rdds) ssc.checkpoint("/Users/highlight/sparkvagrant") stream = stream.map(lambda line: (line, 1)) stream = stream.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, windowInterval, windowInterval, 20) stream.foreachRDD(get_output) ssc.start() ssc.awaitTermination()
os.environ["PYSPARK_PYTHON"]='/usr/bin/python3' os.environ["PYSPARK_DRIVER_PYTHON"]='/usr/bin/python3' def output(rdd): mlist = [] for i in rdd.take(5): if (i != []): mlist.append(i) alist = [x[0] for x in mlist if x[0] != ''] #blist = [(x[0],x[1]) for x in mlist] if (alist != []): print(",".join(alist)) #print(blist) conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) sc.setLogLevel("ERROR") batch_interval = int(sys.argv[2]) window_size = int(sys.argv[1]) ssc=StreamingContext(sc,batch_interval) ssc.checkpoint("/home/hadoop/checkpoint_BIGDATA424") dataStream=ssc.socketTextStream("localhost",9009) tweet=dataStream.map(lambda x: x.split(';')[7]).flatMap(lambda x: x.split(',')) tweet = tweet.countByValueAndWindow(window_size,batch_interval) sortedtweets = tweet.transform(lambda rdd: rdd.sortBy(lambda x : (-x[1],x[0]),ascending=True)) sortedtweets.foreachRDD(lambda rdd: output(rdd)) ssc.start() ssc.awaitTermination(100) ssc.stop()
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext if __name__ == "__main__": # Create a SparkSession sc = SparkContext(appName="Sai-Streaming") ssc = StreamingContext(sc, 2) ssc.checkpoint("checkpoint") lines = ssc.socketTextStream('192.168.56.1', 23456) #lines = ssc.socketTextStream("localhost", 9999) lines.pprint() def countWords(newVaues, lastSum): if lastSum is None: lastSum = 0 return sum(newVaues, lastSum) word_counts = lines.flatMap(lambda line: line.split(" "))\ .filter(lambda w: w.startswith("#"))\ .map(lambda word: (word,1))\ .updateStateByKey(countWords) word_counts.pprint() ssc.start() ssc.awaitTermination() #for starting this go to Terminal #spark-submit "C:\SaiStudy - LEarn It All - Version9\SaiStudy - Spark_STream_Count.py" 192.168.56.1 9999 #nc -Lp 9999
def process_rdd(rdd): sort = rdd.sortBy(lambda x: (-x[1],x[0])).filter(lambda y: y[0] !='').collect() if(sort!=[]): #print(sorted_list[:5]) print(sort[0][0],sort[1][0],sort[2][0],sort[3][0],sort[4][0],sep=",") def out(l): o=l.split(";")[7] if(',' not in o): return [o] return o.split(",") conf=SparkConf() conf.setAppName("BD3") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,int(sys.argv[2])) ssc.checkpoint("~/checkpoint_BIGDATA2") dataStream=ssc.socketTextStream("localhost",9009) hashtags=dataStream.window(int(sys.argv[1]),1) all_hashtags=hashtags.flatMap(out) result=all_hashtags.map(lambda h : (h,1)) final_result=result.reduceByKey(lambda x,y:int(x)+int(y)) final_result.foreachRDD(process_rdd) ssc.start() ssc.awaitTermination(25) ssc.stop()
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": # Create the Spark context sc = SparkContext(appName="DataIngestionApp") log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) # Create the Spark Streaming Context with 10 seconds batch interval ssc = StreamingContext(sc, 10) # Check point directory setting ssc.checkpoint("\tmp") # Zookeeper host zooKeeperQuorum="localhost" # Kaka message group messageGroup="sfb-consumer-group" # Kafka topic where the programming is listening for the data # Reader TODO: Here only one topic is included, it can take a comma separated string containing the list of topics. # Reader TODO: When using multiple topics, use your own logic to extract the right message and persist to its data store topics = "message" numThreads = 1 # Create a Kafka DStream kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup, {topics: numThreads}) messageLines = kafkaStream.map(lambda x: x[1]) # This is where the messages are printed to the console. Instead of this, implement your own persistence logic messageLines.pprint() # Start the streaming
counter+=1 if(len(string) > 0): print(string) def tmp(x): for i in (x.split(';')[7]).split(','): return (i,1) windowDuration = int(sys.argv[1]) slideInterval = int(sys.argv[2]) conf=SparkConf() conf.setAppName("BigData") sc=SparkContext(conf=conf) ssc=StreamingContext(sc,1) ssc.checkpoint("/checkpoint_BIGDATA") dataStream=ssc.socketTextStream("localhost",9009) temp = dataStream.window(windowDuration,slideInterval) # dataStream.pprint() tweet=temp.flatMap(lambda x:((x.split(';')[7]).split(","))) tweet = tweet.map(lambda x:(x,1)) # OR #tweet=dataStream.map(lambda w:(w.split(';')[0],1)) count=tweet.reduceByKey(lambda x,y:x+y) #count.pprint() #TO maintain state totalcount=tweet.updateStateByKey(aggregate_tweets_count) # totalcount.pprint() #To Perform operation on each RDD
rdd.foreachPartition(partitionOfRecordsFun) if __name__ == "__main__": # checkpoint_path = "hdfs://spark-master:9000/checkpiont/streaming_cp_log" checkpoint_path = "tachyon-ft://spark-master:19998/checkpoint/streaming_log" kafka_topic_list = ["realdata_receive"] broker_list_dit = {"metadata.broker.list": "192.168.108.222:9092"} conf = SparkConf().setAppName("streaming_kafka_send") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 5) # setDefaultEncoding() # ssc = initStreamingContext("streaming_kafka_deltaT", "local[2]", 7) ssc.checkpoint(checkpoint_path) kvs = KafkaUtils.createDirectStream(ssc, kafka_topic_list, broker_list_dit) deltaT = kvs.flatMap(lambda lines: toJson(lines)).map(lambda x: (x["oid"], x)). \ updateStateByKey(updateFun).foreachRDD(foreachPartitionFun) # ensureOffset(kvs=kvs) offsetRanges = [] def storeOffsetRange(rdd): global offsetRanges offsetRanges = rdd.offsetRanges() return rdd def printOffsetRange(rdd):
# # do processing for each RDD generated in each interval # tags_totals.foreachRDD(su.process_rdd) # # start the streaming computation # ssc.start() # # wait for the streaming to finish # ssc.awaitTermination() conf = SparkConf() conf.setAppName("TwitterStreamApp") # create spark context with the above configuration sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") # create the Streaming Context from the above spark context with interval size 2 seconds ssc = StreamingContext(sc, 2) # setting a checkpoint to allow RDD recovery ssc.checkpoint("checkpoint_TwitterApp") # read data from port 9009 dataStream = ssc.socketTextStream("localhost", 9009) # split each tweet into words words = dataStream.flatMap(lambda line: line.split(" ")) # filter the words to get only hashtags, then map each hashtag to be a pair of (hashtag,1) hashtags = words.filter(lambda w: '#' in w).map(lambda x: (x, 1)) # adding the count of each hashtag to its last count tags_totals = hashtags.updateStateByKey(su.aggregate_tags_count) # do processing for each RDD generated in each interval tags_totals.foreachRDD(su.process_rdd) # start the streaming computation ssc.start() # wait for the streaming to finish ssc.awaitTermination()