def main(ssc): zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
def stream(ssc): zkQuorum = "localhost:2181" topic = "topic1" tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"}) tweets = tweets.map(lambda x: x[1].encode("ascii","ignore")) return tweets
def ss_direct_kafka_bucket_counter(brokers, topic, bucket_interval, output_msg, message_parse, valueDecoder=None): """Starts a Spark Streaming job from a Kafka input and parses message time WARNING!! This function only works for spark 1.4.0+ Args: brokers: the kafka broker that we look at for the topic topic: the kafka topic for input timeinterval: the time interval in seconds (int) that the job will bucket Returns: None """ sc = SparkContext(appName="PythonKafkaBucketCounter") ssc = StreamingContext(sc, timeinterval + 5) if valueDecoder: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}, valueDecoder=valueDecoder) else: kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) lines = kvs.map(lambda x: x[1]) interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b) output_msg_func = output_msg(sc, ssc) interval_counts.foreachRDD(output_msg_func) ssc.start() ssc.awaitTermination()
def get_kafka_stream(topic, streaming_context): offset_specifications = simport.load(cfg.CONF.repositories.offsets)() app_name = streaming_context.sparkContext.appName saved_offset_spec = offset_specifications.get_kafka_offsets(app_name) if len(saved_offset_spec) < 1: MonMetricsKafkaProcessor.log_debug( "No saved offsets available..." "connecting to kafka without specifying offsets") kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}) return kvs else: from_offsets = {} for key, value in saved_offset_spec.items(): if key.startswith("%s_%s" % (app_name, topic)): # spec_app_name = value.get_app_name() spec_topic = value.get_topic() spec_partition = int(value.get_partition()) # spec_from_offset = value.get_from_offset() spec_until_offset = value.get_until_offset() # composite_key = "%s_%s_%s" % (spec_app_name, # spec_topic, # spec_partition) # partition = saved_offset_spec[composite_key] from_offsets[ TopicAndPartition(spec_topic, spec_partition) ] = long(spec_until_offset) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream :" " topic:{%s} : start " % topic) for key, value in from_offsets.items(): MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "offsets : TopicAndPartition:{%s,%s}, value:{%s}" % (str(key._topic), str(key._partition), str(value))) MonMetricsKafkaProcessor.log_debug( "get_kafka_stream: calling createDirectStream : " "topic:{%s} : done" % topic) kvs = KafkaUtils.createDirectStream( streaming_context, [topic], {"metadata.broker.list": cfg.CONF.messaging.brokers}, from_offsets) return kvs
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) pword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("positive",1) if word in pwords else ("positive",0)).reduceByKey(lambda a,b:a+b) nword_rdd=tweets.flatMap(lambda line: line.split(" ")).map(lambda word: ("negative",1) if word in nwords else ("negative",0)).reduceByKey(lambda a,b:a+b) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # make the plot on this rdd -combined_rdd combined_rdd=pword_rdd.union(nword_rdd) running_counts=combined_rdd.updateStateByKey(updateFunction) # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] combined_rdd.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) # print "printing dstream" running_counts.pprint() # Start the computation ssc.start() ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "localhost:9092" topics = ['test'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) print(wordcounts) kafkaStreams_lines.transform(storeOffsetRanges).foreachRDD(printOffsetRanges) wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def bro_parse(zk,topic,db,db_table,num_of_workers): app_name = "ONI-INGEST-{0}".format(topic) wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc,1) sqc = HiveContext(sc) # create DStream for each topic partition. topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks) ] tp_stream = ssc.union(*topic_dstreams) # Parallelism in Data Processing #processingDStream = tp_stream(wrks) # parse the RDD content. proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1])) # save RDD into hive . proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic)) ssc.start() ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words=tweets.flatMap(lambda x: x.split(" ")).filter(lambda x: x in pwords or x in nwords) wordPairs=words.map(lambda x: ("positive",1) if x in pwords else ("negative",1)) wordCount=wordPairs.reduceByKey(lambda x, y: x + y) runningCounts = wordPairs.updateStateByKey(updateFunction) runningCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] wordCount.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def test_kafka_direct_stream_transform_get_offsetRanges(self): """Test the Python direct Kafka stream transform get offsetRanges.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) offsetRanges = [] def transformWithOffsetRanges(rdd): for o in rdd.offsetRanges(): offsetRanges.append(o) return rdd # Test whether it is ok mixing KafkaTransformedDStream and TransformedDStream together, # only the TransformedDstreams can be folded together. stream.transform(transformWithOffsetRanges).map(lambda kv: kv[1]).count().pprint() self.ssc.start() self.wait_for(offsetRanges, 1) self.assertEqual(offsetRanges, [OffsetRange(topic, 0, long(0), long(6))])
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) #print "HELOKOJOJEORUBEORUBOUBEROUBNOUONEROJOEJRNOJENROJENFOJEFOEJFNOEFUNOEUFN" #tweets.pprint() # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words = tweets.flatMap(lambda line: line.split(" ")) pairs = words.map(classifier).map(lambda word: (word, 1)).filter(lambda x: x[0] != 'none').reduceByKey(lambda a,b: a+b) runningCounts = pairs.updateStateByKey(updateFunction) runningCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] pairs.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) #print counts return counts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). # YOUR CODE HERE words = tweets.flatMap(lambda line: line.split(' ')) \ .map(lambda word: ('positive', 1) if word in pwords else ('negative', 1) if word in nwords else ('none', 1)) \ .filter(lambda x: x[0]=='positive' or x[0]=='negative') \ .reduceByKey(lambda x, y: x + y) # Print the first ten elements of each RDD generated in this DStream to the console def updateValues(values, count): if count is None: count = 0 return sum(values, count) updatedWords = words.updateStateByKey(updateValues) updatedWords.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] # YOURDSTREAMOBJECT.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) words.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def main(): if len(sys.argv) != 4: print("Usage: kafka_wordcount.py <zk> <topic> <timeout>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) timeout = None if len(sys.argv) == 4: zk, topic, timeout = sys.argv[1:] timeout = int(timeout) else: zk, topic = sys.argv[1:] kvs = KafkaUtils.createStream( ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: (line.split(" ")) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a+b)) counts.pprint() kwargs = {} if timeout: kwargs['timeout'] = timeout ssc.start() ssc.awaitTermination(**kwargs)
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list":'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # Need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). pnTweets = tweets.flatMap(lambda line: line.split(" ")) pnTweetsPairs = pnTweets.map(lambda x: determine(x,pwords,nwords)) wordCounts = pnTweetsPairs.reduceByKey(lambda x, y: x + y) totalCounts = pnTweetsPairs.updateStateByKey(updateFunction) totalCounts.pprint() # Let the counts variable hold the word counts for all time steps # Need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] wordCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) # becaue counts include those neither ones newCounts = [] for count in counts: newCount = [item for item in count if item[0] == "positive" or item[0] =="negative"] newCounts.insert(len(newCounts),newCount) return newCounts
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Print the first ten elements of each RDD generated in this DStream to the console #tweets.pprint() words = tweets.flatMap(lambda line: line.split(" ")) posNegPairs = words.map(lambda word: myMapping(word, pwords, nwords)) filteredPairs = posNegPairs.filter(lambda x: x[0] != "na") posNegCounts = filteredPairs.reduceByKey(lambda x, y: x + y) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). cumulativeCounts = posNegCounts.updateStateByKey(myRunningUpdate) cumulativeCounts.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] posNegCounts.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def start_spark(timeout=None, max_items_per_rdd_sent=None): sc = SparkContext("local[4]", "twitter.trending") ssc = StreamingContext(sc, 5) ssc.checkpoint('hdfs://localhost:9000/user/spark/checkpoint/') kafka_params = { 'zookeeper.connect': config.get('zookeeper', 'host'), 'group.id': config.get('kafka', 'group_id'), 'metadata.broker.list': config.get('kafka', 'hosts') } ksc = KafkaUtils.createDirectStream(ssc, [config.get('kafka', 'topic')], kafka_params) hashtag_counts = get_word_counts(ksc) filtered_tweet_count = filter_tweets(hashtag_counts) send_dstream_data(filtered_tweet_count, max_items_per_rdd_sent) ssc.start() if timeout: ssc.awaitTermination(timeout) ssc.stop(stopSparkContext=True, stopGraceFully=True) else: ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream(ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) tweets = tweets.flatMap(lambda line: line.split(" ")) words = tweets.flatMap(lambda line: line.split(" ")) tweets = tweets.filter(lambda x: x in pwords or x in nwords) tweets = tweets.map(lambda x: ("positive",1) if x in pwords else ("negative",1)) tweets = tweets.reduceByKey(lambda x,y: x+y) tweets = tweets.updateStateByKey(updateFunction) tweets.pprint() pds = words.filter(lambda x: x in pwords) nds = words.filter(lambda x: x in nwords) plist=[] nlist=[] pds.foreachRDD(lambda t,rdd: plist.append(rdd.count())) nds.foreachRDD(lambda t,rdd: nlist.append(rdd.count())) counts = [] ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) for i in range(0,len(plist)): counts.append((plist[i],nlist[i])) return counts
def start(): sconf = SparkConf() sconf.set('spark.cores.max', 2) sc = SparkContext(appName='KafkaDirectWordCount', conf=sconf) ssc = StreamingContext(sc, 2) brokers = "192.192.0.27:9092" topics = ['topic7'] kafkaStreams_lines = KafkaUtils.createDirectStream(ssc, topics, kafkaParams={"metadata.broker.list": brokers}) lines1 = kafkaStreams_lines.map(lambda x: x[1]) # 注意 取tuple下的第二个即为接收到的kafka流 words = lines1.flatMap(lambda line: line.split(" ")) pairs = words.map(lambda word: (word, 1)) wordcounts = pairs.reduceByKey(lambda x, y: x + y) wordcounts.saveAsTextFiles("/var/lib/hadoop-hdfs/spark-libin/kafka") wordcounts.pprint() # 统计生成的随机数的分布情况 ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def main(): sc = SparkContext(appName="IntrusionDetector") ssc = StreamingContext(sc, batch_durations) kvs = KafkaUtils.createDirectStream(ssc, [input_topic], {"metadata.broker.list": broker}) kvs.foreachRDD(processRDD) ssc.start() ssc.awaitTermination()
def readSource(ssc, di_in_conf_with_ds_conf, app_conf): sourceType = di_in_conf_with_ds_conf['source.type'] if sourceType == 'kafka': kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True) if kafkaSimpleConsumerApiUsed: topics = di_in_conf_with_ds_conf['topics'] if not isinstance(topics, list): raise TypeError("topic should be list") brokers = di_in_conf_with_ds_conf['metadata.broker.list'] kafkaParams = {"metadata.broker.list": brokers} stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1]) else: zkConnect = di_in_conf_with_ds_conf['zookeeper.connect'] groupId = app_conf['group.id'] numReceivers = app_conf.get('num.receivers', 1) numConsumerFetchers = app_conf.get('num.consumer.fetchers') topics = di_in_conf_with_ds_conf['topics'] topic_map = dict(zip(topics, numConsumerFetchers)) # streams = reduce(lambda x, y: x.union(y), # map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map), # range(0, numReceivers))) streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)] stream = ssc.union(streams).map(lambda x: x[1]) elif sourceType == 'hdfs': path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path'] stream = ssc.textFilesStream(path) else: raise Exception('Error: unsupported source.type = ' + sourceType) num_repartition = app_conf.get('dataInterface.stream.repatition.partitions') if num_repartition is None or not isinstance(num_repartition, int): stream2 = stream else: stream2 = stream.repartition(num_repartition) # 是否使用格式化插件类格式化 format_class_path = di_in_conf_with_ds_conf.get('format.class', '') if format_class_path.strip() == '': stream3 = stream2 else: format_class_obj = get_class_obj(format_class_path) stream3 = format_class_obj.format(stream2) return stream3
def kafka_spark_streaming_sql_main(app_name, brokers, topic, interval_seconds, sql_function): sc = SparkContext(appName=app_name) sqlContext = SQLContext(sc) # ssc = StreamingContext(sc, interval_seconds) ssc = StreamingContext(sc, 10) kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) kvs.foreachRDD(sql_function) ssc.start() ssc.awaitTermination()
def stream(ssc, pwords, nwords, duration): kstream = KafkaUtils.createDirectStream( ssc, topics = ['twitterstream'], kafkaParams = {"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: x[1].encode("ascii","ignore")) # Each element of tweets will be the text of a tweet. # You need to find the count of all the positive and negative words in these tweets. # Keep track of a running total counts and print this at every time step (use the pprint function). #tweets.pprint() words = tweets.flatMap(lambda tweet:tweet.split(" ")) #words.pprint() positive = words.filter(lambda x: (x in pwords)) negative = words.filter(lambda x: (x in nwords)) #positive.pprint() #negative.pprint() ppairs = positive.map(lambda p: ('positive', 1)) npairs = negative.map(lambda n: ('negative', 1)) pwordCounts = ppairs.reduceByKey(lambda x, y: x + y) nwordCounts = npairs.reduceByKey(lambda x, y: x + y) count = pwordCounts.union(nwordCounts) #count.pprint() #pwordCounts.pprint() #nwordCounts.pprint() def updateFunction(newValues, runningCount): if runningCount is None: runningCount = 0 return sum(newValues, runningCount) prunningCounts = pwordCounts.updateStateByKey(updateFunction) nrunningCounts = nwordCounts.updateStateByKey(updateFunction) #prunningCounts.pprint() #nrunningCounts.pprint() total = prunningCounts.union(nrunningCounts) total.pprint() # Let the counts variable hold the word counts for all time steps # You will need to use the foreachRDD function. # For our implementation, counts looked like: # [[("positive", 100), ("negative", 50)], [("positive", 80), ("negative", 60)], ...] counts = [] count.foreachRDD(lambda t,rdd: counts.append(rdd.collect())) ssc.start() # Start the computation ssc.awaitTerminationOrTimeout(duration) ssc.stop(stopGraceFully=True) return counts
def fetch_pre_hourly_data(spark_context, offset_range_list): """get metrics pre hourly data from offset range list.""" # get kafka stream over the same offsets pre_hourly_rdd = KafkaUtils.createRDD(spark_context, {"metadata.broker.list": cfg.CONF.messaging.brokers}, offset_range_list) return pre_hourly_rdd
def functionToCreateContext(): sc = SparkContext(appName="StreamingExampleWithKafka") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") opts = {"metadata.broker.list": "node1.example.com:6667,node2.example.com:6667"} kvs = KafkaUtils.createDirectStream(ssc, ["mytopic"], opts) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).updateStateByKey(updateFunction) counts.pprint() return ssc
def read_tweets(): sc = SparkContext(appName="sentimentProducer") ssc = StreamingContext(sc,600) # Test 60 segundos brokers = "localhost:9092" kvs = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": brokers}) kvs.foreachRDD(create_format) producer.flush() ssc.start() ssc.awaitTermination()
def test_kafka_direct_stream(self): """Test the Python direct Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress(), "auto.offset.reset": "smallest"} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams) self._validateStreamResult(sendData, stream)
def test_kafka_rdd(self): """Test the Python direct Kafka RDD API.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self._validateRddResult(sendData, rdd)
def main(): conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) try: kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) kafka_streams.foreachRDD(process_rdd) except Exception as e: print e ssc.start() ssc.awaitTermination()
def test_kafka_rdd_get_offsetRanges(self): """Test Python direct Kafka RDD get OffsetRanges.""" topic = self._randomTopic() sendData = {"a": 3, "b": 4, "c": 5} offsetRanges = [OffsetRange(topic, 0, long(0), long(sum(sendData.values())))] kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) rdd = KafkaUtils.createRDD(self.sc, kafkaParams, offsetRanges) self.assertEqual(offsetRanges, rdd.offsetRanges())
def test_kafka_direct_stream_from_offset(self): """Test the Python direct Kafka stream API with start offset specified.""" topic = self._randomTopic() sendData = {"a": 1, "b": 2, "c": 3} fromOffsets = {TopicAndPartition(topic, 0): long(0)} kafkaParams = {"metadata.broker.list": self._kafkaTestUtils.brokerAddress()} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createDirectStream(self.ssc, [topic], kafkaParams, fromOffsets) self._validateStreamResult(sendData, stream)
def test_kafka_stream(self): """Test the Python Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 3, "b": 5, "c": 10} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), "test-streaming-consumer", {topic: 1}, {"auto.offset.reset": "smallest"}) self._validateStreamResult(sendData, stream)
print(type(value_key)) print(type(value_all)) print("before executing") schema_registry_client = CachedSchemaRegistryClient( url='http://ashaplq00003:8081') print("after executing") print(schema_registry_client) serializer = MessageSerializer(schema_registry_client) print("after executing") print(serializer) spark = SparkSession.builder \ .appName('SparkCassandraApp') \ .config('spark.cassandra.connection.host', 'localhost') \ .config('spark.cassandra.connection.port', '9042') \ .config('spark.cassandra.output.consistency.level','ONE') \ .master('local[2]') \ .getOrCreate() sc = spark.sparkContext ssc = StreamingContext(sc, 5) kvs = KafkaUtils.createDirectStream( ssc, ['NBC_APPS.TBL_MS_ADVERTISER'], {"metadata.broker.list": 'ashaplq00003:9192'}, valueDecoder=serializer.decode_message) kvs.pprint() kvs.foreachRDD(handler) ssc.start() ssc.awaitTermination()
import json import csv from json import loads from flatten_json import flatten from time import sleep import pandas as pd print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") print("PROGRAM START!!!") sc = SparkContext() ssc = StreamingContext(sc, 10) sqlc = SQLContext(sc) directKafkaStream = KafkaUtils.createDirectStream( ssc, ["kafka_spark"], {"metadata.broker.list": "localhost:9094"}) lines = directKafkaStream.map(lambda x: x[1]) print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") print("LINES START!!!") def transformer(rdd): my_obj = json.loads(rdd) return (my_obj["group"]["id"], my_obj["group"]["name"], my_obj["group"]["latest"]["text"], my_obj["group"]["members"], my_obj["group"]["unread_count"])
def setup_kafka_stream(): kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers}) return kvs
if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: kafka_wordcount.py <zk> <EventsTopic> ", file=sys.stderr) exit(-1) sc = SparkContext(appName="biddingStream") ssc = StreamingContext(sc, 10) sqlContext = SQLContext(sc) bidprice=sqlContext.read\ .format("org.apache.spark.sql.cassandra")\ .options(keyspace="ad_flow", table="bidprice")\ .load().rdd tmp = {} for item in bidprice.collect(): tmp[item['pid']] = item['price'] bidPriceBC = sc.broadcast(tmp) # print(tmp) zkQuorum, topic1 = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "sparkStreamingGetNewEvents", {topic1: 1}) lines = kvs.map(lambda x: json.loads(x[1])) # lines.pprint() # uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\ # uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\ # .reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 60, 10) # ssc.checkpoint('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/checkpoint/') lines.foreachRDD(process) ssc.start() ssc.awaitTermination()
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils, TopicAndPartition from pyspark.sql import HiveContext, Row import json sc = SparkContext("local[*]", "mysqltohive") ssc = StreamingContext(sc,1) hsql = HiveContext(sc) topic = "test" brokers = "datanode1:9092,datanode3:9092,datanode6:9092" parttiton = 0 start = 8390 topicpartion = TopicAndPartition(topic, parttiton) fromoffset = {topicpartion: long(start)} dkafka = KafkaUtils.createDirectStream(ssc,[topic], \ {"metadata.broker.list": brokers},fromOffsets = fromoffset) offsetRanges = [] def storeOffsetRanges(rdd): global offsetRanges offsetRanges = rdd.offsetRanges() return rdd def printOffsetRanges(rdd): for o in offsetRanges: print "%s %s %s %s" % (o.topic, o.partition, o.fromOffset, o.untilOffset) dkafka.transform(storeOffsetRanges).foreachRDD(printOffsetRanges)
from pyspark.sql import SparkSession from pyspark.ml.feature import MinMaxScaler n_secs = 1 topic = "test2" conf = SparkConf().setAppName("RealTimeDetector").setMaster("local[*]") sc = SparkContext(conf=conf) sc.setLogLevel("WARN") ssc = StreamingContext(sc, n_secs) spSession = SparkSession.builder.master("local").appName("MineCap").config("spark.some.config.option", "some-value").getOrCreate() kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], { 'bootstrap.servers':'localhost:9092', 'group.id':'flow-streaming', 'fetch.message.max.bytes':'15728640', 'auto.offset.reset':'largest'}) # Group ID is completely arbitrary lines = kafkaStream.map(lambda x: x[1]) flows = lines.flatMap(lambda line: line.split(" "))#.map(lambda word: (word[1:-1].split(","))) ##### tratamento dos dados #fluxoRDD = sc.textFile("/home/administrador/MineCap/process-layer/dataset_fluxo_bc.csv") fluxoRDD = sc.textFile("/home/helio/MineCap/process-layer/dataset_novo.csv") # Removendo a primeira linha do arquivo (cabeçalho) firstLine = fluxoRDD.first() fluxoRDD2 = fluxoRDD.filter(lambda x: x != firstLine)
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: consumer.py <zk> <topic>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafka") ssc = StreamingContext(sc, 1) zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() ssc.start() ssc.awaitTermination()
conf.set("spark.streaming.backpressure.enabled", "true") conf.set("spark.streaming.backpressure.initialRate", "100") conf.set("spark.streaming.kafka.maxRatePerPartition", "100") sc = SparkContext(appName="Most Popular Airports", batchSize=50, conf=conf) sc.setLogLevel("WARN") ssc = StreamingContext(sc, 60) broker = sys.argv[1] source_topic = sys.argv[2] # destionation_topic = sys.argv[3] print("The broker is " + broker) print("The source topic is " + source_topic) df = KafkaUtils.createDirectStream(ssc, [source_topic], {"metadata.broker.list": broker}, valueDecoder=decoder) df \ .map(get_original_airport_and_destination_airport) \ .filter(lambda line: len(line) > 3) \ .filter(lambda line: Helpers.is_airport(line)) \ .flatMap(lambda line: line.split(",")) \ .countByValue() \ .transform(lambda airports: airports.sortBy(lambda t: t[1], ascending=False)) \ .foreachRDD(handler) ssc.start() ssc.awaitTermination()
from pyspark.sql import SparkSession from pyspark.sql import functions as F import json import re #reload(sys) #sys.setdefaultencoding('utf-8') if __name__ == "__main__": sc = SparkContext(appName="SparkStreaming") sqlContext = SQLContext(sc) sc.setLogLevel("WARN") ssc = StreamingContext(sc, 5) # 5 seconds window broker, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": broker}) data = kvs.map(lambda x: x[1]) def readRdd4rmKafkaStream(readRDD): if not readRDD.isEmpty(): # Put RDD into a dataframe df = sqlContext.read.json(readRDD) df.show() #df.registerTempTable("Businesses_data") data.foreachRDD(lambda rdd: readRdd4rmKafkaStream(rdd)) print("\n\n\n\n\n\n\nHEY, CAN YOU SEE ME\n\n\n\n\n\n\n") ssc.start() ssc.awaitTermination()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('topic_name', help='the name of the topic') parser.add_argument('kafka_broker', help='the location of the kafka') parser.add_argument('target_topic', help = 'the new topic to write to') args = parser.parse_args() topic_name = args.topic_name kafka_broker = args.kafka_broker target_topic = args.target_topic sc = SparkContext('local[2]', 'stock-price-analysis') sc.setLogLevel('WARN') ssc = StreamingContext(sc, 5) # batch second # direct stream directKafkaStream = KafkaUtils.createDirectStream(ssc, [topic_name], {'metadata.broker.list':kafka_broker}) process_stream(directKafkaStream) # create kafka producer kafka_producer = KafkaProducer( bootstrap_servers = kafka_broker ) atexit.register(shutdown_hook, kafka_producer) ssc.start() ssc.awaitTermination()
###This streaming takes data from Apache Kafka and makes basic analyse. After analyse save it to Cassandra database. def getSqlContextInstance(sparkContext): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext) return globals()['sqlContextSingletonInstance'] # Create a StreamingContext with batch interval of 3 second conf=SparkConf().setAppName("asd").set("spark.cassandra.connection.host", "127.0.0.1") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 3) topic = "topic_cassandraam" kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181", "topic", {topic: 4}) def process(time, rdd): try: # Get the singleton instance of SparkSession sqlContext = getSqlContextInstance(rdd.context) # Convert RDD[String] to RDD[Row] to DataFrame #d.map(lambda w: Row(word=w)) df = sqlContext.createDataFrame(rdd) #df.registerTempTable("words") #wordCountsDataFrame = sqlContext.sql("select * from words")
producer.send_messages('top_carriers_by_airports', message.encode()) # MAIN sc = SparkContext(appName="TopCarriersByAirports") sc.setLogLevel('ERROR') # Create a local StreamingContext ssc = StreamingContext(sc, 1) ssc.checkpoint( "s3a://cloudcapstone-checkpoints/checkpoints/checkpoint-top-carriers-by-airports/" ) lines = KafkaUtils.createDirectStream(ssc, ['input'], { "metadata.broker.list": sys.argv[1], "auto.offset.reset": "smallest" }) # Split each line by separator lines = lines.map(lambda tup: tup[1]) rows = lines.map(lambda line: line.split()) # Get the airports rows = rows.filter(lambda row: len(row) > 7) airports_and_carriers = rows.map(lambda row: ((row[0], row[3]), float(row[7]))) # Count averages airports_and_carriers = airports_and_carriers.updateStateByKey(updateFunction) # Change key to just airports airports = airports_and_carriers.map(lambda row: (row[0][0], (row[0][1], row[1][2])))
df = spark.createDataFrame(rowRdd) df.write\ .format("org.apache.spark.sql.cassandra")\ .mode('overwrite')\ .options(table="airport_carrier_departure", keyspace="aviation")\ .save() # MAIN sc = SparkContext(appName="TopCarriersByAirportsToCassandra") sc.setLogLevel('ERROR') # Create a local StreamingContext ssc = StreamingContext(sc, 1) lines = KafkaUtils.createDirectStream(ssc, ['top_carriers_by_airports'], { "metadata.broker.list": sys.argv[1], "auto.offset.reset": "smallest" }) # Transform lines = lines.map(lambda message: message[1]) lines = lines.map(lambda line: line.split()) lines = lines.map(lambda tuple: (tuple[0], tuple[1], float(tuple[2]))) # Save to Cassandra lines.foreachRDD(printResults) lines.foreachRDD(saveToCassandra) ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: main.py <broker_list> <topic>", file=sys.stderr) exit(-1) # read command line arguments brokers, topic = sys.argv[1:] sparkSession = ( SparkSession.builder.appName('Santiago Meetup').getOrCreate()) ssc = StreamingContext(sparkSession.sparkContext, BATCH_INTERVAL) kvs = KafkaUtils.createDirectStream( ssc, [topic], { "metadata.broker.list": brokers, "auto.offset.reset": STREAM_KAFKA_OFFSET }) result = kvs.map(lambda record: map_tweet(record)).filter( lambda record: record[0] is not None).cache() #Send twitter reply result.foreachRDD(lambda rdd: rdd.foreachPartition(reply_to_tweet)) #Save meetup_tags data to hdfs result.flatMap(lambda record: map_scores(record)).foreachRDD( lambda rdd: save_meetup_tags_to_hbase(rdd)) #Save meetup data to hdfs result.map(lambda r: (r[0], r[1], r[2], r[3], r[4], r[5])).foreachRDD(
print(e) print("Database insertion unsuccessful!!") finally: conn.close() from pyspark.streaming import StreamingContext conf = SparkConf().setMaster("local[2]").setAppName("Streamer") sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") val = sc.parallelize("abd") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") kstream = KafkaUtils.createDirectStream( ssc, topics=['twitterstream'], kafkaParams={"metadata.broker.list": 'localhost:9092'}) tweets = kstream.map(lambda x: json.loads(x[1])) with open('IRModel1', 'rb') as f: loadedModel = pickle.load(f) bc_model = sc.broadcast(loadedModel) def process_data(data): print("Processing data ...") if (not data.isEmpty()): nbModel = bc_model.value
from datetime import datetime import json from pyspark import SparkContext, SparkConf,SQLContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import SparkSession from pyspark.sql import HiveContext sc = SparkContext() ssc = StreamingContext(sc, 10) # 10 Second window to get the stream from the source sqlc= SQLContext(sc) kvs = KafkaUtils.createDirectStream(ssc, ['bigdata'], {'metadata.broker.list': 'localhost:9096'})#'sandbox-hdp.hortonworks.com:6667' lines = kvs.map(lambda x: x[1]) def transformer(rdd): my_obj=json.loads(rdd) now = datetime.now() dt_string = now.strftime("%m/%d/%Y %H:%M:%S")# mm/dd/YY H:M:S #return str(dt_string),str(my_obj['country_name']),str(my_obj['cases']),str(my_obj['region']),str(my_obj['new_cases']),str(my_obj['serious_critical']) return dt_string,my_obj['country_name'],my_obj['cases'],my_obj['region'],my_obj['new_cases'],my_obj['serious_critical'] transform=lines.map(transformer) def build_df(rdd): if not rdd.isEmpty(): global sqlc
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount") ssc = StreamingContext(sc, 2) # brokers, topic = sys.argv[1:] kvs = KafkaUtils.createDirectStream(ssc, ["kafkaTwitterSpark"], {"metadata.broker.list": 3}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
value=json.dumps(data).encode('utf-8')) producer.flush() producer.close() # Initialisation de notre liste de topics et mots-clés associés topic_list = {"sport": ['doping', 'olympic', 'injury','medal','record','title','world','athletic'], \ "tech" : ['technology','author','article','computer','hi-tech','software','network','security','phone']} # Création de notre Stream. sc = SparkContext(appName='PythonStreamingRecieverKafka') ssc = StreamingContext(sc, 2) # 2 second window zookeeper_broker = "localhost:2181" topic = "queue1" kvs = KafkaUtils.createStream(ssc, \ zookeeper_broker, \ 'streaming-consumer',\ {topic:1}) # séparation de notre stream et filtrage avant envoie dans queue2 et queue3 queue2 = kvs.filter(lambda x: queue2_filter(x)) #queue2.pprint(num=10) queue2.foreachRDD( lambda rdd: rdd.foreachPartition(lambda x: send_to_queue2(x))) queue3 = kvs.filter(lambda x: queue3_filter(x)) #queue3.pprint(num=10) queue3.foreachRDD( lambda rdd: rdd.foreachPartition(lambda x: send_to_queue3(x))) ssc.start() ssc.awaitTermination()
producer.flush() producer.close() # Read Data files with a Custom Schema - CDR Data was obtained from the Open Big Data project by Dandelion, # It is availabe at https://dandelion.eu/datamine/open-big-data/ DataSchema = StructType([StructField("square_id", FloatType(), True), \ StructField("time", StringType(), True), \ StructField("country", FloatType(), True), \ StructField("sms_in", FloatType(), True), \ StructField("sms_out", FloatType(), True), \ StructField("call_in", FloatType(), True), \ StructField("call_out", FloatType(), True), \ StructField("internet", FloatType(), True)]) kvs = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'realtime': 1}) lines = kvs.map(lambda x: x[1]) rows = lines.map(lambda line: line.split("\n")[0]) rows = rows.map(lambda line: line.split("\t")) Data = rows.transform(lambda rdd: PreProcessData(rdd)) Result = Data.transform(lambda rdd: ComputeDistances(rdd)) Data = Data.transform(lambda rdd: rdd.zipWithIndex().map(lambda e: (e[1], [e[0]]))) Data.pprint() Result.pprint() Final = Result.join(Data) Final.pprint() Final.foreachRDD(lambda rdd: rdd.foreachPartition(SendResult)) ssc.start()
'/user/hadoop/POC/FraudAnalytics/data/transactions_history/Transaction_DB_jupyter.csv' transactionsDataDF = spark.read.csv(transactionsDataHDFSPath, header=True, inferSchema=True) print 'Printing transactions history data' transactionsDataDF.show(1) transactionsDataDF.registerTempTable('tranHistoryTbl') # sqlContext.cacheTable("tranHistoryTbl") transRecCount = \ transactionsDataDF.filter("Card_Number = '7470510000000000'" ).select('*').count() print '========= Printing transactionsDataDF count =========' print transRecCount ssc = StreamingContext(sc, 10) # zkQuorum = "34.201.235.134:2181" # topic = "fraud_analytics" (zkQuorum, topic) = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, 'fraud_test1_consumer', {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() lines.foreachRDD(lambda rdd: \ process(time.strftime('%Y-%m-%d %H:%M:%S'), transactionsDataDF, rdd)) ssc.start() ssc.awaitTermination()
# Spark from pyspark import SparkContext # Spark Streaming from pyspark.streaming import StreamingContext # Kafka from pyspark.streaming.kafka import KafkaUtils # json parsing import json sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 60) kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming', {'twitter': 1}) parsed = kafkaStream.map(lambda v: json.loads(v[1])) parsed.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint() authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name']) author_counts = authors_dstream.countByValue() author_counts.pprint() author_counts_sorted_dstream = author_counts.transform(\ (lambda foo:foo\ .sortBy(lambda x:( -x[1])))) author_counts_sorted_dstream.pprint()
except: pass if __name__ == "__main__": try: #Create Spark Context to Connect Spark Cluster sc = SparkContext(appName="PythonStreamingKafkaTweetCount") #Set the Batch Interval is 2 sec of Streaming Context ssc = StreamingContext(sc, 2) # sqlContext = sql.SQLContext(sc) #Create Kafka Stream to Consume Data Comes From Twitter Topic #localhost:2181 = Default Zookeeper Consumer Address kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'spark-streaming', {'twitter': 1}) #Parse Twitter Data as json parsed = kafkaStream.map(lambda v: forEachBatch(v)) #parsed = kafkaStream.map(lambda x: x[1]) #kafkaStream.saveAsTextFiles('test.txt') #Count the number of tweets per Usere #lines = parsed.map(lambda x: x[1]) # tweets = parsed.map(getFeature) # ##print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") #print('asdljaslkdjaslkjdasjdlkajdlasjkd',len(tweets)) parsed.pprint() #tweets.saveDataToFile("1") # vector = np.array(tweets) #rdd = tweets.foreachRDD(getRDD) #turnIntoVector(rdd)
if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: kafka_wordcount.py <zk> <topic>", file=sys.stderr) exit(-1) sc = SparkContext(appName="biddingStream") ssc = StreamingContext(sc, 10) # load product vector pm=sc.textFile('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/data/pidModel.txt')\ .map(lambda x: x.strip().split(' '))\ .map(lambda x: (x[0], np.asarray([float(i) for i in x[1:]]))) # print(pm.take(100)) bv = sc.broadcast(pm.collect()) zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "sparkStreamingConsumerDetectEvents", {topic: 1}) lines = kvs.map(lambda x: json.loads(x[1])) # lines.pprint() uidVec = lines.map(lambda x: ((x['timestamp'], x[ 'uid']), np.asarray([float(i) for i in x['topicVec']]))) # uidVec=lines.map(lambda x: ((x['uid'], x['tick']), np.asarray([float(i) for i in x['topic']])))\ # .reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 60, 10) # ssc.checkpoint('hdfs://ec2-52-2-60-169.compute-1.amazonaws.com:9000/checkpoint/') window60rdd = uidVec.foreachRDD(process) ssc.start() ssc.awaitTermination()
# print(""" # Usage: SparkTest-sql-kafka-.py <bootstrap-servers> <subscribe-type> <topics> # """, file=sys.stderr) # exit(-1) subscribeType = "subscribe" bootstrapServers = "10.0.10.10:2181" # 打开一个TCP socket 地址 和 端口号 topic = {"test5": 1} # 要列举出分区 groupid = "sprk-consumer-group" sc = SparkContext(appName="PythonStreamingKafkaWordCount") # 处理时间间隔为2s ssc = StreamingContext(sc, 15) kafkaStream = KafkaUtils.createStream(ssc, bootstrapServers, groupid, topic) words = kafkaStream.map(lambda x: x[1]) words.pprint() # .trigger(processingTime='5 seconds') \ # Convert RDDs of the words DStream to DataFrame and run SQL query def process(time, rdd): print("========= %s =====执行时间====" % str(time)) try: # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) schemaString = "c1 c2 c3" fields = [ StructField(field_name, StringType(), True)
# Data to insert to the DB data = {'recordID': recordID, 'c1': mac_address_enc, 'c2': access_point_enc, 'c3': internalNodesStr, 'c4': stay_enc} insertSQL = "insert into t_range_m (recordID, c1, c2, c3, c4) values (%(recordID)s, %(c1)s, %(c2)s, %(c3)s, %(c4)s)" save_to_db(data, insertSQL) # Set up the contexts conf = SparkConf().setAppName("Smart Buildings") sc = SparkContext(conf=conf) stream = StreamingContext(sc, 1) # 1 second window # Returns a DStream (Discretized Stream) object kafka_stream = KafkaUtils.createStream(stream, # StreamingContext object 'localhost:2181', # Zookeeper quorum 'my-test-group', # The group id for this consumer {'smartBuildings':1}) # Dict of (topic_name -> numPartitions) to consume # Each partition is consumed in its own thread val_tup = kafka_stream.map(lambda x: x[1]) itemsStream = val_tup.map(lambda s: make_item(s)) encStream = itemsStream.map(lambda e: encrypt_record(e)) encStream.pprint() stream.start() # start the streaming application stream.awaitTermination()
tokenizer = RegexpTokenizer(r'\w+') client = MongoClient() collection = client.streams.nba #collection.drop() print ("Before stream count is {}".format(collection.count())) client.close() #a new ssc needs to be started after a previous ssc is stopped ssc = StreamingContext(sc, PERIOD) #create stream receivers stream = KafkaUtils.createDirectStream( ssc, [TOPIC], { "metadata.broker.list": BROKERS, } ) tweets = stream.map(lambda x: json.loads(x[1])).map(lambda x: json.loads(x)) #filter commercials filtered_tweets = tweets.filter(lambda x: 'https' not in x['text']) # DataFrame operations inside your streaming program features = filtered_tweets.map(lambda x: {'id': x['id'], 'screen_name': x['user']['screen_name'], 'text': x['text'], 'followers': x['user']['followers_count'], 'created_at': x['created_at'], 'teams': find_teams(x['text'])}) #tweets.pprint() features.pprint() #find trending topic filtered_tweets.foreachRDD(lambda x: find_trends(x['text']))
from pyspark.streaming.kafka import KafkaUtils from pyspark.streaming import StreamingContext from pyspark import SparkConf, SparkContext conf = SparkConf().setMaster("local[*]").setAppName("Streaming") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 3) directKafkaStream = KafkaUtils.createDirectStream( ssc, ['test'], {"bootstrap.servers": 'localhost:9092'}) offsetRanges = [] def storeOffsetRanges(rdd): global offsetRanges offsetRanges = rdd.offsetRanges() return rdd def printOffsetRanges(rdd): for o in offsetRanges: print(o.topic, o.partition, o.fromOffset, o.untilOffset) # TODO: 可以修改为更时尚的计数值、均值等数值指标 def storeRdd(rdd): global kafkaData # kafkaData = rdd.KafkaMessageAndMetadata() return rdd
em(mail["mail_host"], mail["mail_user"], mail["mail_pass"]).send('*****@*****.**', rddstr) def finds(word, keyword): #查找字符串 word = word.lower() for i in keyword: n = word.count(i) if n > 0: return True break if __name__ == '__main__': sc = SparkContext("local[2]") # 处理时间间隔为5s ssc = StreamingContext(sc, 5) # 打开一个TCP socket 地址 和 端口号 lines = KafkaUtils.createStream(ssc, kfk['host'], kfk['group'], kfk['top']) lines1 = lines.map(lambda x: x[1]).map( lambda s: "".join(i for i in s if 31 < ord(i) < 127)).filter( lambda word: finds(word, keyword)) # 注意 取tuple下的第二个即为接收到的kafka流 # #对数据进行存储 lines1.foreachRDD(save1) ssc.start() ssc.awaitTerminationOrTimeout(90) ssc.stop()
put_total_hits(event_id, total_hits) corrupted_count = 0 for hit in hits: hit_id = hit['hit_id'] corrupted_columns = verify_hit(hit) if len(corrupted_columns) != 0: corrupted_count += 1 put_corrupted_columns(event_id, hit_id, corrupted_columns) put_hit(event_id, hit_id, hit) corrupted_count += get_corrupted_count(event_id) put_corrupted_count(event_id, corrupted_count) ac = get_collected(event_id) new_ac = ac + len(event['data']) put_collected(event_id, new_ac) if new_ac == event['total_hits'] and get_corrupted_count(event_id) == 0 : send_completion_msg_to_kafka_topic(event_id) sparkContext = SparkContext.getOrCreate() streamingContext = StreamingContext(sparkContext, 5) sqlContext = SQLContext(sparkContext) kafkaStream = KafkaUtils.createStream(streamingContext, 'sandbox.hortonworks.com:2181', 'defaultGroup', {'events': 1}) kafkaStream \ .map(lambda event: json.loads(event[1].encode('utf-8'))) \ .foreachRDD(put_in_hbase) streamingContext.start()
parser.add_argument('new_topic', help='new topic to send data to') # - get arguments args = parser.parse_args() kafka_brokers = args.kafka_brokers topic = args.topic new_topic = args.new_topic # - setup spark streaming utility conf = SparkConf() \ .setMaster("local[2]") \ .setAppName("StockAveragePrice") sc = SparkContext(conf=conf) sc.addFile('spark/stream-process.py') sc.setLogLevel('ERROR') ssc = StreamingContext(sc, 5) # - instantiate a kafka stream for processing kafka_stream = KafkaUtils.createDirectStream(ssc, [topic], {'metadata.broker.list': kafka_brokers}) kafka_stream.foreachRDD(process) # - instantiate a simple kafka producer kafka_producer = KafkaProducer(bootstrap_servers=kafka_brokers.split(',')) # - setup proper shutdown hook atexit.register(shutdown_hook, kafka_producer) # - start streaming processing ssc.start() ssc.awaitTermination()