def main(ssc): zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a+b) counts.pprint() ssc.start() ssc.awaitTermination()
def stream(ssc): zkQuorum = "localhost:2181" topic = "topic1" tweets = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"}) tweets = tweets.map(lambda x: x[1].encode("ascii","ignore")) return tweets
def main(): # Create a local StreamingContext with two working thread and batch interval of 5 second sc = SparkContext("spark://ip-172-31-29-29:7077", "MyKafkaStream") # stream interval of 5 seconds ssc = StreamingContext(sc, 5) kafkaStream = KafkaUtils.createStream(ssc, "52.3.61.194:2181", "GroupNameDoesntMatter", {"parking_sensor_data": 2}) messages = kafkaStream.flatMap(lambda s: create_tuple(s[1])).reduceByKey(lambda a,b: (int(a)+int(b))/2) messages1 = messages.filter(lambda s: s[1] > 0) messages1.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def bro_parse(zk,topic,db,db_table,num_of_workers): app_name = "ONI-INGEST-{0}".format(topic) wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc,1) sqc = HiveContext(sc) # create DStream for each topic partition. topic_dstreams = [ KafkaUtils.createStream(ssc, zk, app_name, {topic: 1}, keyDecoder=oni_decoder, valueDecoder=oni_decoder) for _ in range (wrks) ] tp_stream = ssc.union(*topic_dstreams) # Parallelism in Data Processing #processingDStream = tp_stream(wrks) # parse the RDD content. proxy_logs = tp_stream.map(lambda x: proxy_parser(x[1])) # save RDD into hive . proxy_logs.foreachRDD(lambda x: save_to_hive(x,sqc,db,db_table,topic)) ssc.start() ssc.awaitTermination()
def main(): if len(sys.argv) != 4: print("Usage: kafka_wordcount.py <zk> <topic> <timeout>", file=sys.stderr) exit(-1) sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) timeout = None if len(sys.argv) == 4: zk, topic, timeout = sys.argv[1:] timeout = int(timeout) else: zk, topic = sys.argv[1:] kvs = KafkaUtils.createStream( ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: (line.split(" ")) .map(lambda word: (word, 1)) .reduceByKey(lambda a, b: a+b)) counts.pprint() kwargs = {} if timeout: kwargs['timeout'] = timeout ssc.start() ssc.awaitTermination(**kwargs)
def main(): conf = SparkConf().setAppName("kafka_source_mongo_sink_pymongo_filtered") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) try: kafka_streams = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) kafka_streams.foreachRDD(process_rdd) except Exception as e: print e ssc.start() ssc.awaitTermination()
def test_kafka_stream(self): """Test the Python Kafka stream API.""" topic = self._randomTopic() sendData = {"a": 3, "b": 5, "c": 10} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), "test-streaming-consumer", {topic: 1}, {"auto.offset.reset": "smallest"}) self._validateStreamResult(sendData, stream)
def main(ssc): zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 2}) lines = kvs.map(lambda x: x[1]) # Convert RDDs of the words DStream to DataFrame and run SQL query def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SQLContext sqlContext = getSqlContextInstance(rdd.context) # Convert RDD[String] to RDD[Row] to DataFrame parts = rdd.map(lambda line: line.split(",")) delays_rdd= parts.map(lambda w: Row(carrier=w[0], origin=w[1], delay=float(w[2]))) delays = sqlContext.createDataFrame(delays_rdd, samplingRatio=1) avg_delays = delays.groupBy("origin", "carrier").agg(F.avg(delays.delay).alias('average')) avg_delays.write.format("org.apache.spark.sql.cassandra").\ options(table="task2_part2_group2_1", keyspace="mykeyspace").\ save(mode="append") # Register as table #dataFrame.registerTempTable("origin_carrier_delays") # Do word count on table using SQL and print it #carrier_delays_df = \ # sqlContext.sql("SELECT origin, carrier, avg(delay) AS average FROM origin_carrier_delays GROUP BY origin, carrier") #carrier_delays_df.registerTempTable("origin_carrier_avg_delays") #carrier_avg_delays_df = \ # sqlContext.sql("SELECT origin, carrier, avg_delay FROM origin_carrier_avg_delays GROUP BY origin ORDER BY avg_delay LIMIT 10") #for i in carrier_delays_df.rdd.takeOrderedByKey(10, sortValue=lambda x: x[2], reverse=False).map(lambda x: x[1]).collect(): # print (i) #dataFrame.select("origin", "carrier", "delay").write \ #carrier_delays_df.write \ # .format("org.apache.spark.sql.cassandra") \ # .options( table = "task2_part2_group2_1", keyspace = "mykeyspace") \ # .save(mode="append") #carrier_delays_df.show() except Exception as e: print (e) #except: # pass #data = lines.map(lambda line: line.split(",")) \ # .map(lambda word: (word[0], float(word[1])) ) \ # .aggregateByKey((0,0), lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1])) \ # .mapValues(lambda v: v[0]/v[1]) \ # .updateStateByKey(updateFunc) \ # .transform(lambda rdd: rdd.sortBy(lambda (word, count): -count)) #data.pprint() lines.foreachRDD(process) ssc.start() ssc.awaitTermination()
def main(ssc): zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) data = lines.map(lambda line: line.split(",")) \ .flatMap(lambda word: [(word[0], 1), (word[1], 1)]) \ .reduceByKey(lambda a, b: a+b) \ .updateStateByKey(updateFunc) \ .transform(lambda rdd: rdd.sortBy(lambda (word, count): -count)) data.pprint() ssc.start() ssc.awaitTermination()
def main(): sc = SparkContext(appName="PythonStreamingKafkaWordCount") ssc = StreamingContext(sc, 1) zkQuorum = "localhost:2181" topic = "twitter_raw" kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: pickle.loads(x[1].decode("utf-8"))["text"]) # fetch the text count = lines.map(lambda line: len(line.split())).reduce(add) # split into words and count count.foreachRDD(publishToRedis) # publish to redis count.pprint() ssc.start() ssc.awaitTermination()
def ss_kafka_bucket_counter(broker, topic, bucket_interval, output_msg, message_parse, valueDecoder=None): """Starts a Spark Streaming job from a Kafka input and parses message time Args: broker: the kafka broker that we look at for the topic topic: the kafka topic for input bucket_interval: the time interval in seconds (int) that the job will bucket output_msg: a function that takes in a spark SparkContext (sc) and StreamingContext (ssc) and returns a function that takes a rdd that performs the output task message_parse: how the message is to be parsed valueDecoder: same as Spark's valueDecoder Returns: None """ sc = SparkContext(appName="PythonKafkaBucketCounter") ssc = StreamingContext(sc, bucket_interval + 5) if valueDecoder: kvs = KafkaUtils.createStream(ssc, broker, "spark-streaming-consumer", {topic: 1}, valueDecoder=valueDecoder) else: kvs = KafkaUtils.createStream(ssc, broker, "spark-streaming-consumer", {topic: 1}) # I assume that we do not store kafka keys lines = kvs.map(lambda x: x[1]) interval_counts = lines.map(lambda line: (message_parse(line), 1)).reduceByKey(lambda a, b: a + b) output_msg_func = output_msg(sc, ssc) interval_counts.foreachRDD(output_msg_func) ssc.start() ssc.awaitTermination()
def consume(self): messages = KafkaUtils.createStream(self.ssc, self.zookeeper, "spark-streaming-consumer", {self.topic: 1}) lines = messages.map(lambda x: x[1]) rows = lines.map(lambda x: { "data": json.loads(x)['data'], "time": json.loads(x)['time'] }) rows.foreachRDD(lambda x: { self.check_and_write(x) }) self.ssc.start() self.ssc.awaitTermination()
def main(): #main function to execute code sqlContext = SQLContext(sc) zk_host = zk_ip+":2181" consumer_group = "reading-consumer-group" kafka_partitions={topic:1} #create kafka stream kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder) lines = kvs.map(lambda x: x[1]) readings = lines.map(lambda x: Row(device_id=x["device_id"],\ metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\ metric_name=x["metric_name"],\ metric_value=float(x["metric_value"]))) readings.foreachRDD(process) ssc.start() ssc.awaitTermination()
def bluecoat_parse(zk,topic,db,db_table,num_of_workers,batch_size): app_name = topic wrks = int(num_of_workers) # create spark context sc = SparkContext(appName=app_name) ssc = StreamingContext(sc,int(batch_size)) sqc = HiveContext(sc) tp_stream = KafkaUtils.createStream(ssc, zk, app_name, {topic: wrks}, keyDecoder=spot_decoder, valueDecoder=spot_decoder) proxy_data = tp_stream.map(lambda row: row[1]).flatMap(lambda row: row.split("\n")).filter(lambda row: rex_date.match(row)).map(lambda row: row.strip("\n").strip("\r").replace("\t", " ").replace(" ", " ")).map(lambda row: split_log_entry(row)).map(lambda row: proxy_parser(row)) saved_data = proxy_data.foreachRDD(lambda row: save_data(row,sqc,db,db_table,topic)) ssc.start(); ssc.awaitTermination()
def readSource(ssc, di_in_conf_with_ds_conf, app_conf): sourceType = di_in_conf_with_ds_conf['source.type'] if sourceType == 'kafka': kafkaSimpleConsumerApiUsed = app_conf.get('kafka.simple.consumer.api.used', True) if kafkaSimpleConsumerApiUsed: topics = di_in_conf_with_ds_conf['topics'] if not isinstance(topics, list): raise TypeError("topic should be list") brokers = di_in_conf_with_ds_conf['metadata.broker.list'] kafkaParams = {"metadata.broker.list": brokers} stream = KafkaUtils.createDirectStream(ssc, topics, kafkaParams).map(lambda x: x[1]) else: zkConnect = di_in_conf_with_ds_conf['zookeeper.connect'] groupId = app_conf['group.id'] numReceivers = app_conf.get('num.receivers', 1) numConsumerFetchers = app_conf.get('num.consumer.fetchers') topics = di_in_conf_with_ds_conf['topics'] topic_map = dict(zip(topics, numConsumerFetchers)) # streams = reduce(lambda x, y: x.union(y), # map(KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map), # range(0, numReceivers))) streams = [KafkaUtils.createStream(ssc, zkConnect, groupId, topic_map) for i in range(0, numReceivers)] stream = ssc.union(streams).map(lambda x: x[1]) elif sourceType == 'hdfs': path = di_in_conf_with_ds_conf['fs.defaultFS'] + '/' + di_in_conf_with_ds_conf['path'] stream = ssc.textFilesStream(path) else: raise Exception('Error: unsupported source.type = ' + sourceType) num_repartition = app_conf.get('dataInterface.stream.repatition.partitions') if num_repartition is None or not isinstance(num_repartition, int): stream2 = stream else: stream2 = stream.repartition(num_repartition) # 是否使用格式化插件类格式化 format_class_path = di_in_conf_with_ds_conf.get('format.class', '') if format_class_path.strip() == '': stream3 = stream2 else: format_class_obj = get_class_obj(format_class_path) stream3 = format_class_obj.format(stream2) return stream3
def test_kafka_stream(self): """Test the Python Kafka stream API.""" topic = "topic1" sendData = {"a": 3, "b": 5, "c": 10} self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, sendData) stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), "test-streaming-consumer", {topic: 1}, {"auto.offset.reset": "smallest"}) result = {} for i in chain.from_iterable(self._collect(stream.map(lambda x: x[1]), sum(sendData.values()))): result[i] = result.get(i, 0) + 1 self.assertEqual(sendData, result)
def main(): #main function to execute code sc = SparkContext(appName="ReadingWriter") ssc = StreamingContext(sc,10) sqlContext = SQLContext(sc) zk_host = zk_ip+":2181" consumer_group = "reading-consumer-group" kafka_partitions={"amtest":1} #create kafka stream kvs = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions,valueDecoder=decoder) lines = kvs.map(lambda x: x[1]) #readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":x["metric_time"],"metric_name":x["metric_name"],"metric_value":x["metric_value"]}) readings = lines.map(lambda x: {"device_id":x["device_id"],"metric_time":datetime.datetime.fromtimestamp(int(x["metric_time"])),"metric_name":x["metric_name"],"metric_value":float(x["metric_value"])}) readings.foreachRDD(lambda rdd: rdd.saveToCassandra("metrics", "raw_metrics")) #readingdf.show() #readings.pprint() #lines.saveToCassandra("metrics", "raw_metrics") ssc.start() ssc.awaitTermination()
def main(self): # loading configuration parameters (from a config file when working on a project) zk, topic, app_name, batch_duration, master = self.setConfiguration() # initiate the spark context / streaming context conf = (SparkConf().setMaster(master)) sc = SparkContext(appName=app_name, conf=conf) ssc = StreamingContext(sc, batch_duration) # reading data to kafka kvs = KafkaUtils.createStream(ssc, zk, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate sc.close()
def start(): # sc = SparkContext(appName='txt', conf=sconf) sc = SparkContext("spark://192.168.1.148:7077", "NetworkWordCount") ssc = StreamingContext(sc, 3) brokers = "192.168.1.148:2181" topic = 'taimei' user_data = KafkaUtils.createStream(ssc, brokers, "spark-streaming-consumer", {topic: 1}) # fromOffsets 设置从起始偏移量消费 # user_data = KafkaUtils.createDirectStream(ssc,[topic],kafkaParams={"metadata.broker.list":brokers},fromOffsets={TopicAndPartition(topic,partition):long(start)}) user_fields = user_data.map(lambda line: line[1].split('|')) gender_users = user_fields.map(lambda fields: fields[3]).map( lambda gender: (gender, 1)).reduceByKey(lambda a, b: a + b) # user_data.foreachRDD(offset) # 存储offset信息 print("---------") gender_users.pprint() gender_users.foreachRDD(lambda rdd: rdd.foreach(echo)) # 返回元组 ssc.start() ssc.awaitTermination()
def main(): #main function to execute code sqlContext = SQLContext(sc) zk_host = zk_ip + ":2181" consumer_group = "reading-consumer-group" kafka_partitions = {topic: 1} #create kafka stream kvs = KafkaUtils.createStream(ssc, zk_host, consumer_group, kafka_partitions, valueDecoder=decoder) lines = kvs.map(lambda x: x[1]) readings = lines.map(lambda x: Row(device_id=x["device_id"],\ metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\ metric_name=x["metric_name"],\ metric_value=float(x["metric_value"]))) readings.foreachRDD(process) ssc.start() ssc.awaitTermination()
def __init__(self, config): self._server = config.content["input"]["options"]["server"] self._port = config.content["input"]["options"]["port"] self._topic = config.content["input"]["options"]["topic"] self._consumer_group = config.content["input"]["options"][ "consumer_group"] self._batchDuration = config.content["input"]["options"][ "batchDuration"] self._sep = config.content["input"]["options"]["sep"] self._spark = SparkSession.builder.appName( "StreamingDataKafka").getOrCreate() sc = self._spark.sparkContext sc.addFile(config.content["databases"]["country"]) sc.addFile(config.content["databases"]["city"]) sc.addFile(config.content["databases"]["asn"]) self._ssc = StreamingContext(sc, self._batchDuration) list_conversion_function = list( (map(lambda x: type_to_func(x.dataType), config.data_structure_pyspark))) ranked_pointer = list(enumerate(list_conversion_function)) functions_list = list( map(lambda x: lambda list_string: x[1](list_string[x[0]]), ranked_pointer)) function_convert = lambda x: list( map(lambda func: func(x), functions_list)) try: dstream = KafkaUtils.createStream( self._ssc, "{0}:{1}".format(self._server, self._port), self._consumer_group, {self._topic: 1}) self._dstream = dstream.map( lambda x: function_convert(x[1].split(","))) except: raise KafkaConnectError( "Kafka error: Connection refused: server={} port={} consumer_group={} topic={}" .format(self._server, self._port, self._consumer_group, self._topic))
def main(): #main function to execute code sc = SparkContext(appName="CouponCounterPySpark") ssc = StreamingContext(sc,10) zk_host = "localhost:2181" consumer_group = "coupon-event-consumers" kafka_partitions={"test":1} #create kafka stream lines = KafkaUtils.createStream(ssc,zk_host,consumer_group,kafka_partitions) events = lines.map(lambda line: line[1].split(',')) tmpagg = events.map(lambda event: ((event[1]),1) ) coupon_counts = tmpagg.reduceByKey(lambda x,y: x+y) coupon_records = coupon_counts.map(lambda x: {"offer_id" : x[0], "bucket" : str(datetime.datetime.now().strftime("%s")), "count" : int(x[1])}) #coupon_records.pprint() #coupon_records.registerTempTable("coupon_counters") #coupon_records.select("offer_id","bucket","count").show() #coupon_records = coupon_counts.map(lambda record: {"offer_id" : record[0],"bucket" : str(int(datetime.datetime.now().strftime("%s"))*1000),"count" : int(record[1])} coupon_records.pprint() coupon_records.foreachRDD(lambda rdd: rdd.saveToCassandra("loyalty","coupon_counters")) ssc.start() ssc.awaitTermination()
def createContext(zkQuorum, topic): # If you do not see this printed, that means the StreamingContext has been loaded # from the new checkpoint print("Creating new context") sc = SparkContext(appName="PythonStreamingRecoverableNetworkWordCount") ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: json.loads(x[1])) #lines.pprint() # hard one #pairs=lines.map(lambda x: (x['uid'],x['topic'])) # simple one pairs = lines.map(lambda x: (x['uid'], 1)) #pairs.pprint() windowedWordCounts = pairs.reduceByKeyAndWindow(lambda x, y: x + y, lambda x, y: x - y, 30, 10) windowedWordCounts.pprint() return ssc
def main(): global topic topic = "topic_name" global errortopic errortopic = 'error_topic_data' sc = CreateSparkContext() ssc = StreamingContext(sc, 10) try: kafka_stream = KafkaUtils.createStream(ssc, "192.168.0.1:2181", "spark-streaming-consumer", {topic:12}) raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS]) lines = raw.flatMap(lambda xs: xs[1].split(",")) counts = lines.map(lambda word: (str(datetime.now()), "api", word)) counts.foreachRDD(lambda k: saveToCassandra(k, sc, ssc)) except Exception, e: print('error :'+str(e))
def main(): conf = SparkConf().setAppName("pyspark read") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) stream = kafkaStream.map(lambda xs:xs) stream.foreachRDD(lambda rdd: rdd.foreach(printRdd)) # stream.foreachRDD(lambda rdd: rdd.saveToMongodb(mongodb_uri)) # stream.pprint() # filter out flights not departing from United States # "depAirportCntry": "United States" # messages = kafkaStream.map(lambda xs: json.load(xs)) # jsonmessages = messages.map(lambda x: json.loads(x)) # usdepartures = jsonmessages.map(lambda x: x['depAirportCntry'].filter(lambda x: "United States" in x))\ ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def test_kafka_stream(self): """Test the Python Kafka stream API.""" topic = "topic1" sendData = {"a": 3, "b": 5, "c": 10} jSendData = MapConverter().convert( sendData, self.ssc.sparkContext._gateway._gateway_client) self._kafkaTestUtils.createTopic(topic) self._kafkaTestUtils.sendMessages(topic, jSendData) stream = KafkaUtils.createStream(self.ssc, self._kafkaTestUtils.zkAddress(), "test-streaming-consumer", {topic: 1}, {"auto.offset.reset": "smallest"}) result = {} for i in chain.from_iterable( self._collect(stream.map(lambda x: x[1]), sum(sendData.values()))): result[i] = result.get(i, 0) + 1 self.assertEqual(sendData, result)
def KafkaWordCount(zkQuorum, group, topics, numThreads): spark_conf = SparkConf().setAppName("KafkaWordCount") sc = SparkContext(conf=spark_conf) sc.setLogLevel("ERROR") ssc = StreamingContext(sc, 1) #ssc.checkpoint("file:///usr/local/spark/checkpoint") # 这里表示把检查点文件写入分布式文件系统HDFS,所以要启动Hadoop ssc.checkpoint(".") topicAry = topics.split(",") # 将topic转换为hashmap形式,而python中字典就是一种hashmap topicMap = {} for topic in topicAry: topicMap[topic] = numThreads lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(lambda x: x[1]) words = lines.flatMap(lambda x: x.split(" ")) wordcount = words.map(lambda x: (x, 1)).reduceByKeyAndWindow( (lambda x, y: x + y), (lambda x, y: x - y), 1, 1, 1) wordcount.foreachRDD(lambda x: sendmsg(x)) wordcount.pprint() ssc.start() ssc.awaitTermination()
def main(): zkQuorum = "localhost:2181" topic = "meetup-rsvps-topic" sc = SparkContext("local[*]") sc.setLogLevel("ERROR") ssc = StreamingContext(sc, BATCH_DUR) # 5 sec batch duration # utf-8 text stream from kafka kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark_consumer", {topic: 1}).cache() # recent top N event stream event_count = kvs.map(extract_event_count).filter(lambda line: line is not None) event_count.reduceByKeyAndWindow(func=lambda x,y:x+y, invFunc=lambda x,y:x-y, windowDuration=WIN_DUR, slideDuration=SLIDE_DUR) \ .filter(lambda pair: pair[1] > 0) \ .transform(take_top_rdd) \ .map(lambda pair: (pair[0][1], pair[1])) \ .foreachRDD(lambda rdd: rdd.foreachPartition(send_recent_top)) # running response count stream response_count = kvs.map(extract_response).filter(lambda line: line is not None) # TODO: may use countByValueAndWindow instead of updateStateByKey response_count.updateStateByKey(update_count) \ .foreachRDD(lambda rdd: rdd.foreachPartition(send_response_count)) # count recent rsvps rsvp_count = kvs.countByWindow(windowDuration=WIN_DUR, slideDuration=SLIDE_DUR) \ .foreachRDD(lambda rdd: rdd.foreachPartition(send_rsvp_count)) # event_count.pprint() ssc.checkpoint("rsvps_checkpoint_dir") ssc.start() ssc.awaitTermination()
def initialize_and_parse_input_stream(input_zookeeper, input_topic, microbatch_duration): """ Initialize spark context, streaming context, input DStream and parse json from DStream. :param input_zookeeper: input zookeeper hostname:port :param input_topic: input kafka topic :param microbatch_duration: duration of micro batches in seconds :return sc, ssc, parsed_stream: initialized spark and streaming context, and json with data from DStream """ # Application name used as identifier application_name = os.path.basename(sys.argv[0]) # Spark context initialization sc = SparkContext(appName=application_name + ' ' + ' '.join(sys.argv[1:])) # Application name used as the appName ssc = StreamingContext(sc, microbatch_duration) # Initialize input DStream of flows from specified Zookeeper server and Kafka topic input_stream = KafkaUtils.createStream(ssc, input_zookeeper, 'spark-consumer-' + application_name + str(time.time()), {input_topic: 1}) # Parse input stream in the json format parsed_stream = input_stream.map(lambda line: json.loads(line[1])) return sc, ssc, parsed_stream
def streaming_profile_toDB(topic, re_table, conn, attributes, rule, job_type): sc = spark.sparkContext ssc = StreamingContext(sc, 5) numThread = 3 print 'in streming profile to db' print zkQuorum print topic kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: numThread}) if job_type == 'accuracy': print 'here in accuracy' kvs.foreachRDD(lambda t, rdd: streaming_accuracy( t, rdd, topic, re_table, conn, attributes, rule)) elif job_type == 'profile': if rule == 'profile': kvs.foreachRDD(lambda t, rdd: streaming_profile( t, rdd, topic, re_table, conn, attributes)) else: kvs.foreachRDD(lambda t, rdd: streaming_user_define_profile( t, rdd, topic, re_table, conn, attributes, rule)) ssc.start() ssc.awaitTermination()
def main(): conf = SparkConf().setAppName("pyspark read") sc = SparkContext(conf=conf) ssc = StreamingContext(sc, 1) kafkaStream = KafkaUtils.createStream(ssc, "localhost:2181", "spark-streaming-consumer", {"splash_json": 2}) stream = kafkaStream.map(lambda xs: xs) stream.foreachRDD(lambda rdd: rdd.foreach(printRdd)) # stream.foreachRDD(lambda rdd: rdd.saveToMongodb(mongodb_uri)) # stream.pprint() # filter out flights not departing from United States # "depAirportCntry": "United States" # messages = kafkaStream.map(lambda xs: json.load(xs)) # jsonmessages = messages.map(lambda x: json.loads(x)) # usdepartures = jsonmessages.map(lambda x: x['depAirportCntry'].filter(lambda x: "United States" in x))\ ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def streaming_data(topic_name, window_size): ''' Get data stream from Kafka broker ''' # Spark context sc = SparkContext(appName="PythonSparkStreamingKafka") # sc.setLogLevel("WARN") # Streaming context batch_duration = 60 # batch duration in seconds stc = StreamingContext(sc, batch_duration) # Connect to Kafka and get DStream of input stream data kafkaStream = KafkaUtils.createStream(stc, 'localhost:2181', 'raw-event-streaming-consumer', {topic_name: 1}) # windowed stream windowedStream = kafkaStream.window(window_size) # Start the streaming context stc.start() stc.awaitTermination()
def start(): sconf = SparkConf() # sconf.set('spark.streaming.blockInterval','100') sconf.set('spark.cores.max', 8) sc = SparkContext(appName='KafkaWordCount', conf=sconf) ssc = StreamingContext(sc, 2) numStreams = 3 kafkaStreams = [ KafkaUtils.createStream( ssc, "server1-2-5-24-138:2181,server1-3-5-24-139:2181,server1-4-5-24-140:2181", "streaming_test_group", {"spark_streaming_test_topic": 1}) for _ in range(numStreams) ] unifiedStream = ssc.union(*kafkaStreams) print unifiedStream #统计生成的随机数的分布情况 result = unifiedStream.map(lambda x: (x[0], 1)).reduceByKey( lambda x, y: x + y) result.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
def Stream(KafkaTopic, ssc): #global Num print(KafkaTopic) StrTopic = KafkaTopic.split('-') Num = StrTopic[0] #print(Num) kvs1 = KafkaUtils.createStream( ssc, "localhost:2181", KafkaTopic, {KafkaTopic: 1}) #接受来自kafka的数据, "localhost:2181"为zookeper的端口号 #kvs2 = KafkaUtils.createStream(ssc, "localhost:2181", KafkaTopic[1], {KafkaTopic[1]: 1}) #接受来自kafka的数据, "localhost:2181"为zookeper的端口号 lines1 = kvs1.map(lambda x: eval(x[1])['battery']) #x[1]代表接受kafka中的值其他都不行 #lines1.pprint() #lines2 = kvs2.map(lambda x: eval(x[1])['battery']) #x[1]代表接受kafka中的值其他都不行 #val = lines.map(lambda k: (k['battery'])) #取字典中需要的参数组成一个tuple batterychange1 = lines1.window(6, 3) #创建计算窗口 #batterychange2 = lines2.window(6,6) batterymax1 = batterychange1.reduce(lambda x, y: x + y) batteryfinal = batterymax1.map(lambda x: {"Exo_ID": Num, "battery": x}) #topic = kvs1.map(lambda x: x[0]) #batterymax2 = batterychange2.reduce(lambda x,y : x+y) batteryfinal.pprint() #topic.pprint() #batterymax2.pprint() batteryfinal.foreachRDD(lambda rdd: rdd.foreach(sendMsg)) #创建新的用于发送数据的RDD
def main(): try: SetParameters() sc = CreateSparkContext() ssc = StreamingContext(sc, 5) kafka_stream = KafkaUtils.createStream(ssc, "192.168.0.1:2181", "topic", {topic: 12}) raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS]) lines = raw.flatMap(lambda xs: xs[1].split(",")) counts = lines.map(lambda word: (str(datetime.now()), "api", word)) counts.foreachRDD(lambda k: saveToCassandra(k, sc, ssc)) ssc.start() ssc.awaitTermination() except Exception, e: print('error:' + str(e))
def getStreamingData(): sc = SparkContext(appName="spark_temperature_processor") ssc = StreamingContext(sc, 1) zkQuorum = 'localhost:2181' topic = 'test' kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) lines.pprint() fo.write(str(lines)) #counts = lines.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b) #counts.pprint() #.reduceByKey(lambda a,b : "hot" if int(b) > 90 else "cold") #maxt.saveAsTextFile("kk.txt") maxt = lines.map(lambda x: x[0]) maxt.pprint() ssc.start() ssc.awaitTermination() #terminate in 5 seconds
def createContext(): sc = SparkContext(appName="PythonSparkStreamingKafka_RM_02") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 5) # Define Kafka Consumer kafkaStream = KafkaUtils.createStream(ssc, 'cdh57-01-node-01.moffatt.me:2181', 'spark-streaming2', {'twitter':1}) ## --- Processing # Extract tweets parsed = kafkaStream.map(lambda v: json.loads(v[1])) # Count number of tweets in the batch count_this_batch = kafkaStream.count().map(lambda x:('Tweets this batch: %s' % x)) # Count by windowed time period count_windowed = kafkaStream.countByWindow(60,5).map(lambda x:('Tweets total (One minute rolling count): %s' % x)) # Get authors authors_dstream = parsed.map(lambda tweet: tweet['user']['screen_name']) # Count each value and number of occurences count_values_this_batch = authors_dstream.countByValue() .transform(lambda rdd:rdd .sortBy(lambda x:-x[1])) .map(lambda x:"Author counts this batch:\tValue %s\tCount %s" % (x[0],x[1])) # Count each value and number of occurences in the batch windowed count_values_windowed = authors_dstream.countByValueAndWindow(60,5) .transform(lambda rdd:rdd .sortBy(lambda x:-x[1])) .map(lambda x:"Author counts (One minute rolling):\tValue %s\tCount %s" % (x[0],x[1])) # Write total tweet counts to stdout # Done with a union here instead of two separate pprint statements just to make it cleaner to display count_this_batch.union(count_windowed).pprint() # Write tweet author counts to stdout count_values_this_batch.pprint(5) count_values_windowed.pprint(5) return ssc
def create_dstream(ssc, zk_quorum, group_id, topics): ''' Create an input stream that pulls ids packet messages from Kafka. :param ssc : :class:`pyspark.streaming.context.StreamingContext` object. :param zk_quorum: Zookeeper quorum (host[:port],...). :param group_id : The group id for this consumer. :param topics : Dictionary of topic -> numOfPartitions to consume. Each partition is consumed in its own thread. :returns : The schema of this :class:`DataFrame`. :rtype : :class:`pyspark.sql.types.StructType` ''' from pyspark.streaming.kafka import KafkaUtils from ..serializer import deserialize dstream = KafkaUtils.createStream(ssc, zk_quorum, group_id, topics, keyDecoder=lambda x: x, valueDecoder=deserialize) return dstream.map(lambda x: x[1]).flatMap(lambda x: x).map( lambda x: x.split(','))
def main(tag): sc = create_sc("UnitTest") sc.setLogLevel("INFO") print('SPARK CONTEXT INFO :') print(' VERSION :', sc.version) print(' DRIVER MEMORY :', sc._conf.get('spark.driver.memory')) stream = StreamingContext(sc, 60) kafka_stream = KafkaUtils.createStream( stream, 'localhost:2181', 'spark-historian-consumer', {'historian-topic-CRY-TGBT-NORMAL-CRY-act-cons-pow': 1}) #kafka_stream = KafkaUtils.createStream(stream, 'victoria.com:2181', 'spark-streaming', {'imagetext':1}) #parsed = kafka_stream.map(lambda v: json.loads(v[1])) parsed = kafka_stream.map(lambda v: analyzeLog(v[1])) #parsed.pprint() parsed.foreachRDD(lambda k: process(k)) parsed.pprint() stream.start() stream.awaitTermination()
def main(): global zookeeper_IP global cassandra_IP global cassandra_keyspace global cassandra_table global kafka_handling_api global seconds_per_job global topic global c_dao zookeeper_IP = GetConfig('zookeeper_IP') cassandra_IP = GetConfig('cassandra_IP') cassandra_keyspace = GetConfig('cassandra_keyspace') cassandra_table = GetConfig('cassandra_table') kafka_handling_api = GetConfig('kafka_handling_api') seconds_per_job = GetConfig('seconds_per_job') topic = GetConfig('topic') c_dao = CassandraDAO(CassandraType.PRODUCTION) sc = CreateSparkContext(cassandra_IP) ssc = StreamingContext(sc, int(float(seconds_per_job))) try: kafka_stream = KafkaUtils.createStream(ssc, zookeeper_IP, 'spark-streaming-consumer', {topic: 12}) raw = kafka_stream.flatMap(lambda kafkaS: [kafkaS]) lines = raw.filter(lambda xs: xs[1].split(',')) counts = lines.map(lambda word: (str(datetime.now()), 'api', word[1])) counts.foreachRDD(lambda k: saveToCassandra(sc, ssc, k)) except Exception, e: print('error:' + str(e))
def streaming(): os.environ[ 'PYSPARK_SUBMIT_ARGS'] = '--jars spark-streaming-kafka-assembly_2.10-1.6.0.jar pyspark-shell' spark = SparkSession.builder.master("spark://t3.dev:7077").appName("test") \ .config('spark.jars.packages', 'org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.4.5') \ .getOrCreate() print(dir(spark._jvm)) ssc = StreamingContext(spark.sparkContext, 5) kafkaParams = { "bootstrap_servers": "t3.dev:9092", "kafka.bootstrap.servers": "t3.dev:9092", "brokers": "t3.dev:9092", "host": "t3.dev:9092" } topics = {'spark-test': 1} lines = KafkaUtils.createStream(ssc, 't3.dev:2181', 'local-test', topics, kafkaParams) print(lines.pprint(10)) ssc.start() ssc.awaitTermination(60)
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, HiveContext, SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils conf = SparkConf().setMaster("local[1]").setAppName("StreamProcessor_1") sc = SparkContext(conf=conf) print "Setting LOG LEVEL as ERROR" sc.setLogLevel("ERROR") ssc = StreamingContext(sparkContext=sc, batchDuration=1) kafkaStream = KafkaUtils.createStream(ssc=ssc, zkQuorum='localhost:2181', topics='test') print(sc)
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": # Create the Spark context sc = SparkContext(appName="DataIngestionApp") log4j = sc._jvm.org.apache.log4j log4j.LogManager.getRootLogger().setLevel(log4j.Level.WARN) # Create the Spark Streaming Context with 10 seconds batch interval ssc = StreamingContext(sc, 10) # Check point directory setting ssc.checkpoint("\tmp") # Zookeeper host zooKeeperQuorum="localhost" # Kaka message group messageGroup="sfb-consumer-group" # Kafka topic where the programming is listening for the data # Reader TODO: Here only one topic is included, it can take a comma separated string containing the list of topics. # Reader TODO: When using multiple topics, use your own logic to extract the right message and persist to its data store topics = "message" numThreads = 1 # Create a Kafka DStream kafkaStream = KafkaUtils.createStream(ssc, zooKeeperQuorum, messageGroup, {topics: numThreads}) messageLines = kafkaStream.map(lambda x: x[1]) # This is where the messages are printed to the console. Instead of this, implement your own persistence logic messageLines.pprint() # Start the streaming ssc.start() # Wait till the application is terminated ssc.awaitTermination()
kstream = KafkaUtils.createDirectStream(ssc, topics = ['topic1'], kafkaParams = {"metadata.broker.list":"localhost:9092"}) tweets = tweets.map(lambda x: x[1].encode("ascii","ignore")) return tweets def process_rdd_queue(twitter_stream): # Create the queue through which RDDs can be pushed to # a QueueInputDStream rddQueue = [] for i in range(3): rddQueue += [ssc.sparkContext.parallelize([get_next_tweet(twitter_stream)], 5)] lines = ssc.queueStream(rddQueue) lines.pprint() if __name__ == "__main__": sc = SparkContext(appName="PythonStreamingQueueStream") ssc = StreamingContext(sc, 10) # Instantiate the twitter_stream #twitter_stream = connect_twitter() # Get RDD queue of the streams json or parsed #process_rdd_queue(twitter_stream) zkQuorum = "localhost:2181" topic = "topic1" tweets = KafkaUtils.createStream(ssc, zkQuorum, "PythonStreamingQueueStream", {topic: 1}) #tweets = stream(ssc) #process_rdd_queue(twitter_stream) tweets.pprint() ssc.start() time.sleep(100) ssc.stop(stopSparkContext=True, stopGraceFully=True)
rdd.foreachPartition(output_partition) if __name__ == "__main__": client = pyhdfs.HdfsClient(hosts="10.120.14.120,9000", user_name="cloudera") #ser producer for topic "utime" topic = "utime" broker_list = '10.120.14.120:9092,10.120.14.120:9093' sc = SparkContext() ssc = StreamingContext(sc, 3) #ser consumer kafkastream take from topic Pdata lines = KafkaUtils.createStream(ssc, "10.120.14.120:2182", "Pdata_for_model", {"Pdata": 3}) load_file = open( "/home/cloudera/HA_ML_prdict_project/predict_model/rfr_0910_df.pkl", 'rb') MRI_Model = joblib.load(load_file) load_file.close() rfr_bc = sc.broadcast(MRI_Model) r = lines.map(lambda x: x[0]) r0 = lines.map(lambda x: x[1]) r1 = r0.map(lambda x: (int(x.split(",")[0]),int(x.split(",")[1]),int(x.split(",")[2]),int(x.split(",")[3]),int(x.split(",")[4]),\ int(x.split(",")[5]),int(x.split(",")[6]),int(x.split(",")[7]))) r2 = r1.map(lambda x: np.array(x, dtype=int)) r3 = r2.map(lambda x: x.reshape(1, -1))
RDB_HOST = os.environ.get('RDB_HOST') RDB_PORT = os.environ.get('RDB_PORT') RDB_DB = "avrotopic1db" zkQuorum, topic, stream_window, RDB_TABLE = sys.argv[1:] stream_window = int(stream_window) sc = SparkContext(appName="PythonStreamingKafkaSums") ssc = StreamingContext(sc, batchDuration=stream_window) streams = [] schema = avro.schema.parse(open("WaterSensor.avsc").read()) reader = DatumReader(schema) numStreams = 4 kafkaStreams = [KafkaUtils.createStream(ssc=ssc, zkQuorum=zkQuorum, groupId="avro-topic1-consumer", valueDecoder=io.BytesIO, topics={topic: 1}) for _ in range (numStreams)] #kvs = kafkaStreams[1] #kkvvss = ssc.union(*kafkaStreams)#.partitionBy(numPartitions=20) #kvs.print() #kvs = KafkaUtils.createStream(ssc, zkQuorum, "my-topic2-consumer", {topic: 1}) def sendRDDCount(count): #print('index: ' + str(index)) connection = createNewConnection()#todo: use-connection-pool #print('count' + str(count)) #r.table(RDB_TABLE).filter(r.row["partition"] == index).update({"count": count}).run(connection) r.table(RDB_TABLE).insert({"count": count, "time":time.time()}).run(connection) connection.close() def sendPartitionCount(index, count):
try: s = db.session.query(Station).filter(Station.id == station['station_id']) s.update({Station.num_bikes_available: station['num_bikes_available'], Station.num_docks_available: station['num_docks_available']}) db.session.commit() except exc.IntegrityError: db.session.rollback() def consume_data(): sc = SparkContext(appName="Lets Go") ssc = StreamingContext(sc, 1) zkQuorum, topic = sys.argv[1:] kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 1}) lines = kvs.map(lambda x: x[1]) counts = lines.flatMap(lambda line: line.split(" ")) \ .map(lambda row: (row, update_station_status())) \ .reduceByKey(lambda a, b: a+b) ssc.start() ssc.awaitTermination() def system_alerts(): """Get alerts about the system. https://gbfs.citibikenyc.com/gbfs/en/system_alerts.json""" pass
"mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" #execute send rdd.map(writeHbase).saveAsNewAPIHadoopDataset( conf=conf, keyConverter=keyConv, valueConverter=valueConv) if __name__ == "__main__": sc = SparkContext(appName = "Hbase") ssc = StreamingContext(sc, 1) # ssc.checkpoint(checkpointDir) # ssc = StreamingContext.getOrCreate(checkpointDir, buildContext) zkQuorum = "localhost:2181" kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {KAFKA_TOPIC: 1}) #data stream of data dictionaries ds = kvs.map(lambda data: ast.literal_eval(data[1])) ds.pprint() if ds is not None: ds.foreachRDD(sendRecord) ssc.start() ssc.awaitTermination() # sumstats = ds.map(partitionCount).updateStateByKey(partitionCount) # ssc.stop(stopGraceFully=True)
schema=['MinPrice', 'Direct', 'OutboundLeg']) df.show() df.write.saveAsTable(name="default.flights", format="hive", mode="append") sc = SparkContext("local[*]", "FlightData") ssc = StreamingContext(sc, 5) ss = SparkSession.builder.appName("FlightData").config( "spark.sql.warehouse.dir", "/user/hive/warehouse").config( "hive.metastore.uris", "thrift://localhost:9083").enableHiveSupport().getOrCreate() kafkaStream = KafkaUtils.createStream(ssc, 'localhost:2181', 'Flights', {'flights': 1}) parsed = kafkaStream.map(lambda v: json.loads(v[1])) #user_counts = parsed.map(lambda tweet: (tweet['user']["screen_name"], 1)).reduceByKey(lambda x,y: x + y) #user_counts.pprint() longest_duration = parsed.flatMap(lambda v: v.get("Quotes")) #longest_duration.pprint() table = longest_duration.map( lambda v: (v.get("MinPrice"), v.get("Direct"), v.get("OutboundLeg"))) longest_duration.pprint()
import sys from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils sc = SparkContext(appName="kafkaTest") ssc = StreamingContext(sc,5) kvs = KafkaUtils.createStream(ssc, "localhost:2181", "spark_streaming", {"inter_transact": 1}) kvs.pprint(10) ssc.start() ssc.awaitTermination()
''' Created on Jul 7, 2016 @author: rbhat ''' from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils # Create a local StreamingContext with two working thread and batch interval of 2 second sc = SparkContext("local[*]", "MyKafkaStream") ssc = StreamingContext(sc, 1) kafkaStream = KafkaUtils.createStream(ssc, "deepc04.acis.ufl.edu:2181", "GroupNameDoesntMatter", {"test": 2}) messages = kafkaStream.map(lambda xs:xs) messages.pprint() ssc.start() # Start the computation ssc.awaitTermination() # Wait for the computation to terminate
stream_window = int(stream_window) sc = SparkContext(appName="PythonStreamingKafkaJSONSums") ssc = StreamingContext(sc, batchDuration=stream_window) def createNewConnection(): return r.connect(host=RDB_HOST, port=RDB_PORT, db=RDB_DB) #delete any data in table connection = createNewConnection() r.table(RDB_TABLE).delete().run(connection) connection.close() streams = [] numStreams = 6 #read parallelism kafkaStreams = [KafkaUtils.createStream(ssc, zkQuorum, "JSON-consumer", {topic: 1}) for _ in range (numStreams)] #set up kafkaStreams into a list def sendRDDCount(count): connection = createNewConnection() r.table(RDB_TABLE).insert(count).run(connection) connection.close() for idx,kvs in enumerate(kafkaStreams): countsDstream=kvs.count() countsDstream = countsDstream.map(lambda x: {"count":x, "time":time.time()}) records = kvs.map(lambda x: bytesDecoder(x[1])) sums = records.map(lambda obj: (obj['unique_id'], obj['quantity'])) \ .reduceByKey(lambda a, b: a+b) countsDstream.foreachRDD(lambda rdd: sendRDDCount(rdd.take(1))) ssc.start() ssc.awaitTermination()
# 6.2.4절 예제 6-12 from pyspark import SparkContext, SparkConf, storagelevel from pyspark.streaming.context import StreamingContext from pyspark.streaming.kafka import KafkaUtils ## pyspark에서 실행할 경우 sparkContext는 생성하지 않습니다! # ./pyspark --packages org.apache.spark:spark-streaming-kafka-0-8-assembly_2.11:2.0.2 conf = SparkConf() sc = SparkContext(master="local[*]", appName="KafkaSample", conf=conf) ssc = StreamingContext(sc, 3) ds1 = KafkaUtils.createStream(ssc, "localhost:2181", "test-consumer-group1", {"test": 3}) ds2 = KafkaUtils.createDirectStream(ssc, ["test"], {"metadata.broker.list": "localhost:9092"}) ds1.pprint() ds2.pprint() ssc.start() ssc.awaitTermination()
record_id = colleciton.insert(push_user) colleciton = mongodb.notify_record notify_user = {"createtime": int(time.time()), "sendid": sendid, "type": channel,"purchaseinfoid":purchaseinfoid,"recordid":record_id} colleciton.insert(notify_user) #print sendid, str(num),purchaseinfo["name"].encode("utf8"),purchaseinfo["price"].encode("utf8"),purchaseinfo["unit"].encode("utf8"),str(purchaseinfo["purchaseid"]),sendtype #reply_wx_notify(sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"])) thread.start_new_thread(reply_wx_notify, (sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"]),uuid,sendtype)) pass def handlestream(kvs): parsed = kvs.map(lambda (k, v): json.loads(v))#获取消息的json格式 #处理发送任务 send=parsed.filter(lambda x: True if x["messagetype"] == 2 else False) send.foreachRDD(sendPush) if __name__ == "__main__": sc = SparkContext(appName="sendKafka") ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createStream(ssc, zk_server, "send-group", {send_task_topic: 1}) handlestream(kvs) ssc.start() ssc.awaitTermination()
colleciton.insert(notify_user) #print sendid, str(num),purchaseinfo["name"].encode("utf8"),purchaseinfo["price"].encode("utf8"),purchaseinfo["unit"].encode("utf8"),str(purchaseinfo["purchaseid"]),sendtype #reply_wx_notify(sendid, str(num), purchaseinfo["name"],purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid),str(purchaseinfo["purchaseid"])) thread.start_new_thread( reply_wx_notify, (sendid, str(num), purchaseinfo["name"], purchaseinfo["price"], purchaseinfo["unit"], str(purchaseinfoid), str(purchaseinfo["purchaseid"]), uuid, sendtype)) pass def handlestream(kvs): parsed = kvs.map(lambda (k, v): json.loads(v)) #获取消息的json格式 #处理发送任务 send = parsed.filter(lambda x: True if x["messagetype"] == 2 else False) send.foreachRDD(sendPush) if __name__ == "__main__": sc = SparkContext(appName="sendKafka") ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createStream(ssc, zk_server, "send-group", {send_task_topic: 1}) handlestream(kvs) ssc.start() ssc.awaitTermination()
from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: script.py <zk> <topic>", file=sys.stderr) exit(-1) zkQuorum, topic = sys.argv[1:] sc = SparkContext(appName="KafkaSparkStreaming") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 10) ssc.checkpoint("checkpoint") ks = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming-consumer", {topic: 42}) def processInput(line): fields = line[1].split("\t") return ((str(fields[6]), 1), (str(fields[7]), 1)) def updateFunction(newValues, runningCount): return sum(newValues, runningCount or 0) digest = ks.flatMap(processInput)\ .updateStateByKey(updateFunction)\ .transform(lambda rdd: rdd.sortBy(lambda x: x[1], ascending=False)\ .map(lambda (x, y): y).zipWithIndex().map(lambda (x, y): (y, x)) ) def toCSVLine(data):
return cv2.imdecode(np.frombuffer(s, dtype=np.uint8), -1) # the dtype of the input data (.jpg) is np.uint8 def write_img(rdd): imgs = rdd.collect() for img in imgs: cv2.imwrite('res1.jpg', img) if __name__ == '__main__': player_dict = getDict() # spark = SparkSession.builder.getOrCreate() sc = SparkContext() ssc = StreamingContext(sc, 1) raw_stream = KafkaUtils.createStream(ssc, 'localhost:2182', 'dl', {'dl_input':3}, valueDecoder=imdecoder) # Kafka default valueDecoder is str.decode('utf-8') imgs = raw_stream.map(lambda x:x[1]) # load model model = get_testing_model() keras_weights_file = 'model/keras/model.h5' model.load_weights(keras_weights_file) # load config params, model_params = config_reader() # process as rdd info_list,temp = imgs.transform(body_and_num_recog) temp.foreachRDD(write_img) # num = temp.map(lambda x:int(x[1]))
if __name__ == "__main__": # SparkContext represents connection to a Spark cluster. conf = SparkConf() conf.setAppName("Kafka Spark App") conf.setMaster('local[2]') sc = SparkContext(conf=conf) sc.setLogLevel("WARN") # StreamingContext represents connection to a Spark cluster from existing SparkContext. ssc = StreamingContext( sc, 60) # the number indicates how many seconds each batch lasts. # Creates an input stream that pulls events from Kafka. kvs = KafkaUtils.createStream(ssc, "streamsetApp:2181", "spark-streaming-consumer", {"NETFLOW": 1}) parsed = kvs.map(lambda x: json.loads(x[1])) # Get only elements that are needed and rename to make it clear. netflow_dict = parsed.map(lambda x: ({ 'srcAddr': x['srcaddr_s'], 'srcPort': x['srcport'], 'dstAddr': x['dstaddr_s'], 'dstPort': x['dstport'], 'tcpFlags': x['tcp_flags'], 'protocol': x['proto'], 'timestampStart': x['first'], 'timestampEnd': x['last'], 'numBytes': x['dOctets'], 'numFlows': x['count'] }))
def my_decoder(s): return s def eye_aspect_ratio(eye): A = distance.euclidean(eye[1], eye[5]) B = distance.euclidean(eye[2], eye[4]) C = distance.euclidean(eye[0], eye[3]) ear = (A + B) / (2.0 * C) return ear kafkaStream = KafkaUtils.createStream(ssc, brokers, 'test-consumer-group-1', {input_topic: 15}, valueDecoder=my_decoder) producer = KafkaProducer(bootstrap_servers='G01-01:9092', compression_type='gzip', batch_size=163840, buffer_memory=33554432, max_request_size=20485760) thresh = 0.25 frame_check = 20 detect = dlib.get_frontal_face_detector() predict = dlib.shape_predictor( predictor_path) # Dat file is the crux of the code (lStart, lEnd) = face_utils.FACIAL_LANDMARKS_68_IDXS["left_eye"] (rStart, rEnd) = face_utils.FACIAL_LANDMARKS_68_IDXS["right_eye"]
virtualMachine = 'local' if socket.gethostname() == 'ubuntu': virtualMachine = socket.gethostname() if virtualMachine == 'local': dirTrainingModel = config.get('StreamingProperties', 'URLTrainingModelLocal') else: dirTrainingModel = config.get('StreamingProperties', 'URLTrainingModelHDFS') if virtualMachine == 'ubuntu': ssc = StreamingContext(sc, 2) brokers = "localhost:2181" kvs = KafkaUtils.createStream(ssc, \ "localhost:2181", \ topicName, {"topic":1}) #kvs = KafkaUtils.createDirectStream (ssc, [topicName], {"metadata.broker.list": brokers}) kvs.pprint() #kvs.foreachRDD (saveData) #brokers = "localhost:9092" #kvs = KafkaUtils.createDirectStream (ssc, [topicName], {"metadata.broker.list": brokers}) #KafkaUtils.createStream(scc, ) #kvs.pprint() #kvs.foreachRDD (saveStream) #rowData = data.map(lambda row: row.asDict()) #rowData.saveToMongoDB(mongodb_connection + 'test.resultsStreaming') ssc.start()