def main(args): spark_context = pyspark.SparkContext(appName='update-analyzer') streaming_context = streaming.StreamingContext(spark_context, 1) kafka_stream = kstreaming.KafkaUtils.createDirectStream( streaming_context, [args.topic], {'bootstrap.servers': args.brokers}) def analyze_updates(rdd): def run_analyzer(u): english = spacy.load('en_core_web_sm') nu = json.loads(u) result = english(nu.get('text', '')) from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyzer = SentimentIntensityAnalyzer() sentiment = [ analyzer.polarity_scores(str(s)) for s in list(result.sents) ] nu.update(sentiment=sentiment) return nu def post_update(u): try: con = httplib.HTTPConnection(host=args.vhost, port=args.vport) con.request('POST', '/', body=json.dumps(u)) con.close() except Exception as e: logging.warn('unable to POST to visualizer, error:') logging.warn(e.message) rdd.map(run_analyzer).foreach(post_update) messages = kafka_stream.map(lambda m: m[1]) messages.foreachRDD(analyze_updates) streaming_context.start() streaming_context.awaitTermination()
def main(): if len(sys.argv) == 2 and sys.argv[1] == "noop": return if len(sys.argv) != 8: print "Usage: spark_stream_analytics.py <spark_master> <zk_quorum> <topic_name> <batch_duration> <save_to>" print "Example: spark_stream_analytics.py local[4] zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181 video-stream 5 hdfs://hdfs-namenode:8020/demo" print "<spark_master> - spark master to use: local[4] or spark://HOST:PORT" print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181" print "<topic_name> - kafka topic name: twitter-stream" print "<batch_duration> - spark streaming batch duration ~ how often data will be written" exit(-1) spark_master = sys.argv[1] zk_quorum = sys.argv[2] topic_name = sys.argv[3] batch_duration = int(sys.argv[4]) sc = pyspark.SparkContext(spark_master, appName="VideoTics") ssc = streaming.StreamingContext(sc, batch_duration) video = kafka.KafkaUtils.createStream(ssc, zk_quorum, "video-consumer", { topic_name: 1 }).map(lambda x: json.loads(x[1])) output = video.foreachRDD(process_frame) output.pprint() ssc.start() ssc.awaitTermination()
def main(hashtags): global IP # start connection # configure spark instance to default config = SparkConf() config.setAppName("Twitter_Stream_Analasys") s_context = SparkContext(conf=config) # To prevent drowing the terminal, only log error messages? s_context.setLogLevel("ERROR") # use spark context to create the stream context # interval size = 2 seconds s_stream_context = pss.StreamingContext(s_context, 2) s_stream_context.checkpoint("checkpoint_TSA") # connect to port 9009 (the one used by twitter_trends) socket_ts = s_stream_context.socketTextStream("twitter", 9009) print("Clear setup\n\n\n\n\n\n\n") # retreve streamed text, split input into array of words #tweet_text = socket_ts # remove all words that arent emotions' words = socket_ts.flatMap(lambda line: line.split(" ")) i_hashtags = words.filter(check_topic) # map each hashtag (map reduce to count) hashtag_count = i_hashtags.map(lambda x: (x.lower(), 1)) # do the aggregation, note that now this is a sequence of RDDs hashtag_totals = hashtag_count.updateStateByKey(aggregate_tags_count) # do this for every single interval hashtag_totals.foreachRDD(process_interval) #set up sql sql_context = get_sql_context_instance(s_context) # start the streaming computation s_stream_context.start() try: # wait for the streaming to finish s_stream_context.awaitTermination() except KeyboardInterrupt: print("\nSpark shutting down\n")
def main(): global hashtags global IP hashtags = ['#youtube', '#google', '#microsoft', '#amazon', '#oracle'] # start connection # configure spark instance to default config = SparkConf() s_context = SparkContext(conf=config) # log error messages? s_context.setLogLevel("ERROR") # use spark context to create the stream context # interval size = 2 seconds s_stream_context = pss.StreamingContext(s_context, 2) s_stream_context.checkpoint("checkpoint_TSA") # connect to port 9009 socket_ts = s_stream_context.socketTextStream("twitter", 9009) # word that are related to tweets words = socket_ts.flatMap(lambda line: line.split(" ")) company_hashtags = words.filter(check_word) # map each hashtag (map reduce to count) hashtag_count = company_hashtags.map(lambda x: (x.lower(), 1)) # do the aggregation, note that now this is a sequence of RDDs hashtag_totals = hashtag_count.updateStateByKey(aggregate_tags_count) # set intervals hashtag_totals.foreachRDD(process_interval) #set up sql sql_context = get_sql_context_instance(s_context) # start the streaming s_stream_context.start() try: # wait for the streaming s_stream_context.awaitTermination() except KeyboardInterrupt: print("\nSpark shutting down\n")
def __init__(self, servers, duration, spark_context, sql_context, model_save_path): """ Create a KafakConnector object. Keyword arguments: servers -- A list of Kafka brokers duration -- The window duration to sample the Kafka stream in seconds spark_context -- main entry point for Spark functionality sql_context -- The entry point for working with structured data """ self.servers = servers self.spark_context = spark_context self.streaming_context = streaming.StreamingContext( self.spark_context, duration) self.sql_context = sql_context self.model_save_path = model_save_path self.es_output_host = os.environ.get('ES_HOST') self.es_output_port = os.environ.get('ES_PORT') self.es_output_index = os.environ.get('ES_OUTPUT_INDEX')
def create_streaming_context(spark_context, config): """ Create a streaming context with a custom Streaming Listener that will log every event. :param spark_context: Spark context :type spark_context: pyspark.SparkContext :param config: dict :return: Returns a new streaming context from the given context. :rtype: pyspark.streaming.StreamingContext """ ssc = streaming.StreamingContext(spark_context, config[ "spark_config"]["streaming"]["batch_interval"]) ssc.addStreamingListener(DriverStreamingListener) directory = os_path.expanduser("~/checkpointing") logger.info("Checkpointing to `{}`".format(directory)) # Commented out to fix a crash occurring when # phase 1 is used. The reason of the crash is still unclear # but Spark complains about the SSC being transferred # to workers. # ssc.checkpoint(directory) return ssc
def __init__(self, input_topic, output_topic, servers, duration): """Create a new StreamProcessor Keyword arguments: input_topic -- Kafka topic to read messages from output_topic -- Kafka topic to write message to servers -- A list of Kafka brokers duration -- The window duration to sample the Kafka stream in seconds """ self.input_topic = input_topic self.output_topic = output_topic self.servers = servers self.spark_context = pyspark.SparkContext( appName='flight-listener') self.streaming_context = streaming.StreamingContext( self.spark_context, duration) self.kafka_stream = kstreaming.KafkaUtils.createDirectStream( self.streaming_context, [self.input_topic], {'bootstrap.servers': self.servers})
def main(): parser = argparse.ArgumentParser( description='filter some words on a kafka topic') parser.add_argument('--in', default='word-fountain', dest='intopic', help='the kafka topic to read words from') parser.add_argument('--out', default='word-filter', help='the kafka topic to publish filtered words on') parser.add_argument('--regex', default='.*', help='the regular expression to use as a filter') parser.add_argument('--servers', default='localhost:9092', help='the kafka brokers') args = parser.parse_args() intopic = args.intopic outtopic = args.out regexp = args.regex servers = args.servers print('using the following parameters:') print('input topic: {}'.format(intopic)) print('output topic: {}'.format(outtopic)) print('regexp: "{}"'.format(regexp)) print('servers: {}'.format(servers)) sc = pyspark.SparkContext(appName='word-filter') ssc = streaming.StreamingContext(sc, 3) kds = kstreaming.KafkaUtils.createDirectStream( ssc, [intopic], {'bootstrap.servers': servers}) words = kds.map(lambda x: x[1]) filterwords = words.filter(lambda x: False if re.search(regexp, x) is None else True) def send_response(rdd): producer = kafka.KafkaProducer(bootstrap_servers=servers) for r in rdd.collect(): producer.send(outtopic, str(r)) producer.flush() filterwords.pprint() filterwords.foreachRDD(send_response) ssc.start() ssc.awaitTermination()
def main(): if len(sys.argv) == 2 and sys.argv[1] == "noop": return if len(sys.argv) != 8: print "Usage: spark_hashtags_count.py <spark_master> <zk_quorum> <topic_name> <min_hashtag_counts> <batch_duration> <save_to>" print "Example: spark_hashtags_count.py local[4] zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181 twitter-stream 0 5 hdfs://hdfs-namenode:8020/demo" print "<spark_master> - spark master to use: local[4] or spark://HOST:PORT" print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181" print "<topic_name> - kafka topic name: twitter-stream" print "<min_hashtag_counts> - filter out hashtags with less then specified count" print "<batch_duration> - spark streaming batch duration ~ how often data will be written" print "<save_to> - hdfs or cassandra" print "<storage> - save as text files to: hdfs://hdfs-namenode:8020/demo or to database: <host>:<keyspace>:<table>" exit(-1) spark_master = sys.argv[1] zk_quorum = sys.argv[2] topic_name = sys.argv[3] min_hashtag_counts = int(sys.argv[4]) batch_duration = int(sys.argv[5]) save_to = sys.argv[6] storage = sys.argv[7] sc = pyspark.SparkContext(spark_master, appName="TweeTics") ssc = streaming.StreamingContext(sc, batch_duration) sql = SQLContext(sc) tweets = kafka.KafkaUtils.createStream(ssc, zk_quorum, "tweetics-consumer", {topic_name: 1}).map(lambda x: x[1]) counts = tweets.flatMap(get_hashtags).map(lambda hashtag: (hashtag, 1)).reduceByKey(lambda a, b: a + b) sorted_counts = counts.transform(lambda rdd: rdd.sortByKey(ascending=False, keyfunc=lambda x: x[1])) output = sorted_counts.map(lambda x: "%s %s" % (x[0], x[1])) output.pprint() save(output, save_to, storage) ssc.start() ssc.awaitTermination()
if cur == 1: count += 1 bit_win = bit_win[1:] if len(whole_queue) >= 1000: true_count = sum(whole_queue[-1000:]) predict_count = sum(bit_win) + the_last_bucket/2 print ('Estimate number of ones in the last 1000 bits: %s' % predict_count) print('Actual number of ones in the last 1000 bits: %s' % true_count) def main(ssc): line = ssc.socketTextStream("localhost", 9999) line.foreachRDD(calculate) ssc.start() # Start the computation ssc.awaitTermination() if __name__ == "__main__": conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) sc.setLogLevel(logLevel="OFF") ssc = streaming.StreamingContext(sc,10) whole_queue = [] bit_win = [] win_size = [] the_last_bucket = 0 # Execute Main functionality main(ssc)
def main(): # start connection # configure spark instance to default global s_context global Logger global mylogger global s_context config = SparkConf() config.setAppName("Gait-Realtime-Analysis") s_context = SparkContext(conf=config) s_context.setLogLevel("ERROR") sys.path.insert(0, SparkFiles.getRootDirectory()) s_context.addFile('./model/cnn_modell.h5') s_context.addFile("./data_transformation.py") # TODO: add logger to spark # use spark context to create the stream context # 5 seconds ensure that we get two overlapping samples of 4 seconds interval_seconds = 10 s_stream_context = pss.StreamingContext(s_context, interval_seconds) s_stream_context.checkpoint("checkpoint_TSA") # with tf.gfile.GFile('./frozenInferenceGraphIdentification.pb', "rb") as f: # model_data = f.read() # model_data_bc = s_context.broadcast(model_data) # model_data_bc = s_context.broadcast(loaded_model) # connect to port 9009 i.e. twitter-client print(API_SERVICE_URL + ' ' + SPARK_SOCKET_PORT) socket_ts = s_stream_context.socketTextStream(API_SERVICE_URL, int(SPARK_SOCKET_PORT)) print("\n################################\n") line = socket_ts.flatMap(lambda line: line.split("\n")) gait = line.map(lambda g: (getUserId(g).strip(), g.strip())) gaitByUserId = gait.groupByKey() sortedGaitByUserId = gaitByUserId.transform( lambda foo: foo.sortBy(lambda x: (x[0]))) # sortedGaitByUserId = gaitByUserId.sortByKey() # author_counts_sorted_dstream = author_counts.transform(\ # (lambda foo:foo\ # .sortBy(lambda x:( -x[1])) ) # ) # author_counts_sorted_dstream.pprint() # sortedGaitByUserId.foreachRDD(another) segmentedData = sortedGaitByUserId.mapPartitions(partition_mapper_func) # x = cogrouped.mapValues(iterate) # for e in x.collect(): # print (e) # segmentedData.pprint() # DO NOT CHANGE THE LOCATION OF THIS FUNCTION def infer(data_rdd): # print("ATTEMPTING DEEP LEARNING") try: datas = data_rdd.collect() if len(datas) > 0: # print("INSIDE TRY BEFORE WITH") # with tf.Graph().as_default() as graph: # graph_def = tf.GraphDef() # graph_def.ParseFromString(model_data_bc.value) # tf.import_graph_def(graph_def, name="prefix") # print("INSIDE TRY AFTER WITH") # x = graph.get_tensor_by_name('prefix/Placeholder:0') # y = graph.get_tensor_by_name('prefix/Softmax:0') for data in datas: for id_xyz in data: if id_xyz: id = id_xyz[0] dummy_axis = "0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00" input_signals = [] input_signals.extend(id_xyz[1:]) for i in range(3): input_signals.append(dummy_axis) X_signals = [] for each in input_signals: X_signals.append([ np.array(cell, dtype=np.float32) for cell in [each.strip().split(' ')] ]) X_test = np.transpose(np.array(X_signals), (1, 2, 0)) from pyspark import SparkFiles from tensorflow.keras.models import load_model path = SparkFiles.get('cnn_modell.h5') model = load_model(path) print("Loaded model from disk") preds = model.predict(X_test) for p in preds: inferred_user_id = str(np.argmax(p) + 1) results = { 'confidency': str(np.amax(p)), 'inferred_user_id': inferred_user_id, 'actual_user_id': str(id) } print(results) requests.post(back_end_url, json=results) # with tf.Session(graph=graph) as sess: # y_out = sess.run(y, feed_dict={ # x: X_test # }) # for each in y_out: # inferred_user_id = str(np.argmax(each) + 1) # confidency = str(np.amax(each)) # actual_user_id = str(id) # results = {'confidency': confidency, 'inferred_user_id': inferred_user_id, # 'actual_user_id': actual_user_id} # print(results) # requests.post(back_end_url, json=results) except: e = sys.exc_info() print("Error: %s" % e) print('infer:', 'running inference on segmented data') segmentedData.foreachRDD(infer) # start the streaming computation s_stream_context.start() try: # wait for the streaming to finish s_stream_context.awaitTermination() except KeyboardInterrupt: print("\nSpark shutting down\n")
import pyspark.streaming.kafka as pyspark_kafka import scapy.all as scapy # ----------------------------------------------------------------------------- # Main program # ----------------------------------------------------------------------------- if __name__ == "__main__": # # Setup # #-- define spark usual and streaming contexts cont_0 = pyspark.SparkContext(appName="pkt_dissector") cont_0.setLogLevel("ERROR") s_cont_0 = pyspark_streaming.StreamingContext(cont_0, 5) #-- kafka integration (notice, that we receive packets as a bytes struct) brokers = "192.168.122.71:9092,192.168.122.72:9092,192.168.122.73:9092" kafka_dstream = pyspark_kafka.KafkaUtils.createDirectStream( s_cont_0, ["test1"], {"metadata.broker.list": brokers}, valueDecoder=lambda x: bytes(x)) # # Lazy evaluation rules # #-- Kafka message comes as a 2-tuple: (key, value). The code below will #-- select the actual message (i.e. packet) and dissects it. pkts = kafka_dstream.map(lambda x: scapy.Ether(x[1])) filtered_pkts = pkts.filter(common._pkt_filter). \ map(lambda x: (x, x.summary()))
print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181" print "<topic_name> - kafka topic name: twitter-stream" print "<min_hashtag_counts> - filter out hashtags with less then specified count" print "<batch_duration> - spark streaming batch duration ~ how often data will be written" print "<save_to> - save as text files to: hdfs://hdfs-namenode:8020/demo" exit(-1) spark_master = sys.argv[1] zk_quorum = sys.argv[2] topic_name = sys.argv[3] min_hashtag_counts = int(sys.argv[4]) batch_duration = int(sys.argv[5]) save_to = sys.argv[6] sc = pyspark.SparkContext("local[2]", appName="TweeTics") ssc = streaming.StreamingContext(sc, batch_duration) tweets = kafka.KafkaUtils.createStream(ssc, zk_quorum, "tweetics-consumer", { topic_name: 1 }).map(lambda x: x[1]) counts = tweets.flatMap(get_hashtags).map( lambda hashtag: (hashtag, 1)).reduceByKey(lambda a, b: a + b) sorted_counts = counts.transform( lambda rdd: rdd.sortByKey(ascending=False, keyfunc=lambda x: x[1])) output = sorted_counts.map(lambda x: "%s %s" % (x[0], x[1])) output.pprint() output.saveAsTextFiles(save_to) ssc.start()