예제 #1
0
def main(args):
    spark_context = pyspark.SparkContext(appName='update-analyzer')
    streaming_context = streaming.StreamingContext(spark_context, 1)
    kafka_stream = kstreaming.KafkaUtils.createDirectStream(
        streaming_context, [args.topic], {'bootstrap.servers': args.brokers})

    def analyze_updates(rdd):
        def run_analyzer(u):
            english = spacy.load('en_core_web_sm')
            nu = json.loads(u)
            result = english(nu.get('text', ''))
            from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
            analyzer = SentimentIntensityAnalyzer()
            sentiment = [
                analyzer.polarity_scores(str(s)) for s in list(result.sents)
            ]
            nu.update(sentiment=sentiment)
            return nu

        def post_update(u):
            try:
                con = httplib.HTTPConnection(host=args.vhost, port=args.vport)
                con.request('POST', '/', body=json.dumps(u))
                con.close()
            except Exception as e:
                logging.warn('unable to POST to visualizer, error:')
                logging.warn(e.message)

        rdd.map(run_analyzer).foreach(post_update)

    messages = kafka_stream.map(lambda m: m[1])
    messages.foreachRDD(analyze_updates)
    streaming_context.start()
    streaming_context.awaitTermination()
예제 #2
0
def main():
    if len(sys.argv) == 2 and sys.argv[1] == "noop":
        return
    if len(sys.argv) != 8:
        print "Usage: spark_stream_analytics.py <spark_master> <zk_quorum> <topic_name> <batch_duration> <save_to>"
        print "Example: spark_stream_analytics.py local[4] zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181 video-stream 5 hdfs://hdfs-namenode:8020/demo"
        print "<spark_master> - spark master to use: local[4] or spark://HOST:PORT"
        print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181"
        print "<topic_name> - kafka topic name: twitter-stream"
        print "<batch_duration> - spark streaming batch duration ~ how often data will be written"
        exit(-1)

    spark_master = sys.argv[1]
    zk_quorum = sys.argv[2]
    topic_name = sys.argv[3]
    batch_duration = int(sys.argv[4])

    sc = pyspark.SparkContext(spark_master, appName="VideoTics")
    ssc = streaming.StreamingContext(sc, batch_duration)

    video = kafka.KafkaUtils.createStream(ssc, zk_quorum, "video-consumer", {
        topic_name: 1
    }).map(lambda x: json.loads(x[1]))

    output = video.foreachRDD(process_frame)
    output.pprint()

    ssc.start()
    ssc.awaitTermination()
예제 #3
0
def main(hashtags):
    global IP

    # start connection
    # configure spark instance to default
    config = SparkConf()
    config.setAppName("Twitter_Stream_Analasys")
    s_context = SparkContext(conf=config)
    # To prevent drowing the terminal, only log error messages?
    s_context.setLogLevel("ERROR")

    # use spark context to create the stream context
    # interval size = 2 seconds
    s_stream_context = pss.StreamingContext(s_context, 2)
    s_stream_context.checkpoint("checkpoint_TSA")

    # connect to port 9009 (the one used by twitter_trends)
    socket_ts = s_stream_context.socketTextStream("twitter", 9009)

    print("Clear setup\n\n\n\n\n\n\n")

    # retreve streamed text, split input into array of words

    #tweet_text = socket_ts

    # remove all words that arent emotions'
    words = socket_ts.flatMap(lambda line: line.split(" "))

    i_hashtags = words.filter(check_topic)

    # map each hashtag (map reduce to count)
    hashtag_count = i_hashtags.map(lambda x: (x.lower(), 1))

    # do the aggregation, note that now this is a sequence of RDDs
    hashtag_totals = hashtag_count.updateStateByKey(aggregate_tags_count)

    # do this for every single interval
    hashtag_totals.foreachRDD(process_interval)

    #set up sql
    sql_context = get_sql_context_instance(s_context)

    # start the streaming computation
    s_stream_context.start()

    try:
        # wait for the streaming to finish
        s_stream_context.awaitTermination()
    except KeyboardInterrupt:
        print("\nSpark shutting down\n")
예제 #4
0
def main():
    global hashtags
    global IP
    hashtags = ['#youtube', '#google', '#microsoft', '#amazon', '#oracle']

    # start connection
    # configure spark instance to default
    config = SparkConf()
    s_context = SparkContext(conf=config)
    # log error messages?
    s_context.setLogLevel("ERROR")

    # use spark context to create the stream context
    # interval size = 2 seconds
    s_stream_context = pss.StreamingContext(s_context, 2)
    s_stream_context.checkpoint("checkpoint_TSA")

    # connect to port 9009
    socket_ts = s_stream_context.socketTextStream("twitter", 9009)

    # word that are related to tweets
    words = socket_ts.flatMap(lambda line: line.split(" "))

    company_hashtags = words.filter(check_word)

    # map each hashtag (map reduce to count)
    hashtag_count = company_hashtags.map(lambda x: (x.lower(), 1))

    # do the aggregation, note that now this is a sequence of RDDs
    hashtag_totals = hashtag_count.updateStateByKey(aggregate_tags_count)

    # set intervals
    hashtag_totals.foreachRDD(process_interval)

    #set up sql
    sql_context = get_sql_context_instance(s_context)

    # start the streaming
    s_stream_context.start()

    try:
        # wait for the streaming
        s_stream_context.awaitTermination()
    except KeyboardInterrupt:
        print("\nSpark shutting down\n")
    def __init__(self, servers, duration, spark_context, sql_context,
                 model_save_path):
        """ Create a KafakConnector object.

        Keyword arguments:
        servers -- A list of Kafka brokers
        duration -- The window duration to sample the Kafka stream in
                    seconds
        spark_context -- main entry point for Spark functionality
        sql_context -- The entry point for working with structured data
        """
        self.servers = servers
        self.spark_context = spark_context
        self.streaming_context = streaming.StreamingContext(
            self.spark_context, duration)
        self.sql_context = sql_context
        self.model_save_path = model_save_path
        self.es_output_host = os.environ.get('ES_HOST')
        self.es_output_port = os.environ.get('ES_PORT')
        self.es_output_index = os.environ.get('ES_OUTPUT_INDEX')
예제 #6
0
def create_streaming_context(spark_context, config):
    """
    Create a streaming context with a custom Streaming Listener
    that will log every event.
    :param spark_context: Spark context
    :type spark_context: pyspark.SparkContext
    :param config: dict
    :return: Returns a new streaming context from the given context.
    :rtype: pyspark.streaming.StreamingContext
    """
    ssc = streaming.StreamingContext(spark_context, config[
        "spark_config"]["streaming"]["batch_interval"])
    ssc.addStreamingListener(DriverStreamingListener)
    directory = os_path.expanduser("~/checkpointing")
    logger.info("Checkpointing to `{}`".format(directory))
    # Commented out to fix a crash occurring when
    # phase 1 is used. The reason of the crash is still unclear
    # but Spark complains about the SSC being transferred
    # to workers.
    # ssc.checkpoint(directory)
    return ssc
예제 #7
0
    def __init__(self, input_topic, output_topic, servers, duration):
        """Create a new StreamProcessor

        Keyword arguments:
        input_topic -- Kafka topic to read messages from
        output_topic -- Kafka topic to write message to
        servers -- A list of Kafka brokers
        duration -- The window duration to sample the Kafka stream in
                    seconds
        """
        self.input_topic = input_topic
        self.output_topic = output_topic
        self.servers = servers
        self.spark_context = pyspark.SparkContext(
            appName='flight-listener')
        self.streaming_context = streaming.StreamingContext(
            self.spark_context, duration)
        self.kafka_stream = kstreaming.KafkaUtils.createDirectStream(
            self.streaming_context,
            [self.input_topic],
            {'bootstrap.servers': self.servers})
예제 #8
0
def main():
    parser = argparse.ArgumentParser(
        description='filter some words on a kafka topic')
    parser.add_argument('--in', default='word-fountain', dest='intopic',
        help='the kafka topic to read words from')
    parser.add_argument('--out', default='word-filter',
        help='the kafka topic to publish filtered words on')
    parser.add_argument('--regex', default='.*',
        help='the regular expression to use as a filter')
    parser.add_argument('--servers', default='localhost:9092',
        help='the kafka brokers')
    args = parser.parse_args()
    intopic = args.intopic
    outtopic = args.out
    regexp = args.regex
    servers = args.servers

    print('using the following parameters:')
    print('input topic: {}'.format(intopic))
    print('output topic: {}'.format(outtopic))
    print('regexp: "{}"'.format(regexp))
    print('servers: {}'.format(servers))

    sc = pyspark.SparkContext(appName='word-filter')
    ssc = streaming.StreamingContext(sc, 3)
    kds = kstreaming.KafkaUtils.createDirectStream(
            ssc, [intopic], {'bootstrap.servers': servers})
    words = kds.map(lambda x: x[1])
    filterwords = words.filter(lambda x: False if re.search(regexp, x) is None else True)

    def send_response(rdd):
        producer = kafka.KafkaProducer(bootstrap_servers=servers)
        for r in rdd.collect():
            producer.send(outtopic, str(r))
        producer.flush()

    filterwords.pprint()
    filterwords.foreachRDD(send_response)
    ssc.start()
    ssc.awaitTermination()
예제 #9
0
def main():
    if len(sys.argv) == 2 and sys.argv[1] == "noop":
        return
    if len(sys.argv) != 8:
        print "Usage: spark_hashtags_count.py <spark_master> <zk_quorum> <topic_name> <min_hashtag_counts> <batch_duration> <save_to>"
        print "Example: spark_hashtags_count.py local[4] zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181 twitter-stream 0 5 hdfs://hdfs-namenode:8020/demo"
        print "<spark_master> - spark master to use: local[4] or spark://HOST:PORT"
        print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181"
        print "<topic_name> - kafka topic name: twitter-stream"
        print "<min_hashtag_counts> - filter out hashtags with less then specified count"
        print "<batch_duration> - spark streaming batch duration ~ how often data will be written"
        print "<save_to> - hdfs or cassandra"
        print "<storage> - save as text files to: hdfs://hdfs-namenode:8020/demo or to database: <host>:<keyspace>:<table>"
        exit(-1)

    spark_master = sys.argv[1]
    zk_quorum = sys.argv[2]
    topic_name = sys.argv[3]
    min_hashtag_counts = int(sys.argv[4])
    batch_duration = int(sys.argv[5])
    save_to = sys.argv[6]
    storage = sys.argv[7]
    sc = pyspark.SparkContext(spark_master, appName="TweeTics")
    ssc = streaming.StreamingContext(sc, batch_duration)
    sql = SQLContext(sc)

    tweets = kafka.KafkaUtils.createStream(ssc, zk_quorum, "tweetics-consumer", {topic_name: 1}).map(lambda x: x[1])
    counts = tweets.flatMap(get_hashtags).map(lambda hashtag: (hashtag, 1)).reduceByKey(lambda a, b: a + b)
    sorted_counts = counts.transform(lambda rdd: rdd.sortByKey(ascending=False, keyfunc=lambda x: x[1]))
    output = sorted_counts.map(lambda x: "%s %s" % (x[0], x[1]))

    output.pprint()
    save(output, save_to, storage)

    ssc.start()
    ssc.awaitTermination()
예제 #10
0
            if cur == 1:
                count += 1
        bit_win = bit_win[1:]
    if len(whole_queue) >= 1000:
        true_count = sum(whole_queue[-1000:])
        predict_count = sum(bit_win) + the_last_bucket/2
        print ('Estimate number of ones in the last 1000 bits: %s' % predict_count)
        print('Actual number of ones in the last 1000 bits: %s' % true_count)


def main(ssc):
    line = ssc.socketTextStream("localhost", 9999)
    line.foreachRDD(calculate)
    ssc.start()  # Start the computation
    ssc.awaitTermination()




if __name__ == "__main__":
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
    sc   = SparkContext(conf=conf)
    sc.setLogLevel(logLevel="OFF")
    ssc  = streaming.StreamingContext(sc,10)
    whole_queue = []
    bit_win = []
    win_size = []
    the_last_bucket = 0
    # Execute Main functionality
    main(ssc)
예제 #11
0
def main():
    # start connection
    # configure spark instance to default
    global s_context
    global Logger
    global mylogger
    global s_context
    config = SparkConf()
    config.setAppName("Gait-Realtime-Analysis")
    s_context = SparkContext(conf=config)
    s_context.setLogLevel("ERROR")

    sys.path.insert(0, SparkFiles.getRootDirectory())

    s_context.addFile('./model/cnn_modell.h5')
    s_context.addFile("./data_transformation.py")
    # TODO: add logger to spark

    # use spark context to create the stream context
    # 5 seconds ensure that we get two overlapping samples of 4 seconds
    interval_seconds = 10
    s_stream_context = pss.StreamingContext(s_context, interval_seconds)
    s_stream_context.checkpoint("checkpoint_TSA")

    # with tf.gfile.GFile('./frozenInferenceGraphIdentification.pb', "rb") as f:
    #     model_data = f.read()

    # model_data_bc = s_context.broadcast(model_data)
    # model_data_bc = s_context.broadcast(loaded_model)

    # connect to port 9009 i.e. twitter-client
    print(API_SERVICE_URL + ' ' + SPARK_SOCKET_PORT)
    socket_ts = s_stream_context.socketTextStream(API_SERVICE_URL,
                                                  int(SPARK_SOCKET_PORT))

    print("\n################################\n")

    line = socket_ts.flatMap(lambda line: line.split("\n"))
    gait = line.map(lambda g: (getUserId(g).strip(), g.strip()))
    gaitByUserId = gait.groupByKey()

    sortedGaitByUserId = gaitByUserId.transform(
        lambda foo: foo.sortBy(lambda x: (x[0])))

    # sortedGaitByUserId = gaitByUserId.sortByKey()

    #     author_counts_sorted_dstream = author_counts.transform(\
    #   (lambda foo:foo\
    #    .sortBy(lambda x:( -x[1])) )
    #    )
    # author_counts_sorted_dstream.pprint()

    # sortedGaitByUserId.foreachRDD(another)

    segmentedData = sortedGaitByUserId.mapPartitions(partition_mapper_func)

    # x = cogrouped.mapValues(iterate)
    # for e in x.collect():
    #     print (e)

    # segmentedData.pprint()

    # DO NOT CHANGE THE LOCATION OF THIS FUNCTION
    def infer(data_rdd):
        # print("ATTEMPTING DEEP LEARNING")
        try:
            datas = data_rdd.collect()
            if len(datas) > 0:
                # print("INSIDE TRY BEFORE WITH")
                # with tf.Graph().as_default() as graph:
                #     graph_def = tf.GraphDef()
                #     graph_def.ParseFromString(model_data_bc.value)
                #     tf.import_graph_def(graph_def, name="prefix")
                # print("INSIDE TRY AFTER WITH")
                # x = graph.get_tensor_by_name('prefix/Placeholder:0')
                # y = graph.get_tensor_by_name('prefix/Softmax:0')

                for data in datas:
                    for id_xyz in data:
                        if id_xyz:
                            id = id_xyz[0]
                            dummy_axis = "0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00 0.000000000000000000e+00"
                            input_signals = []
                            input_signals.extend(id_xyz[1:])
                            for i in range(3):
                                input_signals.append(dummy_axis)

                            X_signals = []
                            for each in input_signals:
                                X_signals.append([
                                    np.array(cell, dtype=np.float32)
                                    for cell in [each.strip().split(' ')]
                                ])
                            X_test = np.transpose(np.array(X_signals),
                                                  (1, 2, 0))

                            from pyspark import SparkFiles
                            from tensorflow.keras.models import load_model
                            path = SparkFiles.get('cnn_modell.h5')
                            model = load_model(path)
                            print("Loaded model from disk")
                            preds = model.predict(X_test)
                            for p in preds:
                                inferred_user_id = str(np.argmax(p) + 1)
                                results = {
                                    'confidency': str(np.amax(p)),
                                    'inferred_user_id': inferred_user_id,
                                    'actual_user_id': str(id)
                                }
                                print(results)
                                requests.post(back_end_url, json=results)
                            # with tf.Session(graph=graph) as sess:
                            #     y_out = sess.run(y, feed_dict={
                            #         x: X_test
                            #     })

                            #     for each in y_out:
                            #         inferred_user_id = str(np.argmax(each) + 1)
                            #         confidency = str(np.amax(each))
                            #         actual_user_id = str(id)
                            #         results = {'confidency': confidency, 'inferred_user_id': inferred_user_id,
                            #                    'actual_user_id': actual_user_id}
                            #         print(results)
                            #         requests.post(back_end_url, json=results)
        except:
            e = sys.exc_info()
            print("Error: %s" % e)

    print('infer:', 'running inference on segmented data')
    segmentedData.foreachRDD(infer)

    # start the streaming computation
    s_stream_context.start()
    try:
        # wait for the streaming to finish
        s_stream_context.awaitTermination()
    except KeyboardInterrupt:
        print("\nSpark shutting down\n")
예제 #12
0
import pyspark.streaming.kafka as pyspark_kafka

import scapy.all as scapy

# -----------------------------------------------------------------------------
# Main program
# -----------------------------------------------------------------------------
if __name__ == "__main__":
    #
    # Setup
    #

    #-- define spark usual and streaming contexts
    cont_0 = pyspark.SparkContext(appName="pkt_dissector")
    cont_0.setLogLevel("ERROR")
    s_cont_0 = pyspark_streaming.StreamingContext(cont_0, 5)

    #-- kafka integration (notice, that we receive packets as a bytes struct)
    brokers = "192.168.122.71:9092,192.168.122.72:9092,192.168.122.73:9092"
    kafka_dstream = pyspark_kafka.KafkaUtils.createDirectStream(
        s_cont_0, ["test1"], {"metadata.broker.list": brokers},
        valueDecoder=lambda x: bytes(x))

    #
    # Lazy evaluation rules
    #
    #-- Kafka message comes as a 2-tuple: (key, value). The code below will
    #-- select the actual message (i.e. packet) and dissects it.
    pkts = kafka_dstream.map(lambda x: scapy.Ether(x[1]))
    filtered_pkts = pkts.filter(common._pkt_filter). \
            map(lambda x: (x, x.summary()))
예제 #13
0
        print "<zk_quorum> - zk quorum to connect: zk-kafka-1-0.zk-kafka-1:2181,zk-kafka-1-1.zk-kafka-1:2181,zk-kafka-1-2.zk-kafka-1:2181"
        print "<topic_name> - kafka topic name: twitter-stream"
        print "<min_hashtag_counts> - filter out hashtags with less then specified count"
        print "<batch_duration> - spark streaming batch duration ~ how often data will be written"
        print "<save_to> - save as text files to: hdfs://hdfs-namenode:8020/demo"
        exit(-1)

    spark_master = sys.argv[1]
    zk_quorum = sys.argv[2]
    topic_name = sys.argv[3]
    min_hashtag_counts = int(sys.argv[4])
    batch_duration = int(sys.argv[5])
    save_to = sys.argv[6]

    sc = pyspark.SparkContext("local[2]", appName="TweeTics")
    ssc = streaming.StreamingContext(sc, batch_duration)

    tweets = kafka.KafkaUtils.createStream(ssc, zk_quorum, "tweetics-consumer",
                                           {
                                               topic_name: 1
                                           }).map(lambda x: x[1])
    counts = tweets.flatMap(get_hashtags).map(
        lambda hashtag: (hashtag, 1)).reduceByKey(lambda a, b: a + b)
    sorted_counts = counts.transform(
        lambda rdd: rdd.sortByKey(ascending=False, keyfunc=lambda x: x[1]))
    output = sorted_counts.map(lambda x: "%s %s" % (x[0], x[1]))

    output.pprint()
    output.saveAsTextFiles(save_to)

    ssc.start()