def streaming_logic():
    """
    :function: initial spark context and all the streaming logic
    :return: None
    """

    # - read configuration from file
    spark_config, kafka_config, cassandra_config = read_config()

    # - initial spark context
    conf = SparkConf().setMaster(spark_config['master']).setAppName(spark_config['app_name']).set('spark.cassandra.connection.host', cassandra_config['cluster'])
    csc = CassandraSparkContext(conf=conf)
    csc.setLogLevel(spark_config['log_level'])
    ssc = StreamingContext(sparkContext=csc, batchDuration=spark_config['time_window'])

    # - creating kafka stream
    directKafkaStream = KafkaUtils.createDirectStream(ssc, [kafka_config['topic_in']], {'metadata.broker.list': kafka_config['cluster']})

    # - start to process data
    # - output data structure: MetadData
    structured_stock_data = directKafkaStream.map(lambda data : preprocess_data(data=data))
    structured_stock_data.pprint(20)

    stock_data_list = structured_stock_data.reduceByKey(lambda a,b : aggregate_list(a,b))
    stock_data_list.pprint(20)

    # - get history data from cassandra
    alert_user_data = stock_data_list.mapValues(lambda dictlist : compute_stock_tending_in_window(dict_list=dictlist))
    alert_user_data.pprint(20)

    # - send alert to user
    alert_user_data.foreachRDD(lambda rdd : rdd.foreachPartition(lambda iter : send_alert_to_kafka(iterator=iter,kafka_config=kafka_config)))

    ssc.start()
    ssc.awaitTermination()
예제 #2
0
def main():
    pwords = load_wordlist("./Dataset/positive.txt")
    nwords = load_wordlist("./Dataset/negative.txt")

    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")
    sql = SQLContext(sc)
    # Creating a streaming context with batch interval of 1 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    #tweets = kstream.map(lambda x: json.loads( x[1].decode('utf-8')))
    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords))

    #searchTermSentiment =
    tweetsUsentiment.pprint()

    tweetsUsentiment.saveToCassandra("tweetdb", "tweettable")

    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop(stopGraceFully=True)
예제 #3
0
def main():
    pwords = load_wordlist("../Dataset/positive.txt")
    nwords = load_wordlist("../Dataset/negative.txt")
    sterms = load_wordlist("../Dataset/keyWords.txt")
    conf = SparkConf().\
        setMaster("local[2]").\
        setAppName("TweeStreamer").\
        set("spark.cassandra.connection.host",\
        "52.25.173.31, 35.165.251.179, 52.27.187.234, 52.38.246.84")
    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel("WARN")

    # Creating a streaming context with batch interval of 10 sec
    ssc = StreamingContext(sc, 10)
    ssc.checkpoint("checkpoint")

    kstream = KafkaUtils.createDirectStream(
        ssc,
        topics=['twitter-topic1'],
        kafkaParams={"metadata.broker.list": 'localhost:9092'})

    tweets = kstream.map(lambda x: json.loads(x[1]))
    tweets.count().map(lambda x: 'Tweets in this batch: %s' % x).pprint()
    tweetsUsentiment = tweets.map(
        lambda tweet: tweetwithSentiment(tweet, pwords, nwords, sterms))

    searchTermUsentiment = tweetsUsentiment.flatMap(
        lambda tweet: searchTermFunction(tweet, sterms)).reduceByKey(
            lambda a, b: a + b)
    searchTermUsentiment = searchTermUsentiment.map(
        lambda (key, value): {
            "searchterm": "_" + key,
            "insertion_time": datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S'),
            "sentiment": value
        })
    searchTermUsentiment.pprint()

    searchTermUsentiment.saveToCassandra("tweetdb", "searchtermtable")
    # searchTermSentiment = tweetsUsentiment.map(lambda tweet: searchTermFunction(tweet,sterms))

    ssc.start()
    ssc.awaitTerminationOrTimeout(1000)
    ssc.stop(stopGraceFully=True)
예제 #4
0
def transfer_time(text):
    #return "2018-06-25"
    return datetime.today().strftime("%Y-%m-%d %H:%M:%S")

def process(rdd):
    spark = getSparkSessionInstance(rdd.context.getConf())
    tweetsDataFrame = spark.read.json(rdd)
    df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text))
    df = df.withColumn('time',transfer_time(tweetsDataFrame.time))
    df.createOrReplaceTempView("historicaltweets")
    df = spark.sql("SELECT MAX(time) AS time,hashtag, count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC")
    rdd = df.rdd.map(tuple)
    rdd.saveToCassandra("twitter","tweet")
    df.show()

if __name__ == "__main__":
    sc = CassandraSparkContext(appName="tweet")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc,600)
    topic_name = "twitter"
    streamFromKafka = KafkaUtils.createDirectStream(ssc, [topic_name],{"metadata.broker.list":'*'})
    lines = streamFromKafka.map(lambda x: x[1])
    lines.count().pprint()
    lines.foreachRDD(process)
    #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y)
    ssc.start() 
    ssc.awaitTermination()



예제 #5
0
    return datetime.today().strftime("%Y-%m-%d %H:%M:%S")


def process(rdd):
    spark = getSparkSessionInstance(rdd.context.getConf())
    tweetsDataFrame = spark.read.json(rdd)
    df = tweetsDataFrame.withColumn('hashtag', func(tweetsDataFrame.text))
    df = df.withColumn('date', transfer_time(tweetsDataFrame.time))
    df.createOrReplaceTempView("historicaltweets")
    df = spark.sql(
        "SELECT MAX(date) AS date,hashtag,count(*) AS count FROM historicaltweets WHERE hashtag IS NOT NULL GROUP BY hashtag ORDER BY count DESC"
    )
    rdd = df.rdd.map(tuple)
    rdd.saveToCassandra("twitter", "tweet")
    df.show()


if __name__ == "__main__":
    sc = CassandraSparkContext(appName="tweet")
    sc.setLogLevel("WARN")
    ssc = StreamingContext(sc, 600)
    topic_name = "twitter"
    streamFromKafka = KafkaUtils.createDirectStream(
        ssc, [topic_name], {"metadata.broker.list": '*'})
    lines = streamFromKafka.map(lambda x: x[1])
    lines.count().pprint()
    lines.foreachRDD(process)
    #text_counts = lines.map(lambda tweet: (tweet['hashtag'],1)).reduceByKey(lambda x,y: x + y)
    ssc.start()
    ssc.awaitTermination()
if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("Usage: consumer.py <kafka-host> <topic-name> <seconds>")
        exit(-1)

    kafka_host = sys.argv[1]
    topic_name = sys.argv[2]
    seconds = int(sys.argv[3])

    conf = SparkConf() \
        .setAppName("data_challenge")

    from pyspark_cassandra import CassandraSparkContext

    sc = CassandraSparkContext(conf=conf)
    sc.setLogLevel('ERROR')
    ssc = StreamingContext(sc, seconds)
    ssc.checkpoint('./output')

    d = dict()
    d['bootstrap.servers'] = kafka_host
    d['group.id'] = 'test-id'
    d['enable.auto.commit'] = 'false'

    kafka_stream = KafkaUtils.createDirectStream(ssc, [topic_name], d)

    # Parse messages as json
    tweets = kafka_stream.map(lambda v: json.loads(v[1]))
    tweets_text = tweets.map(
        lambda tweet: json.loads(tweet)['text'].encode('ascii', 'ignore'))