-
Notifications
You must be signed in to change notification settings - Fork 0
/
spark_app.py
101 lines (69 loc) · 2.62 KB
/
spark_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import yaml
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import Row, SparkSession
from database_app import create_table, insert_table
def get_sql_context_instance(sparkConf):
if 'sqlContextSingletonInstance' not in globals():
globals()['sqlContextSingletonInstance'] = SparkSession \
.builder \
.config(conf=sparkConf) \
.getOrCreate()
return globals()['sqlContextSingletonInstance']
def process_rdd(time, rdd):
print("----------- %s -----------" % str(time))
if rdd.isEmpty():
print('RDD is empty')
else:
try:
# get spark sql singleton context from the current context
spark = get_sql_context_instance(rdd.context.getConf())
# convert the RDD to Row RDD
row_rdd = rdd.map(lambda w: Row(word=w))
# create a DF from the Row RDD
words_df = spark.createDataFrame(row_rdd)
# create a temporary view using the DF
words_df.createOrReplaceTempView('words')
# do word count on table using SQL and print it
word_counts_df = spark.sql(
"""
select
from_unixtime(unix_timestamp()) as date_time,
word,
count(*) as word_count
from words
group by word
"""
)
# word_counts_df.show()
# create table if it does not exist
if not globals()['table_created']:
create_table()
globals()['table_created'] = True
# insert data into table
insert_table(word_counts_df.toPandas())
except Exception as e:
print('Error:', e)
if __name__ == "__main__":
global table_created
table_created = False
# create spark context with the above configuration
sc = SparkContext(appName='TwitterStream')
sc.setLogLevel('ERROR')
# create the Streaming Context from the above spark context with interval size 2 seconds
ssc = StreamingContext(sc, 2)
# read data from port
with open('config.yaml', 'r') as stream:
details = yaml.safe_load(stream)
lines = ssc.socketTextStream(
details['host'],
details['port']
)
# split each tweet into words
words = lines.flatMap(lambda line: line.split(' '))
# do processing for each RDD generated in each interval
words.foreachRDD(process_rdd)
# start the streaming computation
ssc.start()
# wait for the streaming to finish
ssc.awaitTermination()