/
assignment-3-covid_analysis.py
68 lines (49 loc) · 3 KB
/
assignment-3-covid_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from pyspark import SparkConf, SparkContext, SQLContext
from hdfs import Config
from pyspark.sql import functions as func
from pyspark.sql.types import IntegerType
import sys, os, math
# Constants
APP_NAME = "Covid Analysis"
HDFS_RAWFILE_DIR = "/ASSIGNMENT/"
HDFS_OUTPUT_DIR = "/OUTPUT/"
HDFS_BASE_URL = "hdfs://bdrenfdludcf01:9000"
if __name__ == "__main__":
# Folder creation for placing all the spark data
cmd_a = "mkdir -p " + "/tmp/SPARK_PROCESS/"
os.system(cmd_a)
# Configure Spark
conf = SparkConf().setAppName(APP_NAME).set("spark.local.dir", "/tmp/SPARK_PROCESS/")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
client = Config().get_client('bdrenhdfs')
files = client.list(HDFS_RAWFILE_DIR)
totalfilecount = len(files)
if totalfilecount == 0:
print("There is no files to be processed, application exiting...")
sys.exit(0)
filecount = 0
for filename in files:
print(filename)
if filename.find("Covid_Analysis_DataSet.csv") >= 0:
filecount = filecount + 1
df_covid = sqlContext.read.format("csv").option("delimiter", ":").option("header", 'true').load(HDFS_BASE_URL + HDFS_RAWFILE_DIR + filename)
if filecount == 0:
print("The desired file is not present here, application exiting...")
system.exit(0)
#converting month and year to integer
df_covid_month_year_int = df_covid.withColumn("month", df_covid["month"].cast(IntegerType())).withColumn("year", df_covid["year"].cast(IntegerType()))
#creating temporary view
df_covid_month_year_int.createOrReplaceTempView("covid")
#main query
sqlDF = sqlContext.sql("SELECT month Month, year Year, countryterritoryCode CountryCode, (sum(cases)/sum(TestPerformed))*100 InfectionRate, (sum(deaths)/sum(cases))*100 DeathRate FROM covid group by month, year, countryterritoryCode").orderBy('CountryCode', 'year', 'month')
#changing month format
sqlDF_month_updated = sqlDF.withColumn("Month", func.when(sqlDF["Month"]==1, "January").when(sqlDF["Month"]==2, "February").when(sqlDF["Month"]==3, "March").when(sqlDF["Month"]==4, "April").when(sqlDF["Month"]==5, "May").when(sqlDF["Month"]==6, "June").when(sqlDF["Month"]==7, "July").when(sqlDF["Month"]==8, "August").when(sqlDF["Month"]==9, "September").when(sqlDF["Month"]==10, "October").when(sqlDF["Month"]==11, "November").when(sqlDF["Month"]==12, "December").otherwise(sqlDF["Month"]))
#rounding the InfectionRate and DeathRate
sqlDF_rounded = sqlDF_month_updated.withColumn("InfectionRate", func.round(sqlDF_month_updated["InfectionRate"], 2)).withColumn("DeathRate", func.round(sqlDF_month_updated["DeathRate"], 2))
#showing the output
sqlDF_rounded.show()
#writing the output in a output.csv file
sqlDF_rounded.coalesce(1).write.mode("overwrite").option("delimiter", ";").option("header", 'true').csv(HDFS_BASE_URL + HDFS_OUTPUT_DIR + "output.csv")
#call shutdown to release all resources properly
sc.stop()