-
Notifications
You must be signed in to change notification settings - Fork 0
/
runner.py
92 lines (62 loc) · 3.27 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import csv,re
import sys,os
sys.path.append(os.getcwd())
import DBUtil, Pipeline
class Runner:
global insert_columns
insert_columns = None
def __init__(self, data_file):
self.data_file = data_file
self.running_total_dict = {}
self.dbobj = DBUtil.DBUtil(r"pythonsqlite.db", None)
def run_pipeline(self):
with open(self.data_file, encoding='utf8') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_num = 0
for row in csv_reader:
if line_num == 0:
self.setup_tables(row, insert_columns) # Create both table here with column names with headers available in 0th row
line_num += 1
else:
cleaned_row = Pipeline.data_cleanup(row) # Step 1: data cleanup for first column
if cleaned_row is not None: # Step 2: Normalize the date column
date_range = Pipeline.normalize_row(cleaned_row[21])
insert_stmt = "INSERT INTO data_normalized(" + insert_columns_a + ") VALUES(" \
+ '"{0}"'.format('", "'.join(cleaned_row)) + ",\"" \
+ date_range[0]+ "\", \"" + date_range[1] +"\")"
self.dbobj.insert_table(insert_stmt)
Pipeline.running_total(cleaned_row, self.running_total_dict) # Step 3: Calculate running_total
line_num += 1
#if line_num == 2000:
#break;
# Populate classification_totals table with calculated running totals
for classification, totals in self.running_total_dict.items():
insert_stmt = "INSERT INTO classification_totals(Classification, Totals) VALUES (\"" \
+ classification +"\",\""+ str(totals) +"\")"
self.dbobj.insert_table(insert_stmt)
self.dbobj.commit()
#self.dbobj.select_table("data_normalized")
#self.dbobj.select_table("classification_totals")
self.dbobj.close_connection()
def setup_tables(self, row, insert_columns):
self.dbobj.create_connection()
# Create query statement for data_normalized table with two additional columns
tup = tuple(word.replace(" ", "_") for word in row)
global insert_columns_a
insert_columns_a = f'{", ".join(tup)}' + ', Object_Start_date' + ', Object_End_date'
create_columns = f'{" VARCHAR(255), ".join(tup)}'
create_stmt = "CREATE TABLE data_normalized (" + create_columns + \
" VARCHAR(255), Object_Start_date VARCHAR(255), Object_End_date VARCHAR(255))"
# Create data_normalized table
self.dbobj.drop_table('data_normalized')
self.dbobj.create_table(create_stmt)
# Create classification_totals table
self.dbobj.drop_table('classification_totals')
create_stmt = "CREATE TABLE classification_totals (Classification VARCHAR(255), Totals VARCHAR(255))"
self.dbobj.create_table(create_stmt)
# ------------- Main Program -------------#
def main():
runobj = Runner(r"DataEngineerDataSet.csv")
runobj.run_pipeline()
if __name__ == "__main__":
main()