forked from peerlogic/DataWarehouseETL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
etl_table.py
148 lines (124 loc) · 4.84 KB
/
etl_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pymysql
import petl as etl
from conf import Conf
ACTOR_PARTICIPANTS = 'actor_participants'
ACTORS = 'actors'
ANSWERS = 'answers'
ARTIFACTS = 'artifacts'
CRITERIA = 'criteria'
EVAL_MODES = 'eval_modes'
ITEMS = 'items'
PARTICIPANTS = 'participants'
TASKS = 'tasks'
class ETLTable(object):
# TODO: Fix error: pymysql.err.IntegrityError: (1451, 'Cannot delete or update a parent row: a foreign key constraint fails ("staging_warehouse"."actor_participants", CONSTRAINT "actor_participant_participant_id" FOREIGN KEY ("participant_id") REFERENCES "participants" ("id") ON DELETE NO ACTION ON UPDATE NO ACTION)'
# Cannot handle uploading old data, can handle that loading from staging to data warehouse
def __init__(self, app_name, abbrev=None):
self.app_name = app_name
self.abbrev = abbrev if abbrev else self.app_name[:4]
self.conf = Conf()
self.TABLES = {
ACTOR_PARTICIPANTS: self.get_actor_pariticipants,
ACTORS: self.get_actors,
ANSWERS: self.get_answers,
ARTIFACTS: self.get_artifacts,
CRITERIA: self.get_criteria,
EVAL_MODES: self.get_eval_modes,
ITEMS: self.get_items,
PARTICIPANTS: self.get_participants,
TASKS: self.get_tasks,
}
# TODO: It should be possible to grab this from the schema
self.UPDATE_ORDER = [
PARTICIPANTS, ACTORS, ACTOR_PARTICIPANTS, CRITERIA, EVAL_MODES,
TASKS, ITEMS, ARTIFACTS, ANSWERS,
]
self._convert_id = lambda r: self.abbrev + '-' + '0' * (8 - len(r)) + r if r!='None' else None
def load_to_staging_warehouse(self):
db_info = self.conf.get_staging_db_info()
connection = pymysql.connect(
host=db_info['host'], user=db_info['user'],
password=db_info['passwd'], db=db_info['db'],
)
connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
# TODO: ask Yang about peerlogic set up, why is there an error for duplicate
# entries in actor IDs, can't different roles share primary keys for IDs
for table in self.UPDATE_ORDER:
data = self.TABLES[table]()
if data:
print(f'Loading {table}...\n{self.TABLES[table]()}')
etl.todb(data, connection, table)
connection.close()
def get_actor_pariticipants(self):
return (
self._get_actor_pariticipants()
.convert('actor_id', str)
.convert('participant_id', str)
.convert('actor_id', self._convert_id)
.convert('participant_id', self._convert_id)
)
def get_actors(self):
return (
self._get_actors()
.convert('id', str)
.convert('role', str)
.convert('id', self._convert_id)
)
def get_answers(self):
return (
self._get_answers()
.convert('id', str)
.convert('assessee_artifact_id', str)
.convert('assessee_actor_id', str)
.convert('assessor_actor_id', str)
.convert('comment', str)
.convert('criterion_id', str)
.convert('id', self._convert_id)
.convert('assessor_actor_id', self._convert_id)
.convert('assessee_actor_id', self._convert_id)
.convert('assessee_artifact_id', self._convert_id)
.convert('criterion_id', self._convert_id)
)
def get_artifacts(self):
return self._get_artifacts()
def get_criteria(self):
return (
self._get_criteria()
.convert('id', str)
.convert('title', str)
.convert('id', self._convert_id)
)
def get_eval_modes(self):
return self._get_eval_modes()
def get_items(self):
return self._get_items()
def get_participants(self):
return (
self._get_participants()
.addcolumn('app_name', [], missing=self.app_name)
.convert('id', str)
.convert('app_name', str)
.convert('id', self._convert_id)
)
def get_tasks(self):
return self._get_tasks()
def _get_actor_pariticipants(self):
raise UnImplementedMethodError()
def _get_actors(self):
raise UnImplementedMethodError()
def _get_answers(self):
raise UnImplementedMethodError()
def _get_artifacts(self):
raise UnImplementedMethodError()
def _get_criteria(self):
raise UnImplementedMethodError()
def _get_eval_modes(self):
raise UnImplementedMethodError()
def _get_items(self):
raise UnImplementedMethodError()
def _get_participants(self):
raise UnImplementedMethodError()
def _get_tasks(self):
raise UnImplementedMethodError()
class UnImplementedMethodError(Exception):
pass