/
executor.py
174 lines (158 loc) · 6.99 KB
/
executor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
Module builds configurable FIFO data pipelines by building and executing tasks in the configured
order. All tasks must extend "pipeline.task.Task" class. The order of the task determined by the
order in which the arguments are passed to the module from the command line.
"""
from sys import argv as args
import datetime
import commons.config as config
import task as pline_task
# Logger instance.
LOGGER = config.get_logger()
# Registered tasks. Its been abstracted here, but this can be loaded from a metadata table in future
# implementations. Key value of the TASKS dictionary denotes the value to be passed as command line
# argument to add the task to the pipeline. Value in TASKS denotes the fully qualified path to the
# corresponding task classes.
TASKS = {"HDFS_READ": "pipeline.task.Reader",
"HTTP_READ": "pipeline.task.HttpReader",
"WRITE": "pipeline.task.Writer",
"PARTITION_WRITE": "pipeline.task.PartitionWriter",
"FILE_WRITE": "pipeline.task.Writer",
"RECIPES_TRANSFORM": "pipeline.task.TransformRecipes",
"BEEF_RECIPE_TRANSFORM": "pipeline.task.BeefRecipes",
"EMAIL": "pipeline.task.EmailTask",
"SLACK": "pipeline.task.SlackTask",
"KAFKA": "pipeline.task.KafkaTask"
}
def execute_pipeline(task_list):
"""
Function takes a list of tasks in the order to be executed and executes it. Function also
passes the data frames returned from previous tasks to the next task in the pipeline.
:return: None
"""
data_frame = None
for task in task_list:
try:
# Initial or first task in the pipeline does't won't a data frame to process.
if data_frame is None:
data_frame = task.execute()
else:
task.set_data_frame(data_frame)
data_frame = task.execute()
except Exception as exception:
LOGGER.exception(
f"{datetime.datetime.now()} - An exception occurred during the execution of pipeline : {exception}")
raise
def build_pipeline(tasks, is_spark=True):
"""
Function takes an argument list and identifies and instantiate individual tasks objects in the
pipeline in the order it is to be executed.
Raises ValueError, if the given task is not there in the registered tasks list TASKS.
:param tasks: list of string
A list of task and its parameter in the order to be executed.
:param is_spark: bool
A flag to indicate spark job. Default set to true.
:return: list
A list containing task object in the sequence to be executed.
"""
# Application name.
app_name = tasks.pop(0)
def get_class(kls):
"""
Function takes the fully qualified class name string as input and returns the corresponding
Task class
:param kls: str
Fully qualified class name string
:return: class
class
"""
parts = kls.split('.')
# Getting the module name
module_name = ".".join(parts[:-1])
module = __import__(module_name)
for comp in parts[1:]:
module = getattr(module, comp)
return module
task_list = []
# Iterator for tasks
task_iter = iter(tasks)
spark_session = None
if is_spark:
spark_session = config.get_spark_session(app_name)
while task_iter:
try:
task_instance = None
# Next task in the list
next_task = next(task_iter).upper()
if next_task in TASKS:
# Getting class corresponding to the string.
p_task = get_class(TASKS[next_task])
# PartitionWriter tasks require 3 arguments for file format location and.
# partition by column.
# Checking if the class in the list is sub class of type PartitionWriter.
if issubclass(p_task, pline_task.PartitionWriter):
try:
task_instance = p_task(file_format=next(task_iter), location=next(task_iter),
partition_by=next(task_iter))
except:
raise TypeError(f"{task_instance} missing required positional arguments")
# Reader and Writer tasks require 2 arguments for file format and locatoin.
# Checking if the class in the list is sub class of type Reader or Writer.
elif issubclass(p_task, pline_task.Reader) or issubclass(p_task, pline_task.Writer):
# Creating an instance of the Task class
try:
task_instance = p_task(file_format=next(task_iter), location=next(task_iter))
except:
raise TypeError(f"{task_instance} missing required positional arguments")
# Transformation tasks doesn't take any arguments.
elif issubclass(p_task, pline_task.Task):
# Creating an instance of the Task class
task_instance = p_task()
else:
# Raises ValueError, if the given task is not there in the registered tasks list
# TASKS.
raise ValueError(f"{next_task} is not a registered task.")
if is_spark:
# Injecting a SparkSession object to the class
task_instance.set_spark_session(spark_session)
# Adding the task to the task list. Position on the list determines the
# position of execution.
task_list.append(task_instance)
# Stop Iteration at the end of the iterator.
except StopIteration:
break
except Exception as exception:
LOGGER.exception(f"{datetime.datetime.now()} - An exception occurred while building pipeline {exception}")
raise
# Returning task list for execution.
return task_list
def initiate_pipeline():
"""
Main function initiate build and execution of the data pipeline.
:return: NoneType
"""
# Removing the execution script name from the argument list.
args.pop(0)
try:
# Getting the task list by analyzing the arguments.
task_list = build_pipeline(args)
if task_list:
execute_pipeline(task_list)
except Exception as exception:
notify(exception)
def notify(exception):
"""
Function sends notification to users with the failure report using the configured channels.
Functions builds a pipeline for the notification and executes it.
:param exception:
The exception to be notified
:return: None
"""
app_name = "Notification Task"
notification_task_list = config.get_notifications()["notify"].split(',')
notification_task_list.insert(0, app_name)
task_list = build_pipeline(notification_task_list, False)
execute_pipeline(task_list)
if __name__ == "__main__":
# Entry point to the pipeline
initiate_pipeline()