class HeftExecutor(FailRandom, BaseExecutor): def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion , initial_schedule = None, logger=None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.logger = logger def init(self): if self.initial_schedule is None: self.current_schedule = Schedule({node:[] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run(self.current_schedule) else: id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)} mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()} self.current_schedule = Schedule(mapping) self._post_new_events() def _generate_failtime_and_duration(self, item): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() return (time_of_fail, duration) def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): (time_of_fail, duration) = self._generate_failtime_and_duration(item) time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown self._reschedule(event) pass pass
class GAExecutor(FailRandom, BaseExecutor): def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule( {key: [] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)] unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None) #run ready tasks self.post_new_events(unstarted_tasks) def is_ready(self, task): nope = False in [(p.id in self.finished_tasks) for p in task.parents] return not nope def is_next_to_run(self, task): (node, item) = self.initial_schedule.place(task) its = [ it for it in self.initial_schedule.mapping[node] if it.start_time < item.start_time ] not_next = False in [(it.job.id in self.finished_tasks) for it in its] return not not_next def _task_start_handler(self, event): (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id) ]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self.finished_tasks.append(event.task.id) unstarted_items = self.get_ready_tasks(event.task, event.node) ##TODO: remove it later #print("==============================") #print("Task " + str(event.task) + " finished") #for item in unstarted_items: # print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0])) #print("==============================") #generate new task start events self.post_new_events(unstarted_items) pass def _node_failed_handler(self, event): # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time pass def _node_up_handler(self, event): # check node up self.resource_manager.node(event.node).state = Node.Unknown #get next task for this node next_sched_item = [] for item in self.initial_schedule.mapping[event.node]: if item.job.id not in self.finished_tasks: next_sched_item = item break runtime = next_sched_item.end_time - next_sched_item.start_time start_time = self.current_time end_time = start_time + runtime actual_sched_item = ScheduleItem(next_sched_item.job, start_time, end_time) self.post_new_events([actual_sched_item]) pass def get_ready_tasks(self, ptask, pnode): unstarted_items = [] next_for_ptask = self.initial_schedule.get_next_item(ptask) #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job] tsks = [ tsk for tsk in ptask.children if self.is_ready(tsk) and self.is_next_to_run(tsk) ] ##TODO: refactor it later if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready( next_for_ptask.job) and self.is_next_to_run( next_for_ptask.job): tsks.append(next_for_ptask.job) # tsks mustn't be finished, executing or their node is Down def appropriate_to_run(tsk): if tsk.id in self.finished_tasks: return False if self.current_schedule.is_executing(tsk): return False nd = self.initial_schedule.place(tsk)[0] if self.resource_manager.node(nd).state == Node.Down: return False return True tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)] for child in tsks: (node, item) = self.initial_schedule.place(child) ## TODO: remake it later # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child) # runtime = item.end_time - item.start_time # start_time = self.current_time + transf # end_time = start_time + runtime sitems = self.current_schedule.mapping.items() pids = [p.id for p in child.parents] mp = { it.job.id: (pnd, it) for (pnd, items) in sitems for it in items if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED) } estms = [ it.end_time + self.estimator.estimate_transfer_time(pnd, node, it.job, child) for (id, (pnd, it)) in mp.items() ] transf_end = 0 if len(estms) == 0 else max(estms) runtime = item.end_time - item.start_time start_time = max(self.current_time, transf_end) end_time = start_time + runtime actual_sched_item = ScheduleItem(item.job, start_time, end_time) unstarted_items.append(actual_sched_item) return unstarted_items def post_new_events(self, unstarted_items): for item in unstarted_items: (node, it) = self.initial_schedule.place(item.job) event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node self.post(event_start) self.post(event_finish) self.current_schedule.mapping[node].append(item) pass
class GAExecutor(FailRandom, BaseExecutor): def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule({key:[] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)] unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None) #run ready tasks self.post_new_events(unstarted_tasks) def is_ready(self, task): nope = False in [(p.id in self.finished_tasks) for p in task.parents] return not nope def is_next_to_run(self, task): (node, item) = self.initial_schedule.place(task) its = [it for it in self.initial_schedule.mapping[node] if it.start_time < item.start_time] not_next = False in [(it.job.id in self.finished_tasks) for it in its] return not not_next def _task_start_handler(self, event): (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self.finished_tasks.append(event.task.id) unstarted_items = self.get_ready_tasks(event.task, event.node) ##TODO: remove it later #print("==============================") #print("Task " + str(event.task) + " finished") #for item in unstarted_items: # print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0])) #print("==============================") #generate new task start events self.post_new_events(unstarted_items) pass def _node_failed_handler(self, event): # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time pass def _node_up_handler(self, event): # check node up self.resource_manager.node(event.node).state = Node.Unknown #get next task for this node next_sched_item = [] for item in self.initial_schedule.mapping[event.node]: if item.job.id not in self.finished_tasks: next_sched_item = item break runtime = next_sched_item.end_time - next_sched_item.start_time start_time = self.current_time end_time = start_time + runtime actual_sched_item = ScheduleItem(next_sched_item.job, start_time, end_time) self.post_new_events([actual_sched_item]) pass def get_ready_tasks(self, ptask, pnode): unstarted_items = [] next_for_ptask = self.initial_schedule.get_next_item(ptask) #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job] tsks = [tsk for tsk in ptask.children if self.is_ready(tsk) and self.is_next_to_run(tsk)] ##TODO: refactor it later if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready(next_for_ptask.job) and self.is_next_to_run(next_for_ptask.job): tsks.append(next_for_ptask.job) # tsks mustn't be finished, executing or their node is Down def appropriate_to_run(tsk): if tsk.id in self.finished_tasks: return False if self.current_schedule.is_executing(tsk): return False nd = self.initial_schedule.place(tsk)[0] if self.resource_manager.node(nd).state == Node.Down: return False return True tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)] for child in tsks: (node, item) = self.initial_schedule.place(child) ## TODO: remake it later # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child) # runtime = item.end_time - item.start_time # start_time = self.current_time + transf # end_time = start_time + runtime sitems = self.current_schedule.mapping.items() pids = [p.id for p in child.parents] mp = {it.job.id: (pnd, it) for (pnd, items) in sitems for it in items if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED) } estms = [it.end_time + self.estimator.estimate_transfer_time(pnd, node, it.job, child) for (id, (pnd, it)) in mp.items()] transf_end = 0 if len(estms) == 0 else max(estms) runtime = item.end_time - item.start_time start_time = max(self.current_time, transf_end) end_time = start_time + runtime actual_sched_item = ScheduleItem(item.job, start_time, end_time) unstarted_items.append(actual_sched_item) return unstarted_items def post_new_events(self, unstarted_items): for item in unstarted_items: (node, it) = self.initial_schedule.place(item.job) event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node self.post(event_start) self.post(event_finish) self.current_schedule.mapping[node].append(item) pass
class CloudHeftExecutor(EventMachine): STATUS_RUNNING = 'running' STATUS_FINISHED = 'finished' def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion, desired_reliability, public_resource_manager, initial_schedule = None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.desired_reliability = desired_reliability self.public_resources_manager = public_resource_manager #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.register = dict() def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) if self.initial_schedule is None: self.current_schedule = Schedule({node:[] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run(self.current_schedule) else: id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)} mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()} self.current_schedule = Schedule(mapping) self.post_new_events() def event_arrived(self, event): def reschedule(event): self.heft_planner.current_time = self.current_time current_cleaned_schedule = self.clean_events(event) self.current_schedule = self.heft_planner.run(current_cleaned_schedule) self.post_new_events() def check_fail(reliability): res = random.random() if res > reliability: return True return False if isinstance(event, TaskStart): # TODO: if node is cloud node, do nothing prm = self.public_resources_manager if prm.isCloudNode(event.node): return None # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # public_resources_manager: # determine nodes of proper soft type # check and determine free nodes # determine reliability of every nodes # determine time_of_execution probability for (task,node) pair # try to find nodes in cloud if event.task not in self.register: proper_nodes = prm.get_by_softreq(event.task.soft_reqs) proper_nodes = [node for node in proper_nodes if not prm.isBusy(node)] sorted_proper_nodes = sorted(proper_nodes, key=lambda x: prm.get_reliability(x.name)) current_set = [] base_reliability = self.heft_planner.estimator.estimate_reliability(event.task, event.node) obtained_reliability = base_reliability dt = item.end_time - item.start_time def calc(node, dt): #(dt, task, node, transfer_estimation) # TODO: add proper transfer time here fp = prm.get_reliability(node.name) comp_time = self.heft_planner.estimator.estimate_runtime(event.task, node) cp = prm.probability_estimator(dt, comp_time, 0) #TODO: remove it later #cp = 0.95 #print("cp: " + str(cp)) return (node, fp, cp ) it_comm_buf = 0 for pnode in sorted_proper_nodes: common_reliability = 1 - base_reliability #TODO: refactor this later if 1 - common_reliability >= self.desired_reliability: break res = calc(pnode, dt) current_set.append(res) #TODO: add dencity law of probability for dedicated resource for (nd, fp, cp) in current_set: common_reliability *= (1 - fp*cp) common_reliability = 1 - common_reliability #print("common_reliability: " + str(common_reliability)) it_comm_buf = common_reliability if common_reliability >= self.desired_reliability: #print("Commmon: "+ str(common_reliability)) break #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id)) #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task)) def frange(x, y, jump): while x < y: yield x x += jump for (nd, fp, cp) in current_set: comp_time = self.heft_planner.estimator.estimate_runtime(event.task, nd) #sigma 0.1*M lets take 0.6*M #TODO: uncomment it later ints = [(i, calc(nd, i))for i in frange(0, comp_time + 0.2*comp_time, 0.05*comp_time)] rd = random.random() generated_comp_time = comp_time for (i, p) in ints: if p[2] > rd: generated_comp_time = i break #comp_time + 0.6*comp_time # TODO: remove it later #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time) #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95)) #print("cloud reliability: " + str(fp)) if check_fail(fp): event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd self.post(event_start) duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = generated_comp_time*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(nd, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(nd) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) else: event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd event_finish = TaskFinished(event.task) event_finish.time_happened = self.current_time + generated_comp_time event_finish.node = nd self.post(event_start) self.post(event_finish) prm.checkBusy(nd, True) self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING pass reliability = self.heft_planner.estimator.estimate_reliability(event.task, node) if check_fail(reliability): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id and not prm.isCloudNode(ev.node))]) pass return None if isinstance(event, TaskFinished): # check if it cloud task # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function # if task cloud and not first: free cloud node, end_of_function # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_RUNNING: # print("gotcha task: " + str(event.task)) self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED ## TODO: correct it ## if event.task failed and went through rescheduling, ## it would be possible that currently ScheduleItem of event.task on dedicated resource ## has UNSTARTED state. ## TODO: add additional functional to schedule to record such situations and validate it after found = self.current_schedule.change_state_executed_with_end_time(event.task, ScheduleItem.FINISHED, self.current_time) pair = self.current_schedule.place_single(event.task) if pair is not None: ## TODO: The bug is here. Fix it later. ## the unstarted case must be taken into account in schedule and in the validity check procedure too (nd, item) = pair if item.state == ScheduleItem.EXECUTING: item.start_time = event.time_happened item.end_time = event.time_happened item.state = ScheduleItem.FINISHED self.queue = [ev for ev in self.queue if not (not isinstance(ev, NodeUp) and ev.task.id == event.task.id)] else: prm.checkBusy(event.node, False) return None def check(ev): if isinstance(ev, TaskFinished) or isinstance(ev, NodeFailed): if ev.task.id == event.task.id and not prm.isCloudNode(ev.node): return False ## TODO: make it later ##if isinstance(ev, NodeUp): return True self.queue = [ev for ev in self.queue if check(ev)] prm.checkBusy(event.node, False) reschedule(event) return None if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_FINISHED: prm.checkBusy(event.node, False) return None # check task finished self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) return None if isinstance(event, NodeFailed): # check if cloud node # if cloud node: check as down, free node, end_of_function # if not cloud node: check as down, reschedule, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, True) prm.checkBusy(event.node, False) return None # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time reschedule(event) return None if isinstance(event, NodeUp): # check if cloud # if cloud: check node up, end_of_function # if not cloud: check as up, reschedule end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, False) return None # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown reschedule(event) return None return None def post_new_events(self): unstarted_items = set() for (node, items) in self.current_schedule.mapping.items(): for item in items: if item.state == ScheduleItem.UNSTARTED: unstarted_items.add((node, item)) events_to_post = [] for (node, item) in unstarted_items: event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node events_to_post self.post(event_start) self.post(event_finish) pass def clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks prm = self.public_resources_manager def check(event): if isinstance(event, TaskStart) and event.task in cleaned_task and not prm.isCloudNode(event.node): return False if isinstance(event, TaskFinished) and event.task in cleaned_task and not prm.isCloudNode(event.node): return False return True new_queue = deque([evnt for evnt in self.queue if check(evnt)]) self.queue = new_queue return clean_schedule
class GaHeftExecutor(FailRandom, BaseExecutor): #@trace def __init__(self, **kwargs): super().__init__(**kwargs) self.workflow = kwargs["wf"] self.resource_manager = kwargs["resource_manager"] # DynamicHeft # both planners have acess to resource manager and estimator self.heft_planner = kwargs["heft_planner"] self.base_fail_duration = kwargs["base_fail_duration"] self.base_fail_dispersion = kwargs["base_fail_dispersion"] self.current_schedule = None self.fixed_interval_for_ga = kwargs["fixed_interval_for_ga"] self.ga_builder = kwargs["ga_builder"] self.replace_anyway = kwargs.get("replace_anyway", True) self.back_cmp = None pass def init(self): self.current_schedule = Schedule({node: [] for node in self.heft_planner.get_nodes()}) initial_schedule = self.heft_planner.run(Schedule({node: [] for node in self.heft_planner.get_nodes()})) # print("heft solution!") # fsh = [hash(key) for key in initial_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") # TODO: change these two ugly records result = self.ga_builder()(self.current_schedule, initial_schedule) # print("Ga solution is broken!") # fsh = [hash(key) for key in result[0][2].mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") if not self._apply_mh_if_better(None, heuristic_resulted_schedule=initial_schedule, metaheuristic_resulted_schedule=result[0][2]): self.current_schedule = initial_schedule self._post_new_events() # print("Before Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #self.current_schedule = result[0][2] #self._post_new_events() return result def _task_start_handler(self, event): res = self._check_event_for_ga_result(event) if res: return # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if not self._is_a_fail_possible(): return if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self._check_event_for_ga_result(event) pass def _node_failed_handler(self, event): if not self._is_a_fail_possible(): return self._remove_events(lambda ev: not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)) ## interrupt ga self._stop_ga() # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: raise Exception(" Trouble in finding of the task: count of found tasks {0}".format(len(it))) it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time # print("Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") # run HEFT self._reschedule(event) # print("After!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #run GA self._run_ga_in_background(event) pass def _node_up_handler(self, event): ## interrupt ga self._stop_ga() # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown # print("Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") self._reschedule(event) # print("After!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #run GA self._run_ga_in_background(event) pass def _stop_ga(self): self.back_cmp = None pass def _actual_ga_run(self): ## this way makes it possible to calculate what time ## ga actually has to find solution ## this value is important when you need account events between ## planned start and stop points # ga_interval = self.current_time - self.back_cmp.creation_time ## fixed_schedule is actual because ## we can be here only if there haven't been any invalidate events ## such as node failures ## in other case current ga background computation would be dropped ## and we wouldn't get here at all result = self.ga_builder()(self.back_cmp.fixed_schedule, # self.back_cmp.initial_schedule, self.back_cmp.current_schedule, self.current_time) print("CURRENT MAKESPAN: {0}".format(Utility.makespan(result[0][2]))) return result def _check_event_for_ga_result(self, event): # check for time to get result from GA running background if self.back_cmp is None or self.back_cmp.time_to_stop != self.current_time: return False else: print("Event {0}".format(event)) if isinstance(event, TaskStart): print("Task id {0}".format(event.task.id)) result = self._actual_ga_run() if result is not None: return self._apply_mh_if_better(event, heuristic_resulted_schedule=self.current_schedule, metaheuristic_resulted_schedule=result[0][2]) return False def _replace_current_schedule(self, event, new_schedule): # syncrhonize fixed part of new_schedule with the old schedule - lets assume new_schedule already synchonized # remove all events related with the old schedule # replace current with new # generate events of new schedule and post their if event is not None: self._clean_events(event) self.current_schedule = new_schedule self._post_new_events() self.back_cmp = None pass def _apply_mh_if_better(self, event, heuristic_resulted_schedule, metaheuristic_resulted_schedule): t1 = Utility.makespan(metaheuristic_resulted_schedule) t2 = Utility.makespan(heuristic_resulted_schedule) print("Replace anyway - {0}".format(self.replace_anyway)) if self.replace_anyway is True or t1 < t2: ## generate new events self._replace_current_schedule(event, metaheuristic_resulted_schedule) ## if event is TaskStarted event the return value means skip further processing return True else: ## TODO: run_ga_yet_another_with_old_genome # self.ga_computation_manager.run(self.current_schedule, self.current_time) #self._run_ga_in_background(event) self.back_cmp = None return False pass # def _is_a_fail_possible(self): # if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 1: # print("DECLINE NODE DOWN") # st = functools.reduce(operator.add, (" {0} - {1}".format(nd.name, nd.state) for nd in self.resource_manager.get_nodes()), "") # print("STATE INFORMATION: " + st) # return False # return True def _is_a_fail_possible(self): return True def _run_ga_in_background(self, event): if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 0: return current_schedule = self.current_schedule current_time = self.current_time ## TODO: replace by log call print("Time: " + str(current_time) + " Creating reschedule point ") ## there can be several events in one time ## we choose the first to handle background GA run def _get_front_line(schedule, current_time, fixed_interval): event_time = current_time + fixed_interval min_item = ScheduleItem.MIN_ITEM() for (node, items) in schedule.mapping.items(): for item in items: ## It accounts case when event_time appears in a transfer gap(rare situation for all nodes) ## TODO: compare with some precison if event_time < item.end_time < min_item.end_time: min_item = item break if min_item.job is None: return None print("Time: " + str(current_time) + " reschedule point have been founded st:" + str(min_item.start_time) + " end:" + str(min_item.end_time)) return min_item def _get_fixed_schedule(schedule, front_event): def is_before_event(item): # hard to resolve corner case. The simulator doesn't guranteed the order of appearing events. if item.start_time < front_event.end_time: return True ## TODO: Urgent!!! experimental change. Perhaps, It should be removed from here later. if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.FAILED: return True return False ##TODO: it's dangerous operation. ## TODO: need create new example of ScheduleItem. def set_proper_state(item): new_item = ScheduleItem.copy(item) non_finished = new_item.state == ScheduleItem.EXECUTING or new_item.state == ScheduleItem.UNSTARTED ## TODO: Urgent!: dangerous place if non_finished and new_item.end_time <= front_event.end_time: new_item.state = ScheduleItem.FINISHED if non_finished and new_item.end_time > front_event.end_time: new_item.state = ScheduleItem.EXECUTING return new_item fixed_mapping = {key: [set_proper_state(item) for item in items if is_before_event(item)] for (key, items) in schedule.mapping.items()} return Schedule(fixed_mapping) ## TODO: make previous_result used def run_ga(current_schedule): fixed_interval = self.fixed_interval_for_ga front_event = _get_front_line(current_schedule, current_time, fixed_interval) # we can't meet the end of computation so we do nothing if front_event is None: print("GA's computation isn't able to meet the end of computation") return fixed_schedule = _get_fixed_schedule(current_schedule, front_event) #TODO: It isn't a good reliable solution. It should be reconsider later. fixed_ids = set(fixed_schedule.get_all_unique_tasks_id()) all_ids = set(task.id for task in self.workflow.get_all_unique_tasks()) ## TODO: urgent bugfix to correctly run GaHeftvsHeft if len(fixed_ids) == len(all_ids): print("Fixed schedule is complete. There is no use to run ga.") return fsh = [hash(key) for key in fixed_schedule.mapping.keys()] rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] if any(((h not in fsh) for h in rm_hashes)): raise Exception("Fixed schedule is broken") self.back_cmp = BackCmp(fixed_schedule, None, self.current_schedule, event, current_time, front_event.end_time) pass is_running = self.back_cmp is not None if not is_running: run_ga(current_schedule) else: self.back_cmp = None run_ga(current_schedule) ## TODO: only for debug. remove it later. # print("==================FIXED SCHEDULE PART=================") # print(self.back_cmp.fixed_schedule) # print("======================================================") pass
class CloudHeftExecutor(EventMachine): STATUS_RUNNING = 'running' STATUS_FINISHED = 'finished' def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion, desired_reliability, public_resource_manager, initial_schedule=None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.desired_reliability = desired_reliability self.public_resources_manager = public_resource_manager #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.register = dict() def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) if self.initial_schedule is None: self.current_schedule = Schedule( {node: [] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run( self.current_schedule) else: id_to_task = { tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow) } mapping = { node: [ ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items ] for (node, items) in self.initial_schedule.mapping.items() } self.current_schedule = Schedule(mapping) self.post_new_events() def event_arrived(self, event): def reschedule(event): self.heft_planner.current_time = self.current_time current_cleaned_schedule = self.clean_events(event) self.current_schedule = self.heft_planner.run( current_cleaned_schedule) self.post_new_events() def check_fail(reliability): res = random.random() if res > reliability: return True return False if isinstance(event, TaskStart): # TODO: if node is cloud node, do nothing prm = self.public_resources_manager if prm.isCloudNode(event.node): return None # check if failed and post (node, item) = self.current_schedule.place_by_time( event.task, event.time_happened) item.state = ScheduleItem.EXECUTING # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # public_resources_manager: # determine nodes of proper soft type # check and determine free nodes # determine reliability of every nodes # determine time_of_execution probability for (task,node) pair # try to find nodes in cloud if event.task not in self.register: proper_nodes = prm.get_by_softreq(event.task.soft_reqs) proper_nodes = [ node for node in proper_nodes if not prm.isBusy(node) ] sorted_proper_nodes = sorted( proper_nodes, key=lambda x: prm.get_reliability(x.name)) current_set = [] base_reliability = self.heft_planner.estimator.estimate_reliability( event.task, event.node) obtained_reliability = base_reliability dt = item.end_time - item.start_time def calc(node, dt): #(dt, task, node, transfer_estimation) # TODO: add proper transfer time here fp = prm.get_reliability(node.name) comp_time = self.heft_planner.estimator.estimate_runtime( event.task, node) cp = prm.probability_estimator(dt, comp_time, 0) #TODO: remove it later #cp = 0.95 #print("cp: " + str(cp)) return (node, fp, cp) it_comm_buf = 0 for pnode in sorted_proper_nodes: common_reliability = 1 - base_reliability #TODO: refactor this later if 1 - common_reliability >= self.desired_reliability: break res = calc(pnode, dt) current_set.append(res) #TODO: add dencity law of probability for dedicated resource for (nd, fp, cp) in current_set: common_reliability *= (1 - fp * cp) common_reliability = 1 - common_reliability #print("common_reliability: " + str(common_reliability)) it_comm_buf = common_reliability if common_reliability >= self.desired_reliability: #print("Commmon: "+ str(common_reliability)) break #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id)) #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task)) def frange(x, y, jump): while x < y: yield x x += jump for (nd, fp, cp) in current_set: comp_time = self.heft_planner.estimator.estimate_runtime( event.task, nd) #sigma 0.1*M lets take 0.6*M #TODO: uncomment it later ints = [(i, calc(nd, i)) for i in frange(0, comp_time + 0.2 * comp_time, 0.05 * comp_time)] rd = random.random() generated_comp_time = comp_time for (i, p) in ints: if p[2] > rd: generated_comp_time = i break #comp_time + 0.6*comp_time # TODO: remove it later #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time) #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95)) #print("cloud reliability: " + str(fp)) if check_fail(fp): event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd self.post(event_start) duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = generated_comp_time * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(nd, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(nd) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) else: event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd event_finish = TaskFinished(event.task) event_finish.time_happened = self.current_time + generated_comp_time event_finish.node = nd self.post(event_start) self.post(event_finish) prm.checkBusy(nd, True) self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING pass reliability = self.heft_planner.estimator.estimate_reliability( event.task, node) if check_fail(reliability): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id and not prm.isCloudNode(ev.node)) ]) pass return None if isinstance(event, TaskFinished): # check if it cloud task # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function # if task cloud and not first: free cloud node, end_of_function # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud and self.register[ event.task] == CloudHeftExecutor.STATUS_RUNNING: # print("gotcha task: " + str(event.task)) self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED ## TODO: correct it ## if event.task failed and went through rescheduling, ## it would be possible that currently ScheduleItem of event.task on dedicated resource ## has UNSTARTED state. ## TODO: add additional functional to schedule to record such situations and validate it after found = self.current_schedule.change_state_executed_with_end_time( event.task, ScheduleItem.FINISHED, self.current_time) pair = self.current_schedule.place_single(event.task) if pair is not None: ## TODO: The bug is here. Fix it later. ## the unstarted case must be taken into account in schedule and in the validity check procedure too (nd, item) = pair if item.state == ScheduleItem.EXECUTING: item.start_time = event.time_happened item.end_time = event.time_happened item.state = ScheduleItem.FINISHED self.queue = [ ev for ev in self.queue if not (not isinstance(ev, NodeUp) and ev.task.id == event.task.id) ] else: prm.checkBusy(event.node, False) return None def check(ev): if isinstance(ev, TaskFinished) or isinstance( ev, NodeFailed): if ev.task.id == event.task.id and not prm.isCloudNode( ev.node): return False ## TODO: make it later ##if isinstance(ev, NodeUp): return True self.queue = [ev for ev in self.queue if check(ev)] prm.checkBusy(event.node, False) reschedule(event) return None if from_cloud and self.register[ event.task] == CloudHeftExecutor.STATUS_FINISHED: prm.checkBusy(event.node, False) return None # check task finished self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED self.current_schedule.change_state_executed( event.task, ScheduleItem.FINISHED) return None if isinstance(event, NodeFailed): # check if cloud node # if cloud node: check as down, free node, end_of_function # if not cloud node: check as down, reschedule, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, True) prm.checkBusy(event.node, False) return None # check node down self.heft_planner.resource_manager.node( event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time reschedule(event) return None if isinstance(event, NodeUp): # check if cloud # if cloud: check node up, end_of_function # if not cloud: check as up, reschedule end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, False) return None # check node up self.heft_planner.resource_manager.node( event.node).state = Node.Unknown reschedule(event) return None return None def post_new_events(self): unstarted_items = set() for (node, items) in self.current_schedule.mapping.items(): for item in items: if item.state == ScheduleItem.UNSTARTED: unstarted_items.add((node, item)) events_to_post = [] for (node, item) in unstarted_items: event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node events_to_post self.post(event_start) self.post(event_finish) pass def clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks prm = self.public_resources_manager def check(event): if isinstance( event, TaskStart ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False if isinstance( event, TaskFinished ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False return True new_queue = deque([evnt for evnt in self.queue if check(evnt)]) self.queue = new_queue return clean_schedule
class GaOldPopExecutor(FailOnce, BaseExecutor): def __init__(self, **kwargs): super().__init__() self.estimator = kwargs["estimator"] self.base_fail_duration = kwargs["base_fail_duration"] self.base_fail_dispersion = kwargs["base_fail_dispersion"] self.workflow = kwargs["wf"] self.resource_manager = kwargs["resource_manager"] self.stat_saver = kwargs["stat_saver"] self.task_id_to_fail = kwargs["task_id_to_fail"] self.ga_builder = kwargs["ga_builder"] self.current_schedule = None self.past_pop = None pass def init(self): ## TODO: replace it with logging print("Working with initial state of nodes: {0}".format([n.flops for n in self.resource_manager.get_nodes()])) ga_planner = self.ga_builder() self.current_schedule = Schedule({node: [] for node in self.resource_manager.get_nodes()}) (result, logbook) = ga_planner(self.current_schedule, None) self.past_pop = ga_planner.get_pop() print("Result makespan: " + str(Utility.makespan(result[2]))) self.current_schedule = result[2] self._post_new_events() self.failed_once = False pass def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail # event_nodeup = NodeUp(node) # event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) # self.post(event_nodeup) # remove TaskFinished event ##TODO: make a function for this purpose in the base class self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): self.resource_manager.node(event.node).state = Node.Down it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: raise Exception("several items founded") pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): self.resource_manager.node(event.node).state = Node.Unknown self._reschedule(event) pass #@timing def _clean_chromosome(self, chromosome, event, current_cleaned_schedule): not_scheduled_tasks = [ item.job.id for (node, items) in current_cleaned_schedule.mapping.items() for item in items if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.EXECUTING] for (node_name, ids) in chromosome.items(): for_removing = [] for id in ids: if id in not_scheduled_tasks: for_removing.append(id) pass for r in for_removing: ids.remove(r) pass pass if isinstance(event, NodeFailed): tasks = chromosome[event.node.name] ## TODO: here must be a procedure of getting currently alive nodes working_nodes = list(chromosome.keys() - set([event.node.name])) for t in tasks: lt = len(working_nodes) - 1 new_node = 0 if lt == 0 else random.randint(0, lt ) node_name = working_nodes[new_node] length = len(chromosome[node_name]) # TODO: correct 0 and length new_place = 0 if length == 0 else random.randint(0, length) chromosome[node_name].insert(new_place, t) chromosome[event.node.name] = [] return chromosome if isinstance(event, NodeUp): pass return chromosome def _reschedule(self, event): current_cleaned_schedule = self._clean_events(event) task_id = "" if not hasattr(event, 'task') else " " + str(event.task.id) ## scheduling with initial population created of the previous population by moving elements from a downed node print("Scheduling with the old pop: " + str(event.__class__.__name__) + task_id ) ga_planner = self.ga_builder() cleaned_chromosomes = [self._clean_chromosome(ch, event, current_cleaned_schedule) for ch in self.past_pop] def is_empty(ch): return len([item for n, items in ch.items() for item in items]) == 0 cleaned_chromosomes = [ch for ch in cleaned_chromosomes if not is_empty(ch)] cleaned_chromosomes = None if len(cleaned_chromosomes) == 0 else cleaned_chromosomes curr_ids = frozenset(current_cleaned_schedule.get_all_unique_tasks_id()) all_ids = frozenset(t.id for t in self.workflow.get_all_unique_tasks()) if all_ids == curr_ids: print("Schedule alleady has all unique tasks") return ((v1, v2, resulted_schedule, iter_old_pop), logbook_old_pop) = ga_planner(current_cleaned_schedule, None, self.current_time, initial_population=cleaned_chromosomes) #checking Utility.check_and_raise_for_fixed_part(resulted_schedule, current_cleaned_schedule, self.current_time) makespan_old_pop = Utility.makespan(resulted_schedule) print("Result makespan: " + str(makespan_old_pop)) self.current_schedule = resulted_schedule self.past_pop = ga_planner.get_pop() ## scheduling with random initial population print("Scheduling with a random pop: " + str(event.__class__.__name__)+ task_id) ga_planner_with_random_init_population = self.ga_builder() ((v3, v4, schedule_with_random, iter_random), logbook_random) = ga_planner_with_random_init_population(current_cleaned_schedule, None, self.current_time, initial_population=None) Utility.check_and_raise_for_fixed_part(schedule_with_random, current_cleaned_schedule, self.current_time) makespan_random = Utility.makespan(schedule_with_random) print("Result makespan: " + str(Utility.makespan(schedule_with_random))) # creating and writing some stat data # Note: it can be rewritten with using of events if self.stat_saver is not None: stat_data = { "wf_name": self.workflow.name, "event_name": event.__class__.__name__, "task_id": task_id, "with_old_pop": { "iter": iter_old_pop, "makespan": makespan_old_pop, "pop_aggr": logbook_old_pop }, "with_random": { "iter": iter_random, "makespan": makespan_random, "pop_aggr": logbook_random } } self.stat_saver(stat_data) self._post_new_events() pass pass
class HeftExecutor(FailRandom, BaseExecutor): def __init__(self, resource_manager, heft_planner, base_fail_duration, base_fail_dispersion, fail_count_upper_limit=None, initial_schedule=None, logger=None): super().__init__(heft_planner, base_fail_duration, base_fail_dispersion, fail_count_upper_limit, initial_schedule, logger) ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.resource_manager = resource_manager self._fail_count_upper_limit = fail_count_upper_limit self.logger = logger def init(self): if self.initial_schedule is None: self.current_schedule = Schedule( {node: [] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run( self.current_schedule) else: id_to_task = { tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow) } mapping = { node: [ ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items ] for (node, items) in self.initial_schedule.mapping.items() } self.current_schedule = Schedule(mapping) self._post_new_events() def _generate_failtime_and_duration(self, item): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() return (time_of_fail, duration) def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): (time_of_fail, duration) = self._generate_failtime_and_duration(item) time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id) ]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): # check node up self.heft_planner.resource_manager.node( event.node).state = Node.Unknown self._reschedule(event) pass pass