def init(self): if self.initial_schedule is None: self.current_schedule = Schedule({node:[] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run(self.current_schedule) else: id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)} mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()} self.current_schedule = Schedule(mapping) self._post_new_events()
def mapping(self, sorted_jobs, existing_plan, nodes, commcost, compcost): """def allocate(job, orders, jobson, prec, compcost, commcost):""" """ Allocate job to the machine with earliest finish time Operates in place """ ## TODO: add finished tasks jobson = dict() for (node, items) in existing_plan.items(): for item in items: if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.EXECUTING: jobson[item.job] = node new_plan = existing_plan def ft(machine): #cost = st(machine) runtime = compcost(task, machine) cost = st(machine, runtime) + runtime ##print("machine: %s job:%s cost: %s" % (machine.name, task.id, cost)) ##print("machine: " + str(machine.name) + " cost: " + str(cost)) return cost for wf, tasks in sorted_jobs: ##wf_dag = self.convert_to_parent_children_map(wf) wf_dag = HeftHelper.convert_to_parent_children_map(wf) prec = reverse_dict(wf_dag) for task in tasks: st = partial(self.start_time, wf, task, new_plan, jobson, prec, commcost) # ress = [(key, ft(key)) for key in new_plan.keys()] # agent_pair = min(ress, key=lambda x: x[1][0]) # agent = agent_pair[0] # start = agent_pair[1][0] # end = agent_pair[1][1] agent = min(new_plan.keys(), key=ft) runtime = compcost(task, agent) start = st(agent, runtime) end = ft(agent) # new_plan[agent].append(ScheduleItem(task, start, end)) Schedule.insert_item(new_plan, agent, ScheduleItem(task, start, end)) jobson[task] = agent new_sched = Schedule(new_plan) return new_sched
def schedule(self): """ create inter-priority """ def byPriority(wf): return 0 if wf.priority is None else wf.priority ##simple inter priority sorting sorted_wfs = sorted(self.workflows, key=byPriority) wf_jobs = {wf: [] for wf in sorted_wfs} resources = self.resource_manager.get_resources() ##print("common nodes count:" + str(len(toNodes(resources)))) nodes = HeftHelper.to_nodes(resources) wf_jobs = {wf: self.make_ranking(wf, nodes) for wf in sorted_wfs} ##new_schedule = self.get_unchanged_schedule(self.old_schedule, time) new_schedule = Schedule({node: [] for node in nodes}) new_plan = new_schedule.mapping for (wf, jobs) in wf_jobs.items(): new_schedule = self.mapping([(wf, jobs)], new_plan, nodes, self.commcost, self.compcost) new_plan = new_schedule.mapping return new_schedule
def build_schedule(workflow, estimator, resource_manager, solution): """ the solution consists all parts necessary to build whole solution For the moment, it is mentioned that all species taking part in algorithm are necessary to build complete solution solution = { s1.name: val1, s2.name: val2, .... } """ ms = solution[MAPPING_SPECIE] os = solution[ORDERING_SPECIE] assert check_precedence(workflow, os), "Precedence is violated" ms = {t: resource_manager.byName(n) for t, n in ms} schedule_mapping = {n: [] for n in set(ms.values())} task_to_node = {} for t in os: node = ms[t] t = workflow.byId(t) (start_time, end_time) = place_task_to_schedule(workflow, estimator, schedule_mapping, task_to_node, ms, t, node, 0) task_to_node[t.id] = (node, start_time, end_time) schedule = Schedule(schedule_mapping) return schedule
def clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks prm = self.public_resources_manager def check(event): if isinstance( event, TaskStart ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False if isinstance( event, TaskFinished ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False return True new_queue = deque([evnt for evnt in self.queue if check(evnt)]) self.queue = new_queue return clean_schedule
def _clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks def check(event): if isinstance(event, TaskStart) and event.task in cleaned_task: return False if isinstance(event, TaskFinished) and event.task in cleaned_task: return False return True ##TODO: refactor it later self.queue = deque([event for event in self.queue if check(event)]) return clean_schedule
def fnc(): empty_schedule = Schedule( {node: [] for node in resource_manager.get_nodes()}) res = ga(empty_schedule, None) print(res) pass
def gaheft_reschedule(wf_added_time): copy_gaheft_schedule = Schedule({ node: [item for item in items] for (node, items) in ga_initial_schedule.mapping.items() }) added_time = all_initial_wf_time * wf_added_time mark_finished(copy_gaheft_schedule) gaheft_added = DynamicHeft(added_wf, resource_manager, estimator) gaheft_added.current_time = added_time gaheft_added_schedule = gaheft_added.run(copy_gaheft_schedule) new_ga = GAComputationManager(15, added_wf, resource_manager, estimator) gaheft_added_schedule = new_ga.run(gaheft_added_schedule, added_time, False)[2] mark_finished(gaheft_added_schedule) nodes_seq_validaty = Utility.validateNodesSeq(gaheft_added_schedule) if nodes_seq_validaty is not True: raise Exception("Check for nodes_seq_validaty didn't pass") initial_wf_validaty = Utility.validateParentsAndChildren( gaheft_added_schedule, initial_wf) if initial_wf_validaty is not True: raise Exception("Check for initial_wf_validaty didn't pass") added_wf_validaty = Utility.validateParentsAndChildren( gaheft_added_schedule, added_wf) if added_wf_validaty is not True: raise Exception("Check for added_wf_validaty didn't pass") #print("All Ok!") result = Utility.makespan(gaheft_added_schedule) return result
def default_fixed_schedule_part(resource_manager): fix_schedule_part = Schedule({ node: [] for node in HeftHelper.to_nodes( resource_manager.get_resources()) }) return fix_schedule_part
def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule({key:[] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None
def do_exp(wf_name): _wf = wf(wf_name) rm = ExperimentResourceManager(rg.r([10, 15, 25, 30])) estimator = SimpleTimeCostEstimator(comp_time_cost=0, transf_time_cost=0, transferMx=None, ideal_flops=20, transfer_time=100) empty_fixed_schedule_part = Schedule({node: [] for node in rm.get_nodes()}) heft_schedule = run_heft(_wf, rm, estimator) ga_functions = GAFunctions2(_wf, rm, estimator) generate = partial(ga_generate, ga_functions=ga_functions, fixed_schedule_part=empty_fixed_schedule_part, current_time=0.0, init_sched_percent=0.05, initial_schedule=heft_schedule) stats = tools.Statistics(lambda ind: ind.fitness.values[0]) stats.register("avg", numpy.mean) stats.register("std", numpy.std) stats.register("min", numpy.min) stats.register("max", numpy.max) logbook = tools.Logbook() logbook.header = ["gen", "evals"] + stats.fields toolbox = Toolbox() toolbox.register("generate", generate) toolbox.register( "evaluate", fit_converter( ga_functions.build_fitness(empty_fixed_schedule_part, 0.0))) toolbox.register("clone", deepcopy) toolbox.register("mate", ga_functions.crossover) toolbox.register("sweep_mutation", ga_functions.sweep_mutation) toolbox.register("mutate", ga_functions.mutation) # toolbox.register("select_parents", ) # toolbox.register("select", tools.selTournament, tournsize=4) toolbox.register("select", tools.selRoulette) pop, logbook, best = run_ga(toolbox=toolbox, logbook=logbook, stats=stats, **GA_PARAMS) resulted_schedule = ga_functions.build_schedule(best, empty_fixed_schedule_part, 0.0) Utility.validate_static_schedule(_wf, resulted_schedule) ga_makespan = Utility.makespan(resulted_schedule) return ga_makespan
def run_heft(workflow, resource_manager, estimator): """ It simply runs heft with empty initial schedule and returns complete schedule """ heft = DynamicHeft(workflow, resource_manager, estimator) nodes = resource_manager.get_nodes() init_schedule = Schedule({node: [] for node in nodes}) return heft.run(init_schedule)
def clean_unfinished(schedule): def clean(items): return [ item for item in items if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.EXECUTING ] new_mapping = { node: clean(items) for (node, items) in schedule.mapping.items() } return Schedule(new_mapping)
def init(self): self.current_schedule = Schedule({node: [] for node in self.heft_planner.get_nodes()}) initial_schedule = self.heft_planner.run(deepcopy(self.current_schedule)) #print("HEFT MAKESPAN: {0}".format(Utility.makespan(initial_schedule))) # TODO: change these two ugly records result = self.ga_builder()(self.current_schedule, initial_schedule) #print("INIT MAKESPAN: {0}".format(Utility.makespan(result[0][2]))) self.current_schedule = result[0][2] self._post_new_events() return result
def init(self): if self.initial_schedule is None: self.current_schedule = Schedule( {node: [] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run( self.current_schedule) else: id_to_task = { tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow) } mapping = { node: [ ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items ] for (node, items) in self.initial_schedule.mapping.items() } self.current_schedule = Schedule(mapping) self._post_new_events()
def init(self): self.current_schedule = Schedule({node: [] for node in self.heft_planner.get_nodes()}) initial_schedule = self.heft_planner.run(Schedule({node: [] for node in self.heft_planner.get_nodes()})) # print("heft solution!") # fsh = [hash(key) for key in initial_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") # TODO: change these two ugly records result = self.ga_builder()(self.current_schedule, initial_schedule) # print("Ga solution is broken!") # fsh = [hash(key) for key in result[0][2].mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") if not self._apply_mh_if_better(None, heuristic_resulted_schedule=initial_schedule, metaheuristic_resulted_schedule=result[0][2]): self.current_schedule = initial_schedule self._post_new_events() # print("Before Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #self.current_schedule = result[0][2] #self._post_new_events() return result
def init(self): ## TODO: replace it with logging print("Working with initial state of nodes: {0}".format([n.flops for n in self.resource_manager.get_nodes()])) ga_planner = self.ga_builder() self.current_schedule = Schedule({node: [] for node in self.resource_manager.get_nodes()}) (result, logbook) = ga_planner(self.current_schedule, None) self.past_pop = ga_planner.get_pop() print("Result makespan: " + str(Utility.makespan(result[2]))) self.current_schedule = result[2] self._post_new_events() self.failed_once = False pass
def _run_heft(): dynamic_planner = DynamicHeft(wf, resource_manager, estimator) nodes = HeftHelper.to_nodes(resource_manager.resources) current_cleaned_schedule = Schedule({node: [] for node in nodes}) schedule_dynamic_heft = dynamic_planner.run( current_cleaned_schedule) self._validate(wf, estimator, schedule_dynamic_heft) if is_visualized: viz.visualize_task_node_mapping(wf, schedule_dynamic_heft) # Utility.create_jedule_visualization(schedule_dynamic_heft, wf_name+'_heft') pass return schedule_dynamic_heft
def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule( {key: [] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None
def as_schedule(dct): if '__cls__' in dct and dct['__cls__'] == 'Node': res = dct['resource'] node = Node(dct['name'], res, dct['soft']) node.flops = dct['flops'] return node if '__cls__' in dct and dct['__cls__'] == 'ScheduleItem': task = task_dict[dct['job']] scItem = ScheduleItem(task, dct['start_time'], dct['end_time']) scItem.state = dct['state'] return scItem if '__cls__' in dct and dct['__cls__'] == 'Schedule': mapping = { node_values['node']: node_values['value'] for node_values in dct['mapping'] } schedule = Schedule(mapping) return schedule if '__cls__' in dct and dct['__cls__'] == 'Resource': res = Resource(dct['name']) res.nodes = dct['nodes'] return res if '__cls__' in dct and dct['__cls__'] == 'SaveBundle': all_nodes = set() for res in dct['dedicated_resources']: for node in res.nodes: node.resource = res all_nodes.update(res.nodes) all_nodes = {node.name: node for node in all_nodes} dct['ga_schedule'].mapping = { all_nodes[node_name]: values for (node_name, values) in dct['ga_schedule'].mapping.items() } bundle = SaveBundle(dct['name'], dct['dedicated_resources'], dct['transfer_mx'], dct['ideal_flops'], dct['ga_schedule'], dct['wf_name']) return bundle return dct
def __init__(self, workflow, resource_manager, estimator, ranking=None): self.current_schedule = Schedule(dict()) self.workflow = workflow self.resource_manager = resource_manager self.estimator = estimator self.current_time = 0 nodes = self.get_nodes() self.wf_jobs = self.make_ranking(self.workflow, nodes) if ranking is None else ranking # print("A: " + str(self.wf_jobs)) #TODO: remove it later # to_print = '' # for job in self.wf_jobs: # to_print = to_print + str(job.id) + " " # print(to_print) pass
def generate(wf, rm, estimator, schedule=None, fixed_schedule_part=None, current_time=0.0): sched = schedule if schedule is not None else SimpleRandomizedHeuristic( wf, rm.get_nodes(), estimator).schedule(fixed_schedule_part, current_time) if fixed_schedule_part is not None: un_tasks = unmoveable_tasks(fixed_schedule_part) clean_sched = Schedule({ node: [ item for item in items if item.job.id not in un_tasks and item.state != ScheduleItem.FAILED ] for node, items in sched.mapping.items() }) else: clean_sched = sched mapping, ordering = ord_and_map(clean_sched) ordering_numseq = ordering_to_numseq(ordering) ordering_map = { task_id: val for task_id, val in zip(ordering, ordering_numseq) } ord_p, map_p = OrderingParticle(ordering_map), MappingParticle(mapping) ord_p.velocity = OrderingParticle.Velocity({}) map_p.velocity = MappingParticle.Velocity({}) result = CompoundParticle(map_p, ord_p) if schedule is None and not validate_mapping_with_alive_nodes( result.mapping.entity, rm): raise Exception("found invalid solution in generated array") return result
def _get_fixed_schedule(schedule, front_event): def is_before_event(item): # hard to resolve corner case. The simulator doesn't guranteed the order of appearing events. if item.start_time < front_event.end_time: return True ## TODO: Urgent!!! experimental change. Perhaps, It should be removed from here later. if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.FAILED: return True return False ##TODO: it's dangerous operation. ## TODO: need create new example of ScheduleItem. def set_proper_state(item): new_item = ScheduleItem.copy(item) non_finished = new_item.state == ScheduleItem.EXECUTING or new_item.state == ScheduleItem.UNSTARTED ## TODO: Urgent!: dangerous place if non_finished and new_item.end_time <= front_event.end_time: new_item.state = ScheduleItem.FINISHED if non_finished and new_item.end_time > front_event.end_time: new_item.state = ScheduleItem.EXECUTING return new_item fixed_mapping = {key: [set_proper_state(item) for item in items if is_before_event(item)] for (key, items) in schedule.mapping.items()} return Schedule(fixed_mapping)
class HeftExecutor(FailRandom, BaseExecutor): def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion , initial_schedule = None, logger=None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.logger = logger def init(self): if self.initial_schedule is None: self.current_schedule = Schedule({node:[] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run(self.current_schedule) else: id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)} mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()} self.current_schedule = Schedule(mapping) self._post_new_events() def _generate_failtime_and_duration(self, item): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() return (time_of_fail, duration) def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): (time_of_fail, duration) = self._generate_failtime_and_duration(item) time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown self._reschedule(event) pass pass
class HeftExecutor(FailRandom, BaseExecutor): def __init__(self, resource_manager, heft_planner, base_fail_duration, base_fail_dispersion, fail_count_upper_limit=None, initial_schedule=None, logger=None): super().__init__(heft_planner, base_fail_duration, base_fail_dispersion, fail_count_upper_limit, initial_schedule, logger) ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.resource_manager = resource_manager self._fail_count_upper_limit = fail_count_upper_limit self.logger = logger def init(self): if self.initial_schedule is None: self.current_schedule = Schedule( {node: [] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run( self.current_schedule) else: id_to_task = { tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow) } mapping = { node: [ ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items ] for (node, items) in self.initial_schedule.mapping.items() } self.current_schedule = Schedule(mapping) self._post_new_events() def _generate_failtime_and_duration(self, item): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() return (time_of_fail, duration) def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): (time_of_fail, duration) = self._generate_failtime_and_duration(item) time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id) ]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): # check node up self.heft_planner.resource_manager.node( event.node).state = Node.Unknown self._reschedule(event) pass pass
class GAExecutor(FailRandom, BaseExecutor): def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule({key:[] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)] unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None) #run ready tasks self.post_new_events(unstarted_tasks) def is_ready(self, task): nope = False in [(p.id in self.finished_tasks) for p in task.parents] return not nope def is_next_to_run(self, task): (node, item) = self.initial_schedule.place(task) its = [it for it in self.initial_schedule.mapping[node] if it.start_time < item.start_time] not_next = False in [(it.job.id in self.finished_tasks) for it in its] return not not_next def _task_start_handler(self, event): (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self.finished_tasks.append(event.task.id) unstarted_items = self.get_ready_tasks(event.task, event.node) ##TODO: remove it later #print("==============================") #print("Task " + str(event.task) + " finished") #for item in unstarted_items: # print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0])) #print("==============================") #generate new task start events self.post_new_events(unstarted_items) pass def _node_failed_handler(self, event): # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time pass def _node_up_handler(self, event): # check node up self.resource_manager.node(event.node).state = Node.Unknown #get next task for this node next_sched_item = [] for item in self.initial_schedule.mapping[event.node]: if item.job.id not in self.finished_tasks: next_sched_item = item break runtime = next_sched_item.end_time - next_sched_item.start_time start_time = self.current_time end_time = start_time + runtime actual_sched_item = ScheduleItem(next_sched_item.job, start_time, end_time) self.post_new_events([actual_sched_item]) pass def get_ready_tasks(self, ptask, pnode): unstarted_items = [] next_for_ptask = self.initial_schedule.get_next_item(ptask) #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job] tsks = [tsk for tsk in ptask.children if self.is_ready(tsk) and self.is_next_to_run(tsk)] ##TODO: refactor it later if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready(next_for_ptask.job) and self.is_next_to_run(next_for_ptask.job): tsks.append(next_for_ptask.job) # tsks mustn't be finished, executing or their node is Down def appropriate_to_run(tsk): if tsk.id in self.finished_tasks: return False if self.current_schedule.is_executing(tsk): return False nd = self.initial_schedule.place(tsk)[0] if self.resource_manager.node(nd).state == Node.Down: return False return True tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)] for child in tsks: (node, item) = self.initial_schedule.place(child) ## TODO: remake it later # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child) # runtime = item.end_time - item.start_time # start_time = self.current_time + transf # end_time = start_time + runtime sitems = self.current_schedule.mapping.items() pids = [p.id for p in child.parents] mp = {it.job.id: (pnd, it) for (pnd, items) in sitems for it in items if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED) } estms = [it.end_time + self.estimator.estimate_transfer_time(pnd, node, it.job, child) for (id, (pnd, it)) in mp.items()] transf_end = 0 if len(estms) == 0 else max(estms) runtime = item.end_time - item.start_time start_time = max(self.current_time, transf_end) end_time = start_time + runtime actual_sched_item = ScheduleItem(item.job, start_time, end_time) unstarted_items.append(actual_sched_item) return unstarted_items def post_new_events(self, unstarted_items): for item in unstarted_items: (node, it) = self.initial_schedule.place(item.job) event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node self.post(event_start) self.post(event_finish) self.current_schedule.mapping[node].append(item) pass
class GAExecutor(FailRandom, BaseExecutor): def __init__(self, workflow, resource_manager, estimator, base_fail_duration, base_fail_dispersion, initial_schedule): ## TODO: remake it later self.queue = deque() self.current_time = 0 self.workflow = workflow # DynamicHeft #self.heft_planner = heft_planner self.resource_manager = resource_manager self.estimator = estimator self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion ##self.current_schedule = Schedule({node:[] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = Schedule( {key: [] for key in initial_schedule.mapping.keys()}) #self.ready_tasks = [] self.finished_tasks = [self.workflow.head_task.id] ## TODO: correct this stub later self.logger = None def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) #to_run = [child for child in self.workflow.head_task.children if self.is_next_to_run(child)] unstarted_tasks = self.get_ready_tasks(self.workflow.head_task, None) #run ready tasks self.post_new_events(unstarted_tasks) def is_ready(self, task): nope = False in [(p.id in self.finished_tasks) for p in task.parents] return not nope def is_next_to_run(self, task): (node, item) = self.initial_schedule.place(task) its = [ it for it in self.initial_schedule.mapping[node] if it.start_time < item.start_time ] not_next = False in [(it.job.id in self.finished_tasks) for it in its] return not not_next def _task_start_handler(self, event): (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id) ]) pass pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self.finished_tasks.append(event.task.id) unstarted_items = self.get_ready_tasks(event.task, event.node) ##TODO: remove it later #print("==============================") #print("Task " + str(event.task) + " finished") #for item in unstarted_items: # print("Start task: " + str(item.job) + " On node: " + str(self.initial_schedule.place(item.job)[0])) #print("==============================") #generate new task start events self.post_new_events(unstarted_items) pass def _node_failed_handler(self, event): # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time pass def _node_up_handler(self, event): # check node up self.resource_manager.node(event.node).state = Node.Unknown #get next task for this node next_sched_item = [] for item in self.initial_schedule.mapping[event.node]: if item.job.id not in self.finished_tasks: next_sched_item = item break runtime = next_sched_item.end_time - next_sched_item.start_time start_time = self.current_time end_time = start_time + runtime actual_sched_item = ScheduleItem(next_sched_item.job, start_time, end_time) self.post_new_events([actual_sched_item]) pass def get_ready_tasks(self, ptask, pnode): unstarted_items = [] next_for_ptask = self.initial_schedule.get_next_item(ptask) #next_for_ptask = [] if next_for_ptask is None else [next_for_ptask.job] tsks = [ tsk for tsk in ptask.children if self.is_ready(tsk) and self.is_next_to_run(tsk) ] ##TODO: refactor it later if next_for_ptask is not None and next_for_ptask.job not in tsks and self.is_ready( next_for_ptask.job) and self.is_next_to_run( next_for_ptask.job): tsks.append(next_for_ptask.job) # tsks mustn't be finished, executing or their node is Down def appropriate_to_run(tsk): if tsk.id in self.finished_tasks: return False if self.current_schedule.is_executing(tsk): return False nd = self.initial_schedule.place(tsk)[0] if self.resource_manager.node(nd).state == Node.Down: return False return True tsks = [tsk for tsk in tsks if appropriate_to_run(tsk)] for child in tsks: (node, item) = self.initial_schedule.place(child) ## TODO: remake it later # transf = 0 if pnode is None else self.estimator.estimate_transfer_time(pnode, node, ptask, child) # runtime = item.end_time - item.start_time # start_time = self.current_time + transf # end_time = start_time + runtime sitems = self.current_schedule.mapping.items() pids = [p.id for p in child.parents] mp = { it.job.id: (pnd, it) for (pnd, items) in sitems for it in items if (it.job.id in pids) and (it.state == ScheduleItem.FINISHED) } estms = [ it.end_time + self.estimator.estimate_transfer_time(pnd, node, it.job, child) for (id, (pnd, it)) in mp.items() ] transf_end = 0 if len(estms) == 0 else max(estms) runtime = item.end_time - item.start_time start_time = max(self.current_time, transf_end) end_time = start_time + runtime actual_sched_item = ScheduleItem(item.job, start_time, end_time) unstarted_items.append(actual_sched_item) return unstarted_items def post_new_events(self, unstarted_items): for item in unstarted_items: (node, it) = self.initial_schedule.place(item.job) event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node self.post(event_start) self.post(event_finish) self.current_schedule.mapping[node].append(item) pass
class CloudHeftExecutor(EventMachine): STATUS_RUNNING = 'running' STATUS_FINISHED = 'finished' def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion, desired_reliability, public_resource_manager, initial_schedule = None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.desired_reliability = desired_reliability self.public_resources_manager = public_resource_manager #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.register = dict() def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) if self.initial_schedule is None: self.current_schedule = Schedule({node:[] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run(self.current_schedule) else: id_to_task = {tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow)} mapping = {node: [ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items] for (node, items) in self.initial_schedule.mapping.items()} self.current_schedule = Schedule(mapping) self.post_new_events() def event_arrived(self, event): def reschedule(event): self.heft_planner.current_time = self.current_time current_cleaned_schedule = self.clean_events(event) self.current_schedule = self.heft_planner.run(current_cleaned_schedule) self.post_new_events() def check_fail(reliability): res = random.random() if res > reliability: return True return False if isinstance(event, TaskStart): # TODO: if node is cloud node, do nothing prm = self.public_resources_manager if prm.isCloudNode(event.node): return None # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # public_resources_manager: # determine nodes of proper soft type # check and determine free nodes # determine reliability of every nodes # determine time_of_execution probability for (task,node) pair # try to find nodes in cloud if event.task not in self.register: proper_nodes = prm.get_by_softreq(event.task.soft_reqs) proper_nodes = [node for node in proper_nodes if not prm.isBusy(node)] sorted_proper_nodes = sorted(proper_nodes, key=lambda x: prm.get_reliability(x.name)) current_set = [] base_reliability = self.heft_planner.estimator.estimate_reliability(event.task, event.node) obtained_reliability = base_reliability dt = item.end_time - item.start_time def calc(node, dt): #(dt, task, node, transfer_estimation) # TODO: add proper transfer time here fp = prm.get_reliability(node.name) comp_time = self.heft_planner.estimator.estimate_runtime(event.task, node) cp = prm.probability_estimator(dt, comp_time, 0) #TODO: remove it later #cp = 0.95 #print("cp: " + str(cp)) return (node, fp, cp ) it_comm_buf = 0 for pnode in sorted_proper_nodes: common_reliability = 1 - base_reliability #TODO: refactor this later if 1 - common_reliability >= self.desired_reliability: break res = calc(pnode, dt) current_set.append(res) #TODO: add dencity law of probability for dedicated resource for (nd, fp, cp) in current_set: common_reliability *= (1 - fp*cp) common_reliability = 1 - common_reliability #print("common_reliability: " + str(common_reliability)) it_comm_buf = common_reliability if common_reliability >= self.desired_reliability: #print("Commmon: "+ str(common_reliability)) break #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id)) #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task)) def frange(x, y, jump): while x < y: yield x x += jump for (nd, fp, cp) in current_set: comp_time = self.heft_planner.estimator.estimate_runtime(event.task, nd) #sigma 0.1*M lets take 0.6*M #TODO: uncomment it later ints = [(i, calc(nd, i))for i in frange(0, comp_time + 0.2*comp_time, 0.05*comp_time)] rd = random.random() generated_comp_time = comp_time for (i, p) in ints: if p[2] > rd: generated_comp_time = i break #comp_time + 0.6*comp_time # TODO: remove it later #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time) #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95)) #print("cloud reliability: " + str(fp)) if check_fail(fp): event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd self.post(event_start) duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = generated_comp_time*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(nd, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(nd) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) else: event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd event_finish = TaskFinished(event.task) event_finish.time_happened = self.current_time + generated_comp_time event_finish.node = nd self.post(event_start) self.post(event_finish) prm.checkBusy(nd, True) self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING pass reliability = self.heft_planner.estimator.estimate_reliability(event.task, node) if check_fail(reliability): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id and not prm.isCloudNode(ev.node))]) pass return None if isinstance(event, TaskFinished): # check if it cloud task # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function # if task cloud and not first: free cloud node, end_of_function # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_RUNNING: # print("gotcha task: " + str(event.task)) self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED ## TODO: correct it ## if event.task failed and went through rescheduling, ## it would be possible that currently ScheduleItem of event.task on dedicated resource ## has UNSTARTED state. ## TODO: add additional functional to schedule to record such situations and validate it after found = self.current_schedule.change_state_executed_with_end_time(event.task, ScheduleItem.FINISHED, self.current_time) pair = self.current_schedule.place_single(event.task) if pair is not None: ## TODO: The bug is here. Fix it later. ## the unstarted case must be taken into account in schedule and in the validity check procedure too (nd, item) = pair if item.state == ScheduleItem.EXECUTING: item.start_time = event.time_happened item.end_time = event.time_happened item.state = ScheduleItem.FINISHED self.queue = [ev for ev in self.queue if not (not isinstance(ev, NodeUp) and ev.task.id == event.task.id)] else: prm.checkBusy(event.node, False) return None def check(ev): if isinstance(ev, TaskFinished) or isinstance(ev, NodeFailed): if ev.task.id == event.task.id and not prm.isCloudNode(ev.node): return False ## TODO: make it later ##if isinstance(ev, NodeUp): return True self.queue = [ev for ev in self.queue if check(ev)] prm.checkBusy(event.node, False) reschedule(event) return None if from_cloud and self.register[event.task] == CloudHeftExecutor.STATUS_FINISHED: prm.checkBusy(event.node, False) return None # check task finished self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) return None if isinstance(event, NodeFailed): # check if cloud node # if cloud node: check as down, free node, end_of_function # if not cloud node: check as down, reschedule, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, True) prm.checkBusy(event.node, False) return None # check node down self.heft_planner.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time reschedule(event) return None if isinstance(event, NodeUp): # check if cloud # if cloud: check node up, end_of_function # if not cloud: check as up, reschedule end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, False) return None # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown reschedule(event) return None return None def post_new_events(self): unstarted_items = set() for (node, items) in self.current_schedule.mapping.items(): for item in items: if item.state == ScheduleItem.UNSTARTED: unstarted_items.add((node, item)) events_to_post = [] for (node, item) in unstarted_items: event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node events_to_post self.post(event_start) self.post(event_finish) pass def clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks prm = self.public_resources_manager def check(event): if isinstance(event, TaskStart) and event.task in cleaned_task and not prm.isCloudNode(event.node): return False if isinstance(event, TaskFinished) and event.task in cleaned_task and not prm.isCloudNode(event.node): return False return True new_queue = deque([evnt for evnt in self.queue if check(evnt)]) self.queue = new_queue return clean_schedule
def schedule(self, fixed_schedule_part=None, current_time=0.0): estimate = self.estimator.estimate_transfer_time # TODO: make common utility function with ScheduleBuilder def is_last_version_of_task_executing(item): return item.state == ScheduleItem.EXECUTING or item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.UNSTARTED def _get_ready_tasks(children, finished_tasks): def _is_child_ready(child): ids = set([p.id for p in child.parents]) result = False in [id in finished_tasks for id in ids] return not result ready_children = [ child for child in children if _is_child_ready(child) ] return ready_children if fixed_schedule_part is None: schedule_mapping = {node: [] for node in self.nodes} ready_tasks = [ child.id for child in self.workflow.head_task.children ] task_to_node = dict() finished_tasks = set() else: schedule_mapping = { node: [item for item in items] for (node, items) in fixed_schedule_part.mapping.items() } finished_tasks = [ item.job.id for (node, items) in fixed_schedule_part.mapping.items() for item in items if is_last_version_of_task_executing(item) ] finished_tasks = set([self.workflow.head_task.id] + finished_tasks) unfinished = [ task for task in self.workflow.get_all_unique_tasks() if not task.id in finished_tasks ] ready_tasks = [ task.id for task in _get_ready_tasks(unfinished, finished_tasks) ] task_to_node = { item.job.id: (node, item.start_time, item.end_time) for (node, items) in fixed_schedule_part.mapping.items() for item in items if is_last_version_of_task_executing(item) } def is_child_ready(child): ids = set([p.id for p in child.parents]) result = False in [id in finished_tasks for id in ids] return not result def find_slots(node, comm_ready, runtime): node_schedule = schedule_mapping.get(node, list()) free_time = 0 if len( node_schedule) == 0 else node_schedule[-1].end_time ## TODO: refactor it later f_time = max(free_time, comm_ready) f_time = max(f_time, current_time) base_variant = [(f_time, f_time + runtime + 1)] zero_interval = [] if len(node_schedule) == 0 else [ (0, node_schedule[0].start_time) ] middle_intervals = [(node_schedule[i].end_time, node_schedule[i + 1].start_time) for i in range(len(node_schedule) - 1)] intervals = zero_interval + middle_intervals + base_variant #result = [(st, end) for (st, end) in intervals if st >= comm_ready and end - st >= runtime] ## TODO: rethink rounding result = [ (st, end) for (st, end) in intervals if (current_time < st or abs((current_time - st)) < 0.01) and st >= comm_ready and ( runtime < (end - st) or abs((end - st) - runtime) < 0.01) ] return result def comm_ready_func(task, node): ##TODO: remake this stub later. if len(task.parents) == 1 and self.workflow.head_task.id == list( task.parents)[0].id: return 0 return max([ task_to_node[p.id][2] + estimate(node, task_to_node[p.id][0], task, p) for p in task.parents ]) def get_possible_execution_times(task, node): ## pay attention to the last element in the resulted seq ## it represents all available time of node after it completes all its work ## (if such interval can exist) ## time_slots = [(st1, end1),(st2, end2,...,(st_last, st_last + runtime)] runtime = self.estimator.estimate_runtime(task, node) comm_ready = comm_ready_func(task, node) time_slots = find_slots(node, comm_ready, runtime) return time_slots, runtime while len(ready_tasks) > 0: choosed_index = random.randint(0, len(ready_tasks) - 1) task = self.task_map[ready_tasks[choosed_index]] #TODO: make checking for all nodes are dead.(It's a very rare situation so it is not consider for now) alive_nodes = [ node for node in self.nodes if node.state != Node.Down ] choosed_node_index = random.randint(0, len(alive_nodes) - 1) node = alive_nodes[choosed_node_index] time_slots, runtime = get_possible_execution_times(task, node) choosed_time_index = 0 if len(time_slots) == 1 else random.randint( 0, len(time_slots) - 1) time_slot = time_slots[choosed_time_index] start_time = time_slot[0] end_time = start_time + runtime item = ScheduleItem(task, start_time, end_time) ##schedule_mapping[node].append(item) Schedule.insert_item(schedule_mapping, node, item) task_to_node[task.id] = (node, start_time, end_time) ##print('I am here') ready_tasks.remove(task.id) finished_tasks.add(task.id) ready_children = [ child for child in task.children if is_child_ready(child) ] for child in ready_children: ready_tasks.append(child.id) schedule = Schedule(schedule_mapping) return schedule
wf_added_times = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #wf_added_times = [0.1] initial_wf_name = "Montage_30" added_wf_name = "Montage_25" initial_wf = ExecutorRunner.get_wf(initial_wf_name, "00") added_wf = ExecutorRunner.get_wf(added_wf_name, "10") bundle = Utility.get_default_bundle() (estimator, resource_manager, initial_schedule) = ExecutorRunner.get_infrastructure(bundle, 1.0, False) ## planning for initial wf heft = DynamicHeft(initial_wf, resource_manager, estimator) empty_schedule = Schedule({node: [] for node in heft.get_nodes()}) ga = GAComputationManager(15, initial_wf, resource_manager, estimator) ga_initial_schedule = ga._get_ga_alg()(empty_schedule, None)[2] all_initial_wf_time = Utility.makespan(ga_initial_schedule) print("Initial time: " + str(all_initial_wf_time)) n = 5 ## planning for added wf def gaheft_reschedule(wf_added_time): copy_gaheft_schedule = Schedule({
class GaHeftExecutor(FailRandom, BaseExecutor): #@trace def __init__(self, **kwargs): super().__init__(**kwargs) self.workflow = kwargs["wf"] self.resource_manager = kwargs["resource_manager"] # DynamicHeft # both planners have acess to resource manager and estimator self.heft_planner = kwargs["heft_planner"] self.base_fail_duration = kwargs["base_fail_duration"] self.base_fail_dispersion = kwargs["base_fail_dispersion"] self.current_schedule = None self.fixed_interval_for_ga = kwargs["fixed_interval_for_ga"] self.ga_builder = kwargs["ga_builder"] self.replace_anyway = kwargs.get("replace_anyway", True) self.back_cmp = None pass def init(self): self.current_schedule = Schedule({node: [] for node in self.heft_planner.get_nodes()}) initial_schedule = self.heft_planner.run(Schedule({node: [] for node in self.heft_planner.get_nodes()})) # print("heft solution!") # fsh = [hash(key) for key in initial_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") # TODO: change these two ugly records result = self.ga_builder()(self.current_schedule, initial_schedule) # print("Ga solution is broken!") # fsh = [hash(key) for key in result[0][2].mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") if not self._apply_mh_if_better(None, heuristic_resulted_schedule=initial_schedule, metaheuristic_resulted_schedule=result[0][2]): self.current_schedule = initial_schedule self._post_new_events() # print("Before Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #self.current_schedule = result[0][2] #self._post_new_events() return result def _task_start_handler(self, event): res = self._check_event_for_ga_result(event) if res: return # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if not self._is_a_fail_possible(): return if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) self._check_event_for_ga_result(event) pass def _node_failed_handler(self, event): if not self._is_a_fail_possible(): return self._remove_events(lambda ev: not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)) ## interrupt ga self._stop_ga() # check node down self.resource_manager.node(event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: raise Exception(" Trouble in finding of the task: count of found tasks {0}".format(len(it))) it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time # print("Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") # run HEFT self._reschedule(event) # print("After!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #run GA self._run_ga_in_background(event) pass def _node_up_handler(self, event): ## interrupt ga self._stop_ga() # check node up self.heft_planner.resource_manager.node(event.node).state = Node.Unknown # print("Before!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") self._reschedule(event) # print("After!") # fsh = [hash(key) for key in self.current_schedule.mapping.keys()] # rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] # if any(((h not in fsh) for h in rm_hashes)): # raise Exception("Fixed schedule is broken") #run GA self._run_ga_in_background(event) pass def _stop_ga(self): self.back_cmp = None pass def _actual_ga_run(self): ## this way makes it possible to calculate what time ## ga actually has to find solution ## this value is important when you need account events between ## planned start and stop points # ga_interval = self.current_time - self.back_cmp.creation_time ## fixed_schedule is actual because ## we can be here only if there haven't been any invalidate events ## such as node failures ## in other case current ga background computation would be dropped ## and we wouldn't get here at all result = self.ga_builder()(self.back_cmp.fixed_schedule, # self.back_cmp.initial_schedule, self.back_cmp.current_schedule, self.current_time) print("CURRENT MAKESPAN: {0}".format(Utility.makespan(result[0][2]))) return result def _check_event_for_ga_result(self, event): # check for time to get result from GA running background if self.back_cmp is None or self.back_cmp.time_to_stop != self.current_time: return False else: print("Event {0}".format(event)) if isinstance(event, TaskStart): print("Task id {0}".format(event.task.id)) result = self._actual_ga_run() if result is not None: return self._apply_mh_if_better(event, heuristic_resulted_schedule=self.current_schedule, metaheuristic_resulted_schedule=result[0][2]) return False def _replace_current_schedule(self, event, new_schedule): # syncrhonize fixed part of new_schedule with the old schedule - lets assume new_schedule already synchonized # remove all events related with the old schedule # replace current with new # generate events of new schedule and post their if event is not None: self._clean_events(event) self.current_schedule = new_schedule self._post_new_events() self.back_cmp = None pass def _apply_mh_if_better(self, event, heuristic_resulted_schedule, metaheuristic_resulted_schedule): t1 = Utility.makespan(metaheuristic_resulted_schedule) t2 = Utility.makespan(heuristic_resulted_schedule) print("Replace anyway - {0}".format(self.replace_anyway)) if self.replace_anyway is True or t1 < t2: ## generate new events self._replace_current_schedule(event, metaheuristic_resulted_schedule) ## if event is TaskStarted event the return value means skip further processing return True else: ## TODO: run_ga_yet_another_with_old_genome # self.ga_computation_manager.run(self.current_schedule, self.current_time) #self._run_ga_in_background(event) self.back_cmp = None return False pass # def _is_a_fail_possible(self): # if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 1: # print("DECLINE NODE DOWN") # st = functools.reduce(operator.add, (" {0} - {1}".format(nd.name, nd.state) for nd in self.resource_manager.get_nodes()), "") # print("STATE INFORMATION: " + st) # return False # return True def _is_a_fail_possible(self): return True def _run_ga_in_background(self, event): if len([nd for nd in self.resource_manager.get_nodes() if nd.state != Node.Down]) == 0: return current_schedule = self.current_schedule current_time = self.current_time ## TODO: replace by log call print("Time: " + str(current_time) + " Creating reschedule point ") ## there can be several events in one time ## we choose the first to handle background GA run def _get_front_line(schedule, current_time, fixed_interval): event_time = current_time + fixed_interval min_item = ScheduleItem.MIN_ITEM() for (node, items) in schedule.mapping.items(): for item in items: ## It accounts case when event_time appears in a transfer gap(rare situation for all nodes) ## TODO: compare with some precison if event_time < item.end_time < min_item.end_time: min_item = item break if min_item.job is None: return None print("Time: " + str(current_time) + " reschedule point have been founded st:" + str(min_item.start_time) + " end:" + str(min_item.end_time)) return min_item def _get_fixed_schedule(schedule, front_event): def is_before_event(item): # hard to resolve corner case. The simulator doesn't guranteed the order of appearing events. if item.start_time < front_event.end_time: return True ## TODO: Urgent!!! experimental change. Perhaps, It should be removed from here later. if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.FAILED: return True return False ##TODO: it's dangerous operation. ## TODO: need create new example of ScheduleItem. def set_proper_state(item): new_item = ScheduleItem.copy(item) non_finished = new_item.state == ScheduleItem.EXECUTING or new_item.state == ScheduleItem.UNSTARTED ## TODO: Urgent!: dangerous place if non_finished and new_item.end_time <= front_event.end_time: new_item.state = ScheduleItem.FINISHED if non_finished and new_item.end_time > front_event.end_time: new_item.state = ScheduleItem.EXECUTING return new_item fixed_mapping = {key: [set_proper_state(item) for item in items if is_before_event(item)] for (key, items) in schedule.mapping.items()} return Schedule(fixed_mapping) ## TODO: make previous_result used def run_ga(current_schedule): fixed_interval = self.fixed_interval_for_ga front_event = _get_front_line(current_schedule, current_time, fixed_interval) # we can't meet the end of computation so we do nothing if front_event is None: print("GA's computation isn't able to meet the end of computation") return fixed_schedule = _get_fixed_schedule(current_schedule, front_event) #TODO: It isn't a good reliable solution. It should be reconsider later. fixed_ids = set(fixed_schedule.get_all_unique_tasks_id()) all_ids = set(task.id for task in self.workflow.get_all_unique_tasks()) ## TODO: urgent bugfix to correctly run GaHeftvsHeft if len(fixed_ids) == len(all_ids): print("Fixed schedule is complete. There is no use to run ga.") return fsh = [hash(key) for key in fixed_schedule.mapping.keys()] rm_hashes = [hash(node) for node in self.resource_manager.get_nodes()] if any(((h not in fsh) for h in rm_hashes)): raise Exception("Fixed schedule is broken") self.back_cmp = BackCmp(fixed_schedule, None, self.current_schedule, event, current_time, front_event.end_time) pass is_running = self.back_cmp is not None if not is_running: run_ga(current_schedule) else: self.back_cmp = None run_ga(current_schedule) ## TODO: only for debug. remove it later. # print("==================FIXED SCHEDULE PART=================") # print(self.back_cmp.fixed_schedule) # print("======================================================") pass
class GaOldPopExecutor(FailOnce, BaseExecutor): def __init__(self, **kwargs): super().__init__() self.estimator = kwargs["estimator"] self.base_fail_duration = kwargs["base_fail_duration"] self.base_fail_dispersion = kwargs["base_fail_dispersion"] self.workflow = kwargs["wf"] self.resource_manager = kwargs["resource_manager"] self.stat_saver = kwargs["stat_saver"] self.task_id_to_fail = kwargs["task_id_to_fail"] self.ga_builder = kwargs["ga_builder"] self.current_schedule = None self.past_pop = None pass def init(self): ## TODO: replace it with logging print("Working with initial state of nodes: {0}".format([n.flops for n in self.resource_manager.get_nodes()])) ga_planner = self.ga_builder() self.current_schedule = Schedule({node: [] for node in self.resource_manager.get_nodes()}) (result, logbook) = ga_planner(self.current_schedule, None) self.past_pop = ga_planner.get_pop() print("Result makespan: " + str(Utility.makespan(result[2]))) self.current_schedule = result[2] self._post_new_events() self.failed_once = False pass def _task_start_handler(self, event): # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # try to find nodes in cloud # check if failed and post (node, item) = self.current_schedule.place_by_time(event.task, event.time_happened) item.state = ScheduleItem.EXECUTING if self._check_fail(event.task, node): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion *random.random() time_of_fail = (item.end_time - self.current_time)*random.random() time_of_fail = self.current_time + (time_of_fail if time_of_fail > 0 else 0.01) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail # event_nodeup = NodeUp(node) # event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) # self.post(event_nodeup) # remove TaskFinished event ##TODO: make a function for this purpose in the base class self.queue = deque([ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id)]) pass def _task_finished_handler(self, event): # check task finished self.current_schedule.change_state_executed(event.task, ScheduleItem.FINISHED) pass def _node_failed_handler(self, event): self.resource_manager.node(event.node).state = Node.Down it = [item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING] if len(it) != 1: raise Exception("several items founded") pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time self._reschedule(event) pass def _node_up_handler(self, event): self.resource_manager.node(event.node).state = Node.Unknown self._reschedule(event) pass #@timing def _clean_chromosome(self, chromosome, event, current_cleaned_schedule): not_scheduled_tasks = [ item.job.id for (node, items) in current_cleaned_schedule.mapping.items() for item in items if item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.EXECUTING] for (node_name, ids) in chromosome.items(): for_removing = [] for id in ids: if id in not_scheduled_tasks: for_removing.append(id) pass for r in for_removing: ids.remove(r) pass pass if isinstance(event, NodeFailed): tasks = chromosome[event.node.name] ## TODO: here must be a procedure of getting currently alive nodes working_nodes = list(chromosome.keys() - set([event.node.name])) for t in tasks: lt = len(working_nodes) - 1 new_node = 0 if lt == 0 else random.randint(0, lt ) node_name = working_nodes[new_node] length = len(chromosome[node_name]) # TODO: correct 0 and length new_place = 0 if length == 0 else random.randint(0, length) chromosome[node_name].insert(new_place, t) chromosome[event.node.name] = [] return chromosome if isinstance(event, NodeUp): pass return chromosome def _reschedule(self, event): current_cleaned_schedule = self._clean_events(event) task_id = "" if not hasattr(event, 'task') else " " + str(event.task.id) ## scheduling with initial population created of the previous population by moving elements from a downed node print("Scheduling with the old pop: " + str(event.__class__.__name__) + task_id ) ga_planner = self.ga_builder() cleaned_chromosomes = [self._clean_chromosome(ch, event, current_cleaned_schedule) for ch in self.past_pop] def is_empty(ch): return len([item for n, items in ch.items() for item in items]) == 0 cleaned_chromosomes = [ch for ch in cleaned_chromosomes if not is_empty(ch)] cleaned_chromosomes = None if len(cleaned_chromosomes) == 0 else cleaned_chromosomes curr_ids = frozenset(current_cleaned_schedule.get_all_unique_tasks_id()) all_ids = frozenset(t.id for t in self.workflow.get_all_unique_tasks()) if all_ids == curr_ids: print("Schedule alleady has all unique tasks") return ((v1, v2, resulted_schedule, iter_old_pop), logbook_old_pop) = ga_planner(current_cleaned_schedule, None, self.current_time, initial_population=cleaned_chromosomes) #checking Utility.check_and_raise_for_fixed_part(resulted_schedule, current_cleaned_schedule, self.current_time) makespan_old_pop = Utility.makespan(resulted_schedule) print("Result makespan: " + str(makespan_old_pop)) self.current_schedule = resulted_schedule self.past_pop = ga_planner.get_pop() ## scheduling with random initial population print("Scheduling with a random pop: " + str(event.__class__.__name__)+ task_id) ga_planner_with_random_init_population = self.ga_builder() ((v3, v4, schedule_with_random, iter_random), logbook_random) = ga_planner_with_random_init_population(current_cleaned_schedule, None, self.current_time, initial_population=None) Utility.check_and_raise_for_fixed_part(schedule_with_random, current_cleaned_schedule, self.current_time) makespan_random = Utility.makespan(schedule_with_random) print("Result makespan: " + str(Utility.makespan(schedule_with_random))) # creating and writing some stat data # Note: it can be rewritten with using of events if self.stat_saver is not None: stat_data = { "wf_name": self.workflow.name, "event_name": event.__class__.__name__, "task_id": task_id, "with_old_pop": { "iter": iter_old_pop, "makespan": makespan_old_pop, "pop_aggr": logbook_old_pop }, "with_random": { "iter": iter_random, "makespan": makespan_random, "pop_aggr": logbook_random } } self.stat_saver(stat_data) self._post_new_events() pass pass
class CloudHeftExecutor(EventMachine): STATUS_RUNNING = 'running' STATUS_FINISHED = 'finished' def __init__(self, heft_planner, base_fail_duration, base_fail_dispersion, desired_reliability, public_resource_manager, initial_schedule=None): ## TODO: remake it later self.queue = deque() self.current_time = 0 # DynamicHeft self.heft_planner = heft_planner self.base_fail_duration = base_fail_duration self.base_fail_dispersion = base_fail_dispersion self.desired_reliability = desired_reliability self.public_resources_manager = public_resource_manager #self.current_schedule = Schedule({node: [] for node in heft_planner.get_nodes()}) self.initial_schedule = initial_schedule self.current_schedule = initial_schedule self.register = dict() def init(self): #self.current_schedule = self.heft_planner.run(self.current_schedule) if self.initial_schedule is None: self.current_schedule = Schedule( {node: [] for node in self.heft_planner.get_nodes()}) self.current_schedule = self.heft_planner.run( self.current_schedule) else: id_to_task = { tsk.id: tsk for tsk in HeftHelper.get_all_tasks(self.heft_planner.workflow) } mapping = { node: [ ScheduleItem(id_to_task[item.job.id], item.start_time, item.end_time) for item in items ] for (node, items) in self.initial_schedule.mapping.items() } self.current_schedule = Schedule(mapping) self.post_new_events() def event_arrived(self, event): def reschedule(event): self.heft_planner.current_time = self.current_time current_cleaned_schedule = self.clean_events(event) self.current_schedule = self.heft_planner.run( current_cleaned_schedule) self.post_new_events() def check_fail(reliability): res = random.random() if res > reliability: return True return False if isinstance(event, TaskStart): # TODO: if node is cloud node, do nothing prm = self.public_resources_manager if prm.isCloudNode(event.node): return None # check if failed and post (node, item) = self.current_schedule.place_by_time( event.task, event.time_happened) item.state = ScheduleItem.EXECUTING # check task as executing # self.current_schedule.change_state(event.task, ScheduleItem.EXECUTING) # public_resources_manager: # determine nodes of proper soft type # check and determine free nodes # determine reliability of every nodes # determine time_of_execution probability for (task,node) pair # try to find nodes in cloud if event.task not in self.register: proper_nodes = prm.get_by_softreq(event.task.soft_reqs) proper_nodes = [ node for node in proper_nodes if not prm.isBusy(node) ] sorted_proper_nodes = sorted( proper_nodes, key=lambda x: prm.get_reliability(x.name)) current_set = [] base_reliability = self.heft_planner.estimator.estimate_reliability( event.task, event.node) obtained_reliability = base_reliability dt = item.end_time - item.start_time def calc(node, dt): #(dt, task, node, transfer_estimation) # TODO: add proper transfer time here fp = prm.get_reliability(node.name) comp_time = self.heft_planner.estimator.estimate_runtime( event.task, node) cp = prm.probability_estimator(dt, comp_time, 0) #TODO: remove it later #cp = 0.95 #print("cp: " + str(cp)) return (node, fp, cp) it_comm_buf = 0 for pnode in sorted_proper_nodes: common_reliability = 1 - base_reliability #TODO: refactor this later if 1 - common_reliability >= self.desired_reliability: break res = calc(pnode, dt) current_set.append(res) #TODO: add dencity law of probability for dedicated resource for (nd, fp, cp) in current_set: common_reliability *= (1 - fp * cp) common_reliability = 1 - common_reliability #print("common_reliability: " + str(common_reliability)) it_comm_buf = common_reliability if common_reliability >= self.desired_reliability: #print("Commmon: "+ str(common_reliability)) break #print("Comm " + str(it_comm_buf) + " task: " + str(event.task.id)) #print(" Obtained reliability " + str(obtained_reliability) + " for task: " + str(event.task)) def frange(x, y, jump): while x < y: yield x x += jump for (nd, fp, cp) in current_set: comp_time = self.heft_planner.estimator.estimate_runtime( event.task, nd) #sigma 0.1*M lets take 0.6*M #TODO: uncomment it later ints = [(i, calc(nd, i)) for i in frange(0, comp_time + 0.2 * comp_time, 0.05 * comp_time)] rd = random.random() generated_comp_time = comp_time for (i, p) in ints: if p[2] > rd: generated_comp_time = i break #comp_time + 0.6*comp_time # TODO: remove it later #generated_comp_time = comp_time + (0.2 * comp_time * random.random() - 0.1 * comp_time) #generated_comp_time = comp_time - (0.2 * comp_time * (random.random() - 0.95)) #print("cloud reliability: " + str(fp)) if check_fail(fp): event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd self.post(event_start) duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = generated_comp_time * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(nd, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(nd) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) else: event_start = TaskStart(event.task) event_start.time_happened = self.current_time event_start.node = nd event_finish = TaskFinished(event.task) event_finish.time_happened = self.current_time + generated_comp_time event_finish.node = nd self.post(event_start) self.post(event_finish) prm.checkBusy(nd, True) self.register[event.task] = CloudHeftExecutor.STATUS_RUNNING pass reliability = self.heft_planner.estimator.estimate_reliability( event.task, node) if check_fail(reliability): # generate fail time, post it duration = self.base_fail_duration + self.base_fail_dispersion * random.random( ) time_of_fail = (item.end_time - self.current_time) * random.random() time_of_fail = self.current_time + ( time_of_fail if time_of_fail > 0 else 0.01 ) ##(item.end_time - self.current_time)*0.01 event_failed = NodeFailed(node, event.task) event_failed.time_happened = time_of_fail event_nodeup = NodeUp(node) event_nodeup.time_happened = time_of_fail + duration self.post(event_failed) self.post(event_nodeup) # remove TaskFinished event self.queue = deque([ ev for ev in self.queue if not (isinstance(ev, TaskFinished) and ev.task.id == event.task.id and not prm.isCloudNode(ev.node)) ]) pass return None if isinstance(event, TaskFinished): # check if it cloud task # if task cloud and first: register as finished, check node in dedicated as finish, remove appropriate event of failure or task finished for dedicated, free cloud node, reschedule, end_of_function # if task cloud and not first: free cloud node, end_of_function # if task not cloud and first: register as finished, check node in dedicated as finish, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud and self.register[ event.task] == CloudHeftExecutor.STATUS_RUNNING: # print("gotcha task: " + str(event.task)) self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED ## TODO: correct it ## if event.task failed and went through rescheduling, ## it would be possible that currently ScheduleItem of event.task on dedicated resource ## has UNSTARTED state. ## TODO: add additional functional to schedule to record such situations and validate it after found = self.current_schedule.change_state_executed_with_end_time( event.task, ScheduleItem.FINISHED, self.current_time) pair = self.current_schedule.place_single(event.task) if pair is not None: ## TODO: The bug is here. Fix it later. ## the unstarted case must be taken into account in schedule and in the validity check procedure too (nd, item) = pair if item.state == ScheduleItem.EXECUTING: item.start_time = event.time_happened item.end_time = event.time_happened item.state = ScheduleItem.FINISHED self.queue = [ ev for ev in self.queue if not (not isinstance(ev, NodeUp) and ev.task.id == event.task.id) ] else: prm.checkBusy(event.node, False) return None def check(ev): if isinstance(ev, TaskFinished) or isinstance( ev, NodeFailed): if ev.task.id == event.task.id and not prm.isCloudNode( ev.node): return False ## TODO: make it later ##if isinstance(ev, NodeUp): return True self.queue = [ev for ev in self.queue if check(ev)] prm.checkBusy(event.node, False) reschedule(event) return None if from_cloud and self.register[ event.task] == CloudHeftExecutor.STATUS_FINISHED: prm.checkBusy(event.node, False) return None # check task finished self.register[event.task] = CloudHeftExecutor.STATUS_FINISHED self.current_schedule.change_state_executed( event.task, ScheduleItem.FINISHED) return None if isinstance(event, NodeFailed): # check if cloud node # if cloud node: check as down, free node, end_of_function # if not cloud node: check as down, reschedule, end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, True) prm.checkBusy(event.node, False) return None # check node down self.heft_planner.resource_manager.node( event.node).state = Node.Down # check failed event in schedule ## TODO: ambigious choice ##self.current_schedule.change_state(event.task, ScheduleItem.FAILED) it = [ item for item in self.current_schedule.mapping[event.node] if item.job.id == event.task.id and item.state == ScheduleItem.EXECUTING ] if len(it) != 1: ## TODO: raise exception here pass it[0].state = ScheduleItem.FAILED it[0].end_time = self.current_time reschedule(event) return None if isinstance(event, NodeUp): # check if cloud # if cloud: check node up, end_of_function # if not cloud: check as up, reschedule end_of_function prm = self.public_resources_manager from_cloud = prm.isCloudNode(event.node) if from_cloud: prm.checkDown(event.node.name, False) return None # check node up self.heft_planner.resource_manager.node( event.node).state = Node.Unknown reschedule(event) return None return None def post_new_events(self): unstarted_items = set() for (node, items) in self.current_schedule.mapping.items(): for item in items: if item.state == ScheduleItem.UNSTARTED: unstarted_items.add((node, item)) events_to_post = [] for (node, item) in unstarted_items: event_start = TaskStart(item.job) event_start.time_happened = item.start_time event_start.node = node event_finish = TaskFinished(item.job) event_finish.time_happened = item.end_time event_finish.node = node events_to_post self.post(event_start) self.post(event_finish) pass def clean_events(self, event): # remove all unstarted tasks cleaned_task = set() if isinstance(event, NodeFailed): cleaned_task = set([event.task]) new_mapping = dict() for (node, items) in self.current_schedule.mapping.items(): new_mapping[node] = [] for item in items: if item.state != ScheduleItem.UNSTARTED: new_mapping[node].append(item) else: cleaned_task.add(item.job) clean_schedule = Schedule(new_mapping) # remove all events associated with these tasks prm = self.public_resources_manager def check(event): if isinstance( event, TaskStart ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False if isinstance( event, TaskFinished ) and event.task in cleaned_task and not prm.isCloudNode( event.node): return False return True new_queue = deque([evnt for evnt in self.queue if check(evnt)]) self.queue = new_queue return clean_schedule
def schedule(self, fixed_schedule_part=None, current_time=0.0): estimate = self.estimator.estimate_transfer_time # TODO: make common utility function with ScheduleBuilder def is_last_version_of_task_executing(item): return item.state == ScheduleItem.EXECUTING or item.state == ScheduleItem.FINISHED or item.state == ScheduleItem.UNSTARTED def _get_ready_tasks(children, finished_tasks): def _is_child_ready(child): ids = set([p.id for p in child.parents]) result = False in [id in finished_tasks for id in ids] return not result ready_children = [child for child in children if _is_child_ready(child)] return ready_children if fixed_schedule_part is None: schedule_mapping = {node: [] for node in self.nodes} ready_tasks = [child.id for child in self.workflow.head_task.children] task_to_node = dict() finished_tasks = set() else: schedule_mapping = {node: [item for item in items] for (node, items) in fixed_schedule_part.mapping.items()} finished_tasks = [item.job.id for (node, items) in fixed_schedule_part.mapping.items() for item in items if is_last_version_of_task_executing(item)] finished_tasks = set([self.workflow.head_task.id] + finished_tasks) unfinished = [task for task in self.workflow.get_all_unique_tasks() if not task.id in finished_tasks] ready_tasks = [task.id for task in _get_ready_tasks(unfinished, finished_tasks)] task_to_node = {item.job.id: (node, item.start_time, item.end_time) for (node, items) in fixed_schedule_part.mapping.items() for item in items if is_last_version_of_task_executing(item)} def is_child_ready(child): ids = set([p.id for p in child.parents]) result = False in [id in finished_tasks for id in ids] return not result def find_slots(node, comm_ready, runtime): node_schedule = schedule_mapping.get(node, list()) free_time = 0 if len(node_schedule) == 0 else node_schedule[-1].end_time ## TODO: refactor it later f_time = max(free_time, comm_ready) f_time = max(f_time, current_time) base_variant = [(f_time, f_time + runtime + 1)] zero_interval = [] if len(node_schedule) == 0 else [(0, node_schedule[0].start_time)] middle_intervals = [(node_schedule[i].end_time, node_schedule[i + 1].start_time) for i in range(len(node_schedule) - 1)] intervals = zero_interval + middle_intervals + base_variant #result = [(st, end) for (st, end) in intervals if st >= comm_ready and end - st >= runtime] ## TODO: rethink rounding result = [(st, end) for (st, end) in intervals if (current_time < st or abs((current_time - st)) < 0.01) and st >= comm_ready and (runtime < (end - st) or abs((end - st) - runtime) < 0.01)] return result def comm_ready_func(task, node): ##TODO: remake this stub later. if len(task.parents) == 1 and self.workflow.head_task.id == list(task.parents)[0].id: return 0 return max([task_to_node[p.id][2] + estimate(node, task_to_node[p.id][0], task, p) for p in task.parents]) def get_possible_execution_times(task, node): ## pay attention to the last element in the resulted seq ## it represents all available time of node after it completes all its work ## (if such interval can exist) ## time_slots = [(st1, end1),(st2, end2,...,(st_last, st_last + runtime)] runtime = self.estimator.estimate_runtime(task, node) comm_ready = comm_ready_func(task, node) time_slots = find_slots(node, comm_ready, runtime) return time_slots, runtime while len(ready_tasks) > 0: choosed_index = random.randint(0, len(ready_tasks) - 1) task = self.task_map[ready_tasks[choosed_index]] #TODO: make checking for all nodes are dead.(It's a very rare situation so it is not consider for now) alive_nodes = [node for node in self.nodes if node.state != Node.Down] choosed_node_index = random.randint(0, len(alive_nodes) - 1) node = alive_nodes[choosed_node_index] time_slots, runtime = get_possible_execution_times(task, node) choosed_time_index = 0 if len(time_slots) == 1 else random.randint(0, len(time_slots) - 1) time_slot = time_slots[choosed_time_index] start_time = time_slot[0] end_time = start_time + runtime item = ScheduleItem(task, start_time, end_time) ##schedule_mapping[node].append(item) Schedule.insert_item(schedule_mapping, node, item) task_to_node[task.id] = (node, start_time, end_time) ##print('I am here') ready_tasks.remove(task.id) finished_tasks.add(task.id) ready_children = [child for child in task.children if is_child_ready(child)] for child in ready_children: ready_tasks.append(child.id) schedule = Schedule(schedule_mapping) return schedule
def __call__(self, chromo, current_time): count_of_tasks = lambda mapping: reduce(operator.add, ( len(tasks) for node, tasks in mapping.items()), 0) alive_nodes = [node for node in self.nodes if node.state != Node.Down] alive_nodes_names = [node.name for node in alive_nodes] for node_name, tasks in chromo.items(): if node_name not in alive_nodes_names and len(tasks) > 0: raise ValueError( "Chromo is invalid. There is a task assigned to a dead node" ) if count_of_tasks(chromo) + len( self.fixed_schedule_part.get_unfailed_tasks_ids()) != len( self.workflow.get_all_unique_tasks()): print("==Chromosome==================================") print(chromo) print("=fixed_schedule_part===================================") print(self.fixed_schedule_part) raise Exception( "The chromosome not a full. Chromo length: {0}, Fixed part length: {1}, workflow size: {2}" .format(count_of_tasks(chromo), len(self.fixed_schedule_part.get_unfailed_tasks_ids()), len(self.workflow.get_all_unique_tasks()))) # TODO: add not to schedule #if count_of_tasks(chromo) + count_of_tasks(self.fixed_schedule_part.mapping) != (schedule_mapping, finished_tasks, ready_tasks, chrmo_mapping, task_to_node) = self._create_helping_structures(chromo) #chromo_copy = {nd_name: [item for item in items] for (nd_name, items) in chromo.items()} chromo_copy = deepcopy(chromo) if len(alive_nodes) == 0: raise Exception("There are not alive nodes") #print("Building started...") while len(ready_tasks) > 0: # ## TODO: only for debug. Remove it later. # print("alive nodes: {0}".format(alive_nodes)) # for node_name, tasks in chromo_copy.items(): # print("Node: {0}, tasks count: {1}".format(node_name, len(tasks))) count_before = count_of_tasks(chromo_copy) if len(alive_nodes) == 0: raise ValueError("Count of alive_nodes is zero") for node in alive_nodes: if len(chromo_copy[node.name]) == 0: continue ## TODO: Urgent! completely rethink this procedure tsk_id = None for i in range(len(chromo_copy[node.name])): if chromo_copy[node.name][i] in ready_tasks: tsk_id = chromo_copy[node.name][i] break if tsk_id is not None: task = self.task_map[tsk_id] #del chromo_copy[node.name][0] chromo_copy[node.name].remove(tsk_id) ready_tasks.remove(tsk_id) (start_time, end_time) = place_task_to_schedule( self.workflow, self.estimator, schedule_mapping, task_to_node, chrmo_mapping, task, node, current_time) task_to_node[task.id] = (node, start_time, end_time) finished_tasks.add(task.id) ready_children = self._get_ready_tasks( task.children, finished_tasks) for child in ready_children: ready_tasks.append(child.id) count_after = count_of_tasks(chromo_copy) if count_before == count_after: raise Exception( "Unable to properly process a chromosome." " Perhaps, due to invalid fixed_schedule_part or the chromosome." ) pass schedule = Schedule(schedule_mapping) return schedule
def __call__(self, chromo, current_time): (schedule_mapping, finished_tasks, ready_tasks, chrmo_mapping, task_to_node) = self._create_helping_structures(chromo) chromo_copy = dict() for (nd_name, items) in chromo.items(): chromo_copy[nd_name] = [] for item in items: chromo_copy[nd_name].append(item) alive_nodes = [node for node in self.nodes if node.state != Node.Down] if len(alive_nodes) == 0: raise Exception("There are not alive nodes") while len(ready_tasks) > 0: for node in alive_nodes: if len(chromo_copy[node.name]) == 0: continue if node.state == Node.Down: continue ## TODO: Urgent! completely rethink this procedure tsk_id = None for i in range(len(chromo_copy[node.name])): if chromo_copy[node.name][i] in ready_tasks: tsk_id = chromo_copy[node.name][i] break if tsk_id is not None: task = self.task_map[tsk_id] #del chromo_copy[node.name][0] chromo_copy[node.name].remove(tsk_id) ready_tasks.remove(tsk_id) time_slots, runtime = self._get_possible_execution_times( schedule_mapping, task_to_node, chrmo_mapping, task, node, current_time) time_slot = next(time_slots) start_time = time_slot[0] end_time = start_time + runtime item = ScheduleItem(task, start_time, end_time) # need to account current time Schedule.insert_item(schedule_mapping, node, item) task_to_node[task.id] = (node, start_time, end_time) finished_tasks.add(task.id) #ready_children = [child for child in task.children if self._is_child_ready(finished_tasks, child)] ready_children = self._get_ready_tasks(task.children, finished_tasks) for child in ready_children: ready_tasks.append(child.id) schedule = Schedule(schedule_mapping) return schedule