def __init__(self, hy=None, hy_rid=None, ): if hy_rid: self.hy = {} for hy_label, hy_level_roids in iteritems(hy_rid): self.hy[hy_label] = [ ordered_ids2itvs(ids) for k, ids in iteritems(hy_level_roids)] else: if hy: self.hy = hy else: raise Exception("Hierarchy description must be provided")
def sched(self): next_job_arrival = self.job_arrival() while True: print('Wait for job arrivals or job endings', self.env.now) events = list(self.evt_running_jobs) if next_job_arrival is not None: print("append next_job_arrival evt") events.append(next_job_arrival) any_of_events = AnyOf(self.env, events) ev = yield any_of_events for k, v in iteritems(ev.todict()): if k == next_job_arrival: print("job arrives !", v) for jid in v: self.waiting_jids.add(jid) next_job_arrival = self.job_arrival() else: print("job endings !", k, v) # if k in self.evt_running_jobs: # print("remove ev: ", k) self.evt_running_jobs.remove(k) self.jobs[v].state = "Terminated" self.platform.completed_jids.append(v) self.platform.running_jids.remove(v) now = self.env.now if ((next_job_arrival is None) and not self.waiting_jids and not self.evt_running_jobs): print("All job submitted, no more waiting or running jobs ...", now) self.env.exit() print("call schedule_cycle.... ", now) schedule_cycle(self.platform, now, "test") # launch jobs if needed for jid, job in iteritems(self.platform.assigned_jobs): if job.start_time == now: self.waiting_jids.remove(jid) job.state = "Running" print("launch:", jid) evt_running_job = self.env.timeout(job.run_time, jid) self.evt_running_jobs.add(evt_running_job) self.platform.running_jids.append(jid)
def parse(self): """Parses the request arguments.""" parsed_kwargs = {} raw_kwargs = {} for argname, argobj in iteritems(self.argmap): dest = argobj.dest if argobj.dest is not None else argname parsed_value = self.parse_arg(argname, argobj) if parsed_value is not self.MISSING: try: parsed_kwargs[dest] = self.convert(parsed_value, argobj.type) except Exception as e: msg = ("The parameter '%s' specified in the request " "URI is not supported. %s" % (argname, e)) try: abort(400) except: exc_type, exc_value, tb = sys.exc_info() exc_value.data = msg reraise(exc_type, exc_value, tb.tb_next) else: parsed_kwargs[dest] = argobj.default raw_value = argobj.raw_value(parsed_kwargs[dest]) if raw_value is not None: raw_kwargs[argname] = raw_value return parsed_kwargs, raw_kwargs
def __str__(self): lines = [] for i, slot in iteritems(self.slots): lines.append("[%s] %s" % (i, slot)) max_length = max([len(line) for line in lines]) lines.append("%s" % ("-" * max_length)) lines.insert(0, ('{:-^%d}' % max_length).format(' SlotSet ')) return '\n'.join(lines)
def plot_slots_and_job(slots_set, jobs, nb_res, t_max): import matplotlib.pyplot as plt import matplotlib.patches as mpatch fig, ax = plt.subplots() if slots_set: for sid, slot in iteritems(slots_set.slots): col = "blue" if (sid % 2): col = "red" for i, itv in enumerate(slot.itvs): (y0, y1) = itv # print i, y0,y1, slot.b, slot.e # rect = mpatch.Rectangle((2,2), 8, 2) rect = mpatch.Rectangle((slot.b, y0 - 0.4), slot.e - slot.b, y1 - y0 + 0.9, alpha=0.1, color=col) if (i == 0): annotate(ax, rect, 's' + str(sid)) ax.add_artist(rect) if jobs: for jid, job in iteritems(jobs): col = RGB_tuples[random.randint(0, NB_COLORS - 1)] duration = job.walltime if hasattr(job, 'run_time'): duration = job.run_time for i, itv in enumerate(job.res_set): (y0, y1) = itv rect = mpatch.Rectangle((job.start_time, y0 - 0.4), duration, y1 - y0, alpha=0.2, color=col) if (i == 0): annotate(ax, rect, 'j' + str(jid)) ax.add_artist(rect) ax.set_xlim((0, t_max)) ax.set_ylim((0, nb_res)) # ax.set_aspect('equal') ax.grid(True) mng = plt.get_current_fig_manager() try: mng.resize(*mng.window.maxsize()) # mng.window.showMaximized() except: pass plt.show()
def save_assigns_simu(self, jobs, resource_set): print("save_assigns_simu") for jid, job in iteritems(jobs): jres_set = job.res_set print("job.res_set before", jid, job.res_set) r_ids = [resource_set.rid_o2i[roid] for roid in itvs2ids(jres_set)] job.res_set = unordered_ids2itvs(r_ids) self.assigned_jobs = jobs
def check_besteffort_jobs_to_kill(jobs_to_launch, rid2jid_to_launch, current_time_sec, besteffort_rid2job, resource_set): '''Detect if there are besteffort jobs to kill return 1 if there is at least 1 job to frag otherwise 0 ''' return_code = 0 logger.debug("Begin processing of besteffort jobs to kill") fragged_jobs = [] for rid, job_id in iteritems(rid2jid_to_launch): if rid in besteffort_rid2job: be_job = besteffort_rid2job[rid] job_to_launch = jobs_to_launch[job_id] if is_timesharing_for_two_jobs(be_job, job_to_launch): logger.debug("Resource " + str(rid) + " is needed for job " + str(job_id) + ", but besteffort job " + str(be_job.id) + " can live, because timesharing compatible") else: if be_job.id not in fragged_jobs: skip_kill = 0 checkpoint_first_date = sys.maxsize # Check if we must checkpoint the besteffort job if be_job.checkpoint > 0: for ev in get_job_events(be_job.id): if ev.type == 'CHECKPOINT': if checkpoint_first_date > ev.date: checkpoint_first_date = ev.date if (checkpoint_first_date == sys.maxsize) or\ (current_time_sec <= (checkpoint_first_date + be_job.checkpoint)): skip_kill = 1 send_checkpoint_signal(be_job) logger.debug("Send checkpoint signal to the job " + str(be_job.id)) if not skip_kill: logger.debug("Resource " + str(rid) + "need to be freed for job " + str(be_job.id) + ": killing besteffort job " + str(job_to_launch.id)) add_new_event('BESTEFFORT_KILL', be_job.id, "kill the besteffort job " + str(be_job.id)) frag_job(be_job.id) fragged_jobs.append(be_job.id) return_code = 1 logger.debug("End precessing of besteffort jobs to kill\n") return return_code
def scheduleJobs(self): print("Sheduling Round") real_time = time.time() if self.platform_model == "simu": schedule_cycle(self.platform, self.env.now, "default") # retrieve jobs to launch jids_to_launch = [] for jid, job in iteritems(self.platform.assigned_jobs): print("job.start_time %s" % job.start_time) if (job.start_time == self.env.now) and (job.state == "Waiting"): self.waiting_jids.remove(jid) jids_to_launch.append(jid) job.state = "Running" print("tolaunch: %s" % jid) self.platform.running_jids.append(jid) else: print("call meta_schedule('internal')") meta_schedule("internal", plt) result = db.query(Job).filter(Job.state == "toLaunch").order_by(Job.id).all() for job_db in result: set_job_state(job_db.id, "Running") jid = self.db_jid2s_jid[job_db.id] self.waiting_jids.remove(jid) jids_to_launch.append(jid) self.jobs[jid].state = "Running" print("_tolaunch: %s" % jid) self.platform.running_jids.append(jid) print("Ids of jobs to launch: ", *jids_to_launch) print("Time befort scheduling round: ", self.bs._current_time, self.sched_delay) # update time real_sched_time = time.time() - real_time if self.sched_delay == -1: self.bs.consume_time(real_sched_time) # TODO else: self.bs.consume_time(self.sched_delay) self.env.now = self.bs._current_time print("Time after scheduling round: ", self.bs._current_time) # send to uds if len(jids_to_launch) > 0: scheduled_jobs = [] jobs_res = {} for jid in jids_to_launch: ds_job = self.jobs[jid].ds_job res = itvs2batsim_str0(self.jobs[jid].res_set) scheduled_jobs.append(ds_job) jobs_res[ds_job.id] = res self.bs.start_jobs(scheduled_jobs, jobs_res)
def check(self, job): global quotas_rules # self.show_counters('before check, job id: ' + str(job.id)) for rl_fields, rl_quotas in iteritems(quotas_rules): # pdb.set_trace() rl_queue, rl_project, rl_job_type, rl_user = rl_fields rl_nb_resources, rl_nb_jobs, rl_resources_time = rl_quotas for fields, counters in iteritems(self.counters): queue, project, job_type, user = fields nb_resources, nb_jobs, resources_time = counters # match queue if ((rl_queue == '*') and (queue == '*')) or\ ((rl_queue == queue) and (job.queue_name == queue)) or\ (rl_queue == '/'): # match project if ((rl_project == '*') and (project == '*')) or\ ((rl_project == project) and (job.project == project)) or\ (rl_project == '/'): # match job_typ if ((rl_job_type == '*') and (job_type == '*')) or\ ((rl_job_type == job_type) and (job_type in job.types)): # match user if ((rl_user == '*') and (user == '*')) or\ ((rl_user == user) and (job.user == user)) or\ (rl_user == '/'): # test quotas values plus job's ones # 1) test nb_resources if (rl_nb_resources > -1) and\ (rl_nb_resources < nb_resources): return (False, 'nb resources quotas failed', rl_fields, rl_nb_resources) # 2) test nb_jobs if (rl_nb_jobs > -1) and (rl_nb_jobs < nb_jobs): return (False, 'nb jobs quotas failed', rl_fields, rl_nb_jobs) # 3) test resources_time (work) if (rl_resources_time > -1) and\ (rl_resources_time < resources_time): return (False, 'resources hours quotas failed', rl_fields, rl_resources_time) return (True, 'quotas ok', '', 0)
def set_slots_with_prev_scheduled_jobs(slots_sets, jobs, job_security_time, now=0, filter_besteffort=True, only_besteffort=False): jobs_slotsets = {'default': []} for job in jobs: logger.debug("job.id:" + str(job.id)) # print("job.id:", str(job.id)) if ((not filter_besteffort) and ("besteffort" in job.types)) or\ ((not only_besteffort) and (not ("besteffort" in job.types))): if "container" in job.types: t_e = job.start_time + job.walltime - job_security_time # t "job.res_set, job.start_time, t_e", job.res_set, # job.start_time, t_e if job.types["container"] != "": ss_name = job.types["container"] else: ss_name = str(job.id) logger.debug("container:" + ss_name) if ss_name not in slots_sets: slots_sets[ss_name] = SlotSet(([], 1)) if job.start_time < now: start_time = now else: start_time = job.start_time j = JobPseudo(id=0, start_time=start_time, walltime=job.walltime - job_security_time, res_set=job.res_set, ts=job.ts, ph=job.ts) slots_sets[ss_name].split_slots_jobs([j], False) # add job's resources ss_name = 'default' if "inner" in job.types: ss_name = job.types["inner"] if ss_name not in jobs_slotsets: jobs_slotsets[ss_name] = [] jobs_slotsets[ss_name].append(job) for ss_name, slot_set in iteritems(slots_sets): logger.debug(" slots_sets.iteritems():" + ss_name) if ss_name in jobs_slotsets: slot_set.split_slots_jobs(jobs_slotsets[ss_name])
def save_assigns_simu_and_default(self, jobs, resource_set): print("save_assigns_simu_and_default........................") # assigned_jobs = {} for jid, job in iteritems(jobs): sid = self.db_jid2s_jid[jid] jobsimu = self.jobs[sid] jres_set = job.res_set r_ids = [resource_set.rid_o2i[roid] for roid in itvs2ids(jres_set)] jobsimu.res_set = unordered_ids2itvs(r_ids) print("save assign jid, sid, res_set: ", jid, " ", sid, " ", jobsimu.res_set) jobsimu.start_time = job.start_time jobsimu.walltime = job.walltime # assigned_jobs[sid] = jobsimu # self.assigned_jobs = assigned_jobs return save_assigns(jobs, resource_set)
def set_jobs_cache_keys(jobs): """ Set keys for job use by slot_set cache to speed up the search of suitable slots. Jobs with timesharing, placeholder or dependencies requirements are not suitable for this cache feature. Jobs in container might leverage of cache because container is link to a particular slot_set. For jobs with dependencies, they do not update the cache entries. """ for job_id, job in iteritems(jobs): if (not job.ts) and (job.ph == NO_PLACEHOLDER): for res_rqt in job.mld_res_rqts: (moldable_id, walltime, hy_res_rqts) = res_rqt job.key_cache[int(moldable_id)] = str(walltime) + str(hy_res_rqts)
def load_quotas_rules(): global quotas_rules global quotas_job_types """ { "quotas": { "*,*,*,*": [120,-1,-1], "*,*,*,john": [150,-1,-1] } "quotas_job_types": ['besteffort','deploy','console'] } """ quotas_rules_filename = config['QUOTAS_FILE'] with open(quotas_rules_filename) as json_file: json_quotas = json.load(json_file) for k, v in iteritems(json_quotas['quotas']): quotas_rules[tuple(k.split(','))] = [v[0], v[1], int(3600 * v[2])] if 'quotas_job_types' in json_quotas: quotas_job_types.extend(json_quotas['quotas_job_types'])
def assign_resources(self, *proxy_args, **proxy_kwargs): self.app.logger.info("┳ OAR ask to assign resources") slots_set = pickle.loads(proxy_args[0]) job_dict = proxy_args[1] job = SimpleNamespace(job_dict) hy = {} for res_label in iteritems(proxy_args[2]): hy[res_label] = [tuple(i) for i in proxy_args[2][res_label]] self.app.logger.debug("┃ Before COORM scheduling") for line in ("%s" % slots_set).split('\n'): self.app.logger.debug("┃ %s" % line) prev_sid_left, prev_sid_right, job = \ self.app.assign_resources(slots_set, job, *proxy_args[3:]) self.app.logger.debug("┃ After COORM scheduling") for line in ("%s" % slots_set).split('\n'): self.app.logger.debug("┃ %s" % line) self.app.logger.info("┻ Returns : [%s, %s]" % (prev_sid_left, prev_sid_right)) self.app.logger.debug("JOBRET: %s %s %s" % (str(job.id), str(job.res_set), str(job.start_time))) return prev_sid_left, prev_sid_right, dict(job)
def show_counters(self, msg=''): # pragma: no cover print('show_counters:', msg) for k, v in iteritems(self.counters): print(k, ' = ', v)
def sched_loop(self): nb_completed_jobs = 0 while nb_completed_jobs < self.nb_jobs: now_float, jobs_submitted, new_jobs_completed = read_bat_msg(self.sock) # now_str = "10" # jobs_submitted = [1] # new_jobs_completed = [] if jobs_submitted: for jid in jobs_submitted: self.waiting_jids.add(jid) if self.platform_model == "batsim-db": print('set_job_state("Waiting"):', self.jobs[jid].db_jid) set_job_state(self.jobs[jid].db_jid, "Waiting") nb_completed_jobs += len(new_jobs_completed) print("new job completed: %s" % new_jobs_completed) for jid in new_jobs_completed: jobs_completed.append(jid) if jid in self.platform.running_jids: self.platform.running_jids.remove(jid) if self.platform_model == "batsim-db": set_job_state(self.jobs[jid].db_jid, "Terminated") now = int(now_float) self.env.now = now # TODO can be remove ??? real_time = time.time() print("jobs running: %s" % self.platform.running_jids) print("jobs waiting: %s" % self.waiting_jids) print("jobs completed: %s" % jobs_completed) jids_to_launch = [] if self.platform_model == "simu": print("call schedule_cycle.... %s" % now) schedule_cycle(self.platform, now, "default") # retrieve jobs to launch jids_to_launch = [] for jid, job in iteritems(self.platform.assigned_jobs): print("job.start_time %s" % job.start_time) if (job.start_time == now) and (job.state == "Waiting"): self.waiting_jids.remove(jid) jids_to_launch.append(jid) job.state = "Running" print("tolaunch: %s" % jid) self.platform.running_jids.append(jid) else: print("call meta_schedule('internal')") meta_schedule("internal", plt) # Launching phase # Retrieve job to Launch result = db.query(Job).filter(Job.state == "toLaunch").order_by(Job.id).all() for job_db in result: set_job_state(job_db.id, "Running") jid = self.db_jid2s_jid[job_db.id] self.waiting_jids.remove(jid) jids_to_launch.append(jid) self.jobs[jid].state = "Running" print("_tolaunch: %s" % jid) self.platform.running_jids.append(jid) real_sched_time = time.time() - real_time if self.sched_delay == -1: now_float += real_sched_time else: now_float += self.sched_delay send_bat_msg(self.sock, now_float, jids_to_launch, self.jobs)
def __init__(self): # prepare resource order/indirection stuff order_by_clause = config["SCHEDULER_RESOURCE_ORDER"] self.rid_i2o = array("i", [0] * MAX_NB_RESOURCES) self.rid_o2i = array("i", [0] * MAX_NB_RESOURCES) # suspend suspendable_roids = [] if "SCHEDULER_AVAILABLE_SUSPENDED_RESOURCE_TYPE" not in config: config["SCHEDULER_AVAILABLE_SUSPENDED_RESOURCE_TYPE"] = "default" res_suspend_types = ( config["SCHEDULER_AVAILABLE_SUSPENDED_RESOURCE_TYPE"]).split() # prepare hierarchy stuff # "HIERARCHY_LABELS" = "resource_id,network_address" conf_hy_labels = config[ "HIERARCHY_LABELS"] if "HIERARCHY_LABELS" in config else "resource_id,network_address" hy_labels = conf_hy_labels.split(",") hy_labels_w_id = ["id" if v == "resource_id" else v for v in hy_labels] hy_roid = {} for hy_label in hy_labels_w_id: hy_roid[hy_label] = OrderedDict() # available_upto for pseudo job in slot available_upto = {} self.available_upto = {} roids = [] default_rids = [] # retreive resource in order from DB self.resources_db = db.query(Resource).order_by(text(order_by_clause)).all() # fill the different structures for roid, r in enumerate(self.resources_db): if (r.state == "Alive") or (r.state == "Absent"): rid = int(r.id) roids.append(roid) if r.type == 'default': default_rids.append(rid) self.rid_i2o[rid] = roid self.rid_o2i[roid] = rid # fill hy_rid structure for hy_label in hy_labels_w_id: v = getattr(r, hy_label) if v in hy_roid[hy_label]: hy_roid[hy_label][v].append(roid) else: hy_roid[hy_label][v] = [roid] # fill available_upto structure if r.available_upto in available_upto: available_upto[r.available_upto].append(roid) else: available_upto[r.available_upto] = [roid] # fill resource available for suspended job if r.type in res_suspend_types: suspendable_roids.append(roid) # global ordered resources intervals # print roids self.roid_itvs = ordered_ids2itvs(roids) if "id" in hy_roid: hy_roid["resource_id"] = hy_roid["id"] del hy_roid["id"] # create hierarchy self.hierarchy = Hierarchy(hy_rid=hy_roid).hy # transform available_upto for k, v in iteritems(available_upto): self.available_upto[k] = ordered_ids2itvs(v) # self.suspendable_roid_itvs = ordered_ids2itvs(suspendable_roids) default_roids = [self.rid_i2o[i] for i in default_rids] self.default_resource_itvs = unordered_ids2itvs(default_roids) # update global variable default_resource_itvs = self.default_resource_itvs
def meta_schedule(mode='internal', plt=Platform()): exit_code = 0 job_security_time = int(config['SCHEDULER_JOB_SECURITY_TIME']) if ('QUOTAS' in config) and (config['QUOTAS'] == 'yes'): if 'QUOTAS_FILE' not in config: config['QUOTAS_FILE'] = './quotas_conf.json' load_quotas_rules() tools.init_judas_notify_user() tools.create_almighty_socket() logger.debug( "Retrieve information for already scheduled reservations from \ database before flush (keep assign resources)") # reservation ??. initial_time_sec = tools.get_date() # time.time() initial_time_sql = local_to_sql(initial_time_sec) current_time_sec = initial_time_sec current_time_sql = initial_time_sql gantt_init_results = gantt_init_with_running_jobs(plt, initial_time_sec, job_security_time) all_slot_sets, scheduled_jobs, besteffort_rid2jid = gantt_init_results resource_set = plt.resource_set() # Path for user of external schedulers if 'OARDIR' in os.environ: binpath = os.environ['OARDIR'] + '/' else: binpath = '/usr/local/lib/oar/' logger.warning( "OARDIR env variable must be defined, " + binpath + " is used by default") for queue in db.query(Queue).order_by(text('priority DESC')).all(): if queue.state == 'Active': logger.debug("Queue " + queue.name + ": Launching scheduler " + queue.scheduler_policy + " at time " + initial_time_sql) if mode == 'external': # pragma: no cover call_external_scheduler(binpath, scheduled_jobs, all_slot_sets, resource_set, job_security_time, queue, initial_time_sec, initial_time_sql) else: call_internal_scheduler(plt, scheduled_jobs, all_slot_sets, job_security_time, queue, initial_time_sec) handle_waiting_reservation_jobs(queue.name, resource_set, job_security_time, current_time_sec) # handle_new_AR_jobs check_reservation_jobs( plt, resource_set, queue.name, all_slot_sets, current_time_sec) jobs_to_launch, jobs_to_launch_lst, rid2jid_to_launch = get_gantt_jobs_to_launch(resource_set, job_security_time, current_time_sec) if check_besteffort_jobs_to_kill(jobs_to_launch, rid2jid_to_launch, current_time_sec, besteffort_rid2jid, resource_set) == 1: # We must kill some besteffort jobs tools.notify_almighty('ChState') exit_code = 2 elif handle_jobs_to_launch(jobs_to_launch_lst, current_time_sec, current_time_sql) == 1: exit_code = 0 # Update visu gantt tables update_gantt_visualization() # Manage dynamic node feature flag_hulot = False timeout_cmd = int(config['SCHEDULER_TIMEOUT']) if ((('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))) and (('SCHEDULER_NODE_MANAGER_SLEEP_TIME' in config) and ('SCHEDULER_NODE_MANAGER_IDLE_TIME' in config))): # Look at nodes that are unused for a duration idle_duration = int(config['SCHEDULER_NODE_MANAGER_IDLE_TIME']) sleep_duration = int(config['SCHEDULER_NODE_MANAGER_SLEEP_TIME']) idle_nodes = search_idle_nodes(current_time_sec) tmp_time = current_time_sec - idle_duration node_halt = [] for node, idle_duration in iteritems(idle_nodes): if idle_duration < tmp_time: # Search if the node has enough time to sleep tmp = get_next_job_date_on_node(node) if (tmp is None) or (tmp - sleep_duration > current_time_sec): # Search if node has not been woken up recently wakeup_date = get_last_wake_up_date_of_node(node) if (wakeup_date is None) or (wakeup_date < tmp_time): node_halt.append(node) if node_halt != []: logger.debug("Powering off some nodes (energy saving): " + str(node_halt)) # Using the built-in energy saving module to shut down nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('HALT', ' '.join(node_halt)): logger.error("Communication problem with the energy saving module (Hulot)\n") flag_hulot = 1 else: # Not using the built-in energy saving module to shut down nodes cmd = config['SCHEDULER_NODE_MANAGER_SLEEP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, node_halt): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to poweroff some nodes") if (('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))): # Get nodes which the scheduler wants to schedule jobs to, # but which are in the Absent state, to wake them up wakeup_time = int(config['SCHEDULER_NODE_MANAGER_WAKEUP_TIME']) nodes = get_gantt_hostname_to_wake_up(current_time_sec, wakeup_time) if nodes != []: logger.debug("Awaking some nodes: " + str(nodes)) # Using the built-in energy saving module to wake up nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('WAKEUP', ' '.join(nodes)): logger.error("Communication problem with the energy saving module (Hulot)") flag_hulot = 1 else: # Not using the built-in energy saving module to wake up nodes cmd = config['SCHEDULER_NODE_MANAGER_WAKE_UP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, nodes): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to wake-up some nodes ") # Send CHECK signal to Hulot if needed if not flag_hulot and (config['ENERGY_SAVING_INTERNAL'] == 'yes'): if kao_tools.send_to_hulot('CHECK', []): logger.error("Communication problem with the energy saving module (Hulot)") # Retrieve jobs according to their state and excluding job in 'Waiting' state. jobs_by_state = get_current_not_waiting_jobs() # # Search jobs to resume # # # TODO: TOFINISH # if 'Resuming' in jobs_by_state: logger.warn("Resuming job is NOT ENTIRELY IMPLEMENTED") for job in jobs_by_state['Resuming']: other_jobs = get_jobs_on_resuming_job_resources(job.id) # TODO : look for timesharing other jobs. What do we do????? if other_jobs == []: # We can resume the job logger.debug("[" + str(job.id) + "] Resuming job") if 'noop' in job.types: resume_job_action(job.id) logger.debug("[" + str(job.id) + "] Resume NOOP job OK") else: script = config['JUST_BEFORE_RESUME_EXEC_FILE'] timeout = int(config['SUSPEND_RESUME_SCRIPT_TIMEOUT']) if timeout is None: timeout = kao_tools.get_default_suspend_resume_script_timeout() skip = 0 logger.debug("[" + str(job.id) + "] Running post suspend script: `" + script + " " + str(job.id) + "'") cmd_str = script + str(job.id) return_code = -1 try: return_code = call(cmd_str, shell=True, timeout=timeout) except TimeoutExpired as e: logger.error(str(e) + "[" + str(job.id) + "] Suspend script timeouted") add_new_event('RESUME_SCRIPT_ERROR', job.id, "Suspend script timeouted") if return_code != 0: str_error = "[" + str(job.id) + "] Suspend script error, return code = "\ + str(return_code) logger.error(str_error) add_new_event('RESUME_SCRIPT_ERROR', job.id, str_error) frag_job(job.id) tools.notify_almighty('Qdel') skip = 1 cpuset_nodes = None if 'JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD' in config: cpuset_field = config['JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD'] else: cpuset_field = "" if cpuset_field and (skip == 0): # TODO cpuset_name = job.user + "_" + str(job.id) cpuset_nodes = get_cpuset_values(cpuset_field, job.assigned_moldable_id) # TODO suspend_data_hash = {'name': cpuset_name, 'job_id': job.id, 'oarexec_pid_file': kao_tools.get_oar_pid_file_name(job.id)} if cpuset_nodes: # TODO taktuk_cmd = config['TAKTUK_CMD'] if 'SUSPEND_RESUME_FILE' in config: suspend_file = config['SUSPEND_RESUME_FILE'] else: # TODO suspend_file = kao_tools.get_default_suspend_resume_file() # # TODO: TOFINISH # # Notify oarsub -I when they will be launched for j_info in get_gantt_waiting_interactive_prediction_date(): job_id, job_info_type, job_start_time, job_message = j_info addr, port = job_info_type.split(':') new_start_prediction = local_to_sql(job_start_time) logger.debug("[" + str(job_id) + "] Notifying user of the start prediction: " + new_start_prediction + "(" + job_message + ")") tools.notify_tcp_socket(addr, port, "[" + initial_time_sql + "] Start prediction: " + new_start_prediction + " (" + job_message + ")") # Run the decisions # Process "toError" jobs if 'toError' in jobs_by_state: for job in jobs_by_state['toError']: addr, port = job.info_type.split(':') if job.type == 'INTERACTIVE' or\ (job.type == 'PASSIVE' and job.reservation == 'Scheduled'): logger.debug("Notify oarsub job (num:" + str(job.id) + ") in error; jobInfo=" + job.info_type) nb_sent1 = tools.notify_tcp_socket(addr, port, job.message + '\n') nb_sent2 = tools.notify_tcp_socket(addr, port, 'BAD JOB' + '\n') if (nb_sent1 == 0) or (nb_sent2 == 0): logger.warn( "Cannot open connection to oarsub client for" + str(job.id)) logger.debug("Set job " + str(job.id) + " to state Error") set_job_state(job.id, 'Error') # Process toAckReservation jobs if 'toAckReservation' in jobs_by_state: for job in jobs_by_state['toAckReservation']: addr, port = job.info_type.split(':') logger.debug( "Treate job" + str(job.id) + " in toAckReservation state") nb_sent = tools.notify_tcp_socket(addr, port, 'GOOD RESERVATION' + '\n') if nb_sent == 0: logger.warn( "Frag job " + str(job.id) + ", I cannot notify oarsub for the reservation") add_new_event('CANNOT_NOTIFY_OARSUB', str( job.id), "Can not notify oarsub for the job " + str(job.id)) # TODO ??? # OAR::IO::lock_table / OAR::IO::unlock_table($base) frag_job(job.id) exit_code = 2 else: logger.debug("Notify oarsub for a RESERVATION (idJob=" + str(job.id) + ") --> OK; jobInfo=" + job.info_type) set_job_state(job.id, 'Waiting') if ((job.start_time - 1) <= current_time_sec) and (exit_code == 0): exit_code = 1 # Process toLaunch jobs if 'toLaunch' in jobs_by_state: for job in jobs_by_state['toLaunch']: notify_to_run_job(job.id) logger.debug("End of Meta Scheduler") return exit_code
def __init__(self, **kwargs): self.mld_res_rqts = [] for key, value in iteritems(kwargs): setattr(self, key, value)
def combine(self, quotas): # self.show_counters('combine before') for key, value in iteritems(quotas.counters): self.counters[key][0] = max(self.counters[key][0], value[0]) self.counters[key][1] = max(self.counters[key][1], value[1]) self.counters[key][2] += value[2]
def insert_job(**kwargs): """ Insert job in database # "{ sql1 }/prop1=1/prop2=3+{sql2}/prop3=2/prop4=1/prop5=1+...,walltime=60" # # res = "/switch=2/nodes=10+{lic_type = 'mathlab'}/licence=20" types="besteffort, container" # insert_job( res = [ ( 60, [("switch=2/nodes=20", ""), ("licence=20", "lic_type = 'mathlab'")] ) ], types = ["besteffort", "container"], user= "") """ default_values = {'launching_directory': "", 'checkpoint_signal': 0, 'properties': ""} for k, v in iteritems(default_values): if k not in kwargs: kwargs[k] = v if 'res' in kwargs: res = kwargs.pop('res') else: res = [(60, [('resource_id=1', "")])] if 'types' in kwargs: types = kwargs.pop('types') else: types = [] if 'queue_name' not in kwargs: kwargs['queue_name'] = 'default' if 'user' in kwargs: kwargs['job_user'] = kwargs.pop('user') ins = Job.__table__.insert().values(**kwargs) result = db.session.execute(ins) job_id = result.inserted_primary_key[0] mld_jid_walltimes = [] res_grps = [] for res_mld in res: w, res_grp = res_mld mld_jid_walltimes.append( {'moldable_job_id': job_id, 'moldable_walltime': w}) res_grps.append(res_grp) result = db.session.execute(MoldableJobDescription.__table__.insert(), mld_jid_walltimes) if len(mld_jid_walltimes) == 1: mld_ids = [result.inserted_primary_key[0]] else: r = db.query(MoldableJobDescription.id)\ .filter(MoldableJobDescription.job_id == job_id).all() mld_ids = [x for e in r for x in e] for mld_idx, res_grp in enumerate(res_grps): # job_resource_groups mld_id_property = [] res_hys = [] moldable_id = mld_ids[mld_idx] for r_hy_prop in res_grp: (res_hy, properties) = r_hy_prop mld_id_property.append({'res_group_moldable_id': moldable_id, 'res_group_property': properties}) res_hys.append(res_hy) result = db.session.execute(JobResourceGroup.__table__.insert(), mld_id_property) if len(mld_id_property) == 1: grp_ids = [result.inserted_primary_key[0]] else: r = db.query(JobResourceGroup.id)\ .filter(JobResourceGroup.moldable_id == moldable_id).all() grp_ids = [x for e in r for x in e] # job_resource_descriptions for grp_idx, res_hy in enumerate(res_hys): res_description = [] for idx, val in enumerate(res_hy.split('/')): tv = val.split('=') res_description.append({'res_job_group_id': grp_ids[grp_idx], 'res_job_resource_type': tv[0], 'res_job_value': tv[1], 'res_job_order': idx}) db.session.execute(JobResourceDescription.__table__.insert(), res_description) if types: ins = [{'job_id': job_id, 'type': typ} for typ in types] db.session.execute(JobType.__table__.insert(), ins) return job_id
def __init__(self, **kwargs): for key, value in iteritems(kwargs): setattr(self, key, value)