def bipbip_leon_executor(*args, **command): job_id = command["job_id"] if command["cmd"] == "LEONEXTERMINATE": cmd_arg = [leon_command, str(job_id)] else: cmd_arg = [bipbip_command, str(job_id)] + command["args"] logger.debug("Launching: " + str(cmd_arg)) # TODO returncode, tools.call(cmd_arg)
def launch_command(command): '''launch the command line passed in parameter''' #TODO move to oar.lib.tools global finishTag logger.debug('Launching command : [' + command + ']') #import pdb; pdb.set_trace() status = tools.call(command) exit_value = status >> 8 signal_num = status & 127 dumped_core = status & 128 logger.debug(command + ' terminated') logger.debug('Exit value : ' + str(exit_value)) logger.debug('Signal num : ' + str(signal_num)) logger.debug('Core dumped : ' + str(dumped_core)) if signal_num or dumped_core: logger.error('Something wrong occured (signal or core dumped) when trying to call [' + command + '] command') finishTag = 1 return exit_value
def meta_schedule(mode='internal', plt=Platform()): exit_code = 0 job_security_time = int(config['SCHEDULER_JOB_SECURITY_TIME']) if ('QUOTAS' in config) and (config['QUOTAS'] == 'yes'): if 'QUOTAS_FILE' not in config: config['QUOTAS_FILE'] = './quotas_conf.json' load_quotas_rules() tools.init_judas_notify_user() tools.create_almighty_socket() logger.debug( "Retrieve information for already scheduled reservations from \ database before flush (keep assign resources)") # reservation ??. initial_time_sec = tools.get_date() # time.time() initial_time_sql = local_to_sql(initial_time_sec) current_time_sec = initial_time_sec current_time_sql = initial_time_sql gantt_init_results = gantt_init_with_running_jobs(plt, initial_time_sec, job_security_time) all_slot_sets, scheduled_jobs, besteffort_rid2jid = gantt_init_results resource_set = plt.resource_set() # Path for user of external schedulers if 'OARDIR' in os.environ: binpath = os.environ['OARDIR'] + '/' else: binpath = '/usr/local/lib/oar/' logger.warning( "OARDIR env variable must be defined, " + binpath + " is used by default") for queue in db.query(Queue).order_by(text('priority DESC')).all(): if queue.state == 'Active': logger.debug("Queue " + queue.name + ": Launching scheduler " + queue.scheduler_policy + " at time " + initial_time_sql) if mode == 'external': # pragma: no cover call_external_scheduler(binpath, scheduled_jobs, all_slot_sets, resource_set, job_security_time, queue, initial_time_sec, initial_time_sql) else: call_internal_scheduler(plt, scheduled_jobs, all_slot_sets, job_security_time, queue, initial_time_sec) handle_waiting_reservation_jobs(queue.name, resource_set, job_security_time, current_time_sec) # handle_new_AR_jobs check_reservation_jobs( plt, resource_set, queue.name, all_slot_sets, current_time_sec) jobs_to_launch, jobs_to_launch_lst, rid2jid_to_launch = get_gantt_jobs_to_launch(resource_set, job_security_time, current_time_sec) if check_besteffort_jobs_to_kill(jobs_to_launch, rid2jid_to_launch, current_time_sec, besteffort_rid2jid, resource_set) == 1: # We must kill some besteffort jobs tools.notify_almighty('ChState') exit_code = 2 elif handle_jobs_to_launch(jobs_to_launch_lst, current_time_sec, current_time_sql) == 1: exit_code = 0 # Update visu gantt tables update_gantt_visualization() # Manage dynamic node feature flag_hulot = False timeout_cmd = int(config['SCHEDULER_TIMEOUT']) if ((('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))) and (('SCHEDULER_NODE_MANAGER_SLEEP_TIME' in config) and ('SCHEDULER_NODE_MANAGER_IDLE_TIME' in config))): # Look at nodes that are unused for a duration idle_duration = int(config['SCHEDULER_NODE_MANAGER_IDLE_TIME']) sleep_duration = int(config['SCHEDULER_NODE_MANAGER_SLEEP_TIME']) idle_nodes = search_idle_nodes(current_time_sec) tmp_time = current_time_sec - idle_duration node_halt = [] for node, idle_duration in iteritems(idle_nodes): if idle_duration < tmp_time: # Search if the node has enough time to sleep tmp = get_next_job_date_on_node(node) if (tmp is None) or (tmp - sleep_duration > current_time_sec): # Search if node has not been woken up recently wakeup_date = get_last_wake_up_date_of_node(node) if (wakeup_date is None) or (wakeup_date < tmp_time): node_halt.append(node) if node_halt != []: logger.debug("Powering off some nodes (energy saving): " + str(node_halt)) # Using the built-in energy saving module to shut down nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('HALT', ' '.join(node_halt)): logger.error("Communication problem with the energy saving module (Hulot)\n") flag_hulot = 1 else: # Not using the built-in energy saving module to shut down nodes cmd = config['SCHEDULER_NODE_MANAGER_SLEEP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, node_halt): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to poweroff some nodes") if (('SCHEDULER_NODE_MANAGER_SLEEP_CMD' in config) or ((config['ENERGY_SAVING_INTERNAL'] == 'yes') and ('ENERGY_SAVING_NODE_MANAGER_SLEEP_CMD' in config))): # Get nodes which the scheduler wants to schedule jobs to, # but which are in the Absent state, to wake them up wakeup_time = int(config['SCHEDULER_NODE_MANAGER_WAKEUP_TIME']) nodes = get_gantt_hostname_to_wake_up(current_time_sec, wakeup_time) if nodes != []: logger.debug("Awaking some nodes: " + str(nodes)) # Using the built-in energy saving module to wake up nodes if config['ENERGY_SAVING_INTERNAL'] == 'yes': if kao_tools.send_to_hulot('WAKEUP', ' '.join(nodes)): logger.error("Communication problem with the energy saving module (Hulot)") flag_hulot = 1 else: # Not using the built-in energy saving module to wake up nodes cmd = config['SCHEDULER_NODE_MANAGER_WAKE_UP_CMD'] if kao_tools.fork_and_feed_stdin(cmd, timeout_cmd, nodes): logger.error("Command " + cmd + "timeouted (" + str(timeout_cmd) + "s) while trying to wake-up some nodes ") # Send CHECK signal to Hulot if needed if not flag_hulot and (config['ENERGY_SAVING_INTERNAL'] == 'yes'): if kao_tools.send_to_hulot('CHECK', []): logger.error("Communication problem with the energy saving module (Hulot)") # Retrieve jobs according to their state and excluding job in 'Waiting' state. jobs_by_state = get_current_not_waiting_jobs() # # Search jobs to resume # # # TODO: TOFINISH # if 'Resuming' in jobs_by_state: logger.warn("Resuming job is NOT ENTIRELY IMPLEMENTED") for job in jobs_by_state['Resuming']: other_jobs = get_jobs_on_resuming_job_resources(job.id) # TODO : look for timesharing other jobs. What do we do????? if other_jobs == []: # We can resume the job logger.debug("[" + str(job.id) + "] Resuming job") if 'noop' in job.types: resume_job_action(job.id) logger.debug("[" + str(job.id) + "] Resume NOOP job OK") else: script = config['JUST_BEFORE_RESUME_EXEC_FILE'] timeout = int(config['SUSPEND_RESUME_SCRIPT_TIMEOUT']) if timeout is None: timeout = kao_tools.get_default_suspend_resume_script_timeout() skip = 0 logger.debug("[" + str(job.id) + "] Running post suspend script: `" + script + " " + str(job.id) + "'") cmd_str = script + str(job.id) return_code = -1 try: return_code = call(cmd_str, shell=True, timeout=timeout) except TimeoutExpired as e: logger.error(str(e) + "[" + str(job.id) + "] Suspend script timeouted") add_new_event('RESUME_SCRIPT_ERROR', job.id, "Suspend script timeouted") if return_code != 0: str_error = "[" + str(job.id) + "] Suspend script error, return code = "\ + str(return_code) logger.error(str_error) add_new_event('RESUME_SCRIPT_ERROR', job.id, str_error) frag_job(job.id) tools.notify_almighty('Qdel') skip = 1 cpuset_nodes = None if 'JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD' in config: cpuset_field = config['JOB_RESOURCE_MANAGER_PROPERTY_DB_FIELD'] else: cpuset_field = "" if cpuset_field and (skip == 0): # TODO cpuset_name = job.user + "_" + str(job.id) cpuset_nodes = get_cpuset_values(cpuset_field, job.assigned_moldable_id) # TODO suspend_data_hash = {'name': cpuset_name, 'job_id': job.id, 'oarexec_pid_file': kao_tools.get_oar_pid_file_name(job.id)} if cpuset_nodes: # TODO taktuk_cmd = config['TAKTUK_CMD'] if 'SUSPEND_RESUME_FILE' in config: suspend_file = config['SUSPEND_RESUME_FILE'] else: # TODO suspend_file = kao_tools.get_default_suspend_resume_file() # # TODO: TOFINISH # # Notify oarsub -I when they will be launched for j_info in get_gantt_waiting_interactive_prediction_date(): job_id, job_info_type, job_start_time, job_message = j_info addr, port = job_info_type.split(':') new_start_prediction = local_to_sql(job_start_time) logger.debug("[" + str(job_id) + "] Notifying user of the start prediction: " + new_start_prediction + "(" + job_message + ")") tools.notify_tcp_socket(addr, port, "[" + initial_time_sql + "] Start prediction: " + new_start_prediction + " (" + job_message + ")") # Run the decisions # Process "toError" jobs if 'toError' in jobs_by_state: for job in jobs_by_state['toError']: addr, port = job.info_type.split(':') if job.type == 'INTERACTIVE' or\ (job.type == 'PASSIVE' and job.reservation == 'Scheduled'): logger.debug("Notify oarsub job (num:" + str(job.id) + ") in error; jobInfo=" + job.info_type) nb_sent1 = tools.notify_tcp_socket(addr, port, job.message + '\n') nb_sent2 = tools.notify_tcp_socket(addr, port, 'BAD JOB' + '\n') if (nb_sent1 == 0) or (nb_sent2 == 0): logger.warn( "Cannot open connection to oarsub client for" + str(job.id)) logger.debug("Set job " + str(job.id) + " to state Error") set_job_state(job.id, 'Error') # Process toAckReservation jobs if 'toAckReservation' in jobs_by_state: for job in jobs_by_state['toAckReservation']: addr, port = job.info_type.split(':') logger.debug( "Treate job" + str(job.id) + " in toAckReservation state") nb_sent = tools.notify_tcp_socket(addr, port, 'GOOD RESERVATION' + '\n') if nb_sent == 0: logger.warn( "Frag job " + str(job.id) + ", I cannot notify oarsub for the reservation") add_new_event('CANNOT_NOTIFY_OARSUB', str( job.id), "Can not notify oarsub for the job " + str(job.id)) # TODO ??? # OAR::IO::lock_table / OAR::IO::unlock_table($base) frag_job(job.id) exit_code = 2 else: logger.debug("Notify oarsub for a RESERVATION (idJob=" + str(job.id) + ") --> OK; jobInfo=" + job.info_type) set_job_state(job.id, 'Waiting') if ((job.start_time - 1) <= current_time_sec) and (exit_code == 0): exit_code = 1 # Process toLaunch jobs if 'toLaunch' in jobs_by_state: for job in jobs_by_state['toLaunch']: notify_to_run_job(job.id) logger.debug("End of Meta Scheduler") return exit_code