def __init__(self, config): AppManager.__init__(self, config) if not hasattr(self, '_mysql'): db_params = Configuration(config.db_params) db_params.reuse_connection = True # we use locks self._mysql = MySQL(db_params) # make sure applications row with id 0 exists count = self._mysql.query( 'SELECT COUNT(*) FROM `applications` WHERE `id` = 0')[0] if count == 0: # Cannot insert with id = 0 (will be interpreted as next auto_increment id unless server-wide setting is changed) # Inesrt with an implicit id first and update later columns = ('auth_level', 'title', 'path', 'status', 'user_id', 'user_host') values = (AppManager.LV_WRITE, 'wsgi', '', 'done', 0, '') insert_id = self._mysql.insert_get_id('applications', columns=columns, values=values) self._mysql.query( 'UPDATE `applications` SET `id` = 0 WHERE `id` = %s', insert_id)
def __init__(self, config): Authorizer.__init__(self, config) AppManager.__init__(self, config) self.connected = False self._master_server_lock = multiprocessing.RLock()
def update_application(self, app_id, **kwd): #override sql = 'UPDATE `applications` SET ' args = [] updates = [] if 'status' in kwd: updates.append('`status` = %s') args.append(AppManager.status_name(kwd['status'])) if 'hostname' in kwd: updates.append('`server` = %s') args.append(kwd['hostname']) if 'exit_code' in kwd: updates.append('`exit_code` = %s') args.append(kwd['exit_code']) if 'path' in kwd: updates.append('`path` = %s') args.append(kwd['path']) sql += ', '.join(updates) sql += ' WHERE `id` = %s' args.append(app_id) self._mysql.query(sql, *tuple(args))
def _serve_synch_app(self, app_id, path, addr): conns = (socket.create_connection(addr), socket.create_connection(addr)) stop_reading = threading.Event() for conn, name in zip(conns, ('stdout', 'stderr')): args = (path + '/_' + name, conn, stop_reading) th = threading.Thread(target = tail_follow, name = name, args = args) th.daemon = True th.start() msg = self.wait_synch_app_queue(app_id) # {'status': status, 'exit_code': exit_code} self.remove_synch_app_queue(app_id) # not an elegant solution but we need to keep the reader threads alive for just a bit longer time.sleep(1) stop_reading.set() for conn in conns: try: conn.shutdown(socket.SHUT_RDWR) except: pass conn.close() return {'status': AppManager.status_name(msg['status']), 'exit_code': msg['exit_code']}
def _poll_app(self, app_id): app = self._get_app(app_id) if app is None: return False, 'Unknown appid %d' % app_id app['status'] = AppManager.status_name(app['status']) return True, app
def _kill_app(self, app_id): app = self._get_app(app_id) if app is None: return False, 'Unknown appid %d' % app_id if app['status'] in (AppManager.STAT_NEW, AppManager.STAT_RUN): self.dynamo_server.manager.master.update_application(app_id, status = AppManager.STAT_KILLED) return True, {'result': 'success', 'detail': 'Task aborted.'} else: return True, {'result': 'noaction', 'detail': 'Task already completed with status %s (exit code %s).' % \ (AppManager.status_name(app['status']), app['exit_code'])}
def _schedule_app(self, app_data): """ Call schedule_application on the master server. If mode == 'synch', create a communication queue and register it under synch_app_queues. The server should then wait on this queue before starting the application. """ app_data = dict(app_data) # schedule the app on master if 'exec_path' in app_data: try: shutil.copyfile(app_data['exec_path'], app_data['path'] + '/exec.py') except Exception as exc: return False, 'Could not copy executable %s to %s (%s)' % (app_data['exec_path'], app_data['path'], str(exc)) app_data.pop('exec_path') elif 'exec' in app_data: with open(app_data['path'] + '/exec.py', 'w') as out: out.write(app_data['exec']) app_data.pop('exec') mode = app_data.pop('mode') with self.notify_lock: keys = set(app_data.keys()) args = set(['title', 'path', 'args', 'user_id', 'host', 'auth_level', 'timeout']) if len(keys - args) != 0: return False, 'Extra parameter(s): %s' % (str(list(keys - args))) if len(args - keys) != 0: return False, 'Missing parameter(s): %s' % (str(list(args - keys))) app_id = self.dynamo_server.manager.master.schedule_application(**app_data) if mode == 'synch': self.synch_app_queues[app_id] = Queue.Queue() if mode == 'synch': msg = self.wait_synch_app_queue(app_id) if msg['status'] != AppManager.STAT_RUN: # this app is not going to run return False, 'Application status: %s.' % AppManager.status_name(msg['status']) return True, {'appid': app_id, 'path': msg['path'], 'pid': msg['pid']} # msg['path'] should be == app_data['path'] else: return True, {'appid': app_id, 'path': app_data['path']}
def _scheduler(self): """ A function to be run as a thread. Rotates through the scheduler sequence directories and execute whatever is up next. Perhaps we want an independent logger for this thread """ for sequence_name in self.dynamo_server.manager.master.get_sequences(enabled_only = True): _, _, restart, _ = self.dynamo_server.manager.master.find_sequence(sequence_name) if restart: self._shift_sequence_to(sequence_name, 0) LOG.info('[Scheduler] Starting sequence %s from line 0.', sequence_name) else: LOG.info('[Scheduler] Starting sequence %s.', sequence_name) while True: if self._stop_flag.is_set(): break for sequence_name in self.dynamo_server.manager.master.get_sequences(enabled_only = True): if self._stop_flag.is_set(): break work_dir = self.scheduler_base + '/' + sequence_name db = sqlite3.connect(work_dir + '/sequence.db') cursor = db.cursor() try: cursor.execute('SELECT `line`, `command`, `title`, `arguments`, `criticality`, `app_id` FROM `sequence` ORDER BY `id` LIMIT 1') row = cursor.fetchone() if row is None: raise RuntimeError('Sequence is empty') except Exception as ex: LOG.error('[Scheduler] Failed to fetch the current command for sequence %s (%s).', sequence_name, str(ex)) continue db.close() iline, command, title, arguments, criticality, app_id = row if command == AppServer.EXECUTE: if app_id is None: self._schedule_from_sequence(sequence_name, iline) continue # poll the app_id app = self._get_app(app_id) if app is None: LOG.error('[Scheduler] Application %s in sequence %s disappeared.', title, sequence_name) self._schedule_from_sequence(sequence_name, iline) continue if app['status'] in (AppManager.STAT_NEW, AppManager.STAT_ASSIGNED, AppManager.STAT_RUN): continue else: try: with open(work_dir + '/log.out', 'a') as out: out.write('\n') except: pass try: with open(work_dir + '/log.err', 'a') as out: out.write('\n') except: pass if app['status'] == AppManager.STAT_DONE: LOG.info('[Scheduler] Application %s in sequence %s completed.', title, sequence_name) self._schedule_from_sequence(sequence_name, iline + 1) else: LOG.warning('[Scheduler] Application %s in sequence %s terminated with status %s.', title, sequence_name, AppManager.status_name(app['status'])) if criticality == AppServer.PASS: self._schedule_from_sequence(sequence_name, iline + 1) else: self._send_failure_notice(sequence_name, app) if criticality == AppServer.REPEAT_SEQ: LOG.warning('[Scheduler] Restarting sequence %s.', sequence_name) self._schedule_from_sequence(sequence_name, 0) elif criticality == AppServer.REPEAT_LINE: LOG.warning('[Scheduler] Restarting application %s of sequence %s.', title, sequence_name) self._schedule_from_sequence(sequence_name, iline) elif command == AppServer.WAIT: # title is the number of seconds expressed in a decimal string # arguments is set to the unix timestamp (string) until when the sequence should wait wait_until = int(arguments) if time.time() < wait_until: continue else: self._schedule_from_sequence(sequence_name, iline + 1) # all sequences processed; now sleep for 10 seconds self._stop_flag.wait(10)
def _scheduler(self): """ A function to be run as a thread. Rotates through the scheduler sequence directories and execute whatever is up next. Perhaps we want an independent logger for this thread """ for sequence_name in self.dynamo_server.manager.master.get_sequences( enabled_only=True): _, _, restart, _ = self.dynamo_server.manager.master.find_sequence( sequence_name) if restart: self._shift_sequence_to(sequence_name, 0) LOG.info('[Scheduler] Starting sequence %s from line 0.', sequence_name) else: LOG.info('[Scheduler] Starting sequence %s.', sequence_name) while True: if self._stop_flag.is_set(): break for sequence_name in self.dynamo_server.manager.master.get_sequences( enabled_only=True): if self._stop_flag.is_set(): break work_dir = self.scheduler_base + '/' + sequence_name db = sqlite3.connect(work_dir + '/sequence.db') cursor = db.cursor() try: cursor.execute( 'SELECT `line`, `command`, `title`, `arguments`, `criticality`, `app_id` FROM `sequence` ORDER BY `id` LIMIT 1' ) row = cursor.fetchone() if row is None: raise RuntimeError('Sequence is empty') except Exception as ex: LOG.error( '[Scheduler] Failed to fetch the current command for sequence %s (%s).', sequence_name, str(ex)) continue db.close() iline, command, title, arguments, criticality, app_id = row if command == AppServer.EXECUTE: if app_id is None: self._schedule_from_sequence(sequence_name, iline) continue # poll the app_id app = self._get_app(app_id) if app is None: LOG.error( '[Scheduler] Application %s in sequence %s disappeared.', title, sequence_name) self._schedule_from_sequence(sequence_name, iline) continue if app['status'] in (AppManager.STAT_NEW, AppManager.STAT_ASSIGNED, AppManager.STAT_RUN): continue else: try: with open(work_dir + '/log.out', 'a') as out: out.write('\n') except: pass try: with open(work_dir + '/log.err', 'a') as out: out.write('\n') except: pass if app['status'] == AppManager.STAT_DONE: LOG.info( '[Scheduler] Application %s in sequence %s completed.', title, sequence_name) self._schedule_from_sequence( sequence_name, iline + 1) else: LOG.warning( '[Scheduler] Application %s in sequence %s terminated with status %s.', title, sequence_name, AppManager.status_name(app['status'])) if criticality == AppServer.PASS: self._schedule_from_sequence( sequence_name, iline + 1) else: self._send_failure_notice(sequence_name, app) if criticality == AppServer.REPEAT_SEQ: LOG.warning( '[Scheduler] Restarting sequence %s.', sequence_name) self._schedule_from_sequence(sequence_name, 0) elif criticality == AppServer.REPEAT_LINE: LOG.warning( '[Scheduler] Restarting application %s of sequence %s.', title, sequence_name) self._schedule_from_sequence( sequence_name, iline) elif command == AppServer.WAIT: # title is the number of seconds expressed in a decimal string # arguments is set to the unix timestamp (string) until when the sequence should wait wait_until = int(arguments) if time.time() < wait_until: continue else: self._schedule_from_sequence(sequence_name, iline + 1) # all sequences processed; now sleep for 10 seconds self._stop_flag.wait(10)
def _collect_processes(self, child_processes): """ Loop through child processes and make state machine transitions. Processes come in this function in status RUN or KILLED. It is also possible that the master server somehow lost the record of the process (which we considered KILLED). If the process times out, status is set to KILLED. KILLED jobs will be terminated and popped out of the child_processes list. RUN jobs will be polled. If not alive, status changes to DONE or FAILED depending on the exit code. If alive, nothing happens. In either case, for write-enabled processes, updates are collected from the queue. If the status is RUN and collection fails, the subprocess is terminated and the status is set to FAILED. """ writing_process = self.manager.master.get_writing_process_id() ichild = 0 while ichild != len(child_processes): app_id, proc, time_start = child_processes[ichild] apps = self.manager.master.get_applications(app_id = app_id) if len(apps) == 0: status = AppManager.STAT_KILLED id_str = '%s from unknown (AID %d PID %d)' % (proc.name, app_id, proc.pid) timeout = 0 else: status = apps[0]['status'] id_str = '%s (%s) from %s@%s (AID %d PID %d)' % (proc.name, apps[0]['path'], \ apps[0]['user_name'], apps[0]['user_host'], app_id, proc.pid) timeout = apps[0]['timeout'] # Kill processes running for too long (server default timeout given in seconds) if timeout == 0: min_start_time = time.time() - self.applications_config.timeout elif timeout < 0: min_start_time = 0 else: min_start_time = time.time() - timeout * 3600 if time_start < min_start_time: LOG.warning('Application %s timed out.', id_str) status = AppManager.STAT_KILLED if app_id == writing_process: # If this is the writing process, read data from the queue # read_state: 0 -> nothing written yet (process is running), 1 -> read OK, 2 -> failure read_state, update_commands = self._collect_updates() if status == AppManager.STAT_RUN: if read_state == 1 and len(update_commands) != 0: self._update_inventory(update_commands) elif read_state == 2: status = AppManager.STAT_FAILED serverutils.killproc(proc, LOG, 60) if status == AppManager.STAT_KILLED and proc.is_alive(): LOG.warning('Terminating %s.', id_str) serverutils.killproc(proc, LOG, 60) if proc.is_alive(): if status == AppManager.STAT_RUN: ichild += 1 continue else: # The process must be complete but did not join within 60 seconds LOG.error('Application %s is stuck (Status %s).', id_str, AppManager.status_name(status)) else: if status == AppManager.STAT_RUN: if proc.exitcode == 0: status = AppManager.STAT_DONE else: status = AppManager.STAT_FAILED LOG.info('Application %s completed (Exit code %d Status %s).', id_str, proc.exitcode, AppManager.status_name(status)) child_processes.pop(ichild) self.appserver.notify_synch_app(app_id, {'status': status, 'exit_code': proc.exitcode}) self.manager.master.update_application(app_id, status = status, exit_code = proc.exitcode)
def _run_application_cycles(self): """ Infinite-loop main body of the daemon. Step 1: Poll the applications list for one uploaded script. Step 2: If a script is found, check the authorization of the script. Step 3: Spawn a child process for the script. Step 4: Apply updates sent by other servers. Step 5: Collect completed child processes. Get updates from the write-enabled child process if there is one. Step 6: Clean up. Step 7: Sleep for N seconds. """ # Start the application collector thread self.appserver.start() child_processes = [] LOG.info('Start polling for applications.') try: first_wait = True do_sleep = False cleanup_timer = 0 while True: LOG.debug('Check status and connection') self.check_status_and_connection() ## Step 4 (easier to do here because we use "continue"s) LOG.debug('Read updates') self._read_updates() ## Step 5 (easier to do here because we use "continue"s) LOG.debug('Collect processes') self._collect_processes(child_processes) if self.webserver is not None: self._collect_updates_from_web() ## Step 6 (easier to do here because we use "continue"s) cleanup_timer += 1 if cleanup_timer == 100000: LOG.info('Triggering cleanup of old applications.') self._cleanup() cleanup_timer = 0 ## Step 7 (easier to do here because we use "continue"s) if do_sleep: # one successful cycle - reset the error counter LOG.debug('Sleep ' + str(self.poll_interval)) time.sleep(self.poll_interval) ## Step 1: Poll LOG.debug('Polling for applications.') self.manager.master.lock() try: # Cannot run a write process if # . I am supposed to be updating my inventory # . There is a server starting # . There is already a write process read_only = self.manager.master.inhibit_write() app = self.manager.master.get_next_application(read_only) if app is not None: self.manager.master.update_application(app['appid'], status = AppManager.STAT_ASSIGNED, hostname = self.manager.hostname) finally: self.manager.master.unlock() if app is None: if len(child_processes) == 0 and first_wait: LOG.info('Waiting for applications.') first_wait = False do_sleep = True LOG.debug('No application found, sleeping for %.1f second(s).' % self.poll_interval) continue ## Step 2: If a script is found, check the authorization of the script. first_wait = True do_sleep = False if not os.path.exists(app['path'] + '/exec.py'): LOG.info('Application %s from %s@%s (auth level: %s) not found.', app['title'], app['user_name'], app['user_host'], AppManager.auth_level_name(app['auth_level'])) self.manager.master.update_application(app['appid'], status = AppManager.STAT_NOTFOUND) self.appserver.notify_synch_app(app['appid'], {'status': AppManager.STAT_NOTFOUND}) continue LOG.info('Found application %s from %s (AID %s, auth level: %s)', app['title'], app['user_name'], app['appid'], AppManager.auth_level_name(app['auth_level'])) is_local = (app['user_host'] == self.manager.hostname) if app['auth_level'] == AppManager.LV_WRITE: # check authorization with open(app['path'] + '/exec.py') as source: checksum = hashlib.md5(source.read()).hexdigest() if not self.manager.master.check_application_auth(app['title'], app['user_name'], checksum): LOG.warning('Application %s from %s is not authorized for write access.', app['title'], app['user_name']) # TODO send a message self.manager.master.update_application(app['appid'], status = AppManager.STAT_AUTHFAILED) self.appserver.notify_synch_app(app['appid'], {'status': AppManager.STAT_AUTHFAILED}) continue writing_process = app['appid'] ## Step 3: Spawn a child process for the script self.manager.master.update_application(app['appid'], status = AppManager.STAT_RUN) proc = self._start_subprocess(app, is_local) self.appserver.notify_synch_app(app['appid'], {'status': AppManager.STAT_RUN, 'path': app['path'], 'pid': proc.pid}) LOG.info('Started application %s (%s) from %s@%s (AID %d PID %d).', app['title'], app['path'], app['user_name'], app['user_host'], app['appid'], proc.pid) child_processes.append((app['appid'], proc, time.time())) except KeyboardInterrupt: if len(child_processes) != 0: LOG.info('Terminating all child processes..') raise except: if len(child_processes) != 0: LOG.error('Exception (%s) in server process. Terminating all child processes..', sys.exc_info()[0].__name__) else: LOG.error('Exception (%s) in server process.', sys.exc_info()[0].__name__) if self.manager.status not in [ServerHost.STAT_OUTOFSYNC, ServerHost.STAT_ERROR]: try: self.manager.set_status(ServerHost.STAT_ERROR) except: pass raise finally: # If the main process was interrupted by Ctrl+C: # Ctrl+C will pass SIGINT to all child processes (if this process is the head of the # foreground process group). In this case calling terminate() will duplicate signals # in the child. Child processes have to always ignore SIGINT and be killed only from # SIGTERM sent by the line below. for app_id, proc, time_start in child_processes: try: apps = self.manager.master.get_applications(app_id = app_id) except: apps = [] if len(apps) == 0: id_str = '%s from unknown (AID %d PID %d)' % (proc.name, app_id, proc.pid) else: id_str = '%s (%s) from %s@%s (AID %d PID %d)' % (proc.name, apps[0]['path'], \ apps[0]['user_name'], apps[0]['user_host'], app_id, proc.pid) LOG.warning('Terminating %s', id_str) serverutils.killproc(proc, LOG) try: self.manager.master.update_application(app_id, status = AppManager.STAT_KILLED) except: pass LOG.info('Stopping application server.') # Close the application collector. The collector thread will terminate self.appserver.stop()