class TestStatus(object): """Supervisor and Phantom status information container. Attributes: state: str, human readable status. started_at: datetime obj, time when work was started. duration: int, time delta in seconds from start. uid: str, process owner. owner: str, responsible person total_dur: int, theoretical test duration in seconds wd: str, working directory if process. pid: int, supervisor PID. phantom_pid: int, Phantom process PID. answ_mtime: str, time when answ.txt was last time writen by Phantom. answ_mago: int, number of seconds since answ_mtime moment. retcode: int, Phantom process exit code. stdout: str, Phantom process STDOUT. stderr: str, Phantom process STDERR. eggs: An integer count of the eggs we have laid. """ def __init__(self, test_cfg, main_cfg=None, state='Starting'): self.test_cfg = test_cfg self.logger = logging.getLogger('supervisor') self.pid = os.getpid() self.state = state self.started_at = datetime.datetime.utcnow() self.duration = 0 self.uid = getpass.getuser() self.wd = os.getcwd() if main_cfg: self.main_cfg = main_cfg self.fires = {} for idx, fire in enumerate(self.test_cfg['fire']): fire_up = { 'state': 'Created', 'end_status': None, 'phantom_pid': None, 'answ_mtime': None, 'answ_mago': None, 'retcode': None, 'stdout': None, 'stderr': None, } self.test_cfg['fire'][idx].update(fire_up) def jobs_completed(self): '''Check is there currentlly running phantom jobs or not.''' for fire in self.test_cfg['fire']: if fire['end_status'] == None: return False self.state = 'phantoms terminated' self.test_cfg['ended_at'] = datetime.datetime.utcnow() return True def run_jobs(self, sleep_time=1): '''Fork phantom jobs according to test_cfg and monitor them till the termination. ''' # fork phantom for each fire in test. for idx, fire in enumerate(self.test_cfg['fire']): new_job = Popen(PHANTOM_CMD, stdout=PIPE, stderr=PIPE, cwd=fire['wd']) self.test_cfg['fire'][idx]['job'] = new_job self.test_cfg['fire'][idx]['state'] = 'phantom launched' self.logger.info('%s > %s' % (fire['name'], 'Phantom launched')) self.test_cfg['fire'][idx]['phantom_pid'] = new_job.pid self.logger.info('%s > %s %s' % (fire['name'], 'Phantom PID:', new_job.pid)) self.state = 'Phantoms forked' # sleep and pool each child process. end_status = 0 # by default All is ok. if self.main_cfg: self.oc = FirebatOverlordClient(self.main_cfg['FIERBAT_API_URL'], self.main_cfg['FIERBAT_API_TO']) if self.oc: # push updates to overlord if needed. retcode, resp = self.oc.push_test_updates(self.test_cfg, status='running') self.logger.info('Push test status update to Overlord side.') while not self.jobs_completed(): self.duration_update() self.log_mtime_update() for idx, fire in enumerate(self.test_cfg['fire']): if fire['end_status'] != None: # fire allready terminated continue job = self.test_cfg['fire'][idx]['job'] retcode = job.poll() if retcode is not None: # Process finished. self.test_cfg['fire'][idx]['retcode'] = retcode self.test_cfg['fire'][idx]['stdout'] = job.stdout.read() self.test_cfg['fire'][idx]['stderr'] = job.stderr.read() self.test_cfg['fire'][idx]['ended_at'] = datetime.datetime.utcnow() self.test_cfg['fire'][idx]['state'] = 'phantom terminated' self.logger.info( '%s > phantom terminated with retcode: %s' %\ (fire['name'], retcode)) self.test_cfg['fire'][idx]['end_status'] = end_status if self.oc: # push updates to overlord if needed. f = self.test_cfg['fire'][idx] #self.logger.info('Try to patch fire on Overlord side.') self.logger.info('%s > %s' % (f['name'], 'Try to patch fire on Overlord side.')) # Push fire termination details to remote side. self.oc.push_fire_updates(f) # Calculate metrics from phout and push them after. result, msg = proc_whole_phout(f, oc=self.oc) if result: self.logger.info('%s > %s' % (f['name'], 'Fire updated on remote side successfully.')) else: self.logger.info('%s > %s' % (f['name'], 'Remote call failed: %s' % msg)) else: # No process is done, wait a bit and check again. self.state = 'Phantoms waiting' time.sleep(sleep_time) self.msg_update() try: astop_mtime =\ self.test_cfg['fire'][idx]['autostop']['astop_mtime'] except KeyError: astop_mtime = None if astop_mtime and \ self.test_cfg['fire'][idx]['answ_mago'] > astop_mtime: self.logger.error('phantom answ file hasn\'t been' + ' updated to long') self.logger.error('Try to stop the fire') job.kill() end_status = -1 if self.oc: # push updates to overlord if needed. #self.test_cfg['ended_at'] = datetime.datetime.utcnow() retcode, resp = self.oc.push_test_updates(self.test_cfg, status='finished') self.logger.info('Push test status update to Overlord side.') def msg_update(self): '''Update JSON obj translated to other process''' global msg msg = json.dumps(self.repr_as_dict()) def log_result(self): '''Log info about how phantom was terminated.''' for idx, fire in enumerate(self.test_cfg['fire']): if fire['end_status'] == None: continue result = 'undefined' if fire['end_status'] == 0: result = 'phantom terminated normally' if fire['retcode'] == -9: result = 'phantom killed by user with SIGNUM 9' elif fire['end_status'] == -1: result = 'phantom terminated by supervisor, reason: answ mtime' self.logger.info('%s > %s' % (fire['name'], result)) def duration_update(self): delta = datetime.datetime.utcnow() - self.started_at self.duration = delta.seconds def log_mtime_update(self, log_name='answ.txt'): '''Check when Phantom write one of his log files last time, helps to check, is Phantom alive or not.''' for idx, fire in enumerate(self.test_cfg['fire']): log_path = '%s/%s' % (fire['wd'], log_name) try: mtime = os.path.getmtime(log_path) self.test_cfg['fire'][idx]['answ_mtime'] = time.ctime(mtime) delta = datetime.datetime.now() -\ datetime.datetime.fromtimestamp(int(mtime)) self.test_cfg['fire'][idx]['answ_mago'] = delta.seconds except OSError: self.logger.warning('Can\'t read mtime of file: %s' % log_path) def repr_as_dict(self): '''Represent current test status as a dict.''' self.duration_update() self.log_mtime_update() result = { 'test_name': self.test_cfg['title']['test_name'], 'pid': self.pid, 'id': self.test_cfg.get('id', None), 'task': self.test_cfg['title']['task'], 'duration': self.duration, 'started_at': self.started_at.isoformat(), 'state': self.state, 'owner': self.test_cfg.get('owner', 'uid'), 'uid': self.uid, 'fires': [], } fire_allowed = [ 'answ_mago', 'name', 'phantom_pid', 'total_dur', ] for idx, fire in enumerate(self.test_cfg['fire']): f = dict((key,value) for key, value in fire.iteritems() if key in\ fire_allowed) result['fires'].append(f) return result
def run_jobs(self, sleep_time=1): '''Fork phantom jobs according to test_cfg and monitor them till the termination. ''' # fork phantom for each fire in test. for idx, fire in enumerate(self.test_cfg['fire']): new_job = Popen(PHANTOM_CMD, stdout=PIPE, stderr=PIPE, cwd=fire['wd']) self.test_cfg['fire'][idx]['job'] = new_job self.test_cfg['fire'][idx]['state'] = 'phantom launched' self.logger.info('%s > %s' % (fire['name'], 'Phantom launched')) self.test_cfg['fire'][idx]['phantom_pid'] = new_job.pid self.logger.info('%s > %s %s' % (fire['name'], 'Phantom PID:', new_job.pid)) self.state = 'Phantoms forked' # sleep and pool each child process. end_status = 0 # by default All is ok. if self.main_cfg: self.oc = FirebatOverlordClient(self.main_cfg['FIERBAT_API_URL'], self.main_cfg['FIERBAT_API_TO']) if self.oc: # push updates to overlord if needed. retcode, resp = self.oc.push_test_updates(self.test_cfg, status='running') self.logger.info('Push test status update to Overlord side.') while not self.jobs_completed(): self.duration_update() self.log_mtime_update() for idx, fire in enumerate(self.test_cfg['fire']): if fire['end_status'] != None: # fire allready terminated continue job = self.test_cfg['fire'][idx]['job'] retcode = job.poll() if retcode is not None: # Process finished. self.test_cfg['fire'][idx]['retcode'] = retcode self.test_cfg['fire'][idx]['stdout'] = job.stdout.read() self.test_cfg['fire'][idx]['stderr'] = job.stderr.read() self.test_cfg['fire'][idx]['ended_at'] = datetime.datetime.utcnow() self.test_cfg['fire'][idx]['state'] = 'phantom terminated' self.logger.info( '%s > phantom terminated with retcode: %s' %\ (fire['name'], retcode)) self.test_cfg['fire'][idx]['end_status'] = end_status if self.oc: # push updates to overlord if needed. f = self.test_cfg['fire'][idx] #self.logger.info('Try to patch fire on Overlord side.') self.logger.info('%s > %s' % (f['name'], 'Try to patch fire on Overlord side.')) # Push fire termination details to remote side. self.oc.push_fire_updates(f) # Calculate metrics from phout and push them after. result, msg = proc_whole_phout(f, oc=self.oc) if result: self.logger.info('%s > %s' % (f['name'], 'Fire updated on remote side successfully.')) else: self.logger.info('%s > %s' % (f['name'], 'Remote call failed: %s' % msg)) else: # No process is done, wait a bit and check again. self.state = 'Phantoms waiting' time.sleep(sleep_time) self.msg_update() try: astop_mtime =\ self.test_cfg['fire'][idx]['autostop']['astop_mtime'] except KeyError: astop_mtime = None if astop_mtime and \ self.test_cfg['fire'][idx]['answ_mago'] > astop_mtime: self.logger.error('phantom answ file hasn\'t been' + ' updated to long') self.logger.error('Try to stop the fire') job.kill() end_status = -1 if self.oc: # push updates to overlord if needed. #self.test_cfg['ended_at'] = datetime.datetime.utcnow() retcode, resp = self.oc.push_test_updates(self.test_cfg, status='finished') self.logger.info('Push test status update to Overlord side.')