def monitorEvents(options, events): """This is the main state machine for the Tegra monitor. Respond to the events sent via queue and also monitor the state of the buildslave if it's been started. """ pidFile = os.path.join(options.bbpath, 'twistd.pid') flagFile = os.path.join(options.bbpath, 'proxy.flg') errorFile = os.path.join(options.bbpath, 'error.flg') bbEnv = { 'PATH': os.getenv('PATH'), 'SUT_NAME': options.tegra, 'SUT_IP': options.tegraIP, } event = None bbActive = False tegraActive = False connected = False nChatty = 0 maxChatty = 10 hbFails = 0 maxFails = 50 sleepFails = 5 softCount = 0 # how many times tegraActive is True # but errorFlag is set softCountMax = 5 # how many active events to wait bdfore # triggering a soft reset softResets = 0 softResetMax = 5 # how many soft resets do we try before # waiting for a hard reset hardResets = 0 hardResetsMax = 3 lastHangCheck = time.time() log.info('monitoring started (process pid %s)' % current_process().pid) while True: if not connected: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((options.tegraIP, sutDataPort)) connected = True except: connected = False hbFails += 1 log.info('Error connecting to data port - sleeping for %d seconds' % sleepFails) time.sleep(sleepFails) try: event = events.get(False) except Empty: event = None if event is None: if connected: try: hbData = hbSocket.recv(4096) except: hbData = '' connected = False dumpException('hbSocket.recv()') if len(hbData) > 1: log.debug('socket data [%s]' % hbData[:-1]) if 'ebooting ...' in hbData: log.warning('device is rebooting') events.put(('reboot',)) if os.path.isfile(flagFile): time.sleep(5) hbSocket.close() connected = False else: log.info('heartbeat detected') events.put(('active',)) hbFails = 0 else: hbFails += 1 else: state = event[0] s = 'event %s hbFails %d / %d' % (state, hbFails, maxFails) nChatty += 1 if nChatty > maxChatty: log.info(s) else: log.debug(s) if nChatty > maxChatty: nChatty = 0 if state == 'reboot': tegraActive = False if not os.path.isfile(flagFile): log.warning('Tegra rebooting, stopping buildslave') events.put(('stop',)) elif state == 'stop' or state == 'offline': stopSlave(pidFile) bbActive = False if connected and state == 'offline': hbSocket.close() connected = False elif state == 'active' or state == 'dialback': tegraActive = True if not bbActive: if os.path.isfile(errorFile): log.warning('Tegra active but error flag set [%d/%d]' % (softCount, softResets)) softCount += 1 if softCount > softCountMax: softCount = 0 if softResets < softResetMax: softResets += 1 log.warning('removing error flag to see if tegra comes back') os.remove(errorFile) else: hardResets += 1 log.warning('hard reset reboot check [%d/%d]' % (hardResets, hardResetsMax)) if hardResets < hardResetsMax: sendReboot(options.tegraIP, sutDataPort) else: events.put(('offline',)) else: events.put(('start',)) elif state == 'start': if tegraActive and not bbActive: log.debug('starting buildslave in %s' % options.bbpath) bbProc, _ = runCommand(['twistd', '--no_save', '--rundir=%s' % options.bbpath, '--pidfile=%s' % pidFile, '--python=%s' % os.path.join(options.bbpath, 'buildbot.tac')], env=bbEnv) log.info('buildslave start returned %s' % bbProc.returncode) if bbProc.returncode == 0 or bbProc.returncode == 1: # pause to give twistd a chance to generate the pidfile # before the code that follows goes off killing it because # it thinks that it didn't start properly # OMGRACECONDITIONWTF nTries = 0 while nTries < 20: nTries += 1 if os.path.isfile(pidFile): log.debug('pidfile found, setting bbActive to True') bbActive = True break else: time.sleep(5) elif state == 'dialback': softCount = 0 softResets = 0 hardResets = 0 elif state == 'terminate': break if hbFails > maxFails: hbFails = 0 sleepFails += 5 if sleepFails > 300: sleepFails = 300 if os.path.isfile(flagFile): log.debug('install flag found, resetting error count') else: events.put(('offline',)) if connected: hbSocket.close() connected = False log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive)) if os.path.isfile(errorFile): if bbActive: log.error('errorFile detected - sending stop request') events.put(('stop',)) if bbActive: if os.path.isfile(pidFile): n = time.time() if not checkSlaveAlive(options.bbpath): log.warning('buildslave should be active but pid is not alive') if int(n - lastHangCheck) > 300: lastHangCheck = n logTD = checkSlaveActive(options.bbpath) if logTD.days > 0 or (logTD.days == 0 and logTD.seconds > options.hangtime): log.error('last activity was %d days %d seconds ago - marking as hung slave' % (logTD.days, logTD.seconds)) events.put(('offline',)) else: log.warning('buildslave should be active but pidfile not found, marking as offline') events.put(('offline',)) else: if os.path.isfile(pidFile): if checkSlaveAlive(options.bbpath): log.error('buildslave should NOT be active but pidfile found, killing buildbot') events.put(('stop',)) else: log.warning('buildslave not active but pidfile found, removing pidfile') os.remove(pidFile) if bbActive: stopSlave(pidFile) log.info('monitor stopped')
def checkTegra(master, tegra): tegraIP = getIPAddress(tegra) tegraPath = os.path.join(options.bbpath, tegra) exportFile = os.path.join(tegraPath, '%s_status.log' % tegra) errorFile = os.path.join(tegraPath, 'error.flg') errorFlag = os.path.isfile(errorFile) sTegra = 'OFFLINE' sutFound = False logTD = None status = {'tegra': tegra, 'active': False, 'cp': 'OFFLINE', 'bs': 'OFFLINE', 'msg': '', } log.debug('%s: %s' % (tegra, tegraIP)) if master is None: status['environment'] = 's' status['master'] = 'localhost' else: status['environment'] = master['environment'][0] status['master'] = 'http://%s:%s' % ( master['hostname'], master['http_port']) fPing, lPing = pingDevice(tegra) if fPing: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) sutFound = True time.sleep(2) hbSocket.send('info\n') d = hbSocket.recv(4096) log.debug('socket data length %d' % len(d)) log.debug(d) status['active'] = True hbSocket.close() except: status['active'] = False dumpException('socket') if status['active']: sTegra = 'online' else: sTegra = 'INACTIVE' if not sutFound: status['msg'] += 'SUTAgent not present;' else: status['msg'] += '%s %s;' % (lPing[0], lPing[1]) # Cheat until we have a better check solution for new watch_devices.sh status['cp'] = 'active' # pretend all is well if checkSlaveAlive(tegraPath): logTD = checkSlaveActive(tegraPath) if logTD is not None: if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600): status['bs'] = 'INACTIVE' status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds) else: status['bs'] = 'active' else: status['bs'] = 'INACTIVE' else: # scan thru tegra-### dir and see if any buildbot.tac.bug#### files # exist but ignore buildbot.tac file itself (except to note that it is # missing) files = os.listdir(tegraPath) found = False for f in files: if f.startswith('buildbot.tac'): found = True if len(f) > 12: status['msg'] += '%s;' % f if not found: status['msg'] += 'buildbot.tac NOT found;' if errorFlag: status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile) s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg']) ts = time.strftime('%Y-%m-%d %H:%M:%S') log.info(s) open(exportFile, 'a+').write('%s %s\n' % (ts, s)) summary(status['tegra'], status['environment'], sTegra, status[ 'cp'], status['bs'], status['msg'], ts, status['master']) if errorFlag and options.reset: stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave') if not options.reboot: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) hbSocket.send('rebt\n') hbSocket.close() log.info('rebooting tegra') except: dumpException('socket') if errorFlag: log.info('clearing error.flg') os.remove(errorFile) # Here we try to catch the state where sutagent and cp are inactive that # is determined by: # sTegra == 'INACTIVE' and # status['cp'] == 'INACTIVE' # status['cp'] will be set to INACTIVE only if logTD.seconds (last time # clientproxy updated it's log file) is > 3600 if options.reboot: if not sutFound and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) else: if sTegra == 'OFFLINE' and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE': log.info('stopping hung clientproxy') stopDevice(tegra) time.sleep(5) log.info('starting clientproxy for %s' % tegra) os.chdir(tegraPath) runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])
def checkTegra(master, tegra): tegraIP = getIPAddress(tegra) tegraPath = os.path.join(options.bbpath, tegra) exportFile = os.path.join(tegraPath, '%s_status.log' % tegra) errorFile = os.path.join(tegraPath, 'error.flg') proxyFile = os.path.join(tegraPath, 'proxy.flg') errorFlag = os.path.isfile(errorFile) proxyFlag = os.path.isfile(proxyFile) sTegra = 'OFFLINE' sutFound = False status = { 'tegra': tegra, 'active': False, 'cp': 'OFFLINE', 'bs': 'OFFLINE', 'msg': '', } log.debug('%s: %s' % (tegra, tegraIP)) if master is None: status['environment'] = 's' status['master'] = 'localhost' else: status['environment'] = master['environment'][0] status['master'] = 'http://%s:%s' % (master['hostname'], master['http_port']) fPing, lPing = pingTegra(tegra) if fPing: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) sutFound = True time.sleep(2) hbSocket.send('info\n') d = hbSocket.recv(4096) log.debug('socket data length %d' % len(d)) log.debug(d) status['active'] = True hbSocket.close() except: status['active'] = False dumpException('socket') if status['active']: sTegra = 'online' else: sTegra = 'INACTIVE' if not sutFound: status['msg'] += 'SUTAgent not present;' else: status['msg'] += '%s %s;' % (lPing[0], lPing[1]) if checkCPAlive(tegraPath): logTD = checkCPActive(tegraPath) if logTD is not None and logTD.days > 0 or (logTD.days == 0 and logTD.seconds > 300): status['cp'] = 'INACTIVE' status['msg'] += 'CP %dd %ds;' % (logTD.days, logTD.seconds) else: status['cp'] = 'active' else: if os.path.isfile(os.path.join(tegraPath, 'clientproxy.pid')): status['msg'] += 'clientproxy.pid found;' if checkSlaveAlive(tegraPath): logTD = checkSlaveActive(tegraPath) if logTD is not None and logTD.days > 0 or (logTD.days == 0 and logTD.seconds > 3600): status['bs'] = 'INACTIVE' status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds) else: status['bs'] = 'active' else: # scan thru tegra-### dir and see if any buildbot.tac.bug#### files exist # but ignore buildbot.tac file itself (except to note that it is missing) files = os.listdir(tegraPath) found = False for f in files: if f.startswith('buildbot.tac'): found = True if len(f) > 12: status['msg'] += '%s;' % f if not found: status['msg'] += 'buildbot.tac NOT found;' if errorFlag: status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile) if proxyFlag: status['msg'] += 'REBOOTING ' s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg']) ts = time.strftime('%Y-%m-%d %H:%M:%S') log.info(s) open(exportFile, 'a+').write('%s %s\n' % (ts, s)) summary(status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg'], ts, status['master']) if errorFlag and options.reset: stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave') if not options.reboot: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) hbSocket.send('rebt\n') hbSocket.close() log.info('rebooting tegra') except: dumpException('socket') if errorFlag: log.info('clearing error.flg') os.remove(errorFile) if proxyFlag: log.info('clearing proxy.flg') os.remove(proxyFile) if options.reboot: if not sutFound and status['bs'] != 'active': log.info('power cycling tegra') reboot_tegra(tegra) else: if sTegra == 'OFFLINE' and status['bs'] != 'active': log.info('power cycling tegra') reboot_tegra(tegra)
def monitorEvents(options, events): """This is the main state machine for the Tegra monitor. Respond to the events sent via queue and also monitor the state of the buildslave if it's been started. """ pidFile = os.path.join(options.bbpath, 'twistd.pid') flagFile = os.path.join(options.bbpath, 'proxy.flg') errorFile = os.path.join(options.bbpath, 'error.flg') bbEnv = { 'PATH': os.getenv('PATH'), 'SUT_NAME': options.tegra, 'SUT_IP': options.tegraIP, } event = None bbActive = False tegraActive = False connected = False nChatty = 0 maxChatty = 10 hbFails = 0 maxFails = 50 sleepFails = 5 softCount = 0 # how many times tegraActive is True # but errorFlag is set softCountMax = 5 # how many active events to wait bdfore # triggering a soft reset softResets = 0 softResetMax = 5 # how many soft resets do we try before # waiting for a hard reset hardResets = 0 hardResetsMax = 3 lastHangCheck = time.time() log.info('monitoring started (process pid %s)' % current_process().pid) while True: if not connected: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((options.tegraIP, sutDataPort)) connected = True except: connected = False hbFails += 1 log.info( 'Error connecting to data port - sleeping for %d seconds' % sleepFails) time.sleep(sleepFails) try: event = events.get(False) except Empty: event = None if event is None: if connected: try: hbData = hbSocket.recv(4096) except: hbData = '' connected = False dumpException('hbSocket.recv()') if len(hbData) > 1: log.debug('socket data [%s]' % hbData[:-1]) if 'ebooting ...' in hbData: log.warning('device is rebooting') events.put(('reboot', )) if os.path.isfile(flagFile): time.sleep(5) hbSocket.close() connected = False else: log.info('heartbeat detected') events.put(('active', )) hbFails = 0 else: hbFails += 1 else: state = event[0] s = 'event %s hbFails %d / %d' % (state, hbFails, maxFails) nChatty += 1 if nChatty > maxChatty: log.info(s) else: log.debug(s) if nChatty > maxChatty: nChatty = 0 if state == 'reboot': tegraActive = False if not os.path.isfile(flagFile): log.warning('Tegra rebooting, stopping buildslave') events.put(('stop', )) elif state == 'stop' or state == 'offline': stopSlave(pidFile) bbActive = False if connected and state == 'offline': hbSocket.close() connected = False elif state == 'active' or state == 'dialback': tegraActive = True if not bbActive: if os.path.isfile(errorFile): log.warning('Tegra active but error flag set [%d/%d]' % (softCount, softResets)) softCount += 1 if softCount > softCountMax: softCount = 0 if softResets < softResetMax: softResets += 1 log.warning( 'removing error flag to see if tegra comes back' ) os.remove(errorFile) else: hardResets += 1 log.warning('hard reset reboot check [%d/%d]' % (hardResets, hardResetsMax)) if hardResets < hardResetsMax: sendReboot(options.tegraIP, sutDataPort) else: events.put(('offline', )) else: events.put(('start', )) elif state == 'start': if tegraActive and not bbActive: log.debug('starting buildslave in %s' % options.bbpath) bbProc, _ = runCommand([ 'twistd', '--no_save', '--rundir=%s' % options.bbpath, '--pidfile=%s' % pidFile, '--python=%s' % os.path.join(options.bbpath, 'buildbot.tac') ], env=bbEnv) log.info('buildslave start returned %s' % bbProc.returncode) if bbProc.returncode == 0 or bbProc.returncode == 1: # pause to give twistd a chance to generate the pidfile # before the code that follows goes off killing it because # it thinks that it didn't start properly # OMGRACECONDITIONWTF nTries = 0 while nTries < 20: nTries += 1 if os.path.isfile(pidFile): log.debug( 'pidfile found, setting bbActive to True') bbActive = True break else: time.sleep(5) elif state == 'dialback': softCount = 0 softResets = 0 hardResets = 0 elif state == 'terminate': break if hbFails > maxFails: hbFails = 0 sleepFails += 5 if sleepFails > 300: sleepFails = 300 if os.path.isfile(flagFile): log.debug('install flag found, resetting error count') else: events.put(('offline', )) if connected: hbSocket.close() connected = False log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive)) if os.path.isfile(errorFile): if bbActive: log.error('errorFile detected - sending stop request') events.put(('stop', )) if bbActive: if os.path.isfile(pidFile): n = time.time() if not checkSlaveAlive(options.bbpath): log.warning( 'buildslave should be active but pid is not alive') if int(n - lastHangCheck) > 300: lastHangCheck = n logTD = checkSlaveActive(options.bbpath) if logTD.days > 0 or (logTD.days == 0 and logTD.seconds > options.hangtime): log.error( 'last activity was %d days %d seconds ago - marking as hung slave' % (logTD.days, logTD.seconds)) events.put(('offline', )) else: log.warning( 'buildslave should be active but pidfile not found, marking as offline' ) events.put(('offline', )) else: if os.path.isfile(pidFile): if checkSlaveAlive(options.bbpath): log.error( 'buildslave should NOT be active but pidfile found, killing buildbot' ) events.put(('stop', )) else: log.warning( 'buildslave not active but pidfile found, removing pidfile' ) os.remove(pidFile) if bbActive: stopSlave(pidFile) log.info('monitor stopped')
def checkTegra(master, tegra): tegraIP = getIPAddress(tegra) tegraPath = os.path.join(options.bbpath, tegra) exportFile = os.path.join(tegraPath, '%s_status.log' % tegra) errorFile = os.path.join(tegraPath, 'error.flg') errorFlag = os.path.isfile(errorFile) sTegra = 'OFFLINE' sutFound = False logTD = None status = { 'tegra': tegra, 'active': False, 'cp': 'OFFLINE', 'bs': 'OFFLINE', 'msg': '', } log.debug('%s: %s' % (tegra, tegraIP)) if master is None: status['environment'] = 's' status['master'] = 'localhost' else: status['environment'] = master['environment'][0] status['master'] = 'http://%s:%s' % (master['hostname'], master['http_port']) fPing, lPing = pingDevice(tegra) if fPing: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) sutFound = True time.sleep(2) hbSocket.send('info\n') d = hbSocket.recv(4096) log.debug('socket data length %d' % len(d)) log.debug(d) status['active'] = True hbSocket.close() except: status['active'] = False dumpException('socket') if status['active']: sTegra = 'online' else: sTegra = 'INACTIVE' if not sutFound: status['msg'] += 'SUTAgent not present;' else: status['msg'] += '%s %s;' % (lPing[0], lPing[1]) # Cheat until we have a better check solution for new watch_devices.sh status['cp'] = 'active' # pretend all is well if checkSlaveAlive(tegraPath): logTD = checkSlaveActive(tegraPath) if logTD is not None: if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600): status['bs'] = 'INACTIVE' status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds) else: status['bs'] = 'active' else: status['bs'] = 'INACTIVE' else: # scan thru tegra-### dir and see if any buildbot.tac.bug#### files # exist but ignore buildbot.tac file itself (except to note that it is # missing) files = os.listdir(tegraPath) found = False for f in files: if f.startswith('buildbot.tac'): found = True if len(f) > 12: status['msg'] += '%s;' % f if not found: status['msg'] += 'buildbot.tac NOT found;' if errorFlag: status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile) s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg']) ts = time.strftime('%Y-%m-%d %H:%M:%S') log.info(s) open(exportFile, 'a+').write('%s %s\n' % (ts, s)) summary(status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg'], ts, status['master']) if errorFlag and options.reset: stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave') if not options.reboot: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) hbSocket.send('rebt\n') hbSocket.close() log.info('rebooting tegra') except: dumpException('socket') if errorFlag: log.info('clearing error.flg') os.remove(errorFile) # Here we try to catch the state where sutagent and cp are inactive that # is determined by: # sTegra == 'INACTIVE' and # status['cp'] == 'INACTIVE' # status['cp'] will be set to INACTIVE only if logTD.seconds (last time # clientproxy updated it's log file) is > 3600 if options.reboot: if not sutFound and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) else: if sTegra == 'OFFLINE' and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE': log.info('stopping hung clientproxy') stopDevice(tegra) time.sleep(5) log.info('starting clientproxy for %s' % tegra) os.chdir(tegraPath) runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])