def watchdog_loop(): run_dir = asp.get_asp_run_dir() # no_restart_file = run_dir + '/' + ASP_NO_RESTART_FILE # watchdog_restart_file = run_dir + '/' + ASP_WATCHDOG_RESTART_FILE # restart_disable_file = run_dir + '/' + ASP_RESTART_DISABLE_FILE reboot_file = run_dir + '/' + asp.ASP_REBOOT_FILE stop_file = run_dir + '/' + asp.SAFPLUS_STOP_FILE asp.safe_remove(reboot_file) asp.remove_stop_file() # safe_remove(restart_disable_file) # safe_remove(watchdog_restart_file) while True: try: pid = asp.get_amf_pid() if pid == 0: if os.path.isfile(stop_file): # Kill watchdog if stop file exists print "Stop file exists: SAFplus is stopping" return else: # Restart AMF if stop file not found print "Stop file not found: Starting AMF from Watchdog" start_ams() wdSleep(SAFPLUS_RESTART_DELAY) except Exception, e: print "Exception %s" % str(e) pass
def amf_watchdog_loop(): monitor_interval = 5 run_dir = asp.get_asp_run_dir() restart_file = run_dir + '/' + ASP_RESTART_FILE watchdog_restart_file = run_dir + '/' + ASP_WATCHDOG_RESTART_FILE reboot_file = run_dir + '/' + ASP_REBOOT_FILE restart_disable_file = run_dir + '/' + ASP_RESTART_DISABLE_FILE safe_remove(restart_file) safe_remove(reboot_file) safe_remove(restart_disable_file) seen_openhpid = False while True: pid = asp.get_amf_pid() if pid == 0: asp.log.critical('AMF watchdog invoked on %s' %\ time.strftime('%a %d %b %Y %H:%M:%S')) is_restart = os.access(restart_file, os.F_OK) is_forced_restart = os.access(watchdog_restart_file, os.F_OK) if is_restart or is_forced_restart: safe_remove(restart_file) safe_remove(watchdog_restart_file) asp.log.debug('AMF watchdog restarting ASP...') asp.zap_asp(False) ## give time for pending ops to complete ## we unload the TIPC module and let ASP start reload it, ## since its been observed with tipc 1.5.12 that ASP starts ## after a link re-establishment results in multicast link ## retransmit failures due to pending ACK thereby resulting ## in all the TIPC links being reset. wdSleep(SAFPLUS_RESTART_DELAY) asp.start_asp(stop_watchdog=False, force_start=True) asp.create_asp_cmd_marker('start') sys.exit(1) elif os.access(reboot_file, os.F_OK): safe_remove(reboot_file) if getenv("ASP_NODE_REBOOT_DISABLE", 0) != 0: asp.zap_asp() sys.exit(1) else: asp.log.debug('AMF watchdog rebooting %s...' % asp.get_asp_node_name()) asp.run_custom_scripts('reboot') asp.proc_lock_file('remove') os.system('reboot') elif os.access(restart_disable_file, os.F_OK): safe_remove(restart_disable_file) asp.log.debug('AMF watchdog ignoring failure of %s ' 'as node failfast/failover recovery action ' 'was called on it and ASP_NODE_REBOOT_DISABLE ' 'environment variable is set for it.' % asp.get_asp_node_name()) asp.zap_asp() sys.exit(1) else: asp.log.debug('AMF watchdog invocation default case') if not asp_admin_stop(): asp.zap_asp(False) if asp.should_restart_asp(): wdSleep(SAFPLUS_RESTART_DELAY) asp.start_asp(stop_watchdog=False, force_start = True) asp.create_asp_cmd_marker('start') else: asp.proc_lock_file('remove') sys.exit(1) else: # pid is nonzero => amf is up # handle openhpid here openhpid_pid = asp.get_openhpid_pid() if seen_openhpid: if openhpid_pid == 0: # openhpid is DOWN and we have seen it before # we should bring it back asp.log.debug('AMF watchdog expected openhpid but did not find it. Restarting openhpid...') # zap it to make sure its DEAD os.popen('killall openhpid 2>/dev/null') #time.sleep(1) asp.start_openhpid() else: asp.log.debug('AMF watchdog openhpid pid(%d) found as expected, nothing to do.' % openhpid_pid) else: if openhpid_pid != 0: seen_openhpid = True wdSleep(monitor_interval)
def amf_watchdog_loop(): monitor_interval = 5 run_dir = asp.get_asp_run_dir() restart_file = run_dir + '/' + ASP_RESTART_FILE watchdog_restart_file = run_dir + '/' + ASP_WATCHDOG_RESTART_FILE reboot_file = run_dir + '/' + ASP_REBOOT_FILE restart_disable_file = run_dir + '/' + ASP_RESTART_DISABLE_FILE safe_remove(restart_file) safe_remove(reboot_file) safe_remove(restart_disable_file) seen_openhpid = False while True: pid = asp.get_amf_pid() if pid == 0: asp.log.critical('AMF watchdog invoked on %s' %\ time.strftime('%a %d %b %Y %H:%M:%S')) is_restart = os.access(restart_file, os.F_OK) is_forced_restart = os.access(watchdog_restart_file, os.F_OK) if is_restart or is_forced_restart: safe_remove(restart_file) safe_remove(watchdog_restart_file) asp.log.debug('AMF watchdog restarting ASP...') asp.zap_asp(False) ## give time for pending ops to complete ## we unload the TIPC module and let ASP start reload it, ## since its been observed with tipc 1.5.12 that ASP starts ## after a link re-establishment results in multicast link ## retransmit failures due to pending ACK thereby resulting ## in all the TIPC links being reset. asp.start_asp(stop_watchdog=False, force_start=True) asp.create_asp_cmd_marker('start') sys.exit(1) elif os.access(reboot_file, os.F_OK): safe_remove(reboot_file) if getenv("ASP_NODE_REBOOT_DISABLE", 0) != 0: asp.zap_asp() sys.exit(1) else: asp.log.debug('AMF watchdog rebooting %s...' % asp.get_asp_node_name()) asp.run_custom_scripts('reboot') asp.proc_lock_file('remove') os.system('reboot') elif os.access(restart_disable_file, os.F_OK): safe_remove(restart_disable_file) asp.log.debug('AMF watchdog ignoring failure of %s ' 'as node failfast/failover recovery action ' 'was called on it and ASP_NODE_REBOOT_DISABLE ' 'environment variable is set for it.' % asp.get_asp_node_name()) asp.zap_asp() sys.exit(1) else: asp.log.debug('AMF watchdog invocation default case') if not asp_admin_stop(): asp.zap_asp(False) if asp.should_restart_asp(): asp.start_asp(stop_watchdog=False, force_start=True) asp.create_asp_cmd_marker('start') else: asp.proc_lock_file('remove') sys.exit(1) else: # pid is nonzero => amf is up # handle openhpid here openhpid_pid = asp.get_openhpid_pid() if seen_openhpid: if openhpid_pid == 0: # openhpid is DOWN and we have seen it before # we should bring it back asp.log.debug( 'AMF watchdog expected openhpid but did not find it. Restarting openhpid...' ) # zap it to make sure its DEAD os.popen('killall openhpid 2>/dev/null') #time.sleep(1) asp.start_openhpid() else: asp.log.debug( 'AMF watchdog openhpid pid(%d) found as expected, nothing to do.' % openhpid_pid) else: if openhpid_pid != 0: seen_openhpid = True wdSleep(monitor_interval)