def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report( "Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() for i in range(0, 9): time.sleep(1) sys.stdout.write('.') sys.stdout.flush() print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, interval) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res
def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): for error in errors: msg = ("User {0} on {1}@{2} does not have sufficient " "privileges to execute the {3} command " "(required: {4}).").format(error[0], error[1], error[2], 'failover', error[3]) print("# ERROR: {0}".format(msg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError("Not enough privileges to execute command.") # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() i = 0 while i < 9: time.sleep(1) sys.stdout.write('.') sys.stdout.flush() i += 1 print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, failover_mode) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(True, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res
def auto_failover(self, interval): """Automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = self.options.get("timeout", 3) exec_fail = self.options.get("exec_fail", None) force = self.options.get("force", False) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(fail_check): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "Failover mode changed to 'FAIL'. Console will start in 5 seconds." time.sleep(5) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(script) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for timeout seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ timeout self._report(msg, logging.INFO, False) time.sleep(timeout) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None \ and not first_pass: # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) return True
def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() for i in range(0, 9): time.sleep(1) sys.stdout.write('.') sys.stdout.flush() print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, interval); except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res