class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) # Replace all local host IP addresses (i.e. 127.0.0.1) by localhost for candidate in self.candidates: if candidate['host'] == '127.0.0.1': candidate['host'] = 'localhost' for slave in slave_vals: if slave['host'] == '127.0.0.1': slave['host'] = 'localhost' self.rpl_user = self.options.get("rpl_user", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ from mysql.utilities.common.format import print_list format = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, format, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ from mysql.utilities.common.format import print_list if not self.topology.gtid_enabled(): self._report( "# WARNING: GTIDs are not supported on this topology.", logging.WARN) return format = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, format, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, format, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, format, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, format, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ from mysql.utilities.common.options import hostname_is_ip uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host, port = slave.get_master_host_port() if uses_ip != hostname_is_ip(slave.host) or \ uses_ip != hostname_is_ip(host): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) # Check prerequisites - need valid candidate candidate = self.options.get("new_master", None) if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report(" ".join([ "# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port']) ])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): self._report( "# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report( "ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False Returns bool - True = failover succeeded, False = errors found """ if not self.topology.gtid_enabled(): self._report( "# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def execute_command(self, command): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute Returns bool - True = success, raise error on failure """ # Raise error if command is not valid if not command in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover() else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report( "Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() for i in range(0, 9): time.sleep(1) sys.stdout.write('.') sys.stdout.flush() print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, interval) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, interval): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = int(self.options.get("timeout", 300)) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], 'failover'), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) print "#\n# Failover console will start in 10 seconds." time.sleep(10) # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: pass old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report( "# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script( post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None and \ (not first_pass or self.options.get("rediscover", False)): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err)) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ fmt = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, fmt, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this " "topology.", logging.WARN) return fmt = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, fmt, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, fmt, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, fmt, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check new master is not actual master - need valid candidate candidate = self.options.get("new_master", None) if (self.topology.master.is_alias(candidate['host']) and self.master_vals['port'] == candidate['port']): err_msg = ERROR_SAME_MASTER.format(candidate['host'], candidate['port'], self.master_vals['host'], self.master_vals['port']) self._report(err_msg, logging.WARN) self._report(err_msg, logging.CRITICAL) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) # Check prerequisites if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Can only check errant transactions if GTIDs are enabled. if self.topology.gtid_enabled(): # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = self.options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please " "use the utility with the --force " "option.".format(_ERRANT_TNX_ERROR)) else: warn_msg = ("Errant transactions check skipped (GTID not enabled " "for the whole topology).") print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): warn_msg = _GTID_ON_REQ.format(action='Slave election') print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) return # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False, options=None): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False options[in] options dictionary. Returns bool - True = failover succeeded, False = errors found """ if options is None: options = {} srv_list = self.topology.get_servers_with_gtid_not_on() if srv_list: err_msg = _GTID_ON_REQ.format(action='Slave election') print("# ERROR: {0}".format(err_msg)) self._report(err_msg, logging.ERROR, False) for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) self._report(msg, logging.ERROR) self._report(err_msg, logging.CRITICAL, False) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please use " "the utility with the --force option." "".format(_ERRANT_TNX_ERROR)) self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict, stop_on_error=True): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def check_host_references(self): """Public method to access self.check_host_references() """ return self._check_host_references() def execute_command(self, command, options=None): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute options[in] options dictionary. Returns bool - True = success, raise error on failure """ if options is None: options = {} # Raise error if command is not valid if command not in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover(options=options) else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): for error in errors: msg = ("User {0} on {1}@{2} does not have sufficient " "privileges to execute the {3} command " "(required: {4}).").format(error[0], error[1], error[2], 'failover', error[3]) print("# ERROR: {0}".format(msg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError("Not enough privileges to execute command.") # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() i = 0 while i < 9: time.sleep(1) sys.stdout.write('.') sys.stdout.flush() i += 1 print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, failover_mode) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(True, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def auto_failover_as_daemon(self): """Automatic failover Wrapper class for running automatic failover as daemon. This method ensures the registration/deregistration occurs regardless of exception or errors. Returns bool - True = success, raises exception on error """ # Initialize failover daemon failover_daemon = FailoverDaemon(self) res = None try: action = self.options.get("daemon") if action == "start": res = failover_daemon.start() elif action == "stop": res = failover_daemon.stop() elif action == "restart": res = failover_daemon.restart() else: # Start failover deamon in foreground res = failover_daemon.start(detach_process=False) except: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) failover_daemon.register_instance(True, False) self._report("Failover daemon stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) fail_retry = self.options.get('fail_retry', None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # If user specified a master fail retry, wait for the # predetermined time and attempt to check the master again. if fail_retry is not None and \ not self.topology.master.is_alive(): msg = "Master is still not reachable. Waiting for %s " \ "seconds to retry detection." % fail_retry self._report(msg, logging.INFO, False) time.sleep(fail_retry) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) else: self._report("Master is Ok. Resuming watch.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ from mysql.utilities.common.format import print_list format = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, format, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ from mysql.utilities.common.format import print_list if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this topology.", logging.WARN) return format = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, format, ['host','port','role','uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, format, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, format, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, format, _GTID_COLS, owned) def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ from mysql.utilities.exception import FormatError from mysql.utilities.common.options import parse_connection # Check prerequisites - need valid candidate candidate = self.options.get("new_master", None) if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): self._report("# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False Returns bool - True = failover succeeded, False = errors found """ if not self.topology.gtid_enabled(): self._report("# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict): self._report("# Errors found.", logging.ERROR) return False return True def execute_command(self, command): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute Returns bool - True = success, raise error on failure """ # Raise error if command is not valid if not command in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover() else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = self.options.get("timeout", 3) exec_fail = self.options.get("exec_fail", None) force = self.options.get("force", False) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(fail_check): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "Failover mode changed to 'FAIL'. Console will start in 5 seconds." time.sleep(5) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(script) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for timeout seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ timeout self._report(msg, logging.INFO, False) time.sleep(timeout) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None \ and not first_pass: # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) return True
class ReplicationMultiSource(Daemon): """Setup replication among a slave and multiple masters. This class implements a multi-source replication using a round-robin scheduling for setup replication among all masters and slave. This class also implements a POSIX daemon. """ def __init__(self, slave_vals, masters_vals, options): """Constructor. slave_vals[in] Slave server connection dictionary. master_vals[in] List of master server connection dictionaries. options[in] Options dictionary. """ pidfile = options.get("pidfile", None) if pidfile is None: pidfile = "./rplms_daemon.pid" super(ReplicationMultiSource, self).__init__(pidfile) self.slave_vals = slave_vals self.masters_vals = masters_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.rpl_user = self.options.get("rpl_user", None) self.verbosity = options.get("verbosity", 0) self.interval = options.get("interval", 15) self.switchover_interval = options.get("switchover_interval", 60) self.format = self.options.get("format", False) self.topology = None self.report_values = [ report.lower() for report in self.options["report_values"].split(",") ] # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart self.daemon = options.get("daemon") if self.daemon: if self.daemon in ("start", "nodetach"): self._report("Starting multi-source replication daemon...", logging.INFO, False) elif self.daemon == "stop": self._report("Stopping multi-source replication daemon...", logging.INFO, False) else: self._report("Restarting multi-source replication daemon...", logging.INFO, False) # Disable stdout sys.stdout = self.stdout_devnull else: self._report("# Starting multi-source replication...", logging.INFO) print("# Press CTRL+C to quit.") # Check server versions try: self._check_server_versions() except UtilError as err: raise UtilRplError(err.errmsg) # Check user privileges try: self._check_privileges() except UtilError as err: msg = "Error checking user privileges: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) @staticmethod def _reconnect_server(server, pingtime=3): """Tries to reconnect to the server. This method tries to reconnect to the server and if connection fails after 3 attemps, returns False. server[in] Server instance. pingtime[in] Interval between connection attempts. """ if server and server.is_alive(): return True is_connected = False i = 0 while i < 3: try: server.connect() is_connected = True break except UtilError: pass time.sleep(pingtime) i += 1 return is_connected def _get_slave(self): """Get the slave server instance. Returns a Server instance of the slave from the replication topology. """ return self.topology.slaves[0]["instance"] def _get_master(self): """Get the current master server instance. Returns a Server instance of the current master from the replication topology. """ return self.topology.master def _check_server_versions(self): """Checks the server versions. """ if self.verbosity > 0: print("# Checking server versions.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check masters version for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() if not master.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=master.host, port=master.port ) ) master.disconnect() # Check slave version conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() if not slave.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=slave.host, port=slave.port ) ) slave.disconnect() def _check_privileges(self): """Check required privileges to perform the multi-source replication. This method check if the used users for the slave and masters have the required privileges to perform the multi-source replication. The following privileges are required: - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION; - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION. An exception is thrown if users doesn't have enough privileges. """ if self.verbosity > 0: print("# Checking users privileges for replication.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check privileges for master. master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() user_obj = User(master, "{0}@{1}".format(master.user, master.host)) for any_priv_tuple in master_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ERROR_USER_WITHOUT_PRIVILEGES.format( user=master.user, host=master.host, port=master.port, operation='perform replication', req_privileges=master_priv_str ) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) master.disconnect() # Check privileges for slave slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host)) for any_priv_tuple in slave_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ("User '{0}' on '{1}@{2}' does not have sufficient " "privileges to perform replication (required: {3})." "".format(slave.user, slave.host, slave.port, slave_priv_str)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.disconnect() def _check_host_references(self): """Check to see if using all host or all IP addresses. Returns bool - True = all references are consistent. """ uses_ip = hostname_is_ip(self.topology.master.host) slave = self._get_slave() host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _setup_replication(self, master_vals, use_rpl_setup=True): """Setup replication among a master and a slave. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Use Replication.setup() if True otherwise use switch_master() on the slave. This is used to control the first pass in the masters round-robin scheduling. """ conn_options = { "src_name": "master", "dest_name": "slave", "version": "5.0.0", "unique": True, } (master, slave,) = connect_servers(master_vals, self.slave_vals, conn_options) rpl_options = self.options.copy() rpl_options["verbosity"] = self.verbosity > 0 # Start from beginning only on the first pass if rpl_options.get("from_beginning", False) and not use_rpl_setup: rpl_options["from_beginning"] = False # Create an instance of the replication object rpl = Replication(master, slave, rpl_options) if use_rpl_setup: # Check server ids errors = rpl.check_server_ids() for error in errors: self._report(error, logging.ERROR, True) # Check for server_id uniqueness errors = rpl.check_server_uuids() for error in errors: self._report(error, logging.ERROR, True) # Check InnoDB compatibility errors = rpl.check_innodb_compatibility(self.options) for error in errors: self._report(error, logging.ERROR, True) # Checking storage engines errors = rpl.check_storage_engines(self.options) for error in errors: self._report(error, logging.ERROR, True) # Check master for binary logging errors = rpl.check_master_binlog() if not errors == []: raise UtilRplError(errors[0]) # Setup replication if not rpl.setup(self.rpl_user, 10): msg = "Cannot setup replication." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) else: # Parse user and password (support login-paths) (r_user, r_pass,) = parse_user_password(self.rpl_user) # Switch master and start slave slave.switch_master(master, r_user, r_pass) slave.start({'fetch': False}) # Disconnect from servers master.disconnect() slave.disconnect() def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report("# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on. This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] Message to be printed. level[in] Level of message to log. Default = INFO. print_msg[in] If True, print the message to stdout. Default = True. """ # First, print the message. if print_msg and not self.quiet: print(message) # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(" ")) def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows). """ if self.topology: try: health_data = self.topology.get_health() current_master = self._get_master() # Get data for the remaining masters for master_vals in self.masters_vals: # Discard the current master if master_vals["host"] == current_master.host and \ master_vals["port"] == current_master.port: continue # Connect to the master conn_dict = { "conn_info": master_vals, "quiet": True, "verbose": self.verbosity > 0, } master = Master(conn_dict) master.connect() # Get master health rpl_health = master.check_rpl_health() master_data = [ master.host, master.port, "MASTER", get_server_state(master, master.host, 3, self.verbosity > 0), master.supports_gtid(), "OK" if rpl_health[0] else ", ".join(rpl_health[1]), ] # Get master status master_status = master.get_status() if len(master_status): master_log, master_log_pos = master_status[0][0:2] else: master_log = None master_log_pos = 0 # Show additional details if verbosity is turned on if self.verbosity > 0: master_data.extend([master.get_version(), master_log, master_log_pos, "", "", "", "", "", "", "", "", ""]) health_data[1].append(master_data) return health_data except UtilError as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_UUID_COLS, self.topology.get_server_uuids()) except UtilError as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_gtid_data(self): """Return the GTID information from the topology. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_GTID_COLS, self.topology.get_gtid_data()) except UtilError as err: msg = "Cannot get GTID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _log_data(self, title, labels, data, print_format=True): """Helper method to log data. title[in] Title to log. labels[in] List of labels. data[in] List of data rows. """ self._report("# {0}".format(title), logging.INFO) for row in data: msg = ", ".join( ["{0}: {1}".format(*col) for col in zip(labels, row)] ) self._report("# {0}".format(msg), logging.INFO, False) if print_format: print_list(sys.stdout, self.format, labels, data) def _log_master_status(self, master): """Logs the master information. master[in] Master server instance. This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if master is None: return print("#") self._report("# {0}:".format("Current Master Information"), logging.INFO) try: status = master.get_status()[0] except UtilError: msg = "Cannot get master status" self._report(msg, logging.ERROR, False) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") print_list(sys.stdout, self.format, cols, [rows]) self._report("# {0}".format( ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]), ), logging.INFO, False) # Display gtid executed set master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. master_gtids.append((gtid.strip(","),)) try: if len(master_gtids) > 1: gtid_executed = "{0}[...]".format(master_gtids[0][0]) else: gtid_executed = master_gtids[0][0] except IndexError: gtid_executed = "None" self._report("# GTID Executed Set: {0}".format(gtid_executed), logging.INFO) def stop_replication(self): """Stops multi-source replication. Stop the slave if topology is available. """ if self.topology: # Get the slave instance slave = self._get_slave() # If slave is not connected, try to reconnect and stop replication if self._reconnect_server(slave): slave.stop() slave.disconnect() if self.daemon: self._report("Multi-source replication daemon stopped.", logging.INFO, False) else: print("") self._report("# Multi-source replication stopped.", logging.INFO, True) def stop(self): """Stops the daemon. Stop slave if topology is available and then stop the daemon. """ self.stop_replication() super(ReplicationMultiSource, self).stop() def run(self): """Run the multi-source replication using the round-robin scheduling. This method implements the multi-source replication by using time slices for each master. """ num_masters = len(self.masters_vals) use_rpl_setup = True while True: # Round-robin scheduling on the masters for idx in range(num_masters): # Get the new master values and switch for the next one try: master_vals = self.masters_vals[idx] self._switch_master(master_vals, use_rpl_setup) except UtilError as err: msg = ("Error while switching master: {0}" "".format(err.errmsg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Get the new master and slave instances master = self._get_master() slave = self._get_slave() switchover_timeout = time.time() + self.switchover_interval while switchover_timeout > time.time(): # If servers not connected, try to reconnect if not self._reconnect_server(master): msg = ("Failed to connect to the master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"])) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) if not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Report self._log_master_status(master) if "health" in self.report_values: (health_labels, health_data,) = \ self._format_health_data() if health_data: print("#") self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data,) = self._format_gtid_data() for i, row in enumerate(gtid_data): if row: print("#") self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, row) if "uuid" in self.report_values: (uuid_labels, uuid_data,) = self._format_uuid_data() if uuid_data: print("#") self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect servers master.disconnect() slave.disconnect() # Wait for reporting interval time.sleep(self.interval) # Use Replication.setup() only for the first round use_rpl_setup = False
class ReplicationMultiSource(Daemon): """Setup replication among a slave and multiple masters. This class implements a multi-source replication using a round-robin scheduling for setup replication among all masters and slave. This class also implements a POSIX daemon. """ def __init__(self, slave_vals, masters_vals, options): """Constructor. slave_vals[in] Slave server connection dictionary. master_vals[in] List of master server connection dictionaries. options[in] Options dictionary. """ pidfile = options.get("pidfile", None) if pidfile is None: pidfile = "./rplms_daemon.pid" super(ReplicationMultiSource, self).__init__(pidfile) self.slave_vals = slave_vals self.masters_vals = masters_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.rpl_user = self.options.get("rpl_user", None) self.verbosity = options.get("verbosity", 0) self.interval = options.get("interval", 15) self.switchover_interval = options.get("switchover_interval", 60) self.format = self.options.get("format", False) self.topology = None self.report_values = [ report.lower() for report in self.options["report_values"].split(",") ] # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart self.daemon = options.get("daemon") if self.daemon: if self.daemon in ("start", "nodetach"): self._report("Starting multi-source replication daemon...", logging.INFO, False) elif self.daemon == "stop": self._report("Stopping multi-source replication daemon...", logging.INFO, False) else: self._report("Restarting multi-source replication daemon...", logging.INFO, False) # Disable stdout sys.stdout = self.stdout_devnull else: self._report("# Starting multi-source replication...", logging.INFO) print("# Press CTRL+C to quit.") # Check server versions try: self._check_server_versions() except UtilError as err: raise UtilRplError(err.errmsg) # Check user privileges try: self._check_privileges() except UtilError as err: msg = "Error checking user privileges: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) @staticmethod def _reconnect_server(server, pingtime=3): """Tries to reconnect to the server. This method tries to reconnect to the server and if connection fails after 3 attemps, returns False. server[in] Server instance. pingtime[in] Interval between connection attempts. """ if server and server.is_alive(): return True is_connected = False i = 0 while i < 3: try: server.connect() is_connected = True break except UtilError: pass time.sleep(pingtime) i += 1 return is_connected def _get_slave(self): """Get the slave server instance. Returns a Server instance of the slave from the replication topology. """ return self.topology.slaves[0]["instance"] def _get_master(self): """Get the current master server instance. Returns a Server instance of the current master from the replication topology. """ return self.topology.master def _check_server_versions(self): """Checks the server versions. """ if self.verbosity > 0: print("# Checking server versions.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check masters version for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() if not master.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=master.host, port=master.port ) ) master.disconnect() # Check slave version conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() if not slave.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=slave.host, port=slave.port ) ) slave.disconnect() def _check_privileges(self): """Check required privileges to perform the multi-source replication. This method check if the used users for the slave and masters have the required privileges to perform the multi-source replication. The following privileges are required: - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION; - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION. An exception is thrown if users doesn't have enough privileges. """ if self.verbosity > 0: print("# Checking users privileges for replication.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check privileges for master. master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() user_obj = User(master, "{0}@{1}".format(master.user, master.host)) for any_priv_tuple in master_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ERROR_USER_WITHOUT_PRIVILEGES.format( user=master.user, host=master.host, port=master.port, operation='perform replication', req_privileges=master_priv_str ) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) master.disconnect() # Check privileges for slave slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host)) for any_priv_tuple in slave_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ("User '{0}' on '{1}@{2}' does not have sufficient " "privileges to perform replication (required: {3})." "".format(slave.user, slave.host, slave.port, slave_priv_str)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.disconnect() def _check_host_references(self): """Check to see if using all host or all IP addresses. Returns bool - True = all references are consistent. """ uses_ip = hostname_is_ip(self.topology.master.host) slave = self._get_slave() host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _setup_replication(self, master_vals, use_rpl_setup=True): """Setup replication among a master and a slave. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Use Replication.setup() if True otherwise use switch_master() on the slave. This is used to control the first pass in the masters round-robin scheduling. """ conn_options = { "src_name": "master", "dest_name": "slave", "version": "5.0.0", "unique": True, } (master, slave,) = connect_servers(master_vals, self.slave_vals, conn_options) rpl_options = self.options.copy() rpl_options["verbosity"] = self.verbosity > 0 # Start from beginning only on the first pass if rpl_options.get("from_beginning", False) and not use_rpl_setup: rpl_options["from_beginning"] = False # Create an instance of the replication object rpl = Replication(master, slave, rpl_options) if use_rpl_setup: # Check server ids errors = rpl.check_server_ids() for error in errors: self._report(error, logging.ERROR, True) # Check for server_id uniqueness errors = rpl.check_server_uuids() for error in errors: self._report(error, logging.ERROR, True) # Check InnoDB compatibility errors = rpl.check_innodb_compatibility(self.options) for error in errors: self._report(error, logging.ERROR, True) # Checking storage engines errors = rpl.check_storage_engines(self.options) for error in errors: self._report(error, logging.ERROR, True) # Check master for binary logging errors = rpl.check_master_binlog() if errors != []: raise UtilRplError(errors[0]) # Setup replication if not rpl.setup(self.rpl_user, 10): msg = "Cannot setup replication." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) else: # Parse user and password (support login-paths) try: (r_user, r_pass,) = parse_user_password(self.rpl_user) except FormatError: raise UtilError(USER_PASSWORD_FORMAT.format("--rpl-user")) # Switch master and start slave slave.switch_master(master, r_user, r_pass) slave.start({'fetch': False}) # Disconnect from servers master.disconnect() slave.disconnect() def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report("# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on. This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] Message to be printed. level[in] Level of message to log. Default = INFO. print_msg[in] If True, print the message to stdout. Default = True. """ # First, print the message. if print_msg and not self.quiet: print(message) # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(" ")) def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows). """ if self.topology: try: health_data = self.topology.get_health() current_master = self._get_master() # Get data for the remaining masters for master_vals in self.masters_vals: # Discard the current master if master_vals["host"] == current_master.host and \ master_vals["port"] == current_master.port: continue # Connect to the master conn_dict = { "conn_info": master_vals, "quiet": True, "verbose": self.verbosity > 0, } master = Master(conn_dict) master.connect() # Get master health rpl_health = master.check_rpl_health() master_data = [ master.host, master.port, "MASTER", get_server_state(master, master.host, 3, self.verbosity > 0), master.supports_gtid(), "OK" if rpl_health[0] else ", ".join(rpl_health[1]), ] # Get master status master_status = master.get_status() if len(master_status): master_log, master_log_pos = master_status[0][0:2] else: master_log = None master_log_pos = 0 # Show additional details if verbosity is turned on if self.verbosity > 0: master_data.extend([master.get_version(), master_log, master_log_pos, "", "", "", "", "", "", "", "", ""]) health_data[1].append(master_data) return health_data except UtilError as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_UUID_COLS, self.topology.get_server_uuids()) except UtilError as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_gtid_data(self): """Return the GTID information from the topology. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_GTID_COLS, self.topology.get_gtid_data()) except UtilError as err: msg = "Cannot get GTID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _log_data(self, title, labels, data, print_format=True): """Helper method to log data. title[in] Title to log. labels[in] List of labels. data[in] List of data rows. """ self._report("# {0}".format(title), logging.INFO) for row in data: msg = ", ".join( ["{0}: {1}".format(*col) for col in zip(labels, row)] ) self._report("# {0}".format(msg), logging.INFO, False) if print_format: print_list(sys.stdout, self.format, labels, data) def _log_master_status(self, master): """Logs the master information. master[in] Master server instance. This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if master is None: return print("#") self._report("# {0}:".format("Current Master Information"), logging.INFO) try: status = master.get_status()[0] except UtilError: msg = "Cannot get master status" self._report(msg, logging.ERROR, False) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") print_list(sys.stdout, self.format, cols, [rows]) self._report("# {0}".format( ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]), ), logging.INFO, False) # Display gtid executed set master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. master_gtids.append((gtid.strip(","),)) try: if len(master_gtids) > 1: gtid_executed = "{0}[...]".format(master_gtids[0][0]) else: gtid_executed = master_gtids[0][0] except IndexError: gtid_executed = "None" self._report("# GTID Executed Set: {0}".format(gtid_executed), logging.INFO) def stop_replication(self): """Stops multi-source replication. Stop the slave if topology is available. """ if self.topology: # Get the slave instance slave = self._get_slave() # If slave is not connected, try to reconnect and stop replication if self._reconnect_server(slave): slave.stop() slave.disconnect() if self.daemon: self._report("Multi-source replication daemon stopped.", logging.INFO, False) else: print("") self._report("# Multi-source replication stopped.", logging.INFO, True) def stop(self): """Stops the daemon. Stop slave if topology is available and then stop the daemon. """ self.stop_replication() super(ReplicationMultiSource, self).stop() def run(self): """Run the multi-source replication using the round-robin scheduling. This method implements the multi-source replication by using time slices for each master. """ num_masters = len(self.masters_vals) use_rpl_setup = True # pylint: disable=R0101 while True: # Round-robin scheduling on the masters for idx in range(num_masters): # Get the new master values and switch for the next one try: master_vals = self.masters_vals[idx] self._switch_master(master_vals, use_rpl_setup) except UtilError as err: msg = ("Error while switching master: {0}" "".format(err.errmsg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Get the new master and slave instances master = self._get_master() slave = self._get_slave() switchover_timeout = time.time() + self.switchover_interval while switchover_timeout > time.time(): # If servers not connected, try to reconnect if not self._reconnect_server(master): msg = ("Failed to connect to the master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"])) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) if not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Report self._log_master_status(master) if "health" in self.report_values: (health_labels, health_data,) = \ self._format_health_data() if health_data: print("#") self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data,) = self._format_gtid_data() for i, row in enumerate(gtid_data): if row: print("#") self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, row) if "uuid" in self.report_values: (uuid_labels, uuid_data,) = self._format_uuid_data() if uuid_data: print("#") self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect servers master.disconnect() slave.disconnect() # Wait for reporting interval time.sleep(self.interval) # Use Replication.setup() only for the first round use_rpl_setup = False
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err)) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ fmt = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, fmt, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this " "topology.", logging.WARN) return fmt = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, fmt, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, fmt, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, fmt, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check new master is not actual master - need valid candidate candidate = self.options.get("new_master", None) if (self.topology.master.is_alias(candidate['host']) and self.master_vals['port'] == candidate['port']): err_msg = ERROR_SAME_MASTER.format(candidate['host'], candidate['port'], self.master_vals['host'], self.master_vals['port']) self._report(err_msg, logging.WARN) self._report(err_msg, logging.CRITICAL) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) # Check prerequisites if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Can only check errant transactions if GTIDs are enabled. if self.topology.gtid_enabled(): # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = self.options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please " "use the utility with the --force " "option.".format(_ERRANT_TNX_ERROR)) else: warn_msg = ("Errant transactions check skipped (GTID not enabled " "for the whole topology).") print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): warn_msg = _GTID_ON_REQ.format(action='Slave election') print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) return # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False, options=None): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False options[in] options dictionary. Returns bool - True = failover succeeded, False = errors found """ if options is None: options = {} srv_list = self.topology.get_servers_with_gtid_not_on() if srv_list: err_msg = _GTID_ON_REQ.format(action='Slave election') print("# ERROR: {0}".format(err_msg)) self._report(err_msg, logging.ERROR, False) for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) self._report(msg, logging.ERROR) self._report(err_msg, logging.CRITICAL, False) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please use " "the utility with the --force option." "".format(_ERRANT_TNX_ERROR)) self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict, stop_on_error=True): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def check_host_references(self): """Public method to access self.check_host_references() """ return self._check_host_references() def execute_command(self, command, options=None): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute options[in] options dictionary. Returns bool - True = success, raise error on failure """ if options is None: options = {} # Raise error if command is not valid if command not in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover(options=options) else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): for error in errors: msg = ("User {0} on {1}@{2} does not have sufficient " "privileges to execute the {3} command " "(required: {4}).").format(error[0], error[1], error[2], 'failover', error[3]) print("# ERROR: {0}".format(msg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError("Not enough privileges to execute command.") # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() i = 0 while i < 9: time.sleep(1) sys.stdout.write('.') sys.stdout.flush() i += 1 print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, failover_mode) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(True, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def auto_failover_as_daemon(self): """Automatic failover Wrapper class for running automatic failover as daemon. This method ensures the registration/deregistration occurs regardless of exception or errors. Returns bool - True = success, raises exception on error """ # Initialize failover daemon failover_daemon = FailoverDaemon(self) res = None try: action = self.options.get("daemon") if action == "start": res = failover_daemon.start() elif action == "stop": res = failover_daemon.stop() elif action == "restart": res = failover_daemon.restart() else: # Start failover deamon in foreground res = failover_daemon.start(detach_process=False) except: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) failover_daemon.register_instance(True, False) self._report("Failover daemon stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True