def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) # Replace all local host IP addresses (i.e. 127.0.0.1) by localhost for candidate in self.candidates: if candidate['host'] == '127.0.0.1': candidate['host'] = 'localhost' for slave in slave_vals: if slave['host'] == '127.0.0.1': slave['host'] = 'localhost' self.rpl_user = self.options.get("rpl_user", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err)
def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report( "# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False)
def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err))
def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report("# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False)
def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err)
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) # Replace all local host IP addresses (i.e. 127.0.0.1) by localhost for candidate in self.candidates: if candidate['host'] == '127.0.0.1': candidate['host'] = 'localhost' for slave in slave_vals: if slave['host'] == '127.0.0.1': slave['host'] = 'localhost' self.rpl_user = self.options.get("rpl_user", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ from mysql.utilities.common.format import print_list format = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, format, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ from mysql.utilities.common.format import print_list if not self.topology.gtid_enabled(): self._report( "# WARNING: GTIDs are not supported on this topology.", logging.WARN) return format = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, format, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, format, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, format, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, format, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ from mysql.utilities.common.options import hostname_is_ip uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host, port = slave.get_master_host_port() if uses_ip != hostname_is_ip(slave.host) or \ uses_ip != hostname_is_ip(host): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) # Check prerequisites - need valid candidate candidate = self.options.get("new_master", None) if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report(" ".join([ "# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port']) ])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): self._report( "# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report( "ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False Returns bool - True = failover succeeded, False = errors found """ if not self.topology.gtid_enabled(): self._report( "# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def execute_command(self, command): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute Returns bool - True = success, raise error on failure """ # Raise error if command is not valid if not command in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover() else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report( "Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() for i in range(0, 9): time.sleep(1) sys.stdout.write('.') sys.stdout.flush() print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, interval) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, interval): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = int(self.options.get("timeout", 300)) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], 'failover'), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) print "#\n# Failover console will start in 10 seconds." time.sleep(10) # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: pass old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report( "# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script( post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None and \ (not first_pass or self.options.get("rediscover", False)): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err)) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ fmt = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, fmt, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this " "topology.", logging.WARN) return fmt = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, fmt, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, fmt, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, fmt, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check new master is not actual master - need valid candidate candidate = self.options.get("new_master", None) if (self.topology.master.is_alias(candidate['host']) and self.master_vals['port'] == candidate['port']): err_msg = ERROR_SAME_MASTER.format(candidate['host'], candidate['port'], self.master_vals['host'], self.master_vals['port']) self._report(err_msg, logging.WARN) self._report(err_msg, logging.CRITICAL) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) # Check prerequisites if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Can only check errant transactions if GTIDs are enabled. if self.topology.gtid_enabled(): # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = self.options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please " "use the utility with the --force " "option.".format(_ERRANT_TNX_ERROR)) else: warn_msg = ("Errant transactions check skipped (GTID not enabled " "for the whole topology).") print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): warn_msg = _GTID_ON_REQ.format(action='Slave election') print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) return # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False, options=None): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False options[in] options dictionary. Returns bool - True = failover succeeded, False = errors found """ if options is None: options = {} srv_list = self.topology.get_servers_with_gtid_not_on() if srv_list: err_msg = _GTID_ON_REQ.format(action='Slave election') print("# ERROR: {0}".format(err_msg)) self._report(err_msg, logging.ERROR, False) for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) self._report(msg, logging.ERROR) self._report(err_msg, logging.CRITICAL, False) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please use " "the utility with the --force option." "".format(_ERRANT_TNX_ERROR)) self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict, stop_on_error=True): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def check_host_references(self): """Public method to access self.check_host_references() """ return self._check_host_references() def execute_command(self, command, options=None): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute options[in] options dictionary. Returns bool - True = success, raise error on failure """ if options is None: options = {} # Raise error if command is not valid if command not in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover(options=options) else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): for error in errors: msg = ("User {0} on {1}@{2} does not have sufficient " "privileges to execute the {3} command " "(required: {4}).").format(error[0], error[1], error[2], 'failover', error[3]) print("# ERROR: {0}".format(msg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError("Not enough privileges to execute command.") # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() i = 0 while i < 9: time.sleep(1) sys.stdout.write('.') sys.stdout.flush() i += 1 print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, failover_mode) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(True, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def auto_failover_as_daemon(self): """Automatic failover Wrapper class for running automatic failover as daemon. This method ensures the registration/deregistration occurs regardless of exception or errors. Returns bool - True = success, raises exception on error """ # Initialize failover daemon failover_daemon = FailoverDaemon(self) res = None try: action = self.options.get("daemon") if action == "start": res = failover_daemon.start() elif action == "stop": res = failover_daemon.stop() elif action == "restart": res = failover_daemon.restart() else: # Start failover deamon in foreground res = failover_daemon.start(detach_process=False) except: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) failover_daemon.register_instance(True, False) self._report("Failover daemon stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) fail_retry = self.options.get('fail_retry', None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # If user specified a master fail retry, wait for the # predetermined time and attempt to check the master again. if fail_retry is not None and \ not self.topology.master.is_alive(): msg = "Master is still not reachable. Waiting for %s " \ "seconds to retry detection." % fail_retry self._report(msg, logging.INFO, False) time.sleep(fail_retry) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) else: self._report("Master is Ok. Resuming watch.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
def skip_slaves_trx(gtid_set, slaves_cnx_val, options): """Skip transactions on slaves. This method skips the given transactions (GTID set) on all the specified slaves. That is, an empty transaction is injected for each GTID in the given set for one of each slaves. In case a slave already has an executed transaction for a given GTID then that GTID is ignored for this slave. gtid_set[in] String representing the set of GTIDs to skip. slaves_cnx_val[in] List of the dictionaries with the connection values for each target slave. options[in] Dictionary of options (dry_run, verbosity). Throws an UtilError exception if an error occurs during the execution. """ verbosity = options.get('verbosity') dryrun = options.get('dry_run') # Connect to slaves. rpl_topology = Topology(None, slaves_cnx_val, options) # Check required privileges. errors = rpl_topology.check_privileges(skip_master=True) if errors: err_details = '' for err in errors: err_msg = ERROR_USER_WITHOUT_PRIVILEGES.format( user=err[0], host=err[1], port=err[2], operation='inject empty transactions', req_privileges=err[3]) err_details = '{0}{1}\n'.format(err_details, err_msg) err_details.strip() raise UtilRplError("Not enough privileges.\n{0}".format(err_details)) # GTID must be enabled on all servers. srv_list = rpl_topology.get_servers_with_gtid_not_on() if srv_list: if verbosity: print("# Slaves with GTID not enabled:") for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) print(msg) raise UtilRplError(_GTID_ON_REQ.format(action='Transaction skip')) if dryrun: print("#") print("# WARNING: Executing utility in dry run mode (read only).") # Get GTID set that can be skipped, i.e., not in GTID_EXECUTED. gtids_by_slave = rpl_topology.slaves_gtid_subtract_executed(gtid_set) # Output GTID set that will be skipped. print("#") print("# GTID set to be skipped for each server:") has_gtid_to_skip = False for host, port, gtids_to_skip in gtids_by_slave: if not gtids_to_skip: gtids_to_skip = 'None' else: # Set flag to indicate that there is at least one GTID to skip. has_gtid_to_skip = True print("# - {0}@{1}: {2}".format(host, port, gtids_to_skip)) # Create dictionary to directly access the slaves instances. slaves_dict = rpl_topology.get_slaves_dict() # Skip transactions for the given list of slaves. print("#") if has_gtid_to_skip: for host, port, gtids_to_skip in gtids_by_slave: if gtids_to_skip: # Decompose GTID set into a list of single transactions. gtid_items = gtid_set_itemize(gtids_to_skip) dryrun_mark = '(dry run) ' if dryrun else '' print("# {0}Injecting empty transactions for '{1}:{2}'" "...".format(dryrun_mark, host, port)) slave_key = '{0}@{1}'.format(host, port) slave_srv = slaves_dict[slave_key]['instance'] for uuid, trx_list in gtid_items: for trx_num in trx_list: trx_to_skip = '{0}:{1}'.format(uuid, trx_num) if verbosity: print("# - {0}".format(trx_to_skip)) if not dryrun: # Inject empty transaction. slave_srv.inject_empty_trx( trx_to_skip, gtid_next_automatic=False) if not dryrun: slave_srv.set_gtid_next_automatic() else: print("# No transaction to skip.") print("#\n#...done.\n#")
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ from mysql.utilities.common.topology import Topology self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ from mysql.utilities.common.format import print_list format = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, format, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ from mysql.utilities.common.format import print_list if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this topology.", logging.WARN) return format = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, format, ['host','port','role','uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, format, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, format, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, format, _GTID_COLS, owned) def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ from mysql.utilities.exception import FormatError from mysql.utilities.common.options import parse_connection # Check prerequisites - need valid candidate candidate = self.options.get("new_master", None) if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): self._report("# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False Returns bool - True = failover succeeded, False = errors found """ if not self.topology.gtid_enabled(): self._report("# WARNING: slave election requires GTID_MODE=ON " "for all servers.", logging.WARN) return self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict): self._report("# Errors found.", logging.ERROR) return False return True def execute_command(self, command): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute Returns bool - True = success, raise error on failure """ # Raise error if command is not valid if not command in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover() else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ import time from mysql.utilities.command.failover_console import FailoverConsole from mysql.utilities.common.tools import ping_host from mysql.utilities.common.tools import execute_script failover_mode = self.options.get("failover_mode", "auto") pingtime = self.options.get("pingtime", 3) timeout = self.options.get("timeout", 3) exec_fail = self.options.get("exec_fail", None) force = self.options.get("force", False) post_fail = self.options.get("post_fail", None) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Test failover script. If it doesn't exist, fail. no_exec_fail_msg = "Failover check script cannot be found. Please " + \ "check the path and filename for accuracy and " + \ "restart the failover console." if exec_fail is not None and not os.path.exists(fail_check): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "Failover mode changed to 'FAIL'. Console will start in 5 seconds." time.sleep(5) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.exists(exec_fail): self._report(no_exec_fail_msg, logging.CRITICAL, False) raise UtilRplError(no_exec_fail_msg) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(script) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for timeout seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ timeout self._report(msg, logging.INFO, False) time.sleep(timeout) try: self.topology.master.connect() except: self._report("Cannot reconnect to master.", logging.INFO, False) # Check the master again. If no connection or lost connection, # try ping and if still not alive, failover. This performs the # timeout threshold for detecting a down master. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % \ "Master has failed and automatic failover is not enabled. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % "An error was encountered " + \ "during failover. " self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False) # discover slaves if option was specified at startup elif self.options.get("discover", None) is not None \ and not first_pass: # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(False, False) self._report("Failover console stopped.", logging.INFO, False) return True
class ReplicationMultiSource(Daemon): """Setup replication among a slave and multiple masters. This class implements a multi-source replication using a round-robin scheduling for setup replication among all masters and slave. This class also implements a POSIX daemon. """ def __init__(self, slave_vals, masters_vals, options): """Constructor. slave_vals[in] Slave server connection dictionary. master_vals[in] List of master server connection dictionaries. options[in] Options dictionary. """ pidfile = options.get("pidfile", None) if pidfile is None: pidfile = "./rplms_daemon.pid" super(ReplicationMultiSource, self).__init__(pidfile) self.slave_vals = slave_vals self.masters_vals = masters_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.rpl_user = self.options.get("rpl_user", None) self.verbosity = options.get("verbosity", 0) self.interval = options.get("interval", 15) self.switchover_interval = options.get("switchover_interval", 60) self.format = self.options.get("format", False) self.topology = None self.report_values = [ report.lower() for report in self.options["report_values"].split(",") ] # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart self.daemon = options.get("daemon") if self.daemon: if self.daemon in ("start", "nodetach"): self._report("Starting multi-source replication daemon...", logging.INFO, False) elif self.daemon == "stop": self._report("Stopping multi-source replication daemon...", logging.INFO, False) else: self._report("Restarting multi-source replication daemon...", logging.INFO, False) # Disable stdout sys.stdout = self.stdout_devnull else: self._report("# Starting multi-source replication...", logging.INFO) print("# Press CTRL+C to quit.") # Check server versions try: self._check_server_versions() except UtilError as err: raise UtilRplError(err.errmsg) # Check user privileges try: self._check_privileges() except UtilError as err: msg = "Error checking user privileges: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) @staticmethod def _reconnect_server(server, pingtime=3): """Tries to reconnect to the server. This method tries to reconnect to the server and if connection fails after 3 attemps, returns False. server[in] Server instance. pingtime[in] Interval between connection attempts. """ if server and server.is_alive(): return True is_connected = False i = 0 while i < 3: try: server.connect() is_connected = True break except UtilError: pass time.sleep(pingtime) i += 1 return is_connected def _get_slave(self): """Get the slave server instance. Returns a Server instance of the slave from the replication topology. """ return self.topology.slaves[0]["instance"] def _get_master(self): """Get the current master server instance. Returns a Server instance of the current master from the replication topology. """ return self.topology.master def _check_server_versions(self): """Checks the server versions. """ if self.verbosity > 0: print("# Checking server versions.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check masters version for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() if not master.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=master.host, port=master.port ) ) master.disconnect() # Check slave version conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() if not slave.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=slave.host, port=slave.port ) ) slave.disconnect() def _check_privileges(self): """Check required privileges to perform the multi-source replication. This method check if the used users for the slave and masters have the required privileges to perform the multi-source replication. The following privileges are required: - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION; - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION. An exception is thrown if users doesn't have enough privileges. """ if self.verbosity > 0: print("# Checking users privileges for replication.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check privileges for master. master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() user_obj = User(master, "{0}@{1}".format(master.user, master.host)) for any_priv_tuple in master_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ERROR_USER_WITHOUT_PRIVILEGES.format( user=master.user, host=master.host, port=master.port, operation='perform replication', req_privileges=master_priv_str ) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) master.disconnect() # Check privileges for slave slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host)) for any_priv_tuple in slave_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ("User '{0}' on '{1}@{2}' does not have sufficient " "privileges to perform replication (required: {3})." "".format(slave.user, slave.host, slave.port, slave_priv_str)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.disconnect() def _check_host_references(self): """Check to see if using all host or all IP addresses. Returns bool - True = all references are consistent. """ uses_ip = hostname_is_ip(self.topology.master.host) slave = self._get_slave() host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _setup_replication(self, master_vals, use_rpl_setup=True): """Setup replication among a master and a slave. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Use Replication.setup() if True otherwise use switch_master() on the slave. This is used to control the first pass in the masters round-robin scheduling. """ conn_options = { "src_name": "master", "dest_name": "slave", "version": "5.0.0", "unique": True, } (master, slave,) = connect_servers(master_vals, self.slave_vals, conn_options) rpl_options = self.options.copy() rpl_options["verbosity"] = self.verbosity > 0 # Start from beginning only on the first pass if rpl_options.get("from_beginning", False) and not use_rpl_setup: rpl_options["from_beginning"] = False # Create an instance of the replication object rpl = Replication(master, slave, rpl_options) if use_rpl_setup: # Check server ids errors = rpl.check_server_ids() for error in errors: self._report(error, logging.ERROR, True) # Check for server_id uniqueness errors = rpl.check_server_uuids() for error in errors: self._report(error, logging.ERROR, True) # Check InnoDB compatibility errors = rpl.check_innodb_compatibility(self.options) for error in errors: self._report(error, logging.ERROR, True) # Checking storage engines errors = rpl.check_storage_engines(self.options) for error in errors: self._report(error, logging.ERROR, True) # Check master for binary logging errors = rpl.check_master_binlog() if not errors == []: raise UtilRplError(errors[0]) # Setup replication if not rpl.setup(self.rpl_user, 10): msg = "Cannot setup replication." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) else: # Parse user and password (support login-paths) (r_user, r_pass,) = parse_user_password(self.rpl_user) # Switch master and start slave slave.switch_master(master, r_user, r_pass) slave.start({'fetch': False}) # Disconnect from servers master.disconnect() slave.disconnect() def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report("# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on. This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] Message to be printed. level[in] Level of message to log. Default = INFO. print_msg[in] If True, print the message to stdout. Default = True. """ # First, print the message. if print_msg and not self.quiet: print(message) # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(" ")) def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows). """ if self.topology: try: health_data = self.topology.get_health() current_master = self._get_master() # Get data for the remaining masters for master_vals in self.masters_vals: # Discard the current master if master_vals["host"] == current_master.host and \ master_vals["port"] == current_master.port: continue # Connect to the master conn_dict = { "conn_info": master_vals, "quiet": True, "verbose": self.verbosity > 0, } master = Master(conn_dict) master.connect() # Get master health rpl_health = master.check_rpl_health() master_data = [ master.host, master.port, "MASTER", get_server_state(master, master.host, 3, self.verbosity > 0), master.supports_gtid(), "OK" if rpl_health[0] else ", ".join(rpl_health[1]), ] # Get master status master_status = master.get_status() if len(master_status): master_log, master_log_pos = master_status[0][0:2] else: master_log = None master_log_pos = 0 # Show additional details if verbosity is turned on if self.verbosity > 0: master_data.extend([master.get_version(), master_log, master_log_pos, "", "", "", "", "", "", "", "", ""]) health_data[1].append(master_data) return health_data except UtilError as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_UUID_COLS, self.topology.get_server_uuids()) except UtilError as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_gtid_data(self): """Return the GTID information from the topology. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_GTID_COLS, self.topology.get_gtid_data()) except UtilError as err: msg = "Cannot get GTID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _log_data(self, title, labels, data, print_format=True): """Helper method to log data. title[in] Title to log. labels[in] List of labels. data[in] List of data rows. """ self._report("# {0}".format(title), logging.INFO) for row in data: msg = ", ".join( ["{0}: {1}".format(*col) for col in zip(labels, row)] ) self._report("# {0}".format(msg), logging.INFO, False) if print_format: print_list(sys.stdout, self.format, labels, data) def _log_master_status(self, master): """Logs the master information. master[in] Master server instance. This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if master is None: return print("#") self._report("# {0}:".format("Current Master Information"), logging.INFO) try: status = master.get_status()[0] except UtilError: msg = "Cannot get master status" self._report(msg, logging.ERROR, False) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") print_list(sys.stdout, self.format, cols, [rows]) self._report("# {0}".format( ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]), ), logging.INFO, False) # Display gtid executed set master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. master_gtids.append((gtid.strip(","),)) try: if len(master_gtids) > 1: gtid_executed = "{0}[...]".format(master_gtids[0][0]) else: gtid_executed = master_gtids[0][0] except IndexError: gtid_executed = "None" self._report("# GTID Executed Set: {0}".format(gtid_executed), logging.INFO) def stop_replication(self): """Stops multi-source replication. Stop the slave if topology is available. """ if self.topology: # Get the slave instance slave = self._get_slave() # If slave is not connected, try to reconnect and stop replication if self._reconnect_server(slave): slave.stop() slave.disconnect() if self.daemon: self._report("Multi-source replication daemon stopped.", logging.INFO, False) else: print("") self._report("# Multi-source replication stopped.", logging.INFO, True) def stop(self): """Stops the daemon. Stop slave if topology is available and then stop the daemon. """ self.stop_replication() super(ReplicationMultiSource, self).stop() def run(self): """Run the multi-source replication using the round-robin scheduling. This method implements the multi-source replication by using time slices for each master. """ num_masters = len(self.masters_vals) use_rpl_setup = True while True: # Round-robin scheduling on the masters for idx in range(num_masters): # Get the new master values and switch for the next one try: master_vals = self.masters_vals[idx] self._switch_master(master_vals, use_rpl_setup) except UtilError as err: msg = ("Error while switching master: {0}" "".format(err.errmsg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Get the new master and slave instances master = self._get_master() slave = self._get_slave() switchover_timeout = time.time() + self.switchover_interval while switchover_timeout > time.time(): # If servers not connected, try to reconnect if not self._reconnect_server(master): msg = ("Failed to connect to the master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"])) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) if not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Report self._log_master_status(master) if "health" in self.report_values: (health_labels, health_data,) = \ self._format_health_data() if health_data: print("#") self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data,) = self._format_gtid_data() for i, row in enumerate(gtid_data): if row: print("#") self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, row) if "uuid" in self.report_values: (uuid_labels, uuid_data,) = self._format_uuid_data() if uuid_data: print("#") self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect servers master.disconnect() slave.disconnect() # Wait for reporting interval time.sleep(self.interval) # Use Replication.setup() only for the first round use_rpl_setup = False
class ReplicationMultiSource(Daemon): """Setup replication among a slave and multiple masters. This class implements a multi-source replication using a round-robin scheduling for setup replication among all masters and slave. This class also implements a POSIX daemon. """ def __init__(self, slave_vals, masters_vals, options): """Constructor. slave_vals[in] Slave server connection dictionary. master_vals[in] List of master server connection dictionaries. options[in] Options dictionary. """ pidfile = options.get("pidfile", None) if pidfile is None: pidfile = "./rplms_daemon.pid" super(ReplicationMultiSource, self).__init__(pidfile) self.slave_vals = slave_vals self.masters_vals = masters_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.rpl_user = self.options.get("rpl_user", None) self.verbosity = options.get("verbosity", 0) self.interval = options.get("interval", 15) self.switchover_interval = options.get("switchover_interval", 60) self.format = self.options.get("format", False) self.topology = None self.report_values = [ report.lower() for report in self.options["report_values"].split(",") ] # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart self.daemon = options.get("daemon") if self.daemon: if self.daemon in ("start", "nodetach"): self._report("Starting multi-source replication daemon...", logging.INFO, False) elif self.daemon == "stop": self._report("Stopping multi-source replication daemon...", logging.INFO, False) else: self._report("Restarting multi-source replication daemon...", logging.INFO, False) # Disable stdout sys.stdout = self.stdout_devnull else: self._report("# Starting multi-source replication...", logging.INFO) print("# Press CTRL+C to quit.") # Check server versions try: self._check_server_versions() except UtilError as err: raise UtilRplError(err.errmsg) # Check user privileges try: self._check_privileges() except UtilError as err: msg = "Error checking user privileges: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) @staticmethod def _reconnect_server(server, pingtime=3): """Tries to reconnect to the server. This method tries to reconnect to the server and if connection fails after 3 attemps, returns False. server[in] Server instance. pingtime[in] Interval between connection attempts. """ if server and server.is_alive(): return True is_connected = False i = 0 while i < 3: try: server.connect() is_connected = True break except UtilError: pass time.sleep(pingtime) i += 1 return is_connected def _get_slave(self): """Get the slave server instance. Returns a Server instance of the slave from the replication topology. """ return self.topology.slaves[0]["instance"] def _get_master(self): """Get the current master server instance. Returns a Server instance of the current master from the replication topology. """ return self.topology.master def _check_server_versions(self): """Checks the server versions. """ if self.verbosity > 0: print("# Checking server versions.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check masters version for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() if not master.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=master.host, port=master.port ) ) master.disconnect() # Check slave version conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() if not slave.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=slave.host, port=slave.port ) ) slave.disconnect() def _check_privileges(self): """Check required privileges to perform the multi-source replication. This method check if the used users for the slave and masters have the required privileges to perform the multi-source replication. The following privileges are required: - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION; - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE AND GRANT OPTION. An exception is thrown if users doesn't have enough privileges. """ if self.verbosity > 0: print("# Checking users privileges for replication.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check privileges for master. master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() user_obj = User(master, "{0}@{1}".format(master.user, master.host)) for any_priv_tuple in master_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ERROR_USER_WITHOUT_PRIVILEGES.format( user=master.user, host=master.host, port=master.port, operation='perform replication', req_privileges=master_priv_str ) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) master.disconnect() # Check privileges for slave slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',), ('REPLICATION SLAVE',), ('GRANT OPTION',)] slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE " "AND GRANT OPTION") conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host)) for any_priv_tuple in slave_priv: has_privilege = any( [user_obj.has_privilege('*', '*', priv) for priv in any_priv_tuple] ) if not has_privilege: msg = ("User '{0}' on '{1}@{2}' does not have sufficient " "privileges to perform replication (required: {3})." "".format(slave.user, slave.host, slave.port, slave_priv_str)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.disconnect() def _check_host_references(self): """Check to see if using all host or all IP addresses. Returns bool - True = all references are consistent. """ uses_ip = hostname_is_ip(self.topology.master.host) slave = self._get_slave() host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _setup_replication(self, master_vals, use_rpl_setup=True): """Setup replication among a master and a slave. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Use Replication.setup() if True otherwise use switch_master() on the slave. This is used to control the first pass in the masters round-robin scheduling. """ conn_options = { "src_name": "master", "dest_name": "slave", "version": "5.0.0", "unique": True, } (master, slave,) = connect_servers(master_vals, self.slave_vals, conn_options) rpl_options = self.options.copy() rpl_options["verbosity"] = self.verbosity > 0 # Start from beginning only on the first pass if rpl_options.get("from_beginning", False) and not use_rpl_setup: rpl_options["from_beginning"] = False # Create an instance of the replication object rpl = Replication(master, slave, rpl_options) if use_rpl_setup: # Check server ids errors = rpl.check_server_ids() for error in errors: self._report(error, logging.ERROR, True) # Check for server_id uniqueness errors = rpl.check_server_uuids() for error in errors: self._report(error, logging.ERROR, True) # Check InnoDB compatibility errors = rpl.check_innodb_compatibility(self.options) for error in errors: self._report(error, logging.ERROR, True) # Checking storage engines errors = rpl.check_storage_engines(self.options) for error in errors: self._report(error, logging.ERROR, True) # Check master for binary logging errors = rpl.check_master_binlog() if errors != []: raise UtilRplError(errors[0]) # Setup replication if not rpl.setup(self.rpl_user, 10): msg = "Cannot setup replication." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) else: # Parse user and password (support login-paths) try: (r_user, r_pass,) = parse_user_password(self.rpl_user) except FormatError: raise UtilError(USER_PASSWORD_FORMAT.format("--rpl-user")) # Switch master and start slave slave.switch_master(master, r_user, r_pass) slave.start({'fetch': False}) # Disconnect from servers master.disconnect() slave.disconnect() def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report("# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on. This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] Message to be printed. level[in] Level of message to log. Default = INFO. print_msg[in] If True, print the message to stdout. Default = True. """ # First, print the message. if print_msg and not self.quiet: print(message) # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(" ")) def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows). """ if self.topology: try: health_data = self.topology.get_health() current_master = self._get_master() # Get data for the remaining masters for master_vals in self.masters_vals: # Discard the current master if master_vals["host"] == current_master.host and \ master_vals["port"] == current_master.port: continue # Connect to the master conn_dict = { "conn_info": master_vals, "quiet": True, "verbose": self.verbosity > 0, } master = Master(conn_dict) master.connect() # Get master health rpl_health = master.check_rpl_health() master_data = [ master.host, master.port, "MASTER", get_server_state(master, master.host, 3, self.verbosity > 0), master.supports_gtid(), "OK" if rpl_health[0] else ", ".join(rpl_health[1]), ] # Get master status master_status = master.get_status() if len(master_status): master_log, master_log_pos = master_status[0][0:2] else: master_log = None master_log_pos = 0 # Show additional details if verbosity is turned on if self.verbosity > 0: master_data.extend([master.get_version(), master_log, master_log_pos, "", "", "", "", "", "", "", "", ""]) health_data[1].append(master_data) return health_data except UtilError as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_UUID_COLS, self.topology.get_server_uuids()) except UtilError as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _format_gtid_data(self): """Return the GTID information from the topology. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_GTID_COLS, self.topology.get_gtid_data()) except UtilError as err: msg = "Cannot get GTID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], []) def _log_data(self, title, labels, data, print_format=True): """Helper method to log data. title[in] Title to log. labels[in] List of labels. data[in] List of data rows. """ self._report("# {0}".format(title), logging.INFO) for row in data: msg = ", ".join( ["{0}: {1}".format(*col) for col in zip(labels, row)] ) self._report("# {0}".format(msg), logging.INFO, False) if print_format: print_list(sys.stdout, self.format, labels, data) def _log_master_status(self, master): """Logs the master information. master[in] Master server instance. This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if master is None: return print("#") self._report("# {0}:".format("Current Master Information"), logging.INFO) try: status = master.get_status()[0] except UtilError: msg = "Cannot get master status" self._report(msg, logging.ERROR, False) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") print_list(sys.stdout, self.format, cols, [rows]) self._report("# {0}".format( ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]), ), logging.INFO, False) # Display gtid executed set master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. master_gtids.append((gtid.strip(","),)) try: if len(master_gtids) > 1: gtid_executed = "{0}[...]".format(master_gtids[0][0]) else: gtid_executed = master_gtids[0][0] except IndexError: gtid_executed = "None" self._report("# GTID Executed Set: {0}".format(gtid_executed), logging.INFO) def stop_replication(self): """Stops multi-source replication. Stop the slave if topology is available. """ if self.topology: # Get the slave instance slave = self._get_slave() # If slave is not connected, try to reconnect and stop replication if self._reconnect_server(slave): slave.stop() slave.disconnect() if self.daemon: self._report("Multi-source replication daemon stopped.", logging.INFO, False) else: print("") self._report("# Multi-source replication stopped.", logging.INFO, True) def stop(self): """Stops the daemon. Stop slave if topology is available and then stop the daemon. """ self.stop_replication() super(ReplicationMultiSource, self).stop() def run(self): """Run the multi-source replication using the round-robin scheduling. This method implements the multi-source replication by using time slices for each master. """ num_masters = len(self.masters_vals) use_rpl_setup = True # pylint: disable=R0101 while True: # Round-robin scheduling on the masters for idx in range(num_masters): # Get the new master values and switch for the next one try: master_vals = self.masters_vals[idx] self._switch_master(master_vals, use_rpl_setup) except UtilError as err: msg = ("Error while switching master: {0}" "".format(err.errmsg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Get the new master and slave instances master = self._get_master() slave = self._get_slave() switchover_timeout = time.time() + self.switchover_interval while switchover_timeout > time.time(): # If servers not connected, try to reconnect if not self._reconnect_server(master): msg = ("Failed to connect to the master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"])) self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) if not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Report self._log_master_status(master) if "health" in self.report_values: (health_labels, health_data,) = \ self._format_health_data() if health_data: print("#") self._log_data("Health Status:", health_labels, health_data) if "gtid" in self.report_values: (gtid_labels, gtid_data,) = self._format_gtid_data() for i, row in enumerate(gtid_data): if row: print("#") self._log_data("GTID Status - {0}" "".format(_GTID_LISTS[i]), gtid_labels, row) if "uuid" in self.report_values: (uuid_labels, uuid_data,) = self._format_uuid_data() if uuid_data: print("#") self._log_data("UUID Status:", uuid_labels, uuid_data) # Disconnect servers master.disconnect() slave.disconnect() # Wait for reporting interval time.sleep(self.interval) # Use Replication.setup() only for the first round use_rpl_setup = False
class RplCommands(object): """Replication commands. This class supports the following replication commands. elect - perform best slave election and report best slave failover - conduct failover from master to best slave as specified by the user. This option performs best slave election. gtid - show status of global transaction id variables health - display the replication health reset - stop and reset all slaves start - start all slaves stop - stop all slaves switchover - perform slave promotion as specified by the user to a specific slave. Requires --master and the --candidate options. """ def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err)) def _report(self, message, level=logging.INFO, print_msg=True): """Log message if logging is on This method will log the message presented if the log is turned on. Specifically, if options['log_file'] is not None. It will also print the message to stdout. message[in] message to be printed level[in] level of message to log. Default = INFO print_msg[in] if True, print the message to stdout. Default = True """ # First, print the message. if print_msg and not self.quiet: print message # Now log message if logging turned on if self.logging: logging.log(int(level), message.strip("#").strip(' ')) def _show_health(self): """Run a command on a list of slaves. This method will display the replication health of the topology. This includes the following for each server. - host : host name - port : connection port - role : "MASTER" or "SLAVE" - state : UP = connected, WARN = cannot connect but can ping, DOWN = cannot connect nor ping - gtid : ON = gtid supported and turned on, OFF = supported but not enabled, NO = not supported - rpl_health : (master) binlog enabled, (slave) IO tread is running, SQL thread is running, no errors, slave delay < max_delay, read log pos + max_position < master's log position Note: Will show 'ERROR' if there are multiple errors encountered otherwise will display the health check that failed. If verbosity is set, it will show the following additional information. (master) - server version, binary log file, position (slaves) - server version, master's binary log file, master's log position, IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay, IO_Error_Num, IO_Error """ fmt = self.options.get("format", "grid") quiet = self.options.get("quiet", False) cols, rows = self.topology.get_health() if not quiet: print "#" print "# Replication Topology Health:" # Print health report print_list(sys.stdout, fmt, cols, rows) return def _show_gtid_data(self): """Display the GTID lists from the servers. This method displays the three GTID lists for all of the servers. Each server is listed with its entries in each list. If a list has no entries, that list is not printed. """ if not self.topology.gtid_enabled(): self._report("# WARNING: GTIDs are not supported on this " "topology.", logging.WARN) return fmt = self.options.get("format", "grid") # Get UUIDs uuids = self.topology.get_server_uuids() if len(uuids): print "#" print "# UUIDS for all servers:" print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'], uuids) # Get GTID lists executed, purged, owned = self.topology.get_gtid_data() if len(executed): print "#" print "# Transactions executed on the server:" print_list(sys.stdout, fmt, _GTID_COLS, executed) if len(purged): print "#" print "# Transactions purged from the server:" print_list(sys.stdout, fmt, _GTID_COLS, purged) if len(owned): print "#" print "# Transactions owned by another server:" print_list(sys.stdout, fmt, _GTID_COLS, owned) def _check_host_references(self): """Check to see if using all host or all IP addresses Returns bool - True = all references are consistent """ uses_ip = hostname_is_ip(self.topology.master.host) for slave_dict in self.topology.slaves: slave = slave_dict['instance'] if slave is not None: host_port = slave.get_master_host_port() host = None if host_port: host = host_port[0] if (not host or uses_ip != hostname_is_ip(slave.host) or uses_ip != hostname_is_ip(host)): return False return True def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check new master is not actual master - need valid candidate candidate = self.options.get("new_master", None) if (self.topology.master.is_alias(candidate['host']) and self.master_vals['port'] == candidate['port']): err_msg = ERROR_SAME_MASTER.format(candidate['host'], candidate['port'], self.master_vals['host'], self.master_vals['port']) self._report(err_msg, logging.WARN) self._report(err_msg, logging.CRITICAL) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) # Check prerequisites if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Can only check errant transactions if GTIDs are enabled. if self.topology.gtid_enabled(): # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = self.options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please " "use the utility with the --force " "option.".format(_ERRANT_TNX_ERROR)) else: warn_msg = ("Errant transactions check skipped (GTID not enabled " "for the whole topology).") print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) self._report(" ".join(["# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port'])])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True def _elect_slave(self): """Perform best slave election This method determines which slave is the best candidate for GTID-enabled failover. If called for a non-GTID topology, a warning is issued. """ if not self.topology.gtid_enabled(): warn_msg = _GTID_ON_REQ.format(action='Slave election') print("# WARNING: {0}".format(warn_msg)) self._report(warn_msg, logging.WARN, False) return # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) candidates = self.options.get("candidates", None) if candidates is None or len(candidates) == 0: self._report("# Electing candidate slave from known slaves.") else: self._report("# Electing candidate slave from candidate list " "then slaves list.") best_slave = self.topology.find_best_slave(candidates) if best_slave is None: self._report("ERROR: No slave found that meets eligilibility " "requirements.", logging.ERROR) return self._report("# Best slave found is located on %s:%s." % (best_slave['host'], best_slave['port'])) def _failover(self, strict=False, options=None): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False options[in] options dictionary. Returns bool - True = failover succeeded, False = errors found """ if options is None: options = {} srv_list = self.topology.get_servers_with_gtid_not_on() if srv_list: err_msg = _GTID_ON_REQ.format(action='Slave election') print("# ERROR: {0}".format(err_msg)) self._report(err_msg, logging.ERROR, False) for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) self._report(msg, logging.ERROR) self._report(err_msg, logging.CRITICAL, False) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please use " "the utility with the --force option." "".format(_ERRANT_TNX_ERROR)) self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict, stop_on_error=True): self._report("# Errors found.", logging.ERROR) return False return True def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True def check_host_references(self): """Public method to access self.check_host_references() """ return self._check_host_references() def execute_command(self, command, options=None): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute options[in] options dictionary. Returns bool - True = success, raise error on failure """ if options is None: options = {} # Raise error if command is not valid if command not in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover(options=options) else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True def auto_failover(self, interval): """Automatic failover Wrapper class for running automatic failover. See run_automatic_failover for details on implementation. This method ensures the registration/deregistration occurs regardless of exception or errors. interval[in] time in seconds to wait to check status of servers Returns bool - True = success, raises exception on error """ failover_mode = self.options.get("failover_mode", "auto") force = self.options.get("force", False) # Initialize a console console = FailoverConsole(self.topology.master, self.topology.get_health, self.topology.get_gtid_data, self.topology.get_server_uuids, self.options) # Check privileges self._report("# Checking privileges.") errors = self.topology.check_privileges(failover_mode != 'fail') if len(errors): for error in errors: msg = ("User {0} on {1}@{2} does not have sufficient " "privileges to execute the {3} command " "(required: {4}).").format(error[0], error[1], error[2], 'failover', error[3]) print("# ERROR: {0}".format(msg)) self._report(msg, logging.CRITICAL, False) raise UtilRplError("Not enough privileges to execute command.") # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance self._report("Registering instance on master.", logging.INFO, False) old_mode = failover_mode failover_mode = console.register_instance(force) if failover_mode != old_mode: self._report("Multiple instances of failover console found for " "master %s:%s." % (self.topology.master.host, self.topology.master.port), logging.WARN) print "If this is an error, restart the console with --force. " print "Failover mode changed to 'FAIL' for this instance. " print "Console will start in 10 seconds.", sys.stdout.flush() i = 0 while i < 9: time.sleep(1) sys.stdout.write('.') sys.stdout.flush() i += 1 print "starting Console." time.sleep(1) try: res = self.run_auto_failover(console, failover_mode) except: raise finally: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) console.register_instance(True, False) self._report("Failover console stopped.", logging.INFO, False) except: pass return res def auto_failover_as_daemon(self): """Automatic failover Wrapper class for running automatic failover as daemon. This method ensures the registration/deregistration occurs regardless of exception or errors. Returns bool - True = success, raises exception on error """ # Initialize failover daemon failover_daemon = FailoverDaemon(self) res = None try: action = self.options.get("daemon") if action == "start": res = failover_daemon.start() elif action == "stop": res = failover_daemon.stop() elif action == "restart": res = failover_daemon.restart() else: # Start failover deamon in foreground res = failover_daemon.start(detach_process=False) except: try: # Unregister instance self._report("Unregistering instance on master.", logging.INFO, False) failover_daemon.register_instance(True, False) self._report("Failover daemon stopped.", logging.INFO, False) except: pass return res def run_auto_failover(self, console, failover_mode="auto"): """Run automatic failover This method implements the automatic failover facility. It uses the FailoverConsole class from the failover_console.py to implement all user interface commands and uses the existing failover() method of this class to conduct failover. When the master goes down, the method can perform one of three actions: 1) failover to list of candidates first then slaves 2) failover to list of candidates only 3) fail console[in] instance of the failover console class. Returns bool - True = success, raises exception on error """ pingtime = self.options.get("pingtime", 3) exec_fail = self.options.get("exec_fail", None) post_fail = self.options.get("post_fail", None) pedantic = self.options.get('pedantic', False) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = "Topology must support global transaction ids " + \ "and have GTID_MODE=ON." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Require --master-info-repository=TABLE for all slaves if not self.topology.check_master_info_type("TABLE"): msg = "Failover requires --master-info-repository=TABLE for " + \ "all slaves." self._report(msg, logging.ERROR, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False) print("#\n# Failover console will start in {0} seconds.".format( WARNING_SLEEP_TIME)) time.sleep(WARNING_SLEEP_TIME) # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON if pedantic: raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the --pedantic " "option.".format(_ERRANT_TNX_ERROR)) self._report("Failover console started.", logging.INFO, False) self._report("Failover mode = %s." % failover_mode, logging.INFO, False) # Main loop - loop and fire on interval. done = False first_pass = True failover = False while not done: # Use try block in case master class has gone away. try: old_host = self.master.host old_port = self.master.port except: old_host = "UNKNOWN" old_port = "UNKNOWN" # If a failover script is provided, check it else check master # using connectivity checks. if exec_fail is not None: # Execute failover check script if not os.path.isfile(exec_fail): message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format( path=exec_fail) self._report(message, logging.CRITICAL, False) raise UtilRplError(message) elif not os.access(exec_fail, os.X_OK): message = INSUFFICIENT_FILE_PERMISSIONS.format( path=exec_fail, permissions='execute') self._report(message, logging.CRITICAL, False) raise UtilRplError(message) else: self._report("# Spawning external script for failover " "checking.") res = execute_script(exec_fail, None, [old_host, old_port], self.verbose) if res == 0: self._report("# Failover check script completed Ok. " "Failover averted.") else: self._report("# Failover check script failed. " "Failover initiated", logging.WARN) failover = True else: # Check the master. If not alive, wait for pingtime seconds # and try again. if self.topology.master is not None and \ not self.topology.master.is_alive(): msg = "Master may be down. Waiting for %s seconds." % \ pingtime self._report(msg, logging.INFO, False) time.sleep(pingtime) try: self.topology.master.connect() except: pass # Check the master again. If no connection or lost connection, # try ping. This performs the timeout threshold for detecting # a down master. If still not alive, try to reconnect and if # connection fails after 3 attempts, failover. if self.topology.master is None or \ not ping_host(self.topology.master.host, pingtime) or \ not self.topology.master.is_alive(): failover = True i = 0 while i < 3: try: self.topology.master.connect() failover = False # Master is now connected again break except: pass time.sleep(pingtime) i += 1 if failover: self._report("Failed to reconnect to the master after " "3 attemps.", logging.INFO) if failover: self._report("Master is confirmed to be down or unreachable.", logging.CRITICAL, False) try: self.topology.master.disconnect() except: pass console.clear() if failover_mode == 'auto': self._report("Failover starting in 'auto' mode...") res = self.topology.failover(self.candidates, False) elif failover_mode == 'elect': self._report("Failover starting in 'elect' mode...") res = self.topology.failover(self.candidates, True) else: msg = _FAILOVER_ERROR % ("Master has failed and automatic " "failover is not enabled. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg, _FAILOVER_ERRNO) if not res: msg = _FAILOVER_ERROR % ("An error was encountered " "during failover. ") self._report(msg, logging.CRITICAL, False) # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port]) raise UtilRplError(msg) self.master = self.topology.master console.master = self.master self.topology.remove_discovered_slaves() self.topology.discover_slaves() console.list_data = None print "\nFailover console will restart in 5 seconds." time.sleep(5) console.clear() failover = False # Execute post failover script self.topology.run_script(post_fail, False, [old_host, old_port, self.master.host, self.master.port]) # Unregister existing instances from slaves self._report("Unregistering existing instances from slaves.", logging.INFO, False) console.unregister_slaves(self.topology) # Register instance on the new master self._report("Registering instance on master.", logging.INFO, False) failover_mode = console.register_instance() # discover slaves if option was specified at startup elif (self.options.get("discover", None) is not None and not first_pass): # Force refresh of health list if new slaves found if self.topology.discover_slaves(): console.list_data = None # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: if pedantic: print("# WARNING: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.WARN, False) # Raise an exception (to stop) if pedantic mode is ON raise UtilRplError("{0} Note: If you want to ignore this " "issue, please do not use the " "--pedantic " "option.".format(_ERRANT_TNX_ERROR)) else: if self.logging: warn_msg = ("{0} Check log for more " "details.".format(_ERRANT_TNX_ERROR)) else: warn_msg = _ERRANT_TNX_ERROR console.add_warning('errant_tnx', warn_msg) self._report(_ERRANT_TNX_ERROR, logging.WARN, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) self._report(errant_msg, logging.WARN, False) else: console.del_warning('errant_tnx') res = console.display_console() if res is not None: # None = normal timeout, keep going if not res: return False # Errors detected done = True # User has quit first_pass = False return True
def purge(self): """The Purge Method Determines the latest log file to purge among all the slaves, which becomes the target file to purge binary logs to, in case no other file is specified. """ # Create a topology object to verify the connection between master and # slaves servers. self.topology = Topology(self.master_cnx_val, self.slaves_cnx_val, self.options, skip_conn_err=False) self.master = self.topology.master self.slaves = self.topology.slaves # Check required privileges check_privileges(self.master, BINLOG_OP_PURGE, ["SUPER", "REPLICATION SLAVE"], BINLOG_OP_PURGE_DESC, self.verbosity, self._report) # Get binlog info binlog_file_name, active_binlog_file, active_binlog_index = ( get_binlog_info(self.master, reporter=self._report, server_name="master", verbosity=self.verbosity)) # Verify this Master has at least one slave. if not self.slaves: errormsg = (_CAN_NOT_VERIFY_SLAVES_STATUS.format( host=self.master.host, port=self.master.port)) raise UtilError(errormsg) # verify the given slaves are connected to this Master. if self.slaves_cnx_val and self.slaves: for slave in self.slaves: slave['instance'].is_configured_for_master(self.master, verify_state=False, raise_error=True) # IO running verification for --slaves option if not slave['instance'].is_connected(): if self.verbosity: self._report("# Slave '{0}:{1}' IO not running" "".format(slave['host'], slave['port'])) raise UtilError( _CAN_NOT_VERIFY_SLAVE_STATUS.format( host=slave['host'], port=slave['port'])) target_binlog_index = self.get_target_binlog_index(binlog_file_name) index_last_in_use = determine_purgeable_binlogs( active_binlog_index, self.slaves, reporter=self._report, verbosity=self.verbosity) self._purge(index_last_in_use, active_binlog_file, binlog_file_name, target_binlog_index, server=self.master, server_is_master=True)