def execute_command(self, command, options=None): """Execute a replication admin command This method executes one of the valid replication administration commands as described above. command[in] command to execute options[in] options dictionary. Returns bool - True = success, raise error on failure """ if options is None: options = {} # Raise error if command is not valid if command not in _VALID_COMMANDS: msg = "'%s' is not a valid command." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Check privileges self._report("# Checking privileges.") full_check = command in ['failover', 'elect', 'switchover'] errors = self.topology.check_privileges(full_check) if len(errors): msg = "User %s on %s does not have sufficient privileges to " + \ "execute the %s command." for error in errors: self._report(msg % (error[0], error[1], command), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") self._report("Executing %s command..." % command, logging.INFO, False) # Execute the command if command in _SLAVE_COMMANDS: if command == 'reset': self.topology.run_cmd_on_slaves('stop') self.topology.run_cmd_on_slaves(command) elif command in 'gtid': self._show_gtid_data() elif command == 'health': self._show_health() elif command == 'switchover': self._switchover() elif command == 'elect': self._elect_slave() elif command == 'failover': self._failover(options=options) else: msg = "Command '%s' is not implemented." % command self._report(msg, logging.CRITICAL) raise UtilRplError(msg) if command in ['switchover', 'failover'] and \ not self.options.get("no_health", False): self._show_health() self._report("# ...done.") return True
def daemonize(self): """Creates the daemon. It will fork a child process and then exit parent. By performing a double fork, set the current process's user id, change the current working directory, set the current numeric umask, redirect standard streams and write the pid to a file. """ def redirect_stream(system_stream, target_stream): """Redirect a system stream to a specified file. """ if target_stream is None: target_f = os.open(os.devnull, os.O_RDWR) else: target_f = target_stream.fileno() os.dup2(target_f, system_stream.fileno()) def fork_then_exit_parent(error_message): """Fork a child process, then exit the parent process. """ try: pid = os.fork() if pid > 0: os._exit(0) # pylint: disable=W0212 except OSError as err: msg = "{0}: [{1}] {2}".format(error_message, err.errno, err.strerror) self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Fork fork_then_exit_parent("Failed first fork.") try: os.setsid() os.chdir(self.chdir) os.umask(self.umask) except Exception as err: msg = "Unable to change directory ({0})".format(err) self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Double fork fork_then_exit_parent("Failed second fork.") # Redirect streams redirect_stream(sys.stdin, self.stdin) redirect_stream(sys.stdout, self.stdout) redirect_stream(sys.stderr, self.stderr) # write pidfile atexit.register(self.delete_pidfile) pid = str(os.getpid()) try: with open(self.pidfile, "w") as f: f.write("{0}\n".format(pid)) except IOError as err: msg = "Unable to write pidfile: {0}".format(err.strerror) self._report(msg, logging.CRITICAL) raise UtilRplError(msg)
def _failover(self, strict=False, options=None): """Perform failover This method executes GTID-enabled failover. If called for a non-GTID topology, a warning is issued. strict[in] if True, use only the candidate list for slave election and fail if no candidates are viable. Default = False options[in] options dictionary. Returns bool - True = failover succeeded, False = errors found """ if options is None: options = {} srv_list = self.topology.get_servers_with_gtid_not_on() if srv_list: err_msg = _GTID_ON_REQ.format(action='Slave election') print("# ERROR: {0}".format(err_msg)) self._report(err_msg, logging.ERROR, False) for srv in srv_list: msg = "# - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0], srv[1]) self._report(msg, logging.ERROR) self._report(err_msg, logging.CRITICAL, False) raise UtilRplError(err_msg) # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check existence of errant transactions on slaves errant_tnx = self.topology.find_errant_transactions() if errant_tnx: force = options.get('force') print("# ERROR: {0}".format(_ERRANT_TNX_ERROR)) self._report(_ERRANT_TNX_ERROR, logging.ERROR, False) for host, port, tnx_set in errant_tnx: errant_msg = (" - For slave '{0}@{1}': " "{2}".format(host, port, ", ".join(tnx_set))) print("# {0}".format(errant_msg)) self._report(errant_msg, logging.ERROR, False) # Raise an exception (to stop) if tolerant mode is OFF if not force: raise UtilRplError("{0} Note: If you want to ignore this " "issue, although not advised, please use " "the utility with the --force option." "".format(_ERRANT_TNX_ERROR)) self._report("# Performing failover.") if not self.topology.failover(self.candidates, strict, stop_on_error=True): self._report("# Errors found.", logging.ERROR) return False return True
def _switch_master(self, master_vals, use_rpl_setup=True): """Switches replication to a new master. This method stops replication with the old master if exists and starts the replication with a new one. master_vals[in] Master server connection dictionary. use_rpl_setup[in] Used to control the first pass in the masters round-robin scheduling. """ if self.topology: # Stop slave master = self._get_master() if master.is_alive(): master.disconnect() slave = self._get_slave() if not slave.is_alive() and not self._reconnect_server(slave): msg = "Failed to connect to the slave." self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) slave.stop() slave.disconnect() self._report( "# Switching to master '{0}:{1}'." "".format(master_vals["host"], master_vals["port"]), logging.INFO, True) try: # Setup replication on the new master self._setup_replication(master_vals, use_rpl_setup) # Create a Topology object self.topology = Topology(master_vals, [self.slave_vals], self.options) except UtilError as err: msg = "Error while switching master: {0}".format(err.errmsg) self._report(msg, logging.CRITICAL, False) raise UtilRplError(err.errmsg) # Only works for GTID_MODE=ON if not self.topology.gtid_enabled(): msg = ("Topology must support global transaction ids and have " "GTID_MODE=ON.") self._report(msg, logging.CRITICAL, False) raise UtilRplError(msg) # Check for mixing IP and hostnames if not self._check_host_references(): print("# WARNING: {0}".format(HOST_IP_WARNING)) self._report(HOST_IP_WARNING, logging.WARN, False)
def _switchover(self): """Perform switchover from master to candidate slave This method switches the role of master to a candidate slave. The candidate is specified via the --candidate option. Returns bool - True = no errors, False = errors reported. """ # Check for --master-info-repository=TABLE if rpl_user is None if not self._check_master_info_type(): return False # Check for mixing IP and hostnames if not self._check_host_references(): print "# WARNING: %s" % _HOST_IP_WARNING self._report(_HOST_IP_WARNING, logging.WARN, False) # Check prerequisites - need valid candidate candidate = self.options.get("new_master", None) if candidate is None: msg = "No candidate specified." self._report(msg, logging.CRITICAL) raise UtilRplError(msg) self._report(" ".join([ "# Performing switchover from master at", "%s:%s" % (self.master_vals['host'], self.master_vals['port']), "to slave at %s:%s." % (candidate['host'], candidate['port']) ])) if not self.topology.switchover(candidate): self._report("# Errors found. Switchover aborted.", logging.ERROR) return False return True
def _format_gtid_data(self): """Get the formatted GTID data This method sets the member list_data to the GTID list to populate the list. A subsequent call to _print_list() displays the new list. """ rows = [] # Get GTID lists self.gtid_list += 1 if self.gtid_list > 3: self.gtid_list = 0 if self.gtid_list == 0 and self.master_gtids: self.comment = _MASTER_GTID_LIST rows = self.master_gtids elif self.get_gtid_data: try: gtid_data = self.get_gtid_data() except Exception as err: raise UtilRplError("Cannot get GTID data: {0}".format(err)) self.comment = _GTID_LISTS[self.gtid_list - 1] rows = gtid_data[self.gtid_list - 1] self.start_list = 0 self.end_list = len(rows) self.report_mode = 'G' if self.gtid_list == 0: return (_MASTER_GTID_COLS, rows) else: return (_GEN_GTID_COLS, rows)
def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows). """ if self.topology: try: health_data = self.topology.get_health() current_master = self._get_master() # Get data for the remaining masters for master_vals in self.masters_vals: # Discard the current master if master_vals["host"] == current_master.host and \ master_vals["port"] == current_master.port: continue # Connect to the master conn_dict = { "conn_info": master_vals, "quiet": True, "verbose": self.verbosity > 0, } master = Master(conn_dict) master.connect() # Get master health rpl_health = master.check_rpl_health() master_data = [ master.host, master.port, "MASTER", get_server_state(master, master.host, 3, self.verbosity > 0), master.supports_gtid(), "OK" if rpl_health[0] else ", ".join(rpl_health[1]), ] # Get master status master_status = master.get_status() if len(master_status): master_log, master_log_pos = master_status[0][0:2] else: master_log = None master_log_pos = 0 # Show additional details if verbosity is turned on if self.verbosity > 0: master_data.extend([ master.get_version(), master_log, master_log_pos, "", "", "", "", "", "", "", "", "" ]) health_data[1].append(master_data) return health_data except UtilError as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], [])
def stop(self): """Stops the daemon. It will stop the daemon by sending a signal.SIGTERM to the process. """ # Get the pid from the pidfile try: with open(self.pidfile, "rb") as f: self.pid = int(f.read().strip()) except IOError: self._report("pidfile {0} does not exist.".format(self.pidfile), logging.ERROR) return False except ValueError: self._report("Invalid pid in pidfile {0}.".format(self.pidfile), logging.ERROR) return False # Kill the daemon process try: while 1: os.kill(self.pid, signal.SIGTERM) time.sleep(0.1) except OSError as err: strerror = err.strerror if err.errno == 3: # No such process if os.path.exists(self.pidfile): self.delete_pidfile() else: msg = "Unable to delete pidfile: {0}".format(strerror) self._report(msg, logging.ERROR) raise UtilRplError(msg) return True
def start(self, detach_process=True): """Starts the daemon. Runs the automatic failover, it will start the daemon if detach_process is True. """ # Check failover instances running self.check_instance() if detach_process: # Check for a pidfile presence try: with open(self.pidfile, "rb") as f: self.pid = int(f.read().strip()) except IOError: self.pid = None except SystemExit: self.pid = None except ValueError: self.pid = None if self.pid: # Daemon already runs msg = ("pidfile {0} already exists. The daemon is already " "running?".format(self.pidfile)) self._report(msg, logging.CRITICAL) raise UtilRplError(msg) # Start the daemon self.daemonize() # Run automatic failover return self.run()
def _check_server_versions(self): """Checks the server versions. """ if self.verbosity > 0: print("# Checking server versions.\n#") # Connection dictionary conn_dict = { "conn_info": None, "quiet": True, "verbose": self.verbosity > 0, } # Check masters version for master_vals in self.masters_vals: conn_dict["conn_info"] = master_vals master = Master(conn_dict) master.connect() if not master.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=master.host, port=master.port ) ) master.disconnect() # Check slave version conn_dict["conn_info"] = self.slave_vals slave = Slave(conn_dict) slave.connect() if not slave.check_version_compat(*_MIN_SERVER_VERSION): raise UtilRplError( ERROR_MIN_SERVER_VERSIONS.format( utility="mysqlrplms", min_version=".".join([str(val) for val in _MIN_SERVER_VERSION]), host=slave.host, port=slave.port ) ) slave.disconnect()
def delete_pidfile(self): """Delete pidfile """ try: os.remove(self.pidfile) except (OSError, IOError) as err: msg = "Unable to delete pidfile: {0}".format(err.strerror) self._report(msg, logging.ERROR) raise UtilRplError(msg)
def rpl_test(self): """Execute test. """ # Check rpl_user self.report_test("Replication user exists?") res = self.rpl.slave.get_status() if res is None or res == []: raise UtilRplError("Slave is not connected to a master.") return self.rpl.master.check_rpl_user(res[0][_RPL_USER], self.rpl.slave.host)
def binlog_enabled(self): """Check binary logging status for the client. Returns bool - True - binary logging is ON, False = OFF """ res = self.show_server_variable("log_bin") if not res: raise UtilRplError("Cannot retrieve status of log_bin variable.") if res[0][1] in ("OFF", "0"): return False return True
def _log_master_status(self, master): """Logs the master information. master[in] Master server instance. This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if master is None: return print("#") self._report("# {0}:".format("Current Master Information"), logging.INFO) try: status = master.get_status()[0] except UtilError: msg = "Cannot get master status" self._report(msg, logging.ERROR, False) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") print_list(sys.stdout, self.format, cols, [rows]) self._report( "# {0}".format( ", ".join( ["{0}: {1}".format(*item) for item in zip(cols, rows)]), ), logging.INFO, False) # Display gtid executed set master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. master_gtids.append((gtid.strip(","), )) try: if len(master_gtids) > 1: gtid_executed = "{0}[...]".format(master_gtids[0][0]) else: gtid_executed = master_gtids[0][0] except IndexError: gtid_executed = "None" self._report("# GTID Executed Set: {0}".format(gtid_executed), logging.INFO)
def fork_then_exit_parent(error_message): """Fork a child process, then exit the parent process. """ try: pid = os.fork() if pid > 0: os._exit(0) # pylint: disable=W0212 except OSError as err: msg = "{0}: [{1}] {2}".format(error_message, err.errno, err.strerror) self._report(msg, logging.CRITICAL) raise UtilRplError(msg)
def get_server_id(self): """Retrieve the server id. Returns int - server id. """ try: res = self.show_server_variable("server_id") except: raise UtilRplError("Cannot retrieve server id from " "%s." % self.role) return int(res[0][1])
def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows). """ if self.topology: try: return (_GEN_UUID_COLS, self.topology.get_server_uuids()) except UtilError as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR, False) raise UtilRplError(msg) return ([], [])
def _format_uuid_data(self): """Return the server's uuids. Returns tuple - (columns, rows) """ if self.get_uuid_data is not None: try: return (_GEN_UUID_COLS, self.get_uuid_data()) except Exception as err: msg = "Cannot get UUID data: {0}".format(err) self._report(msg, logging.ERROR) raise UtilRplError(msg) return ([], [])
def _format_gtid_data(self): """Return the GTID information from the topology. Returns tuple - (columns, rows) """ if self.get_gtid_data is not None: try: return (_GEN_GTID_COLS, self.get_gtid_data()) except Exception as err: msg = "Cannot get GTID data: {0}".format(err) self._report(msg, logging.ERROR) raise UtilRplError(msg) return ([], [])
def _format_health_data(self): """Return health data from topology. Returns tuple - (columns, rows) """ if self.get_health_data is not None: try: return self.get_health_data() except Exception as err: msg = "Cannot get health data: {0}".format(err) self._report(msg, logging.ERROR) raise UtilRplError(msg) return ([], [])
def _print_master_status(self): """Display the master information This method displays the master information from SHOW MASTER STATUS. """ from mysql.utilities.common.format import format_tabular_list # If no master present, don't print anything. if self.master is None: return try: status = self.master.get_status()[0] if self.logging: logging.info("Master status: binlog: %s, position:%s" % (status[0], status[1])) except: raise UtilRplError("Cannot get master status") print "Master Information" print "------------------" cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") fmt_opts = { "print_header" : True, "separator" : None, "quiet" : True, "print_footer" : False, } logfile = status[0][0:20] if len(status[0]) > 20 else status[0] rows = [(logfile, status[1], status[2], status[3])] format_tabular_list(sys.stdout, cols, rows, fmt_opts) # Display gtid executed set self.master_gtids = [] for gtid in status[4].split("\n"): if len(gtid): # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. self.master_gtids.append((gtid.strip(","),)) print "\nGTID Executed Set" try: print self.master_gtids[0][0], except IndexError: print "None", if len(self.master_gtids) > 1: print "[...]" else: print print self.rows_printed += 7
def get_server_uuid(self): """Retrieve the server uuid. Returns string - server uuid. """ try: res = self.show_server_variable("server_uuid") if res is None or res == []: return None except: raise UtilRplError("Cannot retrieve server_uuid from " "%s." % self.role) return res[0][1]
def __init__(self, master_vals, slave_vals, options, skip_conn_err=True): """Constructor master_vals[in] master server connection dictionary slave_vals[in] list of slave server connection dictionaries options[in] options dictionary skip_conn_err[in] if True, do not fail on connection failure Default = True """ # A sys.stdout copy, that can be used later to turn on/off stdout self.stdout_copy = sys.stdout self.stdout_devnull = open(os.devnull, "w") # Disable stdout when running --daemon with start, stop or restart daemon = options.get("daemon") if daemon: if daemon in ("start", "nodetach"): print("Starting failover daemon...") elif daemon == "stop": print("Stopping failover daemon...") else: print("Restarting failover daemon...") # Disable stdout if daemon not nodetach if daemon != "nodetach": sys.stdout = self.stdout_devnull self.master = None self.master_vals = master_vals self.options = options self.quiet = self.options.get("quiet", False) self.logging = self.options.get("logging", False) self.candidates = self.options.get("candidates", None) self.verbose = self.options.get("verbose", None) self.rpl_user = self.options.get("rpl_user", None) self.ssl_ca = options.get("ssl_ca", None) self.ssl_cert = options.get("ssl_cert", None) self.ssl_key = options.get("ssl_key", None) if self.ssl_ca or self.ssl_cert or self.ssl_key: self.ssl = True try: self.topology = Topology(master_vals, slave_vals, self.options, skip_conn_err) except Exception as err: if daemon and daemon != "nodetach": # Turn on sys.stdout sys.stdout = self.stdout_copy raise UtilRplError(str(err))
def _log_master_status(self): """Logs the master information This method logs the master information from SHOW MASTER STATUS. """ # If no master present, don't print anything. if self.master is None: return logging.info("Master Information") try: status = self.master.get_status()[0] except: msg = "Cannot get master status" self._report(msg, logging.ERROR) raise UtilRplError(msg) cols = ("Binary Log File", "Position", "Binlog_Do_DB", "Binlog_Ignore_DB") rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A", status[3] or "N/A") logging.info( ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]) ) # Display gtid executed set self.master_gtids = [] for gtid in status[4].split("\n"): if gtid: # Add each GTID to a tuple to match the required format to # print the full GRID list correctly. self.master_gtids.append((gtid.strip(","),)) try: if len(self.master_gtids) > 1: gtid_executed = "{0}[...]".format(self.master_gtids[0][0]) else: gtid_executed = self.master_gtids[0][0] except IndexError: gtid_executed = "None" logging.info("GTID Executed Set: {0}".format(gtid_executed))
def _format_health_data(self): """Get the formatted health data This method sets the member list_data to the health list to populate the list. A subsequent call to _print_list() displays the new list. """ # Get health information if self.get_health_data is not None: try: health_data = self.get_health_data() except Exception as err: raise UtilRplError("Cannot get health data: {0}".format(err)) self.start_list = 0 self.end_list = len(health_data[1]) self.report_mode = 'H' return health_data return ([], [])
def _check_master_info_type(self, halt=True): """Check for master information set to TABLE if rpl_user not provided halt[in] if True, raise error on failure. Default is True Returns bool - True if rpl_user is specified or False if rpl_user not specified and at least one slave does not have --master-info-repository=TABLE. """ error = "You must specify either the --rpl-user or set all slaves " + \ "to use --master-info-repository=TABLE." # Check for --master-info-repository=TABLE if rpl_user is None if self.rpl_user is None: if not self.topology.check_master_info_type("TABLE"): if halt: raise UtilRplError(error) self._report(error, logging.ERROR) return False return True
def check_gtid_executed(self, operation="copy"): """Check to see if the gtid_executed variable is clear If the value is not clear, raise an error with appropriate instructions for the user to correct the issue. operation[in] Name of the operation (copy, import, etc.) default = copy """ res = self.exec_query("SHOW GLOBAL VARIABLES LIKE 'gtid_executed'")[0] if res[1].strip() == '': return err = ("The {0} operation contains GTID statements " "that require the global gtid_executed system variable on the " "target to be empty (no value). The gtid_executed value must " "be reset by issuing a RESET MASTER command on the target " "prior to attempting the {0} operation. " "Once the global gtid_executed value is cleared, you may " "retry the {0}.").format(operation) raise UtilRplError(err)
def check_gtid_version(self): """Determine if server supports latest GTID changes This method checks the server to ensure it contains the latest changes to the GTID variables (from version 5.6.9). Raises UtilRplError when errors occur. """ errors = [] if not self.supports_gtid() == "ON": errors.append(" GTID is not enabled.") if not self.check_version_compat(5, 6, 9): errors.append(" Server version must be 5.6.9 or greater.") res = self.exec_query("SHOW VARIABLES LIKE 'gtid_executed'") if res == [] or not res[0][0] == "gtid_executed": errors.append(" Missing gtid_executed system variable.") if errors: errors = "\n".join(errors) errors = "\n".join([_GTID_ERROR % (self.host, self.port), errors]) raise UtilRplError(errors)
def _format_uuid_data(self): """Get the formatted UUID data This method sets the member list_data to the UUID list to populate the list. A subsequent call to _print_list() displays the new list. """ rows = [] # Get UUID information if self.get_uuid_data is not None: self.comment = _UUID_LIST try: rows = self.get_uuid_data() except Exception as err: raise UtilRplError("Cannot get UUID data: {0}".format(err)) self.start_list = 0 self.end_list = len(rows) self.report_mode = 'U' return (_GEN_UUID_COLS, rows)
def start(self, detach_process=True): """Starts the daemon. Runs the automatic failover, it will start the daemon if detach_process is True. """ # Check privileges self._report("# Checking privileges.") errors = self.rpl.topology.check_privileges(self.mode != "fail") if len(errors): msg = ("User {0} on {1} does not have sufficient privileges to " "execute the {2} command.") for error in errors: self._report(msg.format(error[0], error[1], "failover"), logging.CRITICAL) raise UtilRplError("Not enough privileges to execute command.") # Check failover instances running self.check_instance() # Start the daemon return super(FailoverDaemon, self).start(detach_process)