Exemplo n.º 1
0
class RplCommands(object):
    """Replication commands.

    This class supports the following replication commands.

    elect       - perform best slave election and report best slave
    failover    - conduct failover from master to best slave as specified
                  by the user. This option performs best slave election.
    gtid        - show status of global transaction id variables
    health      - display the replication health
    reset       - stop and reset all slaves
    start       - start all slaves
    stop        - stop all slaves
    switchover  - perform slave promotion as specified by the user to a
                  specific slave. Requires --master and the --candidate
                  options.
    """
    def __init__(self, master_vals, slave_vals, options, skip_conn_err=True):
        """Constructor

        master_vals[in]    master server connection dictionary
        slave_vals[in]     list of slave server connection dictionaries
        options[in]        options dictionary
        skip_conn_err[in]  if True, do not fail on connection failure
                           Default = True
        """
        from mysql.utilities.common.topology import Topology

        self.master_vals = master_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.candidates = self.options.get("candidates", None)

        # Replace all local host IP addresses (i.e. 127.0.0.1) by localhost
        for candidate in self.candidates:
            if candidate['host'] == '127.0.0.1':
                candidate['host'] = 'localhost'
        for slave in slave_vals:
            if slave['host'] == '127.0.0.1':
                slave['host'] = 'localhost'

        self.rpl_user = self.options.get("rpl_user", None)
        self.topology = Topology(master_vals, slave_vals, self.options,
                                 skip_conn_err)

    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on

        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.

        message[in]    message to be printed
        level[in]      level of message to log. Default = INFO
        print_msg[in]  if True, print the message to stdout. Default = True
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print message
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(' '))

    def _show_health(self):
        """Run a command on a list of slaves.

        This method will display the replication health of the topology. This
        includes the following for each server.

          - host       : host name
          - port       : connection port
          - role       : "MASTER" or "SLAVE"
          - state      : UP = connected, WARN = cannot connect but can ping,
                         DOWN = cannot connect nor ping
          - gtid       : ON = gtid supported and turned on, OFF = supported
                         but not enabled, NO = not supported
          - rpl_health : (master) binlog enabled,
                         (slave) IO tread is running, SQL thread is running,
                         no errors, slave delay < max_delay,
                         read log pos + max_position < master's log position
                         Note: Will show 'ERROR' if there are multiple
                         errors encountered otherwise will display the
                         health check that failed.

        If verbosity is set, it will show the following additional information.

          (master)
            - server version, binary log file, position

          (slaves)
            - server version, master's binary log file, master's log position,
              IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay,
              IO_Error_Num, IO_Error
        """
        from mysql.utilities.common.format import print_list

        format = self.options.get("format", "grid")
        quiet = self.options.get("quiet", False)

        cols, rows = self.topology.get_health()

        if not quiet:
            print "#"
            print "# Replication Topology Health:"

        # Print health report
        print_list(sys.stdout, format, cols, rows)

        return

    def _show_gtid_data(self):
        """Display the GTID lists from the servers.

        This method displays the three GTID lists for all of the servers. Each
        server is listed with its entries in each list. If a list has no
        entries, that list is not printed.
        """
        from mysql.utilities.common.format import print_list

        if not self.topology.gtid_enabled():
            self._report(
                "# WARNING: GTIDs are not supported on this topology.",
                logging.WARN)
            return

        format = self.options.get("format", "grid")

        # Get UUIDs
        uuids = self.topology.get_server_uuids()
        if len(uuids):
            print "#"
            print "# UUIDS for all servers:"
            print_list(sys.stdout, format, ['host', 'port', 'role', 'uuid'],
                       uuids)

        # Get GTID lists
        executed, purged, owned = self.topology.get_gtid_data()
        if len(executed):
            print "#"
            print "# Transactions executed on the server:"
            print_list(sys.stdout, format, _GTID_COLS, executed)
        if len(purged):
            print "#"
            print "# Transactions purged from the server:"
            print_list(sys.stdout, format, _GTID_COLS, purged)
        if len(owned):
            print "#"
            print "# Transactions owned by another server:"
            print_list(sys.stdout, format, _GTID_COLS, owned)

    def _check_host_references(self):
        """Check to see if using all host or all IP addresses

        Returns bool - True = all references are consistent
        """
        from mysql.utilities.common.options import hostname_is_ip

        uses_ip = hostname_is_ip(self.topology.master.host)
        for slave_dict in self.topology.slaves:
            slave = slave_dict['instance']
            if slave is not None:
                host, port = slave.get_master_host_port()
                if uses_ip != hostname_is_ip(slave.host) or \
                   uses_ip != hostname_is_ip(host):
                    return False
        return True

    def _switchover(self):
        """Perform switchover from master to candidate slave

        This method switches the role of master to a candidate slave. The
        candidate is specified via the --candidate option.

        Returns bool - True = no errors, False = errors reported.
        """
        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print "# WARNING: %s" % _HOST_IP_WARNING
            self._report(_HOST_IP_WARNING, logging.WARN, False)

        # Check prerequisites - need valid candidate
        candidate = self.options.get("new_master", None)
        if candidate is None:
            msg = "No candidate specified."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        self._report(" ".join([
            "# Performing switchover from master at",
            "%s:%s" % (self.master_vals['host'], self.master_vals['port']),
            "to slave at %s:%s." % (candidate['host'], candidate['port'])
        ]))
        if not self.topology.switchover(candidate):
            self._report("# Errors found. Switchover aborted.", logging.ERROR)
            return False

        return True

    def _elect_slave(self):
        """Perform best slave election

        This method determines which slave is the best candidate for
        GTID-enabled failover. If called for a non-GTID topology, a warning
        is issued.
        """
        if not self.topology.gtid_enabled():
            self._report(
                "# WARNING: slave election requires GTID_MODE=ON "
                "for all servers.", logging.WARN)
            return

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print "# WARNING: %s" % _HOST_IP_WARNING
            self._report(_HOST_IP_WARNING, logging.WARN, False)

        candidates = self.options.get("candidates", None)
        if candidates is None or len(candidates) == 0:
            self._report("# Electing candidate slave from known slaves.")
        else:
            self._report("# Electing candidate slave from candidate list "
                         "then slaves list.")
        best_slave = self.topology.find_best_slave(candidates)
        if best_slave is None:
            self._report(
                "ERROR: No slave found that meets eligilibility "
                "requirements.", logging.ERROR)
            return

        self._report("# Best slave found is located on %s:%s." %
                     (best_slave['host'], best_slave['port']))

    def _failover(self, strict=False):
        """Perform failover

        This method executes GTID-enabled failover. If called for a non-GTID
        topology, a warning is issued.

        strict[in]     if True, use only the candidate list for slave
                       election and fail if no candidates are viable.
                       Default = False

        Returns bool - True = failover succeeded, False = errors found
        """
        if not self.topology.gtid_enabled():
            self._report(
                "# WARNING: slave election requires GTID_MODE=ON "
                "for all servers.", logging.WARN)
            return

        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        self._report("# Performing failover.")
        if not self.topology.failover(self.candidates, strict):
            self._report("# Errors found.", logging.ERROR)
            return False
        return True

    def _check_master_info_type(self, halt=True):
        """Check for master information set to TABLE if rpl_user not provided

        halt[in]       if True, raise error on failure. Default is True

        Returns bool - True if rpl_user is specified or False if rpl_user not
                       specified and at least one slave does not have
                       --master-info-repository=TABLE.
        """
        error = "You must specify either the --rpl-user or set all slaves " + \
                "to use --master-info-repository=TABLE."
        # Check for --master-info-repository=TABLE if rpl_user is None
        if self.rpl_user is None:
            if not self.topology.check_master_info_type("TABLE"):
                if halt:
                    raise UtilRplError(error)
                self._report(error, logging.ERROR)
                return False
        return True

    def execute_command(self, command):
        """Execute a replication admin command

        This method executes one of the valid replication administration
        commands as described above.

        command[in]        command to execute

        Returns bool - True = success, raise error on failure
        """
        # Raise error if command is not valid
        if not command in _VALID_COMMANDS:
            msg = "'%s' is not a valid command." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        full_check = command in ['failover', 'elect', 'switchover']
        errors = self.topology.check_privileges(full_check)
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                             logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        self._report("Executing %s command..." % command, logging.INFO, False)

        # Execute the command
        if command in _SLAVE_COMMANDS:
            if command == 'reset':
                self.topology.run_cmd_on_slaves('stop')
            self.topology.run_cmd_on_slaves(command)
        elif command in 'gtid':
            self._show_gtid_data()
        elif command == 'health':
            self._show_health()
        elif command == 'switchover':
            self._switchover()
        elif command == 'elect':
            self._elect_slave()
        elif command == 'failover':
            self._failover()
        else:
            msg = "Command '%s' is not implemented." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        if command in ['switchover', 'failover'] and \
           not self.options.get("no_health", False):
            self._show_health()

        self._report("# ...done.")

        return True

    def auto_failover(self, interval):
        """Automatic failover

        Wrapper class for running automatic failover. See
        run_automatic_failover for details on implementation.

        This method ensures the registration/deregistration occurs
        regardless of exception or errors.

        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        import time
        from mysql.utilities.command.failover_console import FailoverConsole

        failover_mode = self.options.get("failover_mode", "auto")
        force = self.options.get("force", False)

        # Initialize a console
        console = FailoverConsole(self.topology.master,
                                  self.topology.get_health,
                                  self.topology.get_gtid_data,
                                  self.topology.get_server_uuids, self.options)

        # Register instance
        self._report("Registering instance on master.", logging.INFO, False)
        old_mode = failover_mode
        failover_mode = console.register_instance(force)
        if failover_mode != old_mode:
            self._report(
                "Multiple instances of failover console found for "
                "master %s:%s." %
                (self.topology.master.host, self.topology.master.port),
                logging.WARN)
            print "If this is an error, restart the console with --force. "
            print "Failover mode changed to 'FAIL' for this instance. "
            print "Console will start in 10 seconds.",
            sys.stdout.flush()
            for i in range(0, 9):
                time.sleep(1)
                sys.stdout.write('.')
                sys.stdout.flush()
            print "starting Console."
            time.sleep(1)

        try:
            res = self.run_auto_failover(console, interval)
        except:
            raise
        finally:
            try:
                # Unregister instance
                self._report("Unregistering instance on master.", logging.INFO,
                             False)
                console.register_instance(False, False)
                self._report("Failover console stopped.", logging.INFO, False)
            except:
                pass

        return res

    def run_auto_failover(self, console, interval):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class
        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        import time
        from mysql.utilities.common.tools import ping_host
        from mysql.utilities.common.tools import execute_script

        failover_mode = self.options.get("failover_mode", "auto")
        pingtime = self.options.get("pingtime", 3)
        timeout = int(self.options.get("timeout", 300))
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], 'failover'),
                             logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print "# WARNING: %s" % _HOST_IP_WARNING
            self._report(_HOST_IP_WARNING, logging.WARN, False)
            print "#\n# Failover console will start in 10 seconds."
            time.sleep(10)

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = "Failover check script cannot be found. Please " + \
                           "check the path and filename for accuracy and " + \
                           "restart the failover console."
        if exec_fail is not None and not os.path.exists(exec_fail):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                pass
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None, [old_host, old_port],
                                         self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report(
                            "# Failover check script failed. "
                            "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        self._report("Cannot reconnect to master.",
                                     logging.INFO, False)

                # Check the master again. If no connection or lost connection,
                # try ping and if still not alive, failover. This performs the
                # timeout threshold for detecting a down master.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % \
                          "Master has failed and automatic failover is not enabled. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % "An error was encountered " + \
                          "during failover. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(
                    post_fail, False,
                    [old_host, old_port, self.master.host, self.master.port])

            # discover slaves if option was specified at startup
            elif self.options.get("discover", None) is not None and \
                (not first_pass or self.options.get("rediscover", False)):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            res = console.display_console()
            if res is not None:  # None = normal timeout, keep going
                if not res:
                    return False  # Errors detected
                done = True  # User has quit
            first_pass = False

        return True
Exemplo n.º 2
0
class RplCommands(object):
    """Replication commands.

    This class supports the following replication commands.

    elect       - perform best slave election and report best slave
    failover    - conduct failover from master to best slave as specified
                  by the user. This option performs best slave election.
    gtid        - show status of global transaction id variables
    health      - display the replication health
    reset       - stop and reset all slaves
    start       - start all slaves
    stop        - stop all slaves
    switchover  - perform slave promotion as specified by the user to a
                  specific slave. Requires --master and the --candidate
                  options.
    """

    def __init__(self, master_vals, slave_vals, options,
                 skip_conn_err=True):
        """Constructor

        master_vals[in]    master server connection dictionary
        slave_vals[in]     list of slave server connection dictionaries
        options[in]        options dictionary
        skip_conn_err[in]  if True, do not fail on connection failure
                           Default = True
        """
        # A sys.stdout copy, that can be used later to turn on/off stdout
        self.stdout_copy = sys.stdout
        self.stdout_devnull = open(os.devnull, "w")

        # Disable stdout when running --daemon with start, stop or restart
        daemon = options.get("daemon")
        if daemon:
            if daemon in ("start", "nodetach"):
                print("Starting failover daemon...")
            elif daemon == "stop":
                print("Stopping failover daemon...")
            else:
                print("Restarting failover daemon...")
            # Disable stdout if daemon not nodetach
            if daemon != "nodetach":
                sys.stdout = self.stdout_devnull

        self.master = None
        self.master_vals = master_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.candidates = self.options.get("candidates", None)
        self.verbose = self.options.get("verbose", None)
        self.rpl_user = self.options.get("rpl_user", None)
        self.ssl_ca = options.get("ssl_ca", None)
        self.ssl_cert = options.get("ssl_cert", None)
        self.ssl_key = options.get("ssl_key", None)
        if self.ssl_ca or self.ssl_cert or self.ssl_key:
            self.ssl = True

        try:
            self.topology = Topology(master_vals, slave_vals, self.options,
                                     skip_conn_err)
        except Exception as err:
            if daemon and daemon != "nodetach":
                # Turn on sys.stdout
                sys.stdout = self.stdout_copy
            raise UtilRplError(str(err))

    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on

        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.

        message[in]    message to be printed
        level[in]      level of message to log. Default = INFO
        print_msg[in]  if True, print the message to stdout. Default = True
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print message
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(' '))

    def _show_health(self):
        """Run a command on a list of slaves.

        This method will display the replication health of the topology. This
        includes the following for each server.

          - host       : host name
          - port       : connection port
          - role       : "MASTER" or "SLAVE"
          - state      : UP = connected, WARN = cannot connect but can ping,
                         DOWN = cannot connect nor ping
          - gtid       : ON = gtid supported and turned on, OFF = supported
                         but not enabled, NO = not supported
          - rpl_health : (master) binlog enabled,
                         (slave) IO tread is running, SQL thread is running,
                         no errors, slave delay < max_delay,
                         read log pos + max_position < master's log position
                         Note: Will show 'ERROR' if there are multiple
                         errors encountered otherwise will display the
                         health check that failed.

        If verbosity is set, it will show the following additional information.

          (master)
            - server version, binary log file, position

          (slaves)
            - server version, master's binary log file, master's log position,
              IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay,
              IO_Error_Num, IO_Error
        """
        fmt = self.options.get("format", "grid")
        quiet = self.options.get("quiet", False)

        cols, rows = self.topology.get_health()

        if not quiet:
            print "#"
            print "# Replication Topology Health:"

        # Print health report
        print_list(sys.stdout, fmt, cols, rows)

        return

    def _show_gtid_data(self):
        """Display the GTID lists from the servers.

        This method displays the three GTID lists for all of the servers. Each
        server is listed with its entries in each list. If a list has no
        entries, that list is not printed.
        """
        if not self.topology.gtid_enabled():
            self._report("# WARNING: GTIDs are not supported on this "
                         "topology.", logging.WARN)
            return

        fmt = self.options.get("format", "grid")

        # Get UUIDs
        uuids = self.topology.get_server_uuids()
        if len(uuids):
            print "#"
            print "# UUIDS for all servers:"
            print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'],
                       uuids)

        # Get GTID lists
        executed, purged, owned = self.topology.get_gtid_data()
        if len(executed):
            print "#"
            print "# Transactions executed on the server:"
            print_list(sys.stdout, fmt, _GTID_COLS, executed)
        if len(purged):
            print "#"
            print "# Transactions purged from the server:"
            print_list(sys.stdout, fmt, _GTID_COLS, purged)
        if len(owned):
            print "#"
            print "# Transactions owned by another server:"
            print_list(sys.stdout, fmt, _GTID_COLS, owned)

    def _check_host_references(self):
        """Check to see if using all host or all IP addresses

        Returns bool - True = all references are consistent
        """

        uses_ip = hostname_is_ip(self.topology.master.host)
        for slave_dict in self.topology.slaves:
            slave = slave_dict['instance']
            if slave is not None:
                host_port = slave.get_master_host_port()
                host = None
                if host_port:
                    host = host_port[0]
                if (not host or uses_ip != hostname_is_ip(slave.host) or
                   uses_ip != hostname_is_ip(host)):
                    return False
        return True

    def _switchover(self):
        """Perform switchover from master to candidate slave

        This method switches the role of master to a candidate slave. The
        candidate is specified via the --candidate option.

        Returns bool - True = no errors, False = errors reported.
        """
        # Check new master is not actual master - need valid candidate
        candidate = self.options.get("new_master", None)
        if (self.topology.master.is_alias(candidate['host']) and
           self.master_vals['port'] == candidate['port']):
            err_msg = ERROR_SAME_MASTER.format(candidate['host'],
                                               candidate['port'],
                                               self.master_vals['host'],
                                               self.master_vals['port'])
            self._report(err_msg, logging.WARN)
            self._report(err_msg, logging.CRITICAL)
            raise UtilRplError(err_msg)

        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

        # Check prerequisites
        if candidate is None:
            msg = "No candidate specified."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Can only check errant transactions if GTIDs are enabled.
        if self.topology.gtid_enabled():
            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                force = self.options.get('force')
                print("# ERROR: {0}".format(_ERRANT_TNX_ERROR))
                self._report(_ERRANT_TNX_ERROR, logging.ERROR, False)
                for host, port, tnx_set in errant_tnx:
                    errant_msg = (" - For slave '{0}@{1}': "
                                  "{2}".format(host, port, ", ".join(tnx_set)))
                    print("# {0}".format(errant_msg))
                    self._report(errant_msg, logging.ERROR, False)
                # Raise an exception (to stop) if tolerant mode is OFF
                if not force:
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, although not advised, please "
                                       "use the utility with the --force "
                                       "option.".format(_ERRANT_TNX_ERROR))
        else:
            warn_msg = ("Errant transactions check skipped (GTID not enabled "
                        "for the whole topology).")
            print("# WARNING: {0}".format(warn_msg))
            self._report(warn_msg, logging.WARN, False)

        self._report(" ".join(["# Performing switchover from master at",
                     "%s:%s" % (self.master_vals['host'],
                                self.master_vals['port']),
                               "to slave at %s:%s." %
                               (candidate['host'], candidate['port'])]))
        if not self.topology.switchover(candidate):
            self._report("# Errors found. Switchover aborted.", logging.ERROR)
            return False

        return True

    def _elect_slave(self):
        """Perform best slave election

        This method determines which slave is the best candidate for
        GTID-enabled failover. If called for a non-GTID topology, a warning
        is issued.
        """
        if not self.topology.gtid_enabled():
            warn_msg = _GTID_ON_REQ.format(action='Slave election')
            print("# WARNING: {0}".format(warn_msg))
            self._report(warn_msg, logging.WARN, False)
            return

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

        candidates = self.options.get("candidates", None)
        if candidates is None or len(candidates) == 0:
            self._report("# Electing candidate slave from known slaves.")
        else:
            self._report("# Electing candidate slave from candidate list "
                         "then slaves list.")
        best_slave = self.topology.find_best_slave(candidates)
        if best_slave is None:
            self._report("ERROR: No slave found that meets eligilibility "
                         "requirements.", logging.ERROR)
            return

        self._report("# Best slave found is located on %s:%s." %
                     (best_slave['host'], best_slave['port']))

    def _failover(self, strict=False, options=None):
        """Perform failover

        This method executes GTID-enabled failover. If called for a non-GTID
        topology, a warning is issued.

        strict[in]     if True, use only the candidate list for slave
                       election and fail if no candidates are viable.
                       Default = False
        options[in]    options dictionary.

        Returns bool - True = failover succeeded, False = errors found
        """
        if options is None:
            options = {}
        srv_list = self.topology.get_servers_with_gtid_not_on()
        if srv_list:
            err_msg = _GTID_ON_REQ.format(action='Slave election')
            print("# ERROR: {0}".format(err_msg))
            self._report(err_msg, logging.ERROR, False)
            for srv in srv_list:
                msg = "#  - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0],
                                                             srv[1])
                self._report(msg, logging.ERROR)

            self._report(err_msg, logging.CRITICAL, False)
            raise UtilRplError(err_msg)

        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            force = options.get('force')
            print("# ERROR: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.ERROR, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.ERROR, False)
            # Raise an exception (to stop) if tolerant mode is OFF
            if not force:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, although not advised, please use "
                                   "the utility with the --force option."
                                   "".format(_ERRANT_TNX_ERROR))

        self._report("# Performing failover.")
        if not self.topology.failover(self.candidates, strict,
                                      stop_on_error=True):
            self._report("# Errors found.", logging.ERROR)
            return False
        return True

    def _check_master_info_type(self, halt=True):
        """Check for master information set to TABLE if rpl_user not provided

        halt[in]       if True, raise error on failure. Default is True

        Returns bool - True if rpl_user is specified or False if rpl_user not
                       specified and at least one slave does not have
                       --master-info-repository=TABLE.
        """
        error = "You must specify either the --rpl-user or set all slaves " + \
                "to use --master-info-repository=TABLE."
        # Check for --master-info-repository=TABLE if rpl_user is None
        if self.rpl_user is None:
            if not self.topology.check_master_info_type("TABLE"):
                if halt:
                    raise UtilRplError(error)
                self._report(error, logging.ERROR)
                return False
        return True

    def check_host_references(self):
        """Public method to access self.check_host_references()
        """
        return self._check_host_references()

    def execute_command(self, command, options=None):
        """Execute a replication admin command

        This method executes one of the valid replication administration
        commands as described above.

        command[in]        command to execute
        options[in]        options dictionary.

        Returns bool - True = success, raise error on failure
        """
        if options is None:
            options = {}
        # Raise error if command is not valid
        if command not in _VALID_COMMANDS:
            msg = "'%s' is not a valid command." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        full_check = command in ['failover', 'elect', 'switchover']
        errors = self.topology.check_privileges(full_check)
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                             logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        self._report("Executing %s command..." % command, logging.INFO, False)

        # Execute the command
        if command in _SLAVE_COMMANDS:
            if command == 'reset':
                self.topology.run_cmd_on_slaves('stop')
            self.topology.run_cmd_on_slaves(command)
        elif command in 'gtid':
            self._show_gtid_data()
        elif command == 'health':
            self._show_health()
        elif command == 'switchover':
            self._switchover()
        elif command == 'elect':
            self._elect_slave()
        elif command == 'failover':
            self._failover(options=options)
        else:
            msg = "Command '%s' is not implemented." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        if command in ['switchover', 'failover'] and \
           not self.options.get("no_health", False):
            self._show_health()

        self._report("# ...done.")

        return True

    def auto_failover(self, interval):
        """Automatic failover

        Wrapper class for running automatic failover. See
        run_automatic_failover for details on implementation.

        This method ensures the registration/deregistration occurs
        regardless of exception or errors.

        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        failover_mode = self.options.get("failover_mode", "auto")
        force = self.options.get("force", False)

        # Initialize a console
        console = FailoverConsole(self.topology.master,
                                  self.topology.get_health,
                                  self.topology.get_gtid_data,
                                  self.topology.get_server_uuids,
                                  self.options)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            for error in errors:
                msg = ("User {0} on {1}@{2} does not have sufficient "
                       "privileges to execute the {3} command "
                       "(required: {4}).").format(error[0], error[1], error[2],
                                                  'failover', error[3])
                print("# ERROR: {0}".format(msg))
                self._report(msg, logging.CRITICAL, False)
            raise UtilRplError("Not enough privileges to execute command.")

        # Unregister existing instances from slaves
        self._report("Unregistering existing instances from slaves.",
                     logging.INFO, False)
        console.unregister_slaves(self.topology)

        # Register instance
        self._report("Registering instance on master.", logging.INFO, False)
        old_mode = failover_mode
        failover_mode = console.register_instance(force)
        if failover_mode != old_mode:
            self._report("Multiple instances of failover console found for "
                         "master %s:%s." % (self.topology.master.host,
                                            self.topology.master.port),
                         logging.WARN)
            print "If this is an error, restart the console with --force. "
            print "Failover mode changed to 'FAIL' for this instance. "
            print "Console will start in 10 seconds.",
            sys.stdout.flush()
            i = 0
            while i < 9:
                time.sleep(1)
                sys.stdout.write('.')
                sys.stdout.flush()
                i += 1
            print "starting Console."
            time.sleep(1)

        try:
            res = self.run_auto_failover(console, failover_mode)
        except:
            raise
        finally:
            try:
                # Unregister instance
                self._report("Unregistering instance on master.", logging.INFO,
                             False)
                console.register_instance(True, False)
                self._report("Failover console stopped.", logging.INFO, False)
            except:
                pass

        return res

    def auto_failover_as_daemon(self):
        """Automatic failover

        Wrapper class for running automatic failover as daemon.

        This method ensures the registration/deregistration occurs
        regardless of exception or errors.

        Returns bool - True = success, raises exception on error
        """
        # Initialize failover daemon
        failover_daemon = FailoverDaemon(self)
        res = None

        try:
            action = self.options.get("daemon")
            if action == "start":
                res = failover_daemon.start()
            elif action == "stop":
                res = failover_daemon.stop()
            elif action == "restart":
                res = failover_daemon.restart()
            else:
                # Start failover deamon in foreground
                res = failover_daemon.start(detach_process=False)
        except:
            try:
                # Unregister instance
                self._report("Unregistering instance on master.", logging.INFO,
                             False)
                failover_daemon.register_instance(True, False)
                self._report("Failover daemon stopped.", logging.INFO, False)
            except:
                pass

        return res

    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)
        fail_retry = self.options.get('fail_retry', None)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # If user specified a master fail retry, wait for the
                # predetermined time and attempt to check the master again.
                if fail_retry is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master is still not reachable. Waiting for %s " \
                          "seconds to retry detection." % fail_retry
                    self._report(msg, logging.INFO, False)
                    time.sleep(fail_retry)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)
                    else:
                        self._report("Master is Ok. Resuming watch.",
                                     logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True
Exemplo n.º 3
0
class RplCommands(object):
    """Replication commands.
    
    This class supports the following replication commands.
    
    elect       - perform best slave election and report best slave
    failover    - conduct failover from master to best slave as specified
                  by the user. This option performs best slave election.
    gtid        - show status of global transaction id variables
    health      - display the replication health 
    reset       - stop and reset all slaves
    start       - start all slaves
    stop        - stop all slaves
    switchover  - perform slave promotion as specified by the user to a
                  specific slave. Requires --master and the --candidate
                  options.
    """
    
    def __init__(self, master_vals, slave_vals, options,
                 skip_conn_err=True):
        """Constructor

        master_vals[in]    master server connection dictionary
        slave_vals[in]     list of slave server connection dictionaries
        options[in]        options dictionary
        skip_conn_err[in]  if True, do not fail on connection failure 
                           Default = True                           
        """
        from mysql.utilities.common.topology import Topology
        
        self.master_vals = master_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.candidates = self.options.get("candidates", None)
        self.topology = Topology(master_vals, slave_vals, self.options,
                                 skip_conn_err)
        
    
    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on
        
        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.
        
        message[in]    message to be printed
        level[in]      level of message to log. Default = INFO
        print_msg[in]  if True, print the message to stdout. Default = True
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print message
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(' '))


    def _show_health(self):
        """Run a command on a list of slaves.
        
        This method will display the replication health of the topology. This
        includes the following for each server.
        
          - host       : host name
          - port       : connection port
          - role       : "MASTER" or "SLAVE"
          - state      : UP = connected, WARN = cannot connect but can ping,
                         DOWN = cannot connect nor ping
          - gtid       : ON = gtid supported and turned on, OFF = supported
                         but not enabled, NO = not supported
          - rpl_health : (master) binlog enabled,
                         (slave) IO tread is running, SQL thread is running,
                         no errors, slave delay < max_delay,
                         read log pos + max_position < master's log position
                         Note: Will show 'ERROR' if there are multiple
                         errors encountered otherwise will display the
                         health check that failed.
        
        If verbosity is set, it will show the following additional information.
        
          (master)
            - server version, binary log file, position
           
          (slaves)
            - server version, master's binary log file, master's log position,
              IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay,
              IO_Error_Num, IO_Error
        """
        from mysql.utilities.common.format import print_list
        
        format = self.options.get("format", "grid")
        quiet = self.options.get("quiet", False)

        cols, rows = self.topology.get_health()    
    
        if not quiet:
            print "#"
            print "# Replication Topology Health:"
    
        # Print health report
        print_list(sys.stdout, format, cols, rows)
    
        return
    
    
    def _show_gtid_data(self):
        """Display the GTID lists from the servers.
        
        This method displays the three GTID lists for all of the servers. Each
        server is listed with its entries in each list. If a list has no
        entries, that list is not printed.
        """
        from mysql.utilities.common.format import print_list
        
        if not self.topology.gtid_enabled():
            self._report("# WARNING: GTIDs are not supported on this topology.",
                         logging.WARN)
            return
    
        format = self.options.get("format", "grid")

        # Get UUIDs
        uuids = self.topology.get_server_uuids()
        if len(uuids):
            print "#"
            print "# UUIDS for all servers:"
            print_list(sys.stdout, format, ['host','port','role','uuid'], uuids)

        # Get GTID lists    
        executed, purged, owned = self.topology.get_gtid_data()
        if len(executed):
            print "#"
            print "# Transactions executed on the server:"
            print_list(sys.stdout, format, _GTID_COLS, executed)
        if len(purged):
            print "#"
            print "# Transactions purged from the server:"
            print_list(sys.stdout, format, _GTID_COLS, purged)
        if len(owned):
            print "#"
            print "# Transactions owned by another server:"
            print_list(sys.stdout, format, _GTID_COLS, owned)


    def _switchover(self):
        """Perform switchover from master to candidate slave
        
        This method switches the role of master to a candidate slave. The
        candidate is specified via the --candidate option.
        
        Returns bool - True = no errors, False = errors reported.
        """
        from mysql.utilities.exception import FormatError
        from mysql.utilities.common.options import parse_connection
        
        # Check prerequisites - need valid candidate
        candidate = self.options.get("new_master", None)
        if candidate is None:
            msg = "No candidate specified."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)
            
        self._report(" ".join(["# Performing switchover from master at",
                     "%s:%s" % (self.master_vals['host'],
                     self.master_vals['port']), "to slave at %s:%s." %
                     (candidate['host'], candidate['port'])]))
        if not self.topology.switchover(candidate):
            self._report("# Errors found. Switchover aborted.", logging.ERROR)
            return False
        
        return True


    def _elect_slave(self):
        """Perform best slave election
        
        This method determines which slave is the best candidate for
        GTID-enabled failover. If called for a non-GTID topology, a warning
        is issued.
        """
        if not self.topology.gtid_enabled():
            self._report("# WARNING: slave election requires GTID_MODE=ON "
                         "for all servers.", logging.WARN)
            return
        candidates = self.options.get("candidates", None)
        if candidates is None or len(candidates) == 0:
            self._report("# Electing candidate slave from known slaves.")
        else:
            self._report("# Electing candidate slave from candidate list "
                         "then slaves list.")
        best_slave = self.topology.find_best_slave(candidates)
        if best_slave is None:
            self._report("ERROR: No slave found that meets eligilibility "
                         "requirements.", logging.ERROR)
            return
        
        self._report("# Best slave found is located on %s:%s." %
                     (best_slave['host'], best_slave['port']))


    def _failover(self, strict=False):
        """Perform failover
        
        This method executes GTID-enabled failover. If called for a non-GTID
        topology, a warning is issued.
        
        strict[in]     if True, use only the candidate list for slave
                       election and fail if no candidates are viable.
                       Default = False
                       
        Returns bool - True = failover succeeded, False = errors found
        """
        if not self.topology.gtid_enabled():
            self._report("# WARNING: slave election requires GTID_MODE=ON "
                         "for all servers.", logging.WARN)
            return
        self._report("# Performing failover.")
        if not self.topology.failover(self.candidates, strict):
            self._report("# Errors found.", logging.ERROR)
            return False
        return True
        

    def execute_command(self, command):
        """Execute a replication admin command
        
        This method executes one of the valid replication administration
        commands as described above.
        
        command[in]        command to execute
    
        Returns bool - True = success, raise error on failure
        """    
        # Raise error if command is not valid
        if not command in _VALID_COMMANDS:
            msg = "'%s' is not a valid command." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)
            
        # Check privileges
        self._report("# Checking privileges.")
        full_check = command in ['failover', 'elect', 'switchover']
        errors = self.topology.check_privileges(full_check)
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                                    logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")
   
        self._report("Executing %s command..." % command, logging.INFO, False)

        # Execute the command
        if command in _SLAVE_COMMANDS:
            if command == 'reset':
                self.topology.run_cmd_on_slaves('stop')
            self.topology.run_cmd_on_slaves(command)
        elif command in 'gtid':
            self._show_gtid_data()
        elif command == 'health':
            self._show_health()
        elif command == 'switchover':
            self._switchover()
        elif command == 'elect':
            self._elect_slave()
        elif command == 'failover':
            self._failover()
        else:
            msg = "Command '%s' is not implemented." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)
            
        if command in ['switchover', 'failover'] and \
           not self.options.get("no_health", False):
            self._show_health()
        
        self._report("# ...done.")
    
        return True


    def auto_failover(self, interval):
        """Automatic failover
        
        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.
        
        When the master goes down, the method can perform one of three actions:
        
        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail
            
        interval[in]   time in seconds to wait to check status of servers
        
        Returns bool - True = success, raises exception on error
        """
        import time
        from mysql.utilities.command.failover_console import FailoverConsole
        from mysql.utilities.common.tools import ping_host
        from mysql.utilities.common.tools import execute_script
        
        failover_mode = self.options.get("failover_mode", "auto")
        pingtime = self.options.get("pingtime", 3)
        timeout = self.options.get("timeout", 3)
        exec_fail = self.options.get("exec_fail", None)
        force = self.options.get("force", False)
        post_fail = self.options.get("post_fail", None)
                
        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                                    logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        # Test failover script. If it doesn't exist, fail.
        no_exec_fail_msg = "Failover check script cannot be found. Please " + \
                           "check the path and filename for accuracy and " + \
                           "restart the failover console."
        if exec_fail is not None and not os.path.exists(fail_check):
            self._report(no_exec_fail_msg, logging.CRITICAL, False)
            raise UtilRplError(no_exec_fail_msg)
               
        # Initialize a console
        console = FailoverConsole(self.topology.master, self.topology.get_health,
                                  self.topology.get_gtid_data,
                                  self.topology.get_server_uuids,
                                  self.options)
        
        # Register instance
        self._report("Registering instance on master.", logging.INFO, False)
        old_mode = failover_mode
        failover_mode = console.register_instance(force)
        if failover_mode != old_mode:
            self._report("Multiple instances of failover console found for "
                         "master %s:%s." % (self.topology.master.host,
                                            self.topology.master.port),
                         logging.WARN)
            print "Failover mode changed to 'FAIL'. Console will start in 5 seconds."
            time.sleep(5)
        
        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO, False)
        
        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.exists(exec_fail):
                    self._report(no_exec_fail_msg, logging.CRITICAL, False)
                    raise UtilRplError(no_exec_fail_msg)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(script)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for timeout seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          timeout
                    self._report(msg, logging.INFO, False)
                    time.sleep(timeout)
                    try:
                        self.topology.master.connect()
                    except:
                        self._report("Cannot reconnect to master.",
                                     logging.INFO, False)
                        
                # Check the master again. If no connection or lost connection, 
                # try ping and if still not alive, failover. This performs the
                # timeout threshold for detecting a down master.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
            
            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % \
                          "Master has failed and automatic failover is not enabled. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False)
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % "An error was encountered " + \
                          "during failover. "
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False)
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False)

            # discover slaves if option was specified at startup
            elif self.options.get("discover", None) is not None \
                and not first_pass:
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        # Unregister instance
        self._report("Unregistering instance on master.", logging.INFO, False)
        console.register_instance(False, False)
        self._report("Failover console stopped.", logging.INFO, False)

        return True
Exemplo n.º 4
0
class ReplicationMultiSource(Daemon):
    """Setup replication among a slave and multiple masters.

    This class implements a multi-source replication using a round-robin
    scheduling for setup replication among all masters and slave.

    This class also implements a POSIX daemon.
    """
    def __init__(self, slave_vals, masters_vals, options):
        """Constructor.

        slave_vals[in]     Slave server connection dictionary.
        master_vals[in]    List of master server connection dictionaries.
        options[in]        Options dictionary.
        """
        pidfile = options.get("pidfile", None)
        if pidfile is None:
            pidfile = "./rplms_daemon.pid"
        super(ReplicationMultiSource, self).__init__(pidfile)

        self.slave_vals = slave_vals
        self.masters_vals = masters_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.rpl_user = self.options.get("rpl_user", None)
        self.verbosity = options.get("verbosity", 0)
        self.interval = options.get("interval", 15)
        self.switchover_interval = options.get("switchover_interval", 60)
        self.format = self.options.get("format", False)
        self.topology = None
        self.report_values = [
            report.lower() for report in
            self.options["report_values"].split(",")
        ]

        # A sys.stdout copy, that can be used later to turn on/off stdout
        self.stdout_copy = sys.stdout
        self.stdout_devnull = open(os.devnull, "w")

        # Disable stdout when running --daemon with start, stop or restart
        self.daemon = options.get("daemon")
        if self.daemon:
            if self.daemon in ("start", "nodetach"):
                self._report("Starting multi-source replication daemon...",
                             logging.INFO, False)
            elif self.daemon == "stop":
                self._report("Stopping multi-source replication daemon...",
                             logging.INFO, False)
            else:
                self._report("Restarting multi-source replication daemon...",
                             logging.INFO, False)

            # Disable stdout
            sys.stdout = self.stdout_devnull
        else:
            self._report("# Starting multi-source replication...",
                         logging.INFO)
            print("# Press CTRL+C to quit.")

        # Check server versions
        try:
            self._check_server_versions()
        except UtilError as err:
            raise UtilRplError(err.errmsg)

        # Check user privileges
        try:
            self._check_privileges()
        except UtilError as err:
            msg = "Error checking user privileges: {0}".format(err.errmsg)
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(err.errmsg)

    @staticmethod
    def _reconnect_server(server, pingtime=3):
        """Tries to reconnect to the server.

        This method tries to reconnect to the server and if connection fails
        after 3 attemps, returns False.

        server[in]      Server instance.
        pingtime[in]    Interval between connection attempts.
        """
        if server and server.is_alive():
            return True
        is_connected = False
        i = 0
        while i < 3:
            try:
                server.connect()
                is_connected = True
                break
            except UtilError:
                pass
            time.sleep(pingtime)
            i += 1
        return is_connected

    def _get_slave(self):
        """Get the slave server instance.

        Returns a Server instance of the slave from the replication topology.
        """
        return self.topology.slaves[0]["instance"]

    def _get_master(self):
        """Get the current master server instance.

        Returns a Server instance of the current master from the replication
        topology.
        """
        return self.topology.master

    def _check_server_versions(self):
        """Checks the server versions.
        """
        if self.verbosity > 0:
            print("# Checking server versions.\n#")

        # Connection dictionary
        conn_dict = {
            "conn_info": None,
            "quiet": True,
            "verbose": self.verbosity > 0,
        }

        # Check masters version
        for master_vals in self.masters_vals:
            conn_dict["conn_info"] = master_vals
            master = Master(conn_dict)
            master.connect()
            if not master.check_version_compat(*_MIN_SERVER_VERSION):
                raise UtilRplError(
                    ERROR_MIN_SERVER_VERSIONS.format(
                        utility="mysqlrplms",
                        min_version=".".join([str(val) for val in
                                              _MIN_SERVER_VERSION]),
                        host=master.host,
                        port=master.port
                    )
                )
            master.disconnect()

        # Check slave version
        conn_dict["conn_info"] = self.slave_vals
        slave = Slave(conn_dict)
        slave.connect()
        if not slave.check_version_compat(*_MIN_SERVER_VERSION):
            raise UtilRplError(
                ERROR_MIN_SERVER_VERSIONS.format(
                    utility="mysqlrplms",
                    min_version=".".join([str(val) for val in
                                          _MIN_SERVER_VERSION]),
                    host=slave.host,
                    port=slave.port
                )
            )
        slave.disconnect()

    def _check_privileges(self):
        """Check required privileges to perform the multi-source replication.

        This method check if the used users for the slave and masters have
        the required privileges to perform the multi-source replication.
        The following privileges are required:
            - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION
                        SLAVE AND GRANT OPTION;
            - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE
                             AND GRANT OPTION.
        An exception is thrown if users doesn't have enough privileges.
        """
        if self.verbosity > 0:
            print("# Checking users privileges for replication.\n#")

        # Connection dictionary
        conn_dict = {
            "conn_info": None,
            "quiet": True,
            "verbose": self.verbosity > 0,
        }

        # Check privileges for master.
        master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',),
                       ('REPLICATION SLAVE',), ('GRANT OPTION',)]
        master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE "
                           "AND GRANT OPTION")

        for master_vals in self.masters_vals:
            conn_dict["conn_info"] = master_vals
            master = Master(conn_dict)
            master.connect()

            user_obj = User(master, "{0}@{1}".format(master.user, master.host))
            for any_priv_tuple in master_priv:
                has_privilege = any(
                    [user_obj.has_privilege('*', '*', priv)
                        for priv in any_priv_tuple]
                )
                if not has_privilege:
                    msg = ERROR_USER_WITHOUT_PRIVILEGES.format(
                        user=master.user, host=master.host, port=master.port,
                        operation='perform replication',
                        req_privileges=master_priv_str
                    )
                    self._report(msg, logging.CRITICAL, False)
                    raise UtilRplError(msg)
            master.disconnect()

        # Check privileges for slave
        slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',),
                      ('REPLICATION SLAVE',), ('GRANT OPTION',)]
        slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE "
                          "AND GRANT OPTION")

        conn_dict["conn_info"] = self.slave_vals
        slave = Slave(conn_dict)
        slave.connect()

        user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host))
        for any_priv_tuple in slave_priv:
            has_privilege = any(
                [user_obj.has_privilege('*', '*', priv)
                    for priv in any_priv_tuple]
            )
            if not has_privilege:
                msg = ("User '{0}' on '{1}@{2}' does not have sufficient "
                       "privileges to perform replication (required: {3})."
                       "".format(slave.user, slave.host, slave.port,
                                 slave_priv_str))
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
        slave.disconnect()

    def _check_host_references(self):
        """Check to see if using all host or all IP addresses.

        Returns bool - True = all references are consistent.
        """
        uses_ip = hostname_is_ip(self.topology.master.host)
        slave = self._get_slave()
        host_port = slave.get_master_host_port()
        host = None
        if host_port:
            host = host_port[0]
        if (not host or uses_ip != hostname_is_ip(slave.host) or
           uses_ip != hostname_is_ip(host)):
            return False
        return True

    def _setup_replication(self, master_vals, use_rpl_setup=True):
        """Setup replication among a master and a slave.

        master_vals[in]      Master server connection dictionary.
        use_rpl_setup[in]    Use Replication.setup() if True otherwise use
                             switch_master() on the slave. This is used to
                             control the first pass in the masters round-robin
                             scheduling.
        """
        conn_options = {
            "src_name": "master",
            "dest_name": "slave",
            "version": "5.0.0",
            "unique": True,
        }
        (master, slave,) = connect_servers(master_vals, self.slave_vals,
                                           conn_options)
        rpl_options = self.options.copy()
        rpl_options["verbosity"] = self.verbosity > 0

        # Start from beginning only on the first pass
        if rpl_options.get("from_beginning", False) and not use_rpl_setup:
            rpl_options["from_beginning"] = False

        # Create an instance of the replication object
        rpl = Replication(master, slave, rpl_options)

        if use_rpl_setup:
            # Check server ids
            errors = rpl.check_server_ids()
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check for server_id uniqueness
            errors = rpl.check_server_uuids()
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check InnoDB compatibility
            errors = rpl.check_innodb_compatibility(self.options)
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Checking storage engines
            errors = rpl.check_storage_engines(self.options)
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check master for binary logging
            errors = rpl.check_master_binlog()
            if not errors == []:
                raise UtilRplError(errors[0])

            # Setup replication
            if not rpl.setup(self.rpl_user, 10):
                msg = "Cannot setup replication."
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
        else:
            # Parse user and password (support login-paths)
            (r_user, r_pass,) = parse_user_password(self.rpl_user)

            # Switch master and start slave
            slave.switch_master(master, r_user, r_pass)
            slave.start({'fetch': False})

        # Disconnect from servers
        master.disconnect()
        slave.disconnect()

    def _switch_master(self, master_vals, use_rpl_setup=True):
        """Switches replication to a new master.

        This method stops replication with the old master if exists and
        starts the replication with a new one.

        master_vals[in]      Master server connection dictionary.
        use_rpl_setup[in]    Used to control the first pass in the masters
                             round-robin scheduling.
        """
        if self.topology:
            # Stop slave
            master = self._get_master()
            if master.is_alive():
                master.disconnect()
            slave = self._get_slave()
            if not slave.is_alive() and not self._reconnect_server(slave):
                msg = "Failed to connect to the slave."
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
            slave.stop()
            slave.disconnect()

        self._report("# Switching to master '{0}:{1}'."
                     "".format(master_vals["host"],
                               master_vals["port"]), logging.INFO, True)

        try:
            # Setup replication on the new master
            self._setup_replication(master_vals, use_rpl_setup)

            # Create a Topology object
            self.topology = Topology(master_vals, [self.slave_vals],
                                     self.options)
        except UtilError as err:
            msg = "Error while switching master: {0}".format(err.errmsg)
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(err.errmsg)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = ("Topology must support global transaction ids and have "
                   "GTID_MODE=ON.")
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on.

        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.

        message[in]      Message to be printed.
        level[in]        Level of message to log. Default = INFO.
        print_msg[in]    If True, print the message to stdout. Default = True.
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print(message)
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(" "))

    def _format_health_data(self):
        """Return health data from topology.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                health_data = self.topology.get_health()
                current_master = self._get_master()

                # Get data for the remaining masters
                for master_vals in self.masters_vals:
                    # Discard the current master
                    if master_vals["host"] == current_master.host and \
                       master_vals["port"] == current_master.port:
                        continue

                    # Connect to the master
                    conn_dict = {
                        "conn_info": master_vals,
                        "quiet": True,
                        "verbose": self.verbosity > 0,
                    }
                    master = Master(conn_dict)
                    master.connect()

                    # Get master health
                    rpl_health = master.check_rpl_health()

                    master_data = [
                        master.host,
                        master.port,
                        "MASTER",
                        get_server_state(master, master.host, 3,
                                         self.verbosity > 0),
                        master.supports_gtid(),
                        "OK" if rpl_health[0] else ", ".join(rpl_health[1]),
                    ]

                    # Get master status
                    master_status = master.get_status()
                    if len(master_status):
                        master_log, master_log_pos = master_status[0][0:2]
                    else:
                        master_log = None
                        master_log_pos = 0

                    # Show additional details if verbosity is turned on
                    if self.verbosity > 0:
                        master_data.extend([master.get_version(), master_log,
                                            master_log_pos, "", "", "", "", "",
                                            "", "", "", ""])
                    health_data[1].append(master_data)
                return health_data
            except UtilError as err:
                msg = "Cannot get health data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _format_uuid_data(self):
        """Return the server's uuids.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                return (_GEN_UUID_COLS, self.topology.get_server_uuids())
            except UtilError as err:
                msg = "Cannot get UUID data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _format_gtid_data(self):
        """Return the GTID information from the topology.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                return (_GEN_GTID_COLS, self.topology.get_gtid_data())
            except UtilError as err:
                msg = "Cannot get GTID data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _log_data(self, title, labels, data, print_format=True):
        """Helper method to log data.

        title[in]     Title to log.
        labels[in]    List of labels.
        data[in]      List of data rows.
        """
        self._report("# {0}".format(title), logging.INFO)
        for row in data:
            msg = ", ".join(
                ["{0}: {1}".format(*col) for col in zip(labels, row)]
            )
            self._report("# {0}".format(msg), logging.INFO, False)
        if print_format:
            print_list(sys.stdout, self.format, labels, data)

    def _log_master_status(self, master):
        """Logs the master information.

        master[in]    Master server instance.

        This method logs the master information from SHOW MASTER STATUS.
        """
        # If no master present, don't print anything.
        if master is None:
            return

        print("#")
        self._report("# {0}:".format("Current Master Information"),
                     logging.INFO)

        try:
            status = master.get_status()[0]
        except UtilError:
            msg = "Cannot get master status"
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        cols = ("Binary Log File", "Position", "Binlog_Do_DB",
                "Binlog_Ignore_DB")
        rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A",
                status[3] or "N/A")

        print_list(sys.stdout, self.format, cols, [rows])

        self._report("# {0}".format(
            ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]),
        ), logging.INFO, False)

        # Display gtid executed set
        master_gtids = []
        for gtid in status[4].split("\n"):
            if gtid:
                # Add each GTID to a tuple to match the required format to
                # print the full GRID list correctly.
                master_gtids.append((gtid.strip(","),))

        try:
            if len(master_gtids) > 1:
                gtid_executed = "{0}[...]".format(master_gtids[0][0])
            else:
                gtid_executed = master_gtids[0][0]
        except IndexError:
            gtid_executed = "None"

        self._report("# GTID Executed Set: {0}".format(gtid_executed),
                     logging.INFO)

    def stop_replication(self):
        """Stops multi-source replication.

        Stop the slave if topology is available.
        """
        if self.topology:
            # Get the slave instance
            slave = self._get_slave()
            # If slave is not connected, try to reconnect and stop replication
            if self._reconnect_server(slave):
                slave.stop()
                slave.disconnect()
        if self.daemon:
            self._report("Multi-source replication daemon stopped.",
                         logging.INFO, False)
        else:
            print("")
            self._report("# Multi-source replication stopped.",
                         logging.INFO, True)

    def stop(self):
        """Stops the daemon.

        Stop slave if topology is available and then stop the daemon.
        """
        self.stop_replication()
        super(ReplicationMultiSource, self).stop()

    def run(self):
        """Run the multi-source replication using the round-robin scheduling.

        This method implements the multi-source replication by using time
        slices for each master.
        """
        num_masters = len(self.masters_vals)
        use_rpl_setup = True

        while True:
            # Round-robin scheduling on the masters
            for idx in range(num_masters):
                # Get the new master values and switch for the next one
                try:
                    master_vals = self.masters_vals[idx]
                    self._switch_master(master_vals, use_rpl_setup)
                except UtilError as err:
                    msg = ("Error while switching master: {0}"
                           "".format(err.errmsg))
                    self._report(msg, logging.CRITICAL, False)
                    raise UtilRplError(msg)

                # Get the new master and slave instances
                master = self._get_master()
                slave = self._get_slave()

                switchover_timeout = time.time() + self.switchover_interval

                while switchover_timeout > time.time():
                    # If servers not connected, try to reconnect
                    if not self._reconnect_server(master):
                        msg = ("Failed to connect to the master '{0}:{1}'."
                               "".format(master_vals["host"],
                                         master_vals["port"]))
                        self._report(msg, logging.CRITICAL, False)
                        raise UtilRplError(msg)

                    if not self._reconnect_server(slave):
                        msg = "Failed to connect to the slave."
                        self._report(msg, logging.CRITICAL, False)
                        raise UtilRplError(msg)

                    # Report
                    self._log_master_status(master)
                    if "health" in self.report_values:
                        (health_labels, health_data,) = \
                            self._format_health_data()
                        if health_data:
                            print("#")
                            self._log_data("Health Status:", health_labels,
                                           health_data)
                    if "gtid" in self.report_values:
                        (gtid_labels, gtid_data,) = self._format_gtid_data()
                        for i, row in enumerate(gtid_data):
                            if row:
                                print("#")
                                self._log_data("GTID Status - {0}"
                                               "".format(_GTID_LISTS[i]),
                                               gtid_labels, row)

                    if "uuid" in self.report_values:
                        (uuid_labels, uuid_data,) = self._format_uuid_data()
                        if uuid_data:
                            print("#")
                            self._log_data("UUID Status:", uuid_labels,
                                           uuid_data)

                    # Disconnect servers
                    master.disconnect()
                    slave.disconnect()

                    # Wait for reporting interval
                    time.sleep(self.interval)

            # Use Replication.setup() only for the first round
            use_rpl_setup = False
Exemplo n.º 5
0
class ReplicationMultiSource(Daemon):
    """Setup replication among a slave and multiple masters.

    This class implements a multi-source replication using a round-robin
    scheduling for setup replication among all masters and slave.

    This class also implements a POSIX daemon.
    """
    def __init__(self, slave_vals, masters_vals, options):
        """Constructor.

        slave_vals[in]     Slave server connection dictionary.
        master_vals[in]    List of master server connection dictionaries.
        options[in]        Options dictionary.
        """
        pidfile = options.get("pidfile", None)
        if pidfile is None:
            pidfile = "./rplms_daemon.pid"
        super(ReplicationMultiSource, self).__init__(pidfile)

        self.slave_vals = slave_vals
        self.masters_vals = masters_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.rpl_user = self.options.get("rpl_user", None)
        self.verbosity = options.get("verbosity", 0)
        self.interval = options.get("interval", 15)
        self.switchover_interval = options.get("switchover_interval", 60)
        self.format = self.options.get("format", False)
        self.topology = None
        self.report_values = [
            report.lower() for report in
            self.options["report_values"].split(",")
        ]

        # A sys.stdout copy, that can be used later to turn on/off stdout
        self.stdout_copy = sys.stdout
        self.stdout_devnull = open(os.devnull, "w")

        # Disable stdout when running --daemon with start, stop or restart
        self.daemon = options.get("daemon")
        if self.daemon:
            if self.daemon in ("start", "nodetach"):
                self._report("Starting multi-source replication daemon...",
                             logging.INFO, False)
            elif self.daemon == "stop":
                self._report("Stopping multi-source replication daemon...",
                             logging.INFO, False)
            else:
                self._report("Restarting multi-source replication daemon...",
                             logging.INFO, False)

            # Disable stdout
            sys.stdout = self.stdout_devnull
        else:
            self._report("# Starting multi-source replication...",
                         logging.INFO)
            print("# Press CTRL+C to quit.")

        # Check server versions
        try:
            self._check_server_versions()
        except UtilError as err:
            raise UtilRplError(err.errmsg)

        # Check user privileges
        try:
            self._check_privileges()
        except UtilError as err:
            msg = "Error checking user privileges: {0}".format(err.errmsg)
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(err.errmsg)

    @staticmethod
    def _reconnect_server(server, pingtime=3):
        """Tries to reconnect to the server.

        This method tries to reconnect to the server and if connection fails
        after 3 attemps, returns False.

        server[in]      Server instance.
        pingtime[in]    Interval between connection attempts.
        """
        if server and server.is_alive():
            return True
        is_connected = False
        i = 0
        while i < 3:
            try:
                server.connect()
                is_connected = True
                break
            except UtilError:
                pass
            time.sleep(pingtime)
            i += 1
        return is_connected

    def _get_slave(self):
        """Get the slave server instance.

        Returns a Server instance of the slave from the replication topology.
        """
        return self.topology.slaves[0]["instance"]

    def _get_master(self):
        """Get the current master server instance.

        Returns a Server instance of the current master from the replication
        topology.
        """
        return self.topology.master

    def _check_server_versions(self):
        """Checks the server versions.
        """
        if self.verbosity > 0:
            print("# Checking server versions.\n#")

        # Connection dictionary
        conn_dict = {
            "conn_info": None,
            "quiet": True,
            "verbose": self.verbosity > 0,
        }

        # Check masters version
        for master_vals in self.masters_vals:
            conn_dict["conn_info"] = master_vals
            master = Master(conn_dict)
            master.connect()
            if not master.check_version_compat(*_MIN_SERVER_VERSION):
                raise UtilRplError(
                    ERROR_MIN_SERVER_VERSIONS.format(
                        utility="mysqlrplms",
                        min_version=".".join([str(val) for val in
                                              _MIN_SERVER_VERSION]),
                        host=master.host,
                        port=master.port
                    )
                )
            master.disconnect()

        # Check slave version
        conn_dict["conn_info"] = self.slave_vals
        slave = Slave(conn_dict)
        slave.connect()
        if not slave.check_version_compat(*_MIN_SERVER_VERSION):
            raise UtilRplError(
                ERROR_MIN_SERVER_VERSIONS.format(
                    utility="mysqlrplms",
                    min_version=".".join([str(val) for val in
                                          _MIN_SERVER_VERSION]),
                    host=slave.host,
                    port=slave.port
                )
            )
        slave.disconnect()

    def _check_privileges(self):
        """Check required privileges to perform the multi-source replication.

        This method check if the used users for the slave and masters have
        the required privileges to perform the multi-source replication.
        The following privileges are required:
            - on slave: SUPER, SELECT, INSERT, UPDATE, REPLICATION
                        SLAVE AND GRANT OPTION;
            - on the master: SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE
                             AND GRANT OPTION.
        An exception is thrown if users doesn't have enough privileges.
        """
        if self.verbosity > 0:
            print("# Checking users privileges for replication.\n#")

        # Connection dictionary
        conn_dict = {
            "conn_info": None,
            "quiet": True,
            "verbose": self.verbosity > 0,
        }

        # Check privileges for master.
        master_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',),
                       ('REPLICATION SLAVE',), ('GRANT OPTION',)]
        master_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE "
                           "AND GRANT OPTION")

        for master_vals in self.masters_vals:
            conn_dict["conn_info"] = master_vals
            master = Master(conn_dict)
            master.connect()

            user_obj = User(master, "{0}@{1}".format(master.user, master.host))
            for any_priv_tuple in master_priv:
                has_privilege = any(
                    [user_obj.has_privilege('*', '*', priv)
                     for priv in any_priv_tuple]
                )
                if not has_privilege:
                    msg = ERROR_USER_WITHOUT_PRIVILEGES.format(
                        user=master.user, host=master.host, port=master.port,
                        operation='perform replication',
                        req_privileges=master_priv_str
                    )
                    self._report(msg, logging.CRITICAL, False)
                    raise UtilRplError(msg)
            master.disconnect()

        # Check privileges for slave
        slave_priv = [('SUPER',), ('SELECT',), ('INSERT',), ('UPDATE',),
                      ('REPLICATION SLAVE',), ('GRANT OPTION',)]
        slave_priv_str = ("SUPER, SELECT, INSERT, UPDATE, REPLICATION SLAVE "
                          "AND GRANT OPTION")

        conn_dict["conn_info"] = self.slave_vals
        slave = Slave(conn_dict)
        slave.connect()

        user_obj = User(slave, "{0}@{1}".format(slave.user, slave.host))
        for any_priv_tuple in slave_priv:
            has_privilege = any(
                [user_obj.has_privilege('*', '*', priv)
                 for priv in any_priv_tuple]
            )
            if not has_privilege:
                msg = ("User '{0}' on '{1}@{2}' does not have sufficient "
                       "privileges to perform replication (required: {3})."
                       "".format(slave.user, slave.host, slave.port,
                                 slave_priv_str))
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
        slave.disconnect()

    def _check_host_references(self):
        """Check to see if using all host or all IP addresses.

        Returns bool - True = all references are consistent.
        """
        uses_ip = hostname_is_ip(self.topology.master.host)
        slave = self._get_slave()
        host_port = slave.get_master_host_port()
        host = None
        if host_port:
            host = host_port[0]
        if (not host or uses_ip != hostname_is_ip(slave.host) or
                uses_ip != hostname_is_ip(host)):
            return False
        return True

    def _setup_replication(self, master_vals, use_rpl_setup=True):
        """Setup replication among a master and a slave.

        master_vals[in]      Master server connection dictionary.
        use_rpl_setup[in]    Use Replication.setup() if True otherwise use
                             switch_master() on the slave. This is used to
                             control the first pass in the masters round-robin
                             scheduling.
        """
        conn_options = {
            "src_name": "master",
            "dest_name": "slave",
            "version": "5.0.0",
            "unique": True,
        }
        (master, slave,) = connect_servers(master_vals, self.slave_vals,
                                           conn_options)
        rpl_options = self.options.copy()
        rpl_options["verbosity"] = self.verbosity > 0

        # Start from beginning only on the first pass
        if rpl_options.get("from_beginning", False) and not use_rpl_setup:
            rpl_options["from_beginning"] = False

        # Create an instance of the replication object
        rpl = Replication(master, slave, rpl_options)

        if use_rpl_setup:
            # Check server ids
            errors = rpl.check_server_ids()
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check for server_id uniqueness
            errors = rpl.check_server_uuids()
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check InnoDB compatibility
            errors = rpl.check_innodb_compatibility(self.options)
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Checking storage engines
            errors = rpl.check_storage_engines(self.options)
            for error in errors:
                self._report(error, logging.ERROR, True)

            # Check master for binary logging
            errors = rpl.check_master_binlog()
            if errors != []:
                raise UtilRplError(errors[0])

            # Setup replication
            if not rpl.setup(self.rpl_user, 10):
                msg = "Cannot setup replication."
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
        else:
            # Parse user and password (support login-paths)
            try:
                (r_user, r_pass,) = parse_user_password(self.rpl_user)
            except FormatError:
                raise UtilError(USER_PASSWORD_FORMAT.format("--rpl-user"))

            # Switch master and start slave
            slave.switch_master(master, r_user, r_pass)
            slave.start({'fetch': False})

        # Disconnect from servers
        master.disconnect()
        slave.disconnect()

    def _switch_master(self, master_vals, use_rpl_setup=True):
        """Switches replication to a new master.

        This method stops replication with the old master if exists and
        starts the replication with a new one.

        master_vals[in]      Master server connection dictionary.
        use_rpl_setup[in]    Used to control the first pass in the masters
                             round-robin scheduling.
        """
        if self.topology:
            # Stop slave
            master = self._get_master()
            if master.is_alive():
                master.disconnect()
            slave = self._get_slave()
            if not slave.is_alive() and not self._reconnect_server(slave):
                msg = "Failed to connect to the slave."
                self._report(msg, logging.CRITICAL, False)
                raise UtilRplError(msg)
            slave.stop()
            slave.disconnect()

        self._report("# Switching to master '{0}:{1}'."
                     "".format(master_vals["host"],
                               master_vals["port"]), logging.INFO, True)

        try:
            # Setup replication on the new master
            self._setup_replication(master_vals, use_rpl_setup)

            # Create a Topology object
            self.topology = Topology(master_vals, [self.slave_vals],
                                     self.options)
        except UtilError as err:
            msg = "Error while switching master: {0}".format(err.errmsg)
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(err.errmsg)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = ("Topology must support global transaction ids and have "
                   "GTID_MODE=ON.")
            self._report(msg, logging.CRITICAL, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on.

        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.

        message[in]      Message to be printed.
        level[in]        Level of message to log. Default = INFO.
        print_msg[in]    If True, print the message to stdout. Default = True.
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print(message)
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(" "))

    def _format_health_data(self):
        """Return health data from topology.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                health_data = self.topology.get_health()
                current_master = self._get_master()

                # Get data for the remaining masters
                for master_vals in self.masters_vals:
                    # Discard the current master
                    if master_vals["host"] == current_master.host and \
                       master_vals["port"] == current_master.port:
                        continue

                    # Connect to the master
                    conn_dict = {
                        "conn_info": master_vals,
                        "quiet": True,
                        "verbose": self.verbosity > 0,
                    }
                    master = Master(conn_dict)
                    master.connect()

                    # Get master health
                    rpl_health = master.check_rpl_health()

                    master_data = [
                        master.host,
                        master.port,
                        "MASTER",
                        get_server_state(master, master.host, 3,
                                         self.verbosity > 0),
                        master.supports_gtid(),
                        "OK" if rpl_health[0] else ", ".join(rpl_health[1]),
                    ]

                    # Get master status
                    master_status = master.get_status()
                    if len(master_status):
                        master_log, master_log_pos = master_status[0][0:2]
                    else:
                        master_log = None
                        master_log_pos = 0

                    # Show additional details if verbosity is turned on
                    if self.verbosity > 0:
                        master_data.extend([master.get_version(), master_log,
                                            master_log_pos, "", "", "", "", "",
                                            "", "", "", ""])
                    health_data[1].append(master_data)
                return health_data
            except UtilError as err:
                msg = "Cannot get health data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _format_uuid_data(self):
        """Return the server's uuids.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                return (_GEN_UUID_COLS, self.topology.get_server_uuids())
            except UtilError as err:
                msg = "Cannot get UUID data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _format_gtid_data(self):
        """Return the GTID information from the topology.

        Returns tuple - (columns, rows).
        """
        if self.topology:
            try:
                return (_GEN_GTID_COLS, self.topology.get_gtid_data())
            except UtilError as err:
                msg = "Cannot get GTID data: {0}".format(err)
                self._report(msg, logging.ERROR, False)
                raise UtilRplError(msg)
        return ([], [])

    def _log_data(self, title, labels, data, print_format=True):
        """Helper method to log data.

        title[in]     Title to log.
        labels[in]    List of labels.
        data[in]      List of data rows.
        """
        self._report("# {0}".format(title), logging.INFO)
        for row in data:
            msg = ", ".join(
                ["{0}: {1}".format(*col) for col in zip(labels, row)]
            )
            self._report("# {0}".format(msg), logging.INFO, False)
        if print_format:
            print_list(sys.stdout, self.format, labels, data)

    def _log_master_status(self, master):
        """Logs the master information.

        master[in]    Master server instance.

        This method logs the master information from SHOW MASTER STATUS.
        """
        # If no master present, don't print anything.
        if master is None:
            return

        print("#")
        self._report("# {0}:".format("Current Master Information"),
                     logging.INFO)

        try:
            status = master.get_status()[0]
        except UtilError:
            msg = "Cannot get master status"
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        cols = ("Binary Log File", "Position", "Binlog_Do_DB",
                "Binlog_Ignore_DB")
        rows = (status[0] or "N/A", status[1] or "N/A", status[2] or "N/A",
                status[3] or "N/A")

        print_list(sys.stdout, self.format, cols, [rows])

        self._report("# {0}".format(
            ", ".join(["{0}: {1}".format(*item) for item in zip(cols, rows)]),
        ), logging.INFO, False)

        # Display gtid executed set
        master_gtids = []
        for gtid in status[4].split("\n"):
            if gtid:
                # Add each GTID to a tuple to match the required format to
                # print the full GRID list correctly.
                master_gtids.append((gtid.strip(","),))

        try:
            if len(master_gtids) > 1:
                gtid_executed = "{0}[...]".format(master_gtids[0][0])
            else:
                gtid_executed = master_gtids[0][0]
        except IndexError:
            gtid_executed = "None"

        self._report("# GTID Executed Set: {0}".format(gtid_executed),
                     logging.INFO)

    def stop_replication(self):
        """Stops multi-source replication.

        Stop the slave if topology is available.
        """
        if self.topology:
            # Get the slave instance
            slave = self._get_slave()
            # If slave is not connected, try to reconnect and stop replication
            if self._reconnect_server(slave):
                slave.stop()
                slave.disconnect()
        if self.daemon:
            self._report("Multi-source replication daemon stopped.",
                         logging.INFO, False)
        else:
            print("")
            self._report("# Multi-source replication stopped.",
                         logging.INFO, True)

    def stop(self):
        """Stops the daemon.

        Stop slave if topology is available and then stop the daemon.
        """
        self.stop_replication()
        super(ReplicationMultiSource, self).stop()

    def run(self):
        """Run the multi-source replication using the round-robin scheduling.

        This method implements the multi-source replication by using time
        slices for each master.
        """
        num_masters = len(self.masters_vals)
        use_rpl_setup = True

        # pylint: disable=R0101
        while True:
            # Round-robin scheduling on the masters
            for idx in range(num_masters):
                # Get the new master values and switch for the next one
                try:
                    master_vals = self.masters_vals[idx]
                    self._switch_master(master_vals, use_rpl_setup)
                except UtilError as err:
                    msg = ("Error while switching master: {0}"
                           "".format(err.errmsg))
                    self._report(msg, logging.CRITICAL, False)
                    raise UtilRplError(msg)

                # Get the new master and slave instances
                master = self._get_master()
                slave = self._get_slave()

                switchover_timeout = time.time() + self.switchover_interval

                while switchover_timeout > time.time():
                    # If servers not connected, try to reconnect
                    if not self._reconnect_server(master):
                        msg = ("Failed to connect to the master '{0}:{1}'."
                               "".format(master_vals["host"],
                                         master_vals["port"]))
                        self._report(msg, logging.CRITICAL, False)
                        raise UtilRplError(msg)

                    if not self._reconnect_server(slave):
                        msg = "Failed to connect to the slave."
                        self._report(msg, logging.CRITICAL, False)
                        raise UtilRplError(msg)

                    # Report
                    self._log_master_status(master)
                    if "health" in self.report_values:
                        (health_labels, health_data,) = \
                            self._format_health_data()
                        if health_data:
                            print("#")
                            self._log_data("Health Status:", health_labels,
                                           health_data)
                    if "gtid" in self.report_values:
                        (gtid_labels, gtid_data,) = self._format_gtid_data()
                        for i, row in enumerate(gtid_data):
                            if row:
                                print("#")
                                self._log_data("GTID Status - {0}"
                                               "".format(_GTID_LISTS[i]),
                                               gtid_labels, row)

                    if "uuid" in self.report_values:
                        (uuid_labels, uuid_data,) = self._format_uuid_data()
                        if uuid_data:
                            print("#")
                            self._log_data("UUID Status:", uuid_labels,
                                           uuid_data)

                    # Disconnect servers
                    master.disconnect()
                    slave.disconnect()

                    # Wait for reporting interval
                    time.sleep(self.interval)

            # Use Replication.setup() only for the first round
            use_rpl_setup = False
Exemplo n.º 6
0
class RplCommands(object):
    """Replication commands.

    This class supports the following replication commands.

    elect       - perform best slave election and report best slave
    failover    - conduct failover from master to best slave as specified
                  by the user. This option performs best slave election.
    gtid        - show status of global transaction id variables
    health      - display the replication health
    reset       - stop and reset all slaves
    start       - start all slaves
    stop        - stop all slaves
    switchover  - perform slave promotion as specified by the user to a
                  specific slave. Requires --master and the --candidate
                  options.
    """

    def __init__(self, master_vals, slave_vals, options,
                 skip_conn_err=True):
        """Constructor

        master_vals[in]    master server connection dictionary
        slave_vals[in]     list of slave server connection dictionaries
        options[in]        options dictionary
        skip_conn_err[in]  if True, do not fail on connection failure
                           Default = True
        """
        # A sys.stdout copy, that can be used later to turn on/off stdout
        self.stdout_copy = sys.stdout
        self.stdout_devnull = open(os.devnull, "w")

        # Disable stdout when running --daemon with start, stop or restart
        daemon = options.get("daemon")
        if daemon:
            if daemon in ("start", "nodetach"):
                print("Starting failover daemon...")
            elif daemon == "stop":
                print("Stopping failover daemon...")
            else:
                print("Restarting failover daemon...")
            # Disable stdout if daemon not nodetach
            if daemon != "nodetach":
                sys.stdout = self.stdout_devnull

        self.master = None
        self.master_vals = master_vals
        self.options = options
        self.quiet = self.options.get("quiet", False)
        self.logging = self.options.get("logging", False)
        self.candidates = self.options.get("candidates", None)
        self.verbose = self.options.get("verbose", None)
        self.rpl_user = self.options.get("rpl_user", None)
        self.ssl_ca = options.get("ssl_ca", None)
        self.ssl_cert = options.get("ssl_cert", None)
        self.ssl_key = options.get("ssl_key", None)
        if self.ssl_ca or self.ssl_cert or self.ssl_key:
            self.ssl = True

        try:
            self.topology = Topology(master_vals, slave_vals, self.options,
                                     skip_conn_err)
        except Exception as err:
            if daemon and daemon != "nodetach":
                # Turn on sys.stdout
                sys.stdout = self.stdout_copy
            raise UtilRplError(str(err))

    def _report(self, message, level=logging.INFO, print_msg=True):
        """Log message if logging is on

        This method will log the message presented if the log is turned on.
        Specifically, if options['log_file'] is not None. It will also
        print the message to stdout.

        message[in]    message to be printed
        level[in]      level of message to log. Default = INFO
        print_msg[in]  if True, print the message to stdout. Default = True
        """
        # First, print the message.
        if print_msg and not self.quiet:
            print message
        # Now log message if logging turned on
        if self.logging:
            logging.log(int(level), message.strip("#").strip(' '))

    def _show_health(self):
        """Run a command on a list of slaves.

        This method will display the replication health of the topology. This
        includes the following for each server.

          - host       : host name
          - port       : connection port
          - role       : "MASTER" or "SLAVE"
          - state      : UP = connected, WARN = cannot connect but can ping,
                         DOWN = cannot connect nor ping
          - gtid       : ON = gtid supported and turned on, OFF = supported
                         but not enabled, NO = not supported
          - rpl_health : (master) binlog enabled,
                         (slave) IO tread is running, SQL thread is running,
                         no errors, slave delay < max_delay,
                         read log pos + max_position < master's log position
                         Note: Will show 'ERROR' if there are multiple
                         errors encountered otherwise will display the
                         health check that failed.

        If verbosity is set, it will show the following additional information.

          (master)
            - server version, binary log file, position

          (slaves)
            - server version, master's binary log file, master's log position,
              IO_Thread, SQL_Thread, Secs_Behind, Remaining_Delay,
              IO_Error_Num, IO_Error
        """
        fmt = self.options.get("format", "grid")
        quiet = self.options.get("quiet", False)

        cols, rows = self.topology.get_health()

        if not quiet:
            print "#"
            print "# Replication Topology Health:"

        # Print health report
        print_list(sys.stdout, fmt, cols, rows)

        return

    def _show_gtid_data(self):
        """Display the GTID lists from the servers.

        This method displays the three GTID lists for all of the servers. Each
        server is listed with its entries in each list. If a list has no
        entries, that list is not printed.
        """
        if not self.topology.gtid_enabled():
            self._report("# WARNING: GTIDs are not supported on this "
                         "topology.", logging.WARN)
            return

        fmt = self.options.get("format", "grid")

        # Get UUIDs
        uuids = self.topology.get_server_uuids()
        if len(uuids):
            print "#"
            print "# UUIDS for all servers:"
            print_list(sys.stdout, fmt, ['host', 'port', 'role', 'uuid'],
                       uuids)

        # Get GTID lists
        executed, purged, owned = self.topology.get_gtid_data()
        if len(executed):
            print "#"
            print "# Transactions executed on the server:"
            print_list(sys.stdout, fmt, _GTID_COLS, executed)
        if len(purged):
            print "#"
            print "# Transactions purged from the server:"
            print_list(sys.stdout, fmt, _GTID_COLS, purged)
        if len(owned):
            print "#"
            print "# Transactions owned by another server:"
            print_list(sys.stdout, fmt, _GTID_COLS, owned)

    def _check_host_references(self):
        """Check to see if using all host or all IP addresses

        Returns bool - True = all references are consistent
        """

        uses_ip = hostname_is_ip(self.topology.master.host)
        for slave_dict in self.topology.slaves:
            slave = slave_dict['instance']
            if slave is not None:
                host_port = slave.get_master_host_port()
                host = None
                if host_port:
                    host = host_port[0]
                if (not host or uses_ip != hostname_is_ip(slave.host) or
                   uses_ip != hostname_is_ip(host)):
                    return False
        return True

    def _switchover(self):
        """Perform switchover from master to candidate slave

        This method switches the role of master to a candidate slave. The
        candidate is specified via the --candidate option.

        Returns bool - True = no errors, False = errors reported.
        """
        # Check new master is not actual master - need valid candidate
        candidate = self.options.get("new_master", None)
        if (self.topology.master.is_alias(candidate['host']) and
           self.master_vals['port'] == candidate['port']):
            err_msg = ERROR_SAME_MASTER.format(candidate['host'],
                                               candidate['port'],
                                               self.master_vals['host'],
                                               self.master_vals['port'])
            self._report(err_msg, logging.WARN)
            self._report(err_msg, logging.CRITICAL)
            raise UtilRplError(err_msg)

        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

        # Check prerequisites
        if candidate is None:
            msg = "No candidate specified."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Can only check errant transactions if GTIDs are enabled.
        if self.topology.gtid_enabled():
            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                force = self.options.get('force')
                print("# ERROR: {0}".format(_ERRANT_TNX_ERROR))
                self._report(_ERRANT_TNX_ERROR, logging.ERROR, False)
                for host, port, tnx_set in errant_tnx:
                    errant_msg = (" - For slave '{0}@{1}': "
                                  "{2}".format(host, port, ", ".join(tnx_set)))
                    print("# {0}".format(errant_msg))
                    self._report(errant_msg, logging.ERROR, False)
                # Raise an exception (to stop) if tolerant mode is OFF
                if not force:
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, although not advised, please "
                                       "use the utility with the --force "
                                       "option.".format(_ERRANT_TNX_ERROR))
        else:
            warn_msg = ("Errant transactions check skipped (GTID not enabled "
                        "for the whole topology).")
            print("# WARNING: {0}".format(warn_msg))
            self._report(warn_msg, logging.WARN, False)

        self._report(" ".join(["# Performing switchover from master at",
                     "%s:%s" % (self.master_vals['host'],
                                self.master_vals['port']),
                               "to slave at %s:%s." %
                               (candidate['host'], candidate['port'])]))
        if not self.topology.switchover(candidate):
            self._report("# Errors found. Switchover aborted.", logging.ERROR)
            return False

        return True

    def _elect_slave(self):
        """Perform best slave election

        This method determines which slave is the best candidate for
        GTID-enabled failover. If called for a non-GTID topology, a warning
        is issued.
        """
        if not self.topology.gtid_enabled():
            warn_msg = _GTID_ON_REQ.format(action='Slave election')
            print("# WARNING: {0}".format(warn_msg))
            self._report(warn_msg, logging.WARN, False)
            return

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)

        candidates = self.options.get("candidates", None)
        if candidates is None or len(candidates) == 0:
            self._report("# Electing candidate slave from known slaves.")
        else:
            self._report("# Electing candidate slave from candidate list "
                         "then slaves list.")
        best_slave = self.topology.find_best_slave(candidates)
        if best_slave is None:
            self._report("ERROR: No slave found that meets eligilibility "
                         "requirements.", logging.ERROR)
            return

        self._report("# Best slave found is located on %s:%s." %
                     (best_slave['host'], best_slave['port']))

    def _failover(self, strict=False, options=None):
        """Perform failover

        This method executes GTID-enabled failover. If called for a non-GTID
        topology, a warning is issued.

        strict[in]     if True, use only the candidate list for slave
                       election and fail if no candidates are viable.
                       Default = False
        options[in]    options dictionary.

        Returns bool - True = failover succeeded, False = errors found
        """
        if options is None:
            options = {}
        srv_list = self.topology.get_servers_with_gtid_not_on()
        if srv_list:
            err_msg = _GTID_ON_REQ.format(action='Slave election')
            print("# ERROR: {0}".format(err_msg))
            self._report(err_msg, logging.ERROR, False)
            for srv in srv_list:
                msg = "#  - GTID_MODE={0} on {1}:{2}".format(srv[2], srv[0],
                                                             srv[1])
                self._report(msg, logging.ERROR)

            self._report(err_msg, logging.CRITICAL, False)
            raise UtilRplError(err_msg)

        # Check for --master-info-repository=TABLE if rpl_user is None
        if not self._check_master_info_type():
            return False

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            force = options.get('force')
            print("# ERROR: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.ERROR, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.ERROR, False)
            # Raise an exception (to stop) if tolerant mode is OFF
            if not force:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, although not advised, please use "
                                   "the utility with the --force option."
                                   "".format(_ERRANT_TNX_ERROR))

        self._report("# Performing failover.")
        if not self.topology.failover(self.candidates, strict,
                                      stop_on_error=True):
            self._report("# Errors found.", logging.ERROR)
            return False
        return True

    def _check_master_info_type(self, halt=True):
        """Check for master information set to TABLE if rpl_user not provided

        halt[in]       if True, raise error on failure. Default is True

        Returns bool - True if rpl_user is specified or False if rpl_user not
                       specified and at least one slave does not have
                       --master-info-repository=TABLE.
        """
        error = "You must specify either the --rpl-user or set all slaves " + \
                "to use --master-info-repository=TABLE."
        # Check for --master-info-repository=TABLE if rpl_user is None
        if self.rpl_user is None:
            if not self.topology.check_master_info_type("TABLE"):
                if halt:
                    raise UtilRplError(error)
                self._report(error, logging.ERROR)
                return False
        return True

    def check_host_references(self):
        """Public method to access self.check_host_references()
        """
        return self._check_host_references()

    def execute_command(self, command, options=None):
        """Execute a replication admin command

        This method executes one of the valid replication administration
        commands as described above.

        command[in]        command to execute
        options[in]        options dictionary.

        Returns bool - True = success, raise error on failure
        """
        if options is None:
            options = {}
        # Raise error if command is not valid
        if command not in _VALID_COMMANDS:
            msg = "'%s' is not a valid command." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Check privileges
        self._report("# Checking privileges.")
        full_check = command in ['failover', 'elect', 'switchover']
        errors = self.topology.check_privileges(full_check)
        if len(errors):
            msg = "User %s on %s does not have sufficient privileges to " + \
                  "execute the %s command."
            for error in errors:
                self._report(msg % (error[0], error[1], command),
                             logging.CRITICAL)
            raise UtilRplError("Not enough privileges to execute command.")

        self._report("Executing %s command..." % command, logging.INFO, False)

        # Execute the command
        if command in _SLAVE_COMMANDS:
            if command == 'reset':
                self.topology.run_cmd_on_slaves('stop')
            self.topology.run_cmd_on_slaves(command)
        elif command in 'gtid':
            self._show_gtid_data()
        elif command == 'health':
            self._show_health()
        elif command == 'switchover':
            self._switchover()
        elif command == 'elect':
            self._elect_slave()
        elif command == 'failover':
            self._failover(options=options)
        else:
            msg = "Command '%s' is not implemented." % command
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        if command in ['switchover', 'failover'] and \
           not self.options.get("no_health", False):
            self._show_health()

        self._report("# ...done.")

        return True

    def auto_failover(self, interval):
        """Automatic failover

        Wrapper class for running automatic failover. See
        run_automatic_failover for details on implementation.

        This method ensures the registration/deregistration occurs
        regardless of exception or errors.

        interval[in]   time in seconds to wait to check status of servers

        Returns bool - True = success, raises exception on error
        """
        failover_mode = self.options.get("failover_mode", "auto")
        force = self.options.get("force", False)

        # Initialize a console
        console = FailoverConsole(self.topology.master,
                                  self.topology.get_health,
                                  self.topology.get_gtid_data,
                                  self.topology.get_server_uuids,
                                  self.options)

        # Check privileges
        self._report("# Checking privileges.")
        errors = self.topology.check_privileges(failover_mode != 'fail')
        if len(errors):
            for error in errors:
                msg = ("User {0} on {1}@{2} does not have sufficient "
                       "privileges to execute the {3} command "
                       "(required: {4}).").format(error[0], error[1], error[2],
                                                  'failover', error[3])
                print("# ERROR: {0}".format(msg))
                self._report(msg, logging.CRITICAL, False)
            raise UtilRplError("Not enough privileges to execute command.")

        # Unregister existing instances from slaves
        self._report("Unregistering existing instances from slaves.",
                     logging.INFO, False)
        console.unregister_slaves(self.topology)

        # Register instance
        self._report("Registering instance on master.", logging.INFO, False)
        old_mode = failover_mode
        failover_mode = console.register_instance(force)
        if failover_mode != old_mode:
            self._report("Multiple instances of failover console found for "
                         "master %s:%s." % (self.topology.master.host,
                                            self.topology.master.port),
                         logging.WARN)
            print "If this is an error, restart the console with --force. "
            print "Failover mode changed to 'FAIL' for this instance. "
            print "Console will start in 10 seconds.",
            sys.stdout.flush()
            i = 0
            while i < 9:
                time.sleep(1)
                sys.stdout.write('.')
                sys.stdout.flush()
                i += 1
            print "starting Console."
            time.sleep(1)

        try:
            res = self.run_auto_failover(console, failover_mode)
        except:
            raise
        finally:
            try:
                # Unregister instance
                self._report("Unregistering instance on master.", logging.INFO,
                             False)
                console.register_instance(True, False)
                self._report("Failover console stopped.", logging.INFO, False)
            except:
                pass

        return res

    def auto_failover_as_daemon(self):
        """Automatic failover

        Wrapper class for running automatic failover as daemon.

        This method ensures the registration/deregistration occurs
        regardless of exception or errors.

        Returns bool - True = success, raises exception on error
        """
        # Initialize failover daemon
        failover_daemon = FailoverDaemon(self)
        res = None

        try:
            action = self.options.get("daemon")
            if action == "start":
                res = failover_daemon.start()
            elif action == "stop":
                res = failover_daemon.stop()
            elif action == "restart":
                res = failover_daemon.restart()
            else:
                # Start failover deamon in foreground
                res = failover_daemon.start(detach_process=False)
        except:
            try:
                # Unregister instance
                self._report("Unregistering instance on master.", logging.INFO,
                             False)
                failover_daemon.register_instance(True, False)
                self._report("Failover daemon stopped.", logging.INFO, False)
            except:
                pass

        return res

    def run_auto_failover(self, console, failover_mode="auto"):
        """Run automatic failover

        This method implements the automatic failover facility. It uses the
        FailoverConsole class from the failover_console.py to implement all
        user interface commands and uses the existing failover() method of
        this class to conduct failover.

        When the master goes down, the method can perform one of three actions:

        1) failover to list of candidates first then slaves
        2) failover to list of candidates only
        3) fail

        console[in]    instance of the failover console class.

        Returns bool - True = success, raises exception on error
        """
        pingtime = self.options.get("pingtime", 3)
        exec_fail = self.options.get("exec_fail", None)
        post_fail = self.options.get("post_fail", None)
        pedantic = self.options.get('pedantic', False)

        # Only works for GTID_MODE=ON
        if not self.topology.gtid_enabled():
            msg = "Topology must support global transaction ids " + \
                  "and have GTID_MODE=ON."
            self._report(msg, logging.CRITICAL)
            raise UtilRplError(msg)

        # Require --master-info-repository=TABLE for all slaves
        if not self.topology.check_master_info_type("TABLE"):
            msg = "Failover requires --master-info-repository=TABLE for " + \
                  "all slaves."
            self._report(msg, logging.ERROR, False)
            raise UtilRplError(msg)

        # Check for mixing IP and hostnames
        if not self._check_host_references():
            print("# WARNING: {0}".format(HOST_IP_WARNING))
            self._report(HOST_IP_WARNING, logging.WARN, False)
            print("#\n# Failover console will start in {0} seconds.".format(
                WARNING_SLEEP_TIME))
            time.sleep(WARNING_SLEEP_TIME)

        # Check existence of errant transactions on slaves
        errant_tnx = self.topology.find_errant_transactions()
        if errant_tnx:
            print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
            self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
            for host, port, tnx_set in errant_tnx:
                errant_msg = (" - For slave '{0}@{1}': "
                              "{2}".format(host, port, ", ".join(tnx_set)))
                print("# {0}".format(errant_msg))
                self._report(errant_msg, logging.WARN, False)
            # Raise an exception (to stop) if pedantic mode is ON
            if pedantic:
                raise UtilRplError("{0} Note: If you want to ignore this "
                                   "issue, please do not use the --pedantic "
                                   "option.".format(_ERRANT_TNX_ERROR))

        self._report("Failover console started.", logging.INFO, False)
        self._report("Failover mode = %s." % failover_mode, logging.INFO,
                     False)

        # Main loop - loop and fire on interval.
        done = False
        first_pass = True
        failover = False
        while not done:
            # Use try block in case master class has gone away.
            try:
                old_host = self.master.host
                old_port = self.master.port
            except:
                old_host = "UNKNOWN"
                old_port = "UNKNOWN"

            # If a failover script is provided, check it else check master
            # using connectivity checks.
            if exec_fail is not None:
                # Execute failover check script
                if not os.path.isfile(exec_fail):
                    message = EXTERNAL_SCRIPT_DOES_NOT_EXIST.format(
                        path=exec_fail)
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                elif not os.access(exec_fail, os.X_OK):
                    message = INSUFFICIENT_FILE_PERMISSIONS.format(
                        path=exec_fail, permissions='execute')
                    self._report(message, logging.CRITICAL, False)
                    raise UtilRplError(message)
                else:
                    self._report("# Spawning external script for failover "
                                 "checking.")
                    res = execute_script(exec_fail, None,
                                         [old_host, old_port], self.verbose)
                    if res == 0:
                        self._report("# Failover check script completed Ok. "
                                     "Failover averted.")
                    else:
                        self._report("# Failover check script failed. "
                                     "Failover initiated", logging.WARN)
                        failover = True
            else:
                # Check the master. If not alive, wait for pingtime seconds
                # and try again.
                if self.topology.master is not None and \
                   not self.topology.master.is_alive():
                    msg = "Master may be down. Waiting for %s seconds." % \
                          pingtime
                    self._report(msg, logging.INFO, False)
                    time.sleep(pingtime)
                    try:
                        self.topology.master.connect()
                    except:
                        pass

                # Check the master again. If no connection or lost connection,
                # try ping. This performs the timeout threshold for detecting
                # a down master. If still not alive, try to reconnect and if
                # connection fails after 3 attempts, failover.
                if self.topology.master is None or \
                   not ping_host(self.topology.master.host, pingtime) or \
                   not self.topology.master.is_alive():
                    failover = True
                    i = 0
                    while i < 3:
                        try:
                            self.topology.master.connect()
                            failover = False  # Master is now connected again
                            break
                        except:
                            pass
                        time.sleep(pingtime)
                        i += 1

                    if failover:
                        self._report("Failed to reconnect to the master after "
                                     "3 attemps.", logging.INFO)

            if failover:
                self._report("Master is confirmed to be down or unreachable.",
                             logging.CRITICAL, False)
                try:
                    self.topology.master.disconnect()
                except:
                    pass
                console.clear()
                if failover_mode == 'auto':
                    self._report("Failover starting in 'auto' mode...")
                    res = self.topology.failover(self.candidates, False)
                elif failover_mode == 'elect':
                    self._report("Failover starting in 'elect' mode...")
                    res = self.topology.failover(self.candidates, True)
                else:
                    msg = _FAILOVER_ERROR % ("Master has failed and automatic "
                                             "failover is not enabled. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg, _FAILOVER_ERRNO)
                if not res:
                    msg = _FAILOVER_ERROR % ("An error was encountered "
                                             "during failover. ")
                    self._report(msg, logging.CRITICAL, False)
                    # Execute post failover script
                    self.topology.run_script(post_fail, False,
                                             [old_host, old_port])
                    raise UtilRplError(msg)
                self.master = self.topology.master
                console.master = self.master
                self.topology.remove_discovered_slaves()
                self.topology.discover_slaves()
                console.list_data = None
                print "\nFailover console will restart in 5 seconds."
                time.sleep(5)
                console.clear()
                failover = False
                # Execute post failover script
                self.topology.run_script(post_fail, False,
                                         [old_host, old_port,
                                          self.master.host, self.master.port])

                # Unregister existing instances from slaves
                self._report("Unregistering existing instances from slaves.",
                             logging.INFO, False)
                console.unregister_slaves(self.topology)

                # Register instance on the new master
                self._report("Registering instance on master.", logging.INFO,
                             False)
                failover_mode = console.register_instance()

            # discover slaves if option was specified at startup
            elif (self.options.get("discover", None) is not None
                  and not first_pass):
                # Force refresh of health list if new slaves found
                if self.topology.discover_slaves():
                    console.list_data = None

            # Check existence of errant transactions on slaves
            errant_tnx = self.topology.find_errant_transactions()
            if errant_tnx:
                if pedantic:
                    print("# WARNING: {0}".format(_ERRANT_TNX_ERROR))
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        print("# {0}".format(errant_msg))
                        self._report(errant_msg, logging.WARN, False)

                    # Raise an exception (to stop) if pedantic mode is ON
                    raise UtilRplError("{0} Note: If you want to ignore this "
                                       "issue, please do not use the "
                                       "--pedantic "
                                       "option.".format(_ERRANT_TNX_ERROR))
                else:
                    if self.logging:
                        warn_msg = ("{0} Check log for more "
                                    "details.".format(_ERRANT_TNX_ERROR))
                    else:
                        warn_msg = _ERRANT_TNX_ERROR
                    console.add_warning('errant_tnx', warn_msg)
                    self._report(_ERRANT_TNX_ERROR, logging.WARN, False)
                    for host, port, tnx_set in errant_tnx:
                        errant_msg = (" - For slave '{0}@{1}': "
                                      "{2}".format(host, port,
                                                   ", ".join(tnx_set)))
                        self._report(errant_msg, logging.WARN, False)
            else:
                console.del_warning('errant_tnx')

            res = console.display_console()
            if res is not None:    # None = normal timeout, keep going
                if not res:
                    return False   # Errors detected
                done = True        # User has quit
            first_pass = False

        return True